mirror of
https://github.com/katanemo/plano.git
synced 2026-06-08 14:55:14 +02:00
Introduce hermesllm library to handle llm message translation (#501)
This commit is contained in:
parent
96b583c819
commit
6c53510f49
33 changed files with 1693 additions and 690 deletions
|
|
@ -329,7 +329,7 @@ $ archgw up --service archgw --foreground
|
|||
...
|
||||
```
|
||||
|
||||
Log level can be changed to debug to get more details. To enable debug logs edit (Dockerfile)[arch/Dockerfile], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands,
|
||||
Log level can be changed to debug to get more details. To enable debug logs edit (supervisord.conf)[arch/supervisord.conf], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands,
|
||||
|
||||
```
|
||||
# make sure you are at the root of the repo
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ stdout_logfile_maxbytes=0
|
|||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:envoy]
|
||||
command=/bin/sh -c "python /app/config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml 2>&1 | tee /var/log//envoy.log"
|
||||
command=/bin/sh -c "python /app/config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log//envoy.log"
|
||||
stdout_logfile=/dev/stdout
|
||||
redirect_stderr=true
|
||||
stdout_logfile_maxbytes=0
|
||||
|
|
|
|||
1147
crates/Cargo.lock
generated
1147
crates/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -1,3 +1,3 @@
|
|||
[workspace]
|
||||
resolver = "2"
|
||||
members = ["llm_gateway", "prompt_gateway", "common", "brightstaff"]
|
||||
members = ["llm_gateway", "prompt_gateway", "common", "brightstaff", "hermesllm"]
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ eventsource-client = "0.15.0"
|
|||
eventsource-stream = "0.2.3"
|
||||
futures = "0.3.31"
|
||||
futures-util = "0.3.31"
|
||||
hermesllm = { version = "0.1.0", path = "../hermesllm" }
|
||||
http-body = "1.0.1"
|
||||
http-body-util = "0.1.3"
|
||||
hyper = { version = "1.6.0", features = ["full"] }
|
||||
|
|
|
|||
|
|
@ -1,14 +1,13 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use common::api::open_ai::ChatCompletionsRequest;
|
||||
use common::consts::ARCH_PROVIDER_HINT_HEADER;
|
||||
use hermesllm::providers::openai::types::ChatCompletionsRequest;
|
||||
use http_body_util::combinators::BoxBody;
|
||||
use http_body_util::{BodyExt, Full, StreamBody};
|
||||
use hyper::body::Frame;
|
||||
use hyper::header::{self};
|
||||
use hyper::{Request, Response, StatusCode};
|
||||
use serde_json::Value;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_stream::StreamExt;
|
||||
|
|
@ -32,13 +31,15 @@ pub async fn chat_completions(
|
|||
let chat_request_bytes = request.collect().await?.to_bytes();
|
||||
|
||||
let chat_completion_request: ChatCompletionsRequest =
|
||||
match serde_json::from_slice(&chat_request_bytes) {
|
||||
match ChatCompletionsRequest::try_from(chat_request_bytes.as_ref()) {
|
||||
Ok(request) => request,
|
||||
Err(err) => {
|
||||
let v: Value = serde_json::from_slice(&chat_request_bytes).unwrap();
|
||||
warn!(
|
||||
"arch-router request body string: {}",
|
||||
String::from_utf8_lossy(&chat_request_bytes)
|
||||
);
|
||||
let err_msg = format!("Failed to parse request body: {}", err);
|
||||
warn!("{}", err_msg);
|
||||
warn!("arch-router request body: {}", v.to_string());
|
||||
let mut bad_request = Response::new(full(err_msg));
|
||||
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return Ok(bad_request);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use bytes::Bytes;
|
||||
use common::api::open_ai::Models;
|
||||
use common::configuration::LlmProvider;
|
||||
use common::configuration::{IntoModels, LlmProvider};
|
||||
use hermesllm::providers::openai::types::Models;
|
||||
use http_body_util::{combinators::BoxBody, BodyExt, Full};
|
||||
use hyper::{Response, StatusCode};
|
||||
use serde_json;
|
||||
|
|
@ -11,7 +11,7 @@ pub async fn list_models(
|
|||
) -> Response<BoxBody<Bytes, hyper::Error>> {
|
||||
let prov = llm_providers.clone();
|
||||
let providers = (*prov).clone();
|
||||
let openai_models = Models::from(providers);
|
||||
let openai_models: Models = providers.into_models();
|
||||
|
||||
match serde_json::to_string(&openai_models) {
|
||||
Ok(json) => {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use common::{
|
||||
api::open_ai::{ChatCompletionsResponse, ContentType, Message},
|
||||
configuration::{LlmProvider, LlmRoute},
|
||||
consts::ARCH_PROVIDER_HINT_HEADER,
|
||||
};
|
||||
use hermesllm::providers::openai::types::{ChatCompletionsResponse, ContentType, Message};
|
||||
use hyper::header;
|
||||
use thiserror::Error;
|
||||
use tracing::{debug, info, warn};
|
||||
|
|
@ -136,6 +136,11 @@ impl RouterService {
|
|||
}
|
||||
};
|
||||
|
||||
if chat_completion_response.choices.is_empty() {
|
||||
warn!("No choices in router response: {}", body);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if let Some(ContentType::Text(content)) =
|
||||
&chat_completion_response.choices[0].message.content
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use common::api::open_ai::{ChatCompletionsRequest, Message};
|
||||
use hermesllm::providers::openai::types::{ChatCompletionsRequest, Message};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
use common::{
|
||||
api::open_ai::{ChatCompletionsRequest, ContentType, Message},
|
||||
configuration::LlmRoute,
|
||||
consts::{SYSTEM_ROLE, TOOL_ROLE, USER_ROLE},
|
||||
};
|
||||
use hermesllm::providers::openai::types::{ChatCompletionsRequest, ContentType, Message};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::{debug, warn};
|
||||
|
||||
|
|
@ -121,11 +121,13 @@ impl RouterModel for RouterModelV1 {
|
|||
.iter()
|
||||
.rev()
|
||||
.map(|message| {
|
||||
Message::new(
|
||||
message.role.clone(),
|
||||
Message {
|
||||
role: message.role.clone(),
|
||||
// we can unwrap here because we have already filtered out messages without content
|
||||
message.content.as_ref().unwrap().to_string(),
|
||||
)
|
||||
content: Some(ContentType::Text(
|
||||
message.content.as_ref().unwrap().to_string(),
|
||||
)),
|
||||
}
|
||||
})
|
||||
.collect::<Vec<Message>>();
|
||||
|
||||
|
|
@ -141,14 +143,8 @@ impl RouterModel for RouterModelV1 {
|
|||
messages: vec![Message {
|
||||
content: Some(ContentType::Text(messages_content)),
|
||||
role: USER_ROLE.to_string(),
|
||||
model: None,
|
||||
tool_calls: None,
|
||||
tool_call_id: None,
|
||||
}],
|
||||
tools: None,
|
||||
stream: false,
|
||||
stream_options: None,
|
||||
metadata: None,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ serde_json = "1.0"
|
|||
hex = "0.4.3"
|
||||
urlencoding = "2.1.3"
|
||||
url = "2.5.4"
|
||||
hermesllm = { version = "0.1.0", path = "../hermesllm" }
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4.1"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
use hermesllm::providers::openai::types::{ModelDetail, ModelObject, Models};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
|
|
@ -206,6 +207,29 @@ pub struct LlmProvider {
|
|||
pub usage: Option<String>,
|
||||
}
|
||||
|
||||
pub trait IntoModels {
|
||||
fn into_models(self) -> Models;
|
||||
}
|
||||
|
||||
impl IntoModels for Vec<LlmProvider> {
|
||||
fn into_models(self) -> Models {
|
||||
let data = self
|
||||
.iter()
|
||||
.map(|provider| ModelDetail {
|
||||
id: provider.name.clone(),
|
||||
object: "model".to_string(),
|
||||
created: 0,
|
||||
owned_by: "system".to_string(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Models {
|
||||
object: ModelObject::List,
|
||||
data,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LlmProvider {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
use proxy_wasm::types::Status;
|
||||
|
||||
use crate::{api::open_ai::ChatCompletionChunkResponseError, ratelimit};
|
||||
use hermesllm::providers::openai::types::OpenAIError;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ClientError {
|
||||
|
|
@ -39,4 +40,6 @@ pub enum ServerError {
|
|||
BadRequest { why: String },
|
||||
#[error("error in streaming response")]
|
||||
Streaming(#[from] ChatCompletionChunkResponseError),
|
||||
#[error("error parsing openai message: {0}")]
|
||||
OpenAIPError(#[from] OpenAIError),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
|
|||
);
|
||||
"gpt-4"
|
||||
}
|
||||
true => model_name
|
||||
true => model_name,
|
||||
};
|
||||
|
||||
// Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
|
||||
|
|
|
|||
10
crates/hermesllm/Cargo.toml
Normal file
10
crates/hermesllm/Cargo.toml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
[package]
|
||||
name = "hermesllm"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
serde = {version = "1.0.219", features = ["derive"]}
|
||||
serde_json = "1.0.140"
|
||||
serde_with = "3.12.0"
|
||||
thiserror = "2.0.12"
|
||||
63
crates/hermesllm/README.md
Normal file
63
crates/hermesllm/README.md
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# hermesllm
|
||||
|
||||
A Rust library for translating LLM (Large Language Model) API requests and responses between Mistral, Groq, Gemini, Deepseek, OpenAI, and other provider-compliant formats.
|
||||
|
||||
## Features
|
||||
|
||||
- Unified types for chat completions and model metadata across multiple LLM providers
|
||||
- Builder-pattern API for constructing requests in an idiomatic Rust style
|
||||
- Easy conversion between provider formats
|
||||
- Streaming and non-streaming response support
|
||||
|
||||
## Supported Providers
|
||||
|
||||
- Mistral
|
||||
- Deepseek
|
||||
- Groq
|
||||
- Gemini
|
||||
- OpenAI
|
||||
- Claude
|
||||
- Github
|
||||
|
||||
## Installation
|
||||
|
||||
Add the following to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
hermesllm = { git = "https://github.com/katanemo/archgw", subdir = "crates/hermesllm" }
|
||||
```
|
||||
|
||||
_Replace the path with the appropriate location if using as a workspace member or published crate._
|
||||
|
||||
## Usage
|
||||
|
||||
Construct a chat completion request using the builder pattern:
|
||||
|
||||
```rust
|
||||
use hermesllm::Provider;
|
||||
use hermesllm::providers::openai::types::ChatCompletionsRequest;
|
||||
|
||||
let request = ChatCompletionsRequest::builder("gpt-3.5-turbo", vec![Message::new("Hi".to_string())])
|
||||
.build()
|
||||
.expect("Failed to build OpenAIRequest");
|
||||
|
||||
// Convert to bytes for a specific provider
|
||||
let bytes = request.to_bytes(Provider::OpenAI)?;
|
||||
```
|
||||
|
||||
## API Overview
|
||||
|
||||
- `Provider`: Enum listing all supported LLM providers.
|
||||
- `ChatCompletionsRequest`: Builder-pattern struct for creating chat completion requests.
|
||||
- `ChatCompletionsResponse`: Struct for parsing responses.
|
||||
- Streaming support via `SseChatCompletionIter`.
|
||||
- Error handling via `OpenAIError`.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please open issues or pull requests for bug fixes, new features, or provider integrations.
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the terms of the [MIT License](../LICENSE).
|
||||
79
crates/hermesllm/src/lib.rs
Normal file
79
crates/hermesllm/src/lib.rs
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
//! hermesllm: A library for translating LLM API requests and responses
|
||||
//! between Mistral, Grok, Gemini, and OpenAI-compliant formats.
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
pub mod providers;
|
||||
|
||||
pub enum Provider {
|
||||
Arch,
|
||||
Mistral,
|
||||
Deepseek,
|
||||
Groq,
|
||||
Gemini,
|
||||
OpenAI,
|
||||
Claude,
|
||||
Github,
|
||||
}
|
||||
|
||||
impl From<&str> for Provider {
|
||||
fn from(value: &str) -> Self {
|
||||
match value.to_lowercase().as_str() {
|
||||
"arch" => Provider::Arch,
|
||||
"mistral" => Provider::Mistral,
|
||||
"deepseek" => Provider::Deepseek,
|
||||
"groq" => Provider::Groq,
|
||||
"gemini" => Provider::Gemini,
|
||||
"openai" => Provider::OpenAI,
|
||||
"claude" => Provider::Claude,
|
||||
"github" => Provider::Github,
|
||||
_ => panic!("Unknown provider: {}", value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Provider {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Provider::Arch => write!(f, "Arch"),
|
||||
Provider::Mistral => write!(f, "Mistral"),
|
||||
Provider::Deepseek => write!(f, "Deepseek"),
|
||||
Provider::Groq => write!(f, "Groq"),
|
||||
Provider::Gemini => write!(f, "Gemini"),
|
||||
Provider::OpenAI => write!(f, "OpenAI"),
|
||||
Provider::Claude => write!(f, "Claude"),
|
||||
Provider::Github => write!(f, "Github"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::providers::openai::types::{ChatCompletionsRequest, Message};
|
||||
|
||||
#[test]
|
||||
fn openai_builder() {
|
||||
let request =
|
||||
ChatCompletionsRequest::builder("gpt-3.5-turbo", vec![Message::new("Hi".to_string())])
|
||||
.temperature(0.7)
|
||||
.top_p(0.9)
|
||||
.n(1)
|
||||
.max_tokens(100)
|
||||
.stream(false)
|
||||
.stop(vec!["\n".to_string()])
|
||||
.presence_penalty(0.0)
|
||||
.frequency_penalty(0.0)
|
||||
.build()
|
||||
.expect("Failed to build OpenAIRequest");
|
||||
|
||||
assert_eq!(request.model, "gpt-3.5-turbo");
|
||||
assert_eq!(request.temperature, Some(0.7));
|
||||
assert_eq!(request.top_p, Some(0.9));
|
||||
assert_eq!(request.n, Some(1));
|
||||
assert_eq!(request.max_tokens, Some(100));
|
||||
assert_eq!(request.stream, Some(false));
|
||||
assert_eq!(request.stop, Some(vec!["\n".to_string()]));
|
||||
assert_eq!(request.presence_penalty, Some(0.0));
|
||||
assert_eq!(request.frequency_penalty, Some(0.0));
|
||||
}
|
||||
}
|
||||
1
crates/hermesllm/src/providers/deepseek/mod.rs
Normal file
1
crates/hermesllm/src/providers/deepseek/mod.rs
Normal file
|
|
@ -0,0 +1 @@
|
|||
pub mod types;
|
||||
19
crates/hermesllm/src/providers/deepseek/types.rs
Normal file
19
crates/hermesllm/src/providers/deepseek/types.rs
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
use crate::providers::openai::types::{ChatCompletionsRequest, ChatCompletionsResponse};
|
||||
pub use crate::providers::openai::types::{Choice, Message, Usage};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::skip_serializing_none;
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeepSeekRequest {
|
||||
#[serde(flatten)]
|
||||
pub base: ChatCompletionsRequest,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeepSeekResponse {
|
||||
#[serde(flatten)]
|
||||
pub base: ChatCompletionsResponse,
|
||||
}
|
||||
2
crates/hermesllm/src/providers/mod.rs
Normal file
2
crates/hermesllm/src/providers/mod.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pub mod deepseek;
|
||||
pub mod openai;
|
||||
113
crates/hermesllm/src/providers/openai/builder.rs
Normal file
113
crates/hermesllm/src/providers/openai/builder.rs
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
use serde_json::Value;
|
||||
|
||||
use crate::providers::openai::types::{ChatCompletionsRequest, Message, StreamOptions};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OpenAIRequestBuilder {
|
||||
model: String,
|
||||
messages: Vec<Message>,
|
||||
temperature: Option<f32>,
|
||||
top_p: Option<f32>,
|
||||
n: Option<u32>,
|
||||
max_tokens: Option<u32>,
|
||||
stream: Option<bool>,
|
||||
stop: Option<Vec<String>>,
|
||||
presence_penalty: Option<f32>,
|
||||
frequency_penalty: Option<f32>,
|
||||
stream_options: Option<StreamOptions>,
|
||||
tools: Option<Vec<Value>>,
|
||||
}
|
||||
|
||||
impl OpenAIRequestBuilder {
|
||||
pub fn new(model: impl Into<String>, messages: Vec<Message>) -> Self {
|
||||
Self {
|
||||
model: model.into(),
|
||||
messages,
|
||||
temperature: None,
|
||||
top_p: None,
|
||||
n: None,
|
||||
max_tokens: None,
|
||||
stream: None,
|
||||
stop: None,
|
||||
presence_penalty: None,
|
||||
frequency_penalty: None,
|
||||
stream_options: None,
|
||||
tools: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn temperature(mut self, temperature: f32) -> Self {
|
||||
self.temperature = Some(temperature);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn top_p(mut self, top_p: f32) -> Self {
|
||||
self.top_p = Some(top_p);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn n(mut self, n: u32) -> Self {
|
||||
self.n = Some(n);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_tokens(mut self, max_tokens: u32) -> Self {
|
||||
self.max_tokens = Some(max_tokens);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn stream(mut self, stream: bool) -> Self {
|
||||
self.stream = Some(stream);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn stop(mut self, stop: Vec<String>) -> Self {
|
||||
self.stop = Some(stop);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn presence_penalty(mut self, presence_penalty: f32) -> Self {
|
||||
self.presence_penalty = Some(presence_penalty);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn frequency_penalty(mut self, frequency_penalty: f32) -> Self {
|
||||
self.frequency_penalty = Some(frequency_penalty);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn stream_options(mut self, include_usage: bool) -> Self {
|
||||
self.stream = Some(true);
|
||||
self.stream_options = Some(StreamOptions { include_usage });
|
||||
self
|
||||
}
|
||||
|
||||
pub fn tools(mut self, tools: Vec<Value>) -> Self {
|
||||
self.tools = Some(tools);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<ChatCompletionsRequest, &'static str> {
|
||||
let request = ChatCompletionsRequest {
|
||||
model: self.model,
|
||||
messages: self.messages,
|
||||
temperature: self.temperature,
|
||||
top_p: self.top_p,
|
||||
n: self.n,
|
||||
max_tokens: self.max_tokens,
|
||||
stream: self.stream,
|
||||
stop: self.stop,
|
||||
presence_penalty: self.presence_penalty,
|
||||
frequency_penalty: self.frequency_penalty,
|
||||
stream_options: self.stream_options,
|
||||
tools: self.tools,
|
||||
};
|
||||
Ok(request)
|
||||
}
|
||||
}
|
||||
|
||||
impl ChatCompletionsRequest {
|
||||
pub fn builder(model: impl Into<String>, messages: Vec<Message>) -> OpenAIRequestBuilder {
|
||||
OpenAIRequestBuilder::new(model, messages)
|
||||
}
|
||||
}
|
||||
2
crates/hermesllm/src/providers/openai/mod.rs
Normal file
2
crates/hermesllm/src/providers/openai/mod.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pub mod builder;
|
||||
pub mod types;
|
||||
497
crates/hermesllm/src/providers/openai/types.rs
Normal file
497
crates/hermesllm/src/providers/openai/types.rs
Normal file
|
|
@ -0,0 +1,497 @@
|
|||
use std::fmt::Display;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use serde_with::skip_serializing_none;
|
||||
use std::convert::TryFrom;
|
||||
use std::str;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::Provider;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum OpenAIError {
|
||||
#[error("json error: {0}")]
|
||||
JsonParseError(#[from] serde_json::Error),
|
||||
#[error("utf8 parsing error: {0}")]
|
||||
Utf8Error(#[from] std::str::Utf8Error),
|
||||
#[error("invalid streaming data err {source}, data: {data}")]
|
||||
InvalidStreamingData {
|
||||
source: serde_json::Error,
|
||||
data: String,
|
||||
},
|
||||
#[error("unsupported provider: {provider}")]
|
||||
UnsupportedProvider { provider: String },
|
||||
}
|
||||
|
||||
type Result<T> = std::result::Result<T, OpenAIError>;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub enum MultiPartContentType {
|
||||
#[serde(rename = "text")]
|
||||
Text,
|
||||
#[serde(rename = "image_url")]
|
||||
ImageUrl,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct MultiPartContent {
|
||||
pub text: Option<String>,
|
||||
#[serde(rename = "type")]
|
||||
pub content_type: MultiPartContentType,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(untagged)]
|
||||
pub enum ContentType {
|
||||
Text(String),
|
||||
MultiPart(Vec<MultiPartContent>),
|
||||
}
|
||||
|
||||
impl Display for ContentType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ContentType::Text(text) => write!(f, "{}", text),
|
||||
ContentType::MultiPart(multi_part) => {
|
||||
let text_parts: Vec<String> = multi_part
|
||||
.iter()
|
||||
.filter_map(|part| {
|
||||
if part.content_type == MultiPartContentType::Text {
|
||||
part.text.clone()
|
||||
} else if part.content_type == MultiPartContentType::ImageUrl {
|
||||
// skip image URLs or their data in text representation
|
||||
None
|
||||
} else {
|
||||
panic!("Unsupported content type: {:?}", part.content_type);
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let combined_text = text_parts.join("\n");
|
||||
write!(f, "{}", combined_text)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Message {
|
||||
pub role: String,
|
||||
pub content: Option<ContentType>,
|
||||
}
|
||||
|
||||
impl Message {
|
||||
pub fn new(content: String) -> Self {
|
||||
Self {
|
||||
role: "user".to_string(),
|
||||
content: Some(ContentType::Text(content)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StreamOptions {
|
||||
pub include_usage: bool,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct ChatCompletionsRequest {
|
||||
pub model: String,
|
||||
pub messages: Vec<Message>,
|
||||
pub temperature: Option<f32>,
|
||||
pub top_p: Option<f32>,
|
||||
pub n: Option<u32>,
|
||||
pub max_tokens: Option<u32>,
|
||||
pub stream: Option<bool>,
|
||||
pub stop: Option<Vec<String>>,
|
||||
pub presence_penalty: Option<f32>,
|
||||
pub frequency_penalty: Option<f32>,
|
||||
pub stream_options: Option<StreamOptions>,
|
||||
pub tools: Option<Vec<Value>>,
|
||||
}
|
||||
|
||||
impl TryFrom<&[u8]> for ChatCompletionsRequest {
|
||||
type Error = OpenAIError;
|
||||
fn try_from(bytes: &[u8]) -> Result<Self> {
|
||||
serde_json::from_slice(bytes).map_err(OpenAIError::from)
|
||||
}
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct ChatCompletionsResponse {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
pub created: u64,
|
||||
pub choices: Vec<Choice>,
|
||||
pub usage: Option<Usage>,
|
||||
}
|
||||
|
||||
impl TryFrom<&[u8]> for ChatCompletionsResponse {
|
||||
type Error = OpenAIError;
|
||||
fn try_from(bytes: &[u8]) -> Result<Self> {
|
||||
serde_json::from_slice(bytes).map_err(OpenAIError::from)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TryFrom<(&'a [u8], &'a Provider)> for ChatCompletionsResponse {
|
||||
type Error = OpenAIError;
|
||||
|
||||
fn try_from(input: (&'a [u8], &'a Provider)) -> Result<Self> {
|
||||
// Use input.provider as needed, if necessary
|
||||
serde_json::from_slice(input.0).map_err(OpenAIError::from)
|
||||
}
|
||||
}
|
||||
|
||||
impl ChatCompletionsRequest {
|
||||
pub fn to_bytes(&self, provider: Provider) -> Result<Vec<u8>> {
|
||||
match provider {
|
||||
Provider::OpenAI
|
||||
| Provider::Arch
|
||||
| Provider::Deepseek
|
||||
| Provider::Mistral
|
||||
| Provider::Groq
|
||||
| Provider::Gemini
|
||||
| Provider::Claude => serde_json::to_vec(self).map_err(OpenAIError::from),
|
||||
_ => Err(OpenAIError::UnsupportedProvider {
|
||||
provider: provider.to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct Choice {
|
||||
pub index: u32,
|
||||
pub message: Message,
|
||||
pub finish_reason: Option<String>,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct Usage {
|
||||
pub prompt_tokens: usize,
|
||||
pub completion_tokens: usize,
|
||||
pub total_tokens: usize,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeltaMessage {
|
||||
pub role: Option<String>,
|
||||
pub content: Option<ContentType>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct StreamChoice {
|
||||
pub index: u32,
|
||||
pub delta: DeltaMessage,
|
||||
pub finish_reason: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct ChatCompletionStreamResponse {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
pub created: u64,
|
||||
pub model: String,
|
||||
pub choices: Vec<StreamChoice>,
|
||||
pub usage: Option<Usage>,
|
||||
}
|
||||
|
||||
pub struct SseChatCompletionIter<I>
|
||||
where
|
||||
I: Iterator,
|
||||
I::Item: AsRef<str>,
|
||||
{
|
||||
lines: I,
|
||||
}
|
||||
|
||||
impl<I> SseChatCompletionIter<I>
|
||||
where
|
||||
I: Iterator,
|
||||
I::Item: AsRef<str>,
|
||||
{
|
||||
pub fn new(lines: I) -> Self {
|
||||
Self { lines }
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Iterator for SseChatCompletionIter<I>
|
||||
where
|
||||
I: Iterator,
|
||||
I::Item: AsRef<str>,
|
||||
{
|
||||
type Item = Result<ChatCompletionStreamResponse>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
for line in &mut self.lines {
|
||||
let line = line.as_ref();
|
||||
if let Some(data) = line.strip_prefix("data: ") {
|
||||
let data = data.trim();
|
||||
if data == "[DONE]" {
|
||||
return None;
|
||||
}
|
||||
|
||||
if data == r#"{"type": "ping"}"# {
|
||||
continue; // Skip ping messages - that is usually from anthropic
|
||||
}
|
||||
|
||||
return Some(
|
||||
serde_json::from_str::<ChatCompletionStreamResponse>(data).map_err(|e| {
|
||||
OpenAIError::InvalidStreamingData {
|
||||
source: e,
|
||||
data: data.to_string(),
|
||||
}
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TryFrom<(&'a [u8], &'a Provider)> for SseChatCompletionIter<str::Lines<'a>> {
|
||||
type Error = OpenAIError;
|
||||
|
||||
fn try_from(input: (&'a [u8], &'a Provider)) -> Result<Self> {
|
||||
let s = std::str::from_utf8(input.0)?;
|
||||
// Use input.provider as needed
|
||||
Ok(SseChatCompletionIter::new(s.lines()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TryFrom<&'a [u8]> for SseChatCompletionIter<str::Lines<'a>> {
|
||||
type Error = OpenAIError;
|
||||
|
||||
fn try_from(bytes: &'a [u8]) -> Result<Self> {
|
||||
let s = std::str::from_utf8(bytes)?;
|
||||
Ok(SseChatCompletionIter::new(s.lines()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelDetail {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
pub created: usize,
|
||||
pub owned_by: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum ModelObject {
|
||||
#[serde(rename = "list")]
|
||||
List,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Models {
|
||||
pub object: ModelObject,
|
||||
pub data: Vec<ModelDetail>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_content_type_display() {
|
||||
let text_content = ContentType::Text("Hello, world!".to_string());
|
||||
assert_eq!(text_content.to_string(), "Hello, world!");
|
||||
|
||||
let multi_part_content = ContentType::MultiPart(vec![
|
||||
MultiPartContent {
|
||||
text: Some("This is a text part.".to_string()),
|
||||
content_type: MultiPartContentType::Text,
|
||||
},
|
||||
MultiPartContent {
|
||||
text: Some("https://example.com/image.png".to_string()),
|
||||
content_type: MultiPartContentType::ImageUrl,
|
||||
},
|
||||
]);
|
||||
assert_eq!(multi_part_content.to_string(), "This is a text part.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chat_completions_request_text_type_array() {
|
||||
const CHAT_COMPLETIONS_REQUEST: &str = r#"
|
||||
{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What city do you want to know the weather for?"
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "hello world"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
"#;
|
||||
|
||||
let chat_completions_request: ChatCompletionsRequest =
|
||||
serde_json::from_str(CHAT_COMPLETIONS_REQUEST).unwrap();
|
||||
assert_eq!(chat_completions_request.model, "gpt-3.5-turbo");
|
||||
if let Some(ContentType::MultiPart(multi_part_content)) =
|
||||
chat_completions_request.messages[0].content.as_ref()
|
||||
{
|
||||
assert_eq!(multi_part_content.len(), 2);
|
||||
assert_eq!(
|
||||
multi_part_content[0].content_type,
|
||||
MultiPartContentType::Text
|
||||
);
|
||||
assert_eq!(
|
||||
multi_part_content[0].text,
|
||||
Some("What city do you want to know the weather for?".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
multi_part_content[1].content_type,
|
||||
MultiPartContentType::Text
|
||||
);
|
||||
assert_eq!(multi_part_content[1].text, Some("hello world".to_string()));
|
||||
} else {
|
||||
panic!("Expected MultiPartContent");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sse_streaming() {
|
||||
let json_data = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
|
||||
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"content":"Hello, how can I help you today?"},"finish_reason":null}]}
|
||||
data: [DONE]"#;
|
||||
|
||||
let iter = SseChatCompletionIter::new(json_data.lines());
|
||||
|
||||
println!("Testing SSE Streaming");
|
||||
for item in iter {
|
||||
match item {
|
||||
Ok(response) => {
|
||||
println!("Received response: {:?}", response);
|
||||
if response.choices.is_empty() {
|
||||
continue;
|
||||
}
|
||||
for choice in response.choices {
|
||||
if let Some(content) = choice.delta.content {
|
||||
println!("Content: {}", content);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error parsing JSON: {}", e);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sse_streaming_try_from_bytes() {
|
||||
let json_data = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
|
||||
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"content":"Hello, how can I help you today?"},"finish_reason":null}]}
|
||||
data: [DONE]"#;
|
||||
|
||||
let iter = SseChatCompletionIter::try_from(json_data.as_bytes())
|
||||
.expect("Failed to create SSE iterator");
|
||||
|
||||
println!("Testing SSE Streaming");
|
||||
for item in iter {
|
||||
match item {
|
||||
Ok(response) => {
|
||||
println!("Received response: {:?}", response);
|
||||
if response.choices.is_empty() {
|
||||
continue;
|
||||
}
|
||||
for choice in response.choices {
|
||||
if let Some(content) = choice.delta.content {
|
||||
println!("Content: {}", content);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error parsing JSON: {}", e);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chat_completions_request() {
|
||||
const CHAT_COMPLETIONS_REQUEST: &str = r#"
|
||||
{
|
||||
"model": "None",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle"
|
||||
}
|
||||
],
|
||||
"stream": true
|
||||
} "#;
|
||||
|
||||
let _chat_completions_request: ChatCompletionsRequest =
|
||||
ChatCompletionsRequest::try_from(CHAT_COMPLETIONS_REQUEST.as_bytes())
|
||||
.expect("Failed to parse ChatCompletionsRequest");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stream_chunk_parse_claude() {
|
||||
const CHUNK_RESPONSE: &str = r#"data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"role":"assistant"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"type": "ping"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":"Hello!"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":" How can I assist you today? Whether"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":" you have a question, need information"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":", or just want to chat about"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":" something, I'm here to help. What woul"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":"d you like to talk about?"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
|
||||
|
||||
data: [DONE]
|
||||
"#;
|
||||
|
||||
let iter = SseChatCompletionIter::try_from(CHUNK_RESPONSE.as_bytes());
|
||||
|
||||
assert!(iter.is_ok(), "Failed to create SSE iterator");
|
||||
let iter: SseChatCompletionIter<str::Lines<'_>> = iter.unwrap();
|
||||
|
||||
let all_text: Vec<String> = iter
|
||||
.map(|item| {
|
||||
let response = item.expect("Failed to parse response");
|
||||
response
|
||||
.choices
|
||||
.into_iter()
|
||||
.filter_map(|choice| choice.delta.content)
|
||||
.map(|content| content.to_string())
|
||||
.collect::<String>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
all_text.len(),
|
||||
8,
|
||||
"Expected 8 chunks of text, but got {}",
|
||||
all_text.len()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
all_text.join(""),
|
||||
"Hello! How can I assist you today? Whether you have a question, need information, or just want to chat about something, I'm here to help. What would you like to talk about?"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -22,6 +22,7 @@ rand = "0.8.5"
|
|||
thiserror = "1.0.64"
|
||||
derivative = "2.2.0"
|
||||
sha2 = "0.10.8"
|
||||
hermesllm = { version = "0.1.0", path = "../hermesllm" }
|
||||
|
||||
[dev-dependencies]
|
||||
proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }
|
||||
|
|
|
|||
|
|
@ -1,8 +1,4 @@
|
|||
use crate::metrics::Metrics;
|
||||
use common::api::open_ai::{
|
||||
ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
|
||||
ContentType, Message, StreamOptions,
|
||||
};
|
||||
use common::configuration::{LlmProvider, LlmProviderType, Overrides};
|
||||
use common::consts::{
|
||||
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
|
||||
|
|
@ -14,6 +10,11 @@ use common::ratelimit::Header;
|
|||
use common::stats::{IncrementingMetric, RecordingMetric};
|
||||
use common::tracing::{Event, Span, TraceData, Traceparent};
|
||||
use common::{ratelimit, routing, tokenizer};
|
||||
use hermesllm::providers::openai::types::{ChatCompletionsRequest, SseChatCompletionIter};
|
||||
use hermesllm::providers::openai::types::{
|
||||
ChatCompletionsResponse, ContentType, Message, StreamOptions,
|
||||
};
|
||||
use hermesllm::Provider;
|
||||
use http::StatusCode;
|
||||
use log::{debug, info, warn};
|
||||
use proxy_wasm::hostcalls::get_current_time;
|
||||
|
|
@ -201,14 +202,15 @@ impl HttpContext for StreamContext {
|
|||
return Action::Continue;
|
||||
}
|
||||
|
||||
let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);
|
||||
|
||||
let use_agent_orchestrator = match self.overrides.as_ref() {
|
||||
Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
|
||||
None => false,
|
||||
};
|
||||
|
||||
if let Some(routing_header_value) = routing_header_value.as_ref() {
|
||||
let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);
|
||||
|
||||
if routing_header_value.is_some() && !routing_header_value.as_ref().unwrap().is_empty() {
|
||||
let routing_header_value = routing_header_value.as_ref().unwrap();
|
||||
info!("routing header already set: {}", routing_header_value);
|
||||
self.llm_provider = Some(Rc::new(LlmProvider {
|
||||
name: routing_header_value.to_string(),
|
||||
|
|
@ -284,27 +286,17 @@ impl HttpContext for StreamContext {
|
|||
}
|
||||
};
|
||||
|
||||
// Deserialize body into spec.
|
||||
// Currently OpenAI API.
|
||||
let mut deserialized_body: ChatCompletionsRequest =
|
||||
match serde_json::from_slice(&body_bytes) {
|
||||
Ok(deserialized) => deserialized,
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"on_http_request_body: request body: {}",
|
||||
String::from_utf8_lossy(&body_bytes)
|
||||
);
|
||||
self.send_server_error(
|
||||
ServerError::Deserialization(e),
|
||||
Some(StatusCode::BAD_REQUEST),
|
||||
);
|
||||
return Action::Pause;
|
||||
}
|
||||
};
|
||||
|
||||
for message in deserialized_body.messages.iter_mut() {
|
||||
message.model = None;
|
||||
}
|
||||
let mut deserialized_body = match ChatCompletionsRequest::try_from(body_bytes.as_slice()) {
|
||||
Ok(deserialized) => deserialized,
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"on_http_request_body: request body: {}",
|
||||
String::from_utf8_lossy(&body_bytes)
|
||||
);
|
||||
self.send_server_error(ServerError::OpenAIPError(e), Some(StatusCode::BAD_REQUEST));
|
||||
return Action::Pause;
|
||||
}
|
||||
};
|
||||
|
||||
self.user_message = deserialized_body
|
||||
.messages
|
||||
|
|
@ -348,17 +340,12 @@ impl HttpContext for StreamContext {
|
|||
model_name.unwrap_or(&"None".to_string()),
|
||||
);
|
||||
|
||||
let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();
|
||||
|
||||
debug!(
|
||||
"on_http_request_body: request body: {}",
|
||||
chat_completion_request_str
|
||||
);
|
||||
|
||||
if deserialized_body.stream {
|
||||
if deserialized_body.stream.unwrap_or_default() {
|
||||
self.streaming_response = true;
|
||||
}
|
||||
if deserialized_body.stream && deserialized_body.stream_options.is_none() {
|
||||
if deserialized_body.stream.unwrap_or_default()
|
||||
&& deserialized_body.stream_options.is_none()
|
||||
{
|
||||
deserialized_body.stream_options = Some(StreamOptions {
|
||||
include_usage: true,
|
||||
});
|
||||
|
|
@ -387,7 +374,20 @@ impl HttpContext for StreamContext {
|
|||
return Action::Continue;
|
||||
}
|
||||
|
||||
self.set_http_request_body(0, body_size, chat_completion_request_str.as_bytes());
|
||||
let llm_provider_str = self.llm_provider().provider_interface.to_string();
|
||||
let hermes_llm_provider = Provider::from(llm_provider_str.as_str());
|
||||
|
||||
// convert chat completion request to llm provider specific request
|
||||
let deserialized_body_bytes = match deserialized_body.to_bytes(hermes_llm_provider) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(e) => {
|
||||
warn!("Failed to serialize request body: {}", e);
|
||||
self.send_server_error(ServerError::OpenAIPError(e), Some(StatusCode::BAD_REQUEST));
|
||||
return Action::Pause;
|
||||
}
|
||||
};
|
||||
|
||||
self.set_http_request_body(0, body_size, &deserialized_body_bytes);
|
||||
|
||||
Action::Continue
|
||||
}
|
||||
|
|
@ -542,58 +542,33 @@ impl HttpContext for StreamContext {
|
|||
}
|
||||
};
|
||||
|
||||
let body_utf8 = match String::from_utf8(body) {
|
||||
Ok(body_utf8) => body_utf8,
|
||||
Err(e) => {
|
||||
warn!("could not convert to utf8: {}", e);
|
||||
return Action::Continue;
|
||||
}
|
||||
};
|
||||
let llm_provider_str = self.llm_provider().provider_interface.to_string();
|
||||
let hermes_llm_provider = Provider::from(llm_provider_str.as_str());
|
||||
|
||||
if self.streaming_response {
|
||||
if body_utf8 == "data: [DONE]\n" {
|
||||
return Action::Continue;
|
||||
}
|
||||
|
||||
let chat_completions_chunk_response_events =
|
||||
match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) {
|
||||
Ok(response) => response,
|
||||
match SseChatCompletionIter::try_from((body.as_slice(), &hermes_llm_provider)) {
|
||||
Ok(events) => events,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"invalid streaming response: body str: {}, {:?}",
|
||||
body_utf8, e
|
||||
);
|
||||
warn!("could not parse response: {}", e);
|
||||
return Action::Continue;
|
||||
}
|
||||
};
|
||||
|
||||
if chat_completions_chunk_response_events.events.is_empty() {
|
||||
warn!(
|
||||
"couldn't parse any streaming events: body str: {}",
|
||||
body_utf8
|
||||
);
|
||||
return Action::Continue;
|
||||
for event in chat_completions_chunk_response_events {
|
||||
match event {
|
||||
Ok(event) => {
|
||||
if let Some(usage) = event.usage.as_ref() {
|
||||
self.response_tokens += usage.completion_tokens;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("error in response event: {}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let model = chat_completions_chunk_response_events
|
||||
.events
|
||||
.first()
|
||||
.unwrap()
|
||||
.model
|
||||
.clone();
|
||||
let tokens_str = chat_completions_chunk_response_events.to_string();
|
||||
|
||||
let token_count =
|
||||
match tokenizer::token_count(model.as_ref().unwrap().as_str(), tokens_str.as_str())
|
||||
{
|
||||
Ok(token_count) => token_count,
|
||||
Err(e) => {
|
||||
warn!("could not get token count: {:?}", e);
|
||||
return Action::Continue;
|
||||
}
|
||||
};
|
||||
self.response_tokens += token_count;
|
||||
|
||||
// Compute TTFT if not already recorded
|
||||
if self.ttft_duration.is_none() {
|
||||
// if let Some(start_time) = self.start_time {
|
||||
|
|
@ -616,24 +591,26 @@ impl HttpContext for StreamContext {
|
|||
}
|
||||
} else {
|
||||
debug!("non streaming response");
|
||||
let chat_completions_response: ChatCompletionsResponse =
|
||||
match serde_json::from_str(body_utf8.as_str()) {
|
||||
let chat_completions_response =
|
||||
match ChatCompletionsResponse::try_from((body.as_slice(), &hermes_llm_provider)) {
|
||||
Ok(de) => de,
|
||||
Err(err) => {
|
||||
info!(
|
||||
"non chat-completion compliant response received err: {}, body: {}",
|
||||
err, body_utf8
|
||||
Err(e) => {
|
||||
warn!("could not parse response: {}", e);
|
||||
debug!(
|
||||
"on_http_response_body: S[{}], response body: {}",
|
||||
self.context_id,
|
||||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
self.send_server_error(
|
||||
ServerError::OpenAIPError(e),
|
||||
Some(StatusCode::BAD_REQUEST),
|
||||
);
|
||||
return Action::Continue;
|
||||
}
|
||||
};
|
||||
|
||||
if chat_completions_response.usage.is_some() {
|
||||
self.response_tokens += chat_completions_response
|
||||
.usage
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.completion_tokens;
|
||||
if let Some(usage) = chat_completions_response.usage {
|
||||
self.response_tokens += usage.completion_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -202,20 +202,7 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
|
|||
request_headers_expectations(&mut module, http_context);
|
||||
|
||||
// Request Body
|
||||
let chat_completions_request_body = "\
|
||||
{\
|
||||
\"messages\": [\
|
||||
{\
|
||||
\"role\": \"system\",\
|
||||
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
|
||||
},\
|
||||
{\
|
||||
\"role\": \"user\",\
|
||||
\"content\": \"Compose a poem.\"\
|
||||
}\
|
||||
],\
|
||||
\"model\": \"gpt-4\"\
|
||||
}";
|
||||
let chat_completions_request_body = r#"{"model":"gpt-4","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem."}]}"#;
|
||||
|
||||
module
|
||||
.call_proxy_on_request_body(
|
||||
|
|
@ -229,7 +216,6 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
|
|||
.expect_log(Some(LogLevel::Info), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 21)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
|
|
@ -268,18 +254,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
|
|||
request_headers_expectations(&mut module, http_context);
|
||||
|
||||
// Request Body
|
||||
let incomplete_chat_completions_request_body = "\
|
||||
{\
|
||||
\"messages\": [\
|
||||
{\
|
||||
\"role\": \"system\"\
|
||||
},\
|
||||
{\
|
||||
\"role\": \"user\",\
|
||||
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
|
||||
}\
|
||||
]\
|
||||
}";
|
||||
let incomplete_chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
|
||||
|
||||
module
|
||||
.call_proxy_on_request_body(
|
||||
|
|
@ -290,7 +265,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
|
|||
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
|
||||
.returning(Some(incomplete_chat_completions_request_body))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"))
|
||||
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"))
|
||||
.expect_send_local_response(
|
||||
Some(StatusCode::BAD_REQUEST.as_u16().into()),
|
||||
None,
|
||||
|
|
@ -300,8 +275,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
|
|||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 14)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 13)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.execute_and_expect(ReturnType::Action(Action::Continue))
|
||||
.unwrap();
|
||||
|
|
@ -359,11 +333,10 @@ fn llm_gateway_request_ratelimited() {
|
|||
.expect_log(Some(LogLevel::Info), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 107)
|
||||
.expect_log(Some(LogLevel::Debug), Some("Applying ratelimit for model: gpt-4"))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Warn), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107"))
|
||||
.expect_log(Some(LogLevel::Warn), Some(r#"server error occurred: exceeded limit provider=gpt-4, selector=Header { key: "selector-key", value: "selector-value" }, tokens_used=107"#))
|
||||
.expect_send_local_response(
|
||||
Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
|
||||
None,
|
||||
|
|
@ -399,20 +372,7 @@ fn llm_gateway_request_not_ratelimited() {
|
|||
normal_flow(&mut module, filter_context, http_context);
|
||||
|
||||
// give shorter body to avoid rate limiting
|
||||
let chat_completions_request_body = "\
|
||||
{\
|
||||
\"messages\": [\
|
||||
{\
|
||||
\"role\": \"system\",\
|
||||
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
|
||||
},\
|
||||
{\
|
||||
\"role\": \"user\",\
|
||||
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
|
||||
}\
|
||||
],\
|
||||
\"model\": \"gpt-4\"\
|
||||
}";
|
||||
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
|
||||
|
||||
module
|
||||
.call_proxy_on_request_body(
|
||||
|
|
@ -427,7 +387,6 @@ fn llm_gateway_request_not_ratelimited() {
|
|||
.expect_log(Some(LogLevel::Info), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 29)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
|
|
@ -460,20 +419,7 @@ fn llm_gateway_override_model_name() {
|
|||
normal_flow(&mut module, filter_context, http_context);
|
||||
|
||||
// give shorter body to avoid rate limiting
|
||||
let chat_completions_request_body = "\
|
||||
{\
|
||||
\"model\": \"o1-mini\",\
|
||||
\"messages\": [\
|
||||
{\
|
||||
\"role\": \"system\",\
|
||||
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
|
||||
},\
|
||||
{\
|
||||
\"role\": \"user\",\
|
||||
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
|
||||
}\
|
||||
]
|
||||
}";
|
||||
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
|
||||
|
||||
module
|
||||
.call_proxy_on_request_body(
|
||||
|
|
@ -485,8 +431,7 @@ fn llm_gateway_override_model_name() {
|
|||
.returning(Some(chat_completions_request_body))
|
||||
// The actual call is not important in this test, we just need to grab the token_id
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: o1-mini, model selected: gpt-4"))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 29)
|
||||
|
|
@ -521,19 +466,7 @@ fn llm_gateway_override_use_default_model() {
|
|||
normal_flow(&mut module, filter_context, http_context);
|
||||
|
||||
// give shorter body to avoid rate limiting
|
||||
let chat_completions_request_body = "\
|
||||
{\
|
||||
\"messages\": [\
|
||||
{\
|
||||
\"role\": \"system\",\
|
||||
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
|
||||
},\
|
||||
{\
|
||||
\"role\": \"user\",\
|
||||
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
|
||||
}\
|
||||
]
|
||||
}";
|
||||
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
|
||||
|
||||
module
|
||||
.call_proxy_on_request_body(
|
||||
|
|
@ -547,14 +480,13 @@ fn llm_gateway_override_use_default_model() {
|
|||
// The actual call is not important in this test, we just need to grab the token_id
|
||||
.expect_log(
|
||||
Some(LogLevel::Info),
|
||||
Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"),
|
||||
Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"),
|
||||
)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_record("input_sequence_length", 29)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), Some("Applying ratelimit for model: gpt-4"))
|
||||
.expect_log(Some(LogLevel::Debug), Some(r#"Checking limit for provider=gpt-4, with selector=Header { key: "selector-key", value: "selector-value" }, consuming tokens=29"#))
|
||||
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
|
||||
.execute_and_expect(ReturnType::Action(Action::Continue))
|
||||
.unwrap();
|
||||
|
|
@ -584,20 +516,7 @@ fn llm_gateway_override_use_model_name_none() {
|
|||
normal_flow(&mut module, filter_context, http_context);
|
||||
|
||||
// give shorter body to avoid rate limiting
|
||||
let chat_completions_request_body = "\
|
||||
{\
|
||||
\"model\": \"none\",\
|
||||
\"messages\": [\
|
||||
{\
|
||||
\"role\": \"system\",\
|
||||
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
|
||||
},\
|
||||
{\
|
||||
\"role\": \"user\",\
|
||||
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
|
||||
}\
|
||||
]
|
||||
}";
|
||||
let chat_completions_request_body = r#"{"model":"none","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
|
||||
|
||||
module
|
||||
.call_proxy_on_request_body(
|
||||
|
|
@ -615,7 +534,6 @@ fn llm_gateway_override_use_model_name_none() {
|
|||
.expect_metric_record("input_sequence_length", 29)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
|
||||
.execute_and_expect(ReturnType::Action(Action::Continue))
|
||||
.unwrap();
|
||||
|
|
|
|||
|
|
@ -312,6 +312,7 @@ fn prompt_gateway_bad_request_to_open_ai_chat_completions() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[serial]
|
||||
fn prompt_gateway_request_to_llm_gateway() {
|
||||
let args = tester::MockSettings {
|
||||
|
|
@ -462,6 +463,7 @@ fn prompt_gateway_request_to_llm_gateway() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[serial]
|
||||
fn prompt_gateway_request_no_intent_match() {
|
||||
let args = tester::MockSettings {
|
||||
|
|
@ -608,6 +610,7 @@ ratelimits:
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[serial]
|
||||
fn prompt_gateway_request_no_intent_match_default_target() {
|
||||
let args = tester::MockSettings {
|
||||
|
|
|
|||
|
|
@ -7,7 +7,8 @@ Content-Type: application/json
|
|||
"role": "user",
|
||||
"content": "convert 100 eur"
|
||||
}
|
||||
]
|
||||
],
|
||||
"model": "none"
|
||||
}
|
||||
HTTP 200
|
||||
[Asserts]
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ Content-Type: application/json
|
|||
"content": "convert 100 eur"
|
||||
}
|
||||
],
|
||||
"stream": true
|
||||
"stream": true,
|
||||
"model": "none"
|
||||
}
|
||||
HTTP 200
|
||||
[Asserts]
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ Content-Type: application/json
|
|||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "I am running under debt, how should I keep a tab on my expenses?"
|
||||
"content": "hi"
|
||||
}
|
||||
]
|
||||
],
|
||||
"model": "none"
|
||||
}
|
||||
HTTP 200
|
||||
[Asserts]
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ Content-Type: application/json
|
|||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "I am running under debt, how should I keep a tab on my expenses?"
|
||||
"content": "hi"
|
||||
}
|
||||
],
|
||||
"model": "none",
|
||||
"stream": true
|
||||
}
|
||||
HTTP 200
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ trap 'print_debug' INT TERM ERR
|
|||
|
||||
log starting > ../build.log
|
||||
|
||||
log building and running function_callling demo
|
||||
log building and running function_calling demo
|
||||
log ===========================================
|
||||
cd ../../demos/samples_python/weather_forecast/
|
||||
docker compose up weather_forecast_service --build -d
|
||||
|
|
|
|||
|
|
@ -2,8 +2,33 @@
|
|||
@openai_endpoint = https://api.openai.com
|
||||
@access_key = {{$dotenv OPENAI_API_KEY}}
|
||||
|
||||
### openai request
|
||||
POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
content-type: application/json
|
||||
authorization: Bearer
|
||||
accept: */*
|
||||
accept-encoding: deflate
|
||||
user-agent: Python/3.11 aiohttp/3.11.11
|
||||
content-length: 876
|
||||
x-forwarded-proto: https
|
||||
x-request-id: 99d7817d-a646-9497-a38d-710b1ce1325f
|
||||
traceparent: 00-e4c9fc8cf9fc7714c6a15ef34852fb30-573a351a98e0cd01-01
|
||||
tracestate:
|
||||
x-arch-llm-provider-hint: gpt-4o-mini
|
||||
|
||||
|
||||
{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "### Task:\nGenerate 1-3 broad tags categorizing the main themes of the chat history, along with 1-3 more specific subtopic tags.\n\n### Guidelines:\n- Start with high-level domains (e.g. Science, Technology, Philosophy, Arts, Politics, Business, Health, Sports, Entertainment, Education)\n- Consider including relevant subfields/subdomains if they are strongly represented throughout the conversation\n- If content is too short (less than 3 messages) or too diverse, use only [\"General\"]\n- Use the chat's primary language; default to English if multilingual\n- Prioritize accuracy over specificity\n\n### Output:\nJSON format: { \"tags\": [\"tag1\", \"tag2\", \"tag3\"] }\n\n### Chat History:\n<chat_history>\nUSER: hello\nASSISTANT: Hello! How can I assist you today?\n</chat_history>"
|
||||
}
|
||||
],
|
||||
"stream": false
|
||||
}
|
||||
|
||||
### test
|
||||
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
Authorization: Bearer {{access_key}}
|
||||
|
||||
|
|
@ -15,7 +40,7 @@ Authorization: Bearer {{access_key}}
|
|||
}
|
||||
],
|
||||
"model": "gpt-4o-mini",
|
||||
"stream": true
|
||||
"stream": false
|
||||
}
|
||||
|
||||
### openai request (streaming)
|
||||
|
|
@ -75,3 +100,48 @@ x-arch-llm-provider-hint: gpt-3.5-turbo-0125
|
|||
}
|
||||
]
|
||||
}
|
||||
|
||||
### llm gateway request with function calling (default target)
|
||||
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"stream": true,
|
||||
"model": "None",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle"
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get current weather at a location.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The location to get the weather for",
|
||||
"format": "City, State"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to return the weather in.",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"default": "celsius"
|
||||
},
|
||||
"days": {
|
||||
"type": "string",
|
||||
"description": "The number of days for the request."
|
||||
}
|
||||
},
|
||||
"required": ["location", "days"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue