Merge branch 'main' into adil/agent_format

This commit is contained in:
Adil Hafeez 2025-09-30 19:07:49 -07:00
commit d588e3da98
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
34 changed files with 292 additions and 2525 deletions

View file

@ -30,7 +30,7 @@ jobs:
- name: build arch docker image
run: |
cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.14 -t katanemo/archgw:latest
cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.15 -t katanemo/archgw:latest
- name: start archgw
env:

View file

@ -24,7 +24,7 @@ jobs:
- name: build arch docker image
run: |
docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.14
docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.15
- name: install poetry
run: |

View file

@ -24,7 +24,7 @@ jobs:
- name: build arch docker image
run: |
docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.14
docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.15
- name: install poetry
run: |

View file

@ -24,7 +24,7 @@ jobs:
- name: build arch docker image
run: |
docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.14
docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.15
- name: validate arch config
run: |

View file

@ -83,7 +83,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
```console
$ python3.12 -m venv venv
$ source venv/bin/activate # On Windows, use: venv\Scripts\activate
$ pip install archgw==0.3.14
$ pip install archgw==0.3.15
```
### Use Arch as a LLM Router
@ -272,7 +272,7 @@ endpoints:
```sh
$ archgw up arch_config.yaml
2024-12-05 16:56:27,979 - cli.main - INFO - Starting archgw cli version: 0.3.14
2024-12-05 16:56:27,979 - cli.main - INFO - Starting archgw cli version: 0.3.15
2024-12-05 16:56:28,485 - cli.utils - INFO - Schema validation successful!
2024-12-05 16:56:28,485 - cli.main - INFO - Starting arch model server and arch gateway
2024-12-05 16:56:51,647 - cli.core - INFO - Container is healthy!

View file

@ -64,6 +64,8 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_ingress.log"
format: |
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" "%UPSTREAM_CLUSTER%"
route_config:
name: local_routes
virtual_hosts:
@ -117,6 +119,8 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_ingress_prompt.log"
format: |
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" "%UPSTREAM_CLUSTER%"
route_config:
name: local_routes
virtual_hosts:
@ -425,6 +429,8 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_llm.log"
format: |
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" "%UPSTREAM_CLUSTER%"
route_config:
name: local_routes
virtual_hosts:
@ -515,6 +521,8 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_llm.log"
format: |
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" "%UPSTREAM_CLUSTER%"
route_config:
name: local_routes
virtual_hosts:
@ -705,6 +713,56 @@ static_resources:
tls_minimum_protocol_version: TLSv1_2
tls_maximum_protocol_version: TLSv1_3
- name: moonshotai
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: moonshotai
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: api.moonshot.ai
port_value: 443
hostname: "api.moonshot.ai"
transport_socket:
name: envoy.transport_sockets.tls
typed_config:
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
sni: api.moonshot.ai
common_tls_context:
tls_params:
tls_minimum_protocol_version: TLSv1_2
tls_maximum_protocol_version: TLSv1_3
- name: zhipu
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: zhipu
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: open.bigmodel.cn
port_value: 443
hostname: "open.bigmodel.cn"
transport_socket:
name: envoy.transport_sockets.tls
typed_config:
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
sni: open.bigmodel.cn
common_tls_context:
tls_params:
tls_minimum_protocol_version: TLSv1_2
tls_maximum_protocol_version: TLSv1_3
- name: together_ai
connect_timeout: 0.5s
type: LOGICAL_DNS

View file

@ -2,21 +2,21 @@
nodaemon=true
[program:brightstaff]
command=sh -c "RUST_LOG=debug /app/brightstaff 2>&1 | tee /var/log/brightstaff.log"
command=sh -c "RUST_LOG=info /app/brightstaff 2>&1 | tee /var/log/brightstaff.log | while IFS= read -r line; do echo '[brightstaff]' \"$line\"; done"
stdout_logfile=/dev/stdout
redirect_stderr=true
stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0
[program:envoy]
command=/bin/sh -c "python -m cli.config_generator && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info --log-format '[%%Y-%%m-%%d %%T.%%e][%%l] %%v' 2>&1 | tee /var/log/envoy.log"
command=/bin/sh -c "python -m cli.config_generator && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info --log-format '[%%Y-%%m-%%d %%T.%%e][%%l] %%v' 2>&1 | tee /var/log/envoy.log | while IFS= read -r line; do echo '[envoy_logs] ' \"$line\"; done"
stdout_logfile=/dev/stdout
redirect_stderr=true
stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0
[program:tail_access_logs]
command=/bin/sh -c "tail -F /var/log/access_*.log"
command=/bin/sh -c "tail -F /var/log/access_*.log | while IFS= read -r line; do echo '[access_logs]' \"$line\"; done"
stdout_logfile=/dev/stdout
redirect_stderr=true
stdout_logfile_maxbytes=0

View file

@ -19,7 +19,7 @@ source venv/bin/activate
### Step 3: Run the build script
```bash
pip install archgw==0.3.14
pip install archgw==0.3.15
```
## Uninstall Instructions: archgw CLI

View file

@ -20,6 +20,8 @@ SUPPORTED_PROVIDERS = [
"azure_openai",
"xai",
"ollama",
"moonshotai",
"zhipu",
]

View file

@ -10,4 +10,4 @@ SERVICE_NAME_MODEL_SERVER = "model_server"
SERVICE_ALL = "all"
MODEL_SERVER_LOG_FILE = "~/archgw_logs/modelserver.log"
ARCHGW_DOCKER_NAME = "archgw"
ARCHGW_DOCKER_IMAGE = os.getenv("ARCHGW_DOCKER_IMAGE", "katanemo/archgw:0.3.14")
ARCHGW_DOCKER_IMAGE = os.getenv("ARCHGW_DOCKER_IMAGE", "katanemo/archgw:0.3.15")

View file

@ -2,7 +2,7 @@
[[package]]
name = "archgw_modelserver"
version = "0.3.14"
version = "0.3.15"
description = "A model server for serving models"
optional = false
python-versions = "*"
@ -635,4 +635,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "18a3144509286d68efc9711706943ace163d0d35cd3e700511b77d8c2ac1b303"
content-hash = "70deeecce65e4760e1a50c3d232004842cbd6397ef45163d315269ac7de8e9fc"

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "archgw"
version = "0.3.14"
version = "0.3.15"
description = "Python-based CLI tool to manage Arch Gateway."
authors = ["Katanemo Labs, Inc."]
packages = [
@ -10,7 +10,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
archgw_modelserver = "^0.3.14"
archgw_modelserver = "^0.3.15"
click = "^8.1.7"
jinja2 = "^3.1.4"
jsonschema = "^4.23.0"

1243
crates/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -29,6 +29,7 @@ serde_yaml = "0.9.34"
thiserror = "2.0.12"
tokio = { version = "1.44.2", features = ["full"] }
tokio-stream = "0.1"
time = { version = "0.3", features = ["formatting", "macros"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
@ -37,4 +38,4 @@ mockito = "1.0"
tokio-stream = "0.1.17"
tracing = "0.1.41"
tracing-opentelemetry = "0.30.0"
tracing-subscriber = { version = "0.3.19", features = ["env-filter", "fmt"] }
tracing-subscriber = { version = "0.3.19", features = ["env-filter", "fmt", "time"] }

View file

@ -162,7 +162,7 @@ pub async fn chat(
Ok(route) => match route {
Some((_, model_name)) => model_name,
None => {
debug!(
info!(
"No route determined, using default model from request: {}",
chat_completions_request_for_arch_router.model
);

View file

@ -1,9 +1,46 @@
use std::sync::OnceLock;
use std::fmt;
use opentelemetry::global;
use opentelemetry_sdk::{propagation::TraceContextPropagator, trace::SdkTracerProvider};
use opentelemetry_stdout::SpanExporter;
use tracing_subscriber::EnvFilter;
use tracing_subscriber::fmt::{format, time::FormatTime, FmtContext, FormatEvent, FormatFields};
use tracing::{Event, Subscriber};
use time::macros::format_description;
struct BracketedTime;
impl FormatTime for BracketedTime {
fn format_time(&self, w: &mut format::Writer<'_>) -> fmt::Result {
let now = time::OffsetDateTime::now_utc();
write!(w, "[{}]", now.format(&format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond digits:3]")).unwrap())
}
}
struct BracketedFormatter;
impl<S, N> FormatEvent<S, N> for BracketedFormatter
where
S: Subscriber + for<'a> tracing_subscriber::registry::LookupSpan<'a>,
N: for<'a> FormatFields<'a> + 'static,
{
fn format_event(
&self,
ctx: &FmtContext<'_, S, N>,
mut writer: format::Writer<'_>,
event: &Event<'_>,
) -> fmt::Result {
let timer = BracketedTime;
timer.format_time(&mut writer)?;
write!(writer, "[{}] ", event.metadata().level().to_string().to_lowercase())?;
ctx.field_format().format_fields(writer.by_ref(), event)?;
writeln!(writer)
}
}
static INIT_LOGGER: OnceLock<SdkTracerProvider> = OnceLock::new();
@ -22,6 +59,7 @@ pub fn init_tracer() -> &'static SdkTracerProvider {
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.event_format(BracketedFormatter)
.init();
provider

View file

@ -200,6 +200,10 @@ pub enum LlmProviderType {
AzureOpenAI,
#[serde(rename = "ollama")]
Ollama,
#[serde(rename = "moonshotai")]
Moonshotai,
#[serde(rename = "zhipu")]
Zhipu,
}
impl Display for LlmProviderType {
@ -216,6 +220,8 @@ impl Display for LlmProviderType {
LlmProviderType::TogetherAI => write!(f, "together_ai"),
LlmProviderType::AzureOpenAI => write!(f, "azure_openai"),
LlmProviderType::Ollama => write!(f, "ollama"),
LlmProviderType::Moonshotai => write!(f, "moonshotai"),
LlmProviderType::Zhipu => write!(f, "zhipu"),
}
}
}
@ -267,7 +273,7 @@ impl IntoModels for Vec<LlmProvider> {
.iter()
.map(|provider| ModelDetail {
id: provider.name.clone(),
object: "model".to_string(),
object: Some("model".to_string()),
created: 0,
owned_by: "system".to_string(),
})

View file

@ -380,7 +380,7 @@ pub enum StaticContentType {
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ChatCompletionsResponse {
pub id: String,
pub object: String,
pub object: Option<String>,
pub created: u64,
pub model: String,
pub choices: Vec<Choice>,
@ -393,7 +393,7 @@ impl Default for ChatCompletionsResponse {
fn default() -> Self {
ChatCompletionsResponse {
id: String::new(),
object: String::new(),
object: None,
created: 0,
model: String::new(),
choices: vec![],
@ -486,7 +486,7 @@ impl Default for Choice {
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ChatCompletionsStreamResponse {
pub id: String,
pub object: String,
pub object: Option<String>,
pub created: u64,
pub model: String,
pub choices: Vec<StreamChoice>,
@ -549,7 +549,7 @@ pub struct StreamOptions {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelDetail {
pub id: String,
pub object: String,
pub object: Option<String>,
pub created: usize,
pub owned_by: String,
}
@ -1233,7 +1233,7 @@ mod tests {
let response: ChatCompletionsResponse = serde_json::from_str(json_response).unwrap();
assert_eq!(response.id, "chatcmpl-CAJc2Df6QCc7Mv3RP0Cf2xlbDV1x2");
assert_eq!(response.object, "chat.completion");
assert_eq!(response.object.as_deref(), Some("chat.completion"));
assert_eq!(response.created, 1756574706);
assert_eq!(response.model, "gpt-4o-2024-08-06");
assert_eq!(response.service_tier, Some("default".to_string()));

View file

@ -80,6 +80,13 @@ impl SupportedAPIs {
default_endpoint
}
}
ProviderId::Zhipu => {
if request_path.starts_with("/v1/") {
"/api/paas/v4/chat/completions".to_string()
} else {
default_endpoint
}
}
ProviderId::AzureOpenAI => {
if request_path.starts_with("/v1/") {
format!("/openai/deployments/{}/chat/completions?api-version=2025-01-01-preview", model_id)

View file

@ -210,7 +210,7 @@ impl TryFrom<MessagesResponse> for ChatCompletionsResponse {
Ok(ChatCompletionsResponse {
id: resp.id,
object: "chat.completion".to_string(),
object: Some("chat.completion".to_string()),
created: current_timestamp(),
model: resp.model,
choices: vec![choice],
@ -329,7 +329,7 @@ impl TryFrom<MessagesStreamEvent> for ChatCompletionsStreamResponse {
MessagesStreamEvent::Ping => {
Ok(ChatCompletionsStreamResponse {
id: "stream".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: current_timestamp(),
model: "unknown".to_string(),
choices: vec![],
@ -709,7 +709,7 @@ fn create_openai_chunk(
) -> ChatCompletionsStreamResponse {
ChatCompletionsStreamResponse {
id: id.to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: current_timestamp(),
model: model.to_string(),
choices: vec![StreamChoice {
@ -1254,7 +1254,7 @@ mod tests {
let openai_resp: ChatCompletionsStreamResponse = event.try_into().unwrap();
assert_eq!(openai_resp.id, "msg_stream_123");
assert_eq!(openai_resp.object, "chat.completion.chunk");
assert_eq!(openai_resp.object.as_deref(), Some("chat.completion.chunk"));
assert_eq!(openai_resp.model, "claude-3");
assert_eq!(openai_resp.choices.len(), 1);
@ -1276,7 +1276,7 @@ mod tests {
let openai_resp: ChatCompletionsStreamResponse = event.try_into().unwrap();
assert_eq!(openai_resp.object, "chat.completion.chunk");
assert_eq!(openai_resp.object.as_deref(), Some("chat.completion.chunk"));
assert_eq!(openai_resp.choices.len(), 1);
let choice = &openai_resp.choices[0];
@ -1376,7 +1376,7 @@ mod tests {
let openai_resp: ChatCompletionsStreamResponse = event.try_into().unwrap();
assert_eq!(openai_resp.object, "chat.completion.chunk");
assert_eq!(openai_resp.object.as_deref(), Some("chat.completion.chunk"));
assert_eq!(openai_resp.choices.len(), 0); // Ping has no choices
}
@ -1384,7 +1384,7 @@ mod tests {
fn test_openai_to_anthropic_streaming_role_start() {
let openai_resp = ChatCompletionsStreamResponse {
id: "chatcmpl-123".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 1234567890,
model: "gpt-4".to_string(),
choices: vec![StreamChoice {
@ -1420,7 +1420,7 @@ mod tests {
fn test_openai_to_anthropic_streaming_content_delta() {
let openai_resp = ChatCompletionsStreamResponse {
id: "chatcmpl-123".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 1234567890,
model: "gpt-4".to_string(),
choices: vec![StreamChoice {
@ -1460,7 +1460,7 @@ mod tests {
fn test_openai_to_anthropic_streaming_tool_calls() {
let openai_resp = ChatCompletionsStreamResponse {
id: "chatcmpl-123".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 1234567890,
model: "gpt-4".to_string(),
choices: vec![StreamChoice {
@ -1509,7 +1509,7 @@ mod tests {
fn test_openai_to_anthropic_streaming_final_usage() {
let openai_resp = ChatCompletionsStreamResponse {
id: "chatcmpl-123".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 1234567890,
model: "gpt-4".to_string(),
choices: vec![StreamChoice {
@ -1551,7 +1551,7 @@ mod tests {
fn test_openai_empty_choices_to_anthropic_ping() {
let openai_resp = ChatCompletionsStreamResponse {
id: "chatcmpl-123".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 1234567890,
model: "gpt-4".to_string(),
choices: vec![], // Empty choices
@ -1690,7 +1690,7 @@ mod tests {
// Test that malformed streaming events are handled gracefully
let openai_resp_with_missing_data = ChatCompletionsStreamResponse {
id: "test".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 1234567890,
model: "test".to_string(),
choices: vec![StreamChoice {
@ -1722,7 +1722,7 @@ mod tests {
let openai_resp: ChatCompletionsStreamResponse = event.try_into().unwrap();
// ContentBlockStop should produce an empty chunk
assert_eq!(openai_resp.object, "chat.completion.chunk");
assert_eq!(openai_resp.object.as_deref(), Some("chat.completion.chunk"));
assert_eq!(openai_resp.choices.len(), 1);
let choice = &openai_resp.choices[0];

View file

@ -17,6 +17,8 @@ pub enum ProviderId {
XAI,
TogetherAI,
Ollama,
Moonshotai,
Zhipu,
}
impl From<&str> for ProviderId {
@ -34,6 +36,8 @@ impl From<&str> for ProviderId {
"xai" => ProviderId::XAI,
"together_ai" => ProviderId::TogetherAI,
"ollama" => ProviderId::Ollama,
"moonshotai" => ProviderId::Moonshotai,
"zhipu" => ProviderId::Zhipu,
_ => panic!("Unknown provider: {}", value),
}
}
@ -58,7 +62,9 @@ impl ProviderId {
| ProviderId::AzureOpenAI
| ProviderId::XAI
| ProviderId::TogetherAI
| ProviderId::Ollama,
| ProviderId::Ollama
| ProviderId::Moonshotai
| ProviderId::Zhipu,
SupportedAPIs::AnthropicMessagesAPI(_)) => SupportedAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
(ProviderId::OpenAI
@ -71,7 +77,9 @@ impl ProviderId {
| ProviderId::AzureOpenAI
| ProviderId::XAI
| ProviderId::TogetherAI
| ProviderId::Ollama,
| ProviderId::Ollama
| ProviderId::Moonshotai
| ProviderId::Zhipu,
SupportedAPIs::OpenAIChatCompletions(_)) => SupportedAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
}
}
@ -92,6 +100,8 @@ impl Display for ProviderId {
ProviderId::XAI => write!(f, "xai"),
ProviderId::TogetherAI => write!(f, "together_ai"),
ProviderId::Ollama => write!(f, "ollama"),
ProviderId::Moonshotai => write!(f, "moonshotai"),
ProviderId::Zhipu => write!(f, "zhipu"),
}
}
}

View file

@ -787,7 +787,7 @@ mod tests {
// Test OpenAI event type (should be None)
let openai_event = ChatCompletionsStreamResponse {
id: "test".to_string(),
object: "chat.completion.chunk".to_string(),
object: Some("chat.completion.chunk".to_string()),
created: 123456789,
model: "gpt-4".to_string(),
choices: vec![],

View file

@ -25,5 +25,4 @@ sha2 = "0.10.8"
hermesllm = { version = "0.1.0", path = "../hermesllm" }
[dev-dependencies]
proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }
serial_test = "3.1.1"

View file

@ -204,7 +204,7 @@ impl StreamContext {
// Tokenize and record token count.
let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
info!(
debug!(
"[ARCHGW_REQ_ID:{}] TOKEN_COUNT: model='{}' input_tokens={}",
self.request_identifier(),
model,
@ -492,7 +492,7 @@ impl StreamContext {
body: &[u8],
provider_id: ProviderId,
) -> Result<Vec<u8>, Action> {
info!(
debug!(
"[ARCHGW_REQ_ID:{}] NON_STREAMING_PROCESS: provider_id={:?} body_size={}",
self.request_identifier(),
provider_id,
@ -531,7 +531,7 @@ impl StreamContext {
if let Some((prompt_tokens, completion_tokens, total_tokens)) =
response.extract_usage_counts()
{
info!(
debug!(
"[ARCHGW_REQ_ID:{}] RESPONSE_USAGE: prompt_tokens={} completion_tokens={} total_tokens={}",
self.request_identifier(),
prompt_tokens,
@ -697,7 +697,7 @@ impl HttpContext for StreamContext {
//We need to deserialize the request body based on the resolved API
let mut deserialized_client_request: ProviderRequestType = match self.client_api.as_ref() {
Some(the_client_api) => {
info!(
debug!(
"[ARCHGW_REQ_ID:{}] CLIENT_REQUEST_RECEIVED: api={:?} body_size={}",
self.request_identifier(),
the_client_api,
@ -785,7 +785,7 @@ impl HttpContext for StreamContext {
// Extract user message for tracing
self.user_message = deserialized_client_request.get_recent_user_message();
info!(
debug!(
"[ARCHGW_REQ_ID:{}] MODEL_RESOLUTION: req_model='{}' -> resolved_model='{}' provider='{}' streaming={}",
self.request_identifier(),
model_requested,
@ -871,7 +871,7 @@ impl HttpContext for StreamContext {
if let Ok(status_code) = status_str.parse::<u16>() {
self.upstream_status_code = StatusCode::from_u16(status_code).ok();
info!(
debug!(
"[ARCHGW_REQ_ID:{}] UPSTREAM_RESPONSE_STATUS: {}",
self.request_identifier(),
status_code

View file

@ -1,562 +0,0 @@
use http::StatusCode;
use proxy_wasm_test_framework::tester::{self, Tester};
use proxy_wasm_test_framework::types::{
Action, BufferType, LogLevel, MapType, MetricType, ReturnType,
};
use serial_test::serial;
use std::path::Path;
fn wasm_module() -> String {
let wasm_file = Path::new("../target/wasm32-wasip1/release/llm_gateway.wasm");
assert!(
wasm_file.exists(),
"Run `cargo build --release --target=wasm32-wasip1` first"
);
wasm_file.to_string_lossy().to_string()
}
fn request_headers_expectations(module: &mut Tester, http_context: i32) {
module
.call_proxy_on_request_headers(http_context, 0, false)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
.returning(Some("/v1/chat/completions"))
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider"),
)
.returning(None)
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider-hint"),
)
.returning(None)
.expect_log(
Some(LogLevel::Info),
None, // Dynamic request ID - could be context_id or x-request-id
)
.expect_add_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider"),
Some("openai"),
)
.expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-api-key"))
.expect_replace_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("Authorization"),
Some("Bearer secret_key"),
)
.expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider-hint"),
)
.returning(Some("default"))
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-ratelimit-selector"),
)
.returning(Some("selector-key"))
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("selector-key"))
.returning(Some("selector-value"))
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
.returning(None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
.returning(None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
module
.call_proxy_on_context_create(http_context, filter_context)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
request_headers_expectations(module, http_context);
}
fn setup_filter(module: &mut Tester, config: &str) -> i32 {
let filter_context = 1;
module
.call_proxy_on_context_create(filter_context, 0)
.expect_metric_creation(MetricType::Gauge, "active_http_calls")
.expect_metric_creation(MetricType::Counter, "ratelimited_rq")
.expect_metric_creation(MetricType::Histogram, "time_to_first_token")
.expect_metric_creation(MetricType::Histogram, "time_per_output_token")
.expect_metric_creation(MetricType::Histogram, "tokens_per_second")
.expect_metric_creation(MetricType::Histogram, "request_latency")
.expect_metric_creation(MetricType::Histogram, "output_sequence_length")
.expect_metric_creation(MetricType::Histogram, "input_sequence_length")
.execute_and_expect(ReturnType::None)
.unwrap();
module
.call_proxy_on_configure(filter_context, config.len() as i32)
.expect_get_buffer_bytes(Some(BufferType::PluginConfiguration))
.returning(Some(config))
.execute_and_expect(ReturnType::Bool(true))
.unwrap();
filter_context
}
fn default_config() -> &'static str {
r#"
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
endpoints:
api_server:
endpoint: api_server:80
connect_timeout: 0.005s
llm_providers:
- name: open-ai-gpt-4
provider_interface: openai
access_key: secret_key
model: gpt-4
default: true
- name: open-ai-gpt-4o
provider_interface: openai
access_key: secret_key
model: gpt-4o
overrides:
# confidence threshold for prompt target intent matching
prompt_target_intent_matching_threshold: 0.6
system_prompt: |
You are a helpful assistant.
prompt_guards:
input_guards:
jailbreak:
on_exception:
message: "Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters."
prompt_targets:
- name: weather_forecast
description: This function provides realtime weather forecast information for a given city.
parameters:
- name: city
required: true
description: The city for which the weather forecast is requested.
- name: days
description: The number of days for which the weather forecast is requested.
- name: units
description: The units in which the weather forecast is requested.
endpoint:
name: api_server
path: /weather
system_prompt: |
You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
- Use farenheight for temperature
- Use miles per hour for wind speed
ratelimits:
- model: gpt-4
selector:
key: selector-key
value: selector-value
limit:
tokens: 100
unit: minute
"#
}
#[test]
#[serial]
fn llm_gateway_successful_request_to_open_ai_chat_completions() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
module
.call_proxy_on_context_create(http_context, filter_context)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
request_headers_expectations(&mut module, http_context);
// Request Body
let chat_completions_request_body = r#"{"model":"gpt-4","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem."}]}"#;
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID - REQUEST_BODY_CHUNK
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - CLIENT_REQUEST_RECEIVED
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID - CLIENT_REQUEST_PAYLOAD
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - MODEL_RESOLUTION
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - TOKEN_COUNT
.expect_metric_record("input_sequence_length", 21)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - RATELIMIT_CHECK
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=21"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - UPSTREAM_TRANSFORM
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID - UPSTREAM_REQUEST_PAYLOAD
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_bad_request_to_open_ai_chat_completions() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
module
.call_proxy_on_context_create(http_context, filter_context)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
request_headers_expectations(&mut module, http_context);
// Request Body
let incomplete_chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
http_context,
incomplete_chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID - REQUEST_BODY_CHUNK
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(incomplete_chat_completions_request_body))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - CLIENT_REQUEST_RECEIVED
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID - CLIENT_REQUEST_PAYLOAD
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - MODEL_RESOLUTION
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - TOKEN_COUNT
.expect_metric_record("input_sequence_length", 13)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - RATELIMIT_CHECK
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=13"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID - RATELIMIT_CHECK
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] UPSTREAM_REQUEST_PAYLOAD: {\"messages\":[{\"role\":\"system\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}],\"model\":\"gpt-4\"}"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_request_ratelimited() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// Request Body
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a helpful poetic assistant!, skilled in explaining complex programming concepts with creative flair. Be sure to be concise and to the point.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. And also summarize it how a 4th graded would understand it. Compose a poem that explains the concept of recursion in programming. And also summarize it how a 4th graded would understand it.\"\
}\
],\
\"model\": \"gpt-4\"\
}";
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] CLIENT_REQUEST_PAYLOAD: {\"messages\": [{\"role\": \"system\",\"content\": \"You are a helpful poetic assistant!, skilled in explaining complex programming concepts with creative flair. Be sure to be concise and to the point.\"},{\"role\": \"user\",\"content\": \"Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. And also summarize it how a 4th graded would understand it. Compose a poem that explains the concept of recursion in programming. And also summarize it how a 4th graded would understand it.\"}],\"model\": \"gpt-4\"}"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None)// Dynamic request ID)
.expect_metric_record("input_sequence_length", 107)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=107"))
.expect_log(Some(LogLevel::Warn), Some(r#"server error occurred: exceeded limit provider=gpt-4, selector=Header { key: "selector-key", value: "selector-value" }, tokens_used=107"#))
.expect_send_local_response(
Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
None,
None,
None,
)
.expect_metric_increment("ratelimited_rq", 1)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_request_not_ratelimited() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Info), None)
// Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] CLIENT_REQUEST_PAYLOAD: {\"model\":\"gpt-1\",\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}]}"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=29"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] UPSTREAM_REQUEST_PAYLOAD: {\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}],\"model\":\"gpt-4\"}"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_override_model_name() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] CLIENT_REQUEST_PAYLOAD: {\"model\":\"gpt-1\",\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}]}"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=29"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] UPSTREAM_REQUEST_PAYLOAD: {\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}],\"model\":\"gpt-4\"}"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_override_use_default_model() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] CLIENT_REQUEST_PAYLOAD: {\"model\":\"gpt-1\",\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}]}"))
.expect_log(
Some(LogLevel::Info),
None // Dynamic request ID,
)
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=29"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] UPSTREAM_REQUEST_PAYLOAD: {\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}],\"model\":\"gpt-4\"}"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_override_use_model_name_none() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = r#"{"model":"none","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None) // Dynamic request ID)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Info), None)
// Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] CLIENT_REQUEST_PAYLOAD: {\"model\":\"none\",\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}]}"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("TOKENIZER: computing token count for model=gpt-4"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("Checking limit for provider=gpt-4, with selector=Header { key: \"selector-key\", value: \"selector-value\" }, consuming tokens=29"))
.expect_log(Some(LogLevel::Info), None) // Dynamic request ID)
.expect_log(Some(LogLevel::Debug), Some("[ARCHGW_REQ_ID:NO_REQUEST_ID] UPSTREAM_REQUEST_PAYLOAD: {\"messages\":[{\"role\":\"system\",\"content\":\"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"},{\"role\":\"user\",\"content\":\"Compose a poem that explains the concept of recursion in programming.\"}],\"model\":\"gpt-4\"}"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}

View file

@ -24,6 +24,5 @@ derivative = "2.2.0"
sha2 = "0.10.8"
[dev-dependencies]
proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }
serial_test = "3.1.1"
pretty_assertions = "1.4.1"

View file

@ -1,694 +0,0 @@
use common::api::open_ai::{
ChatCompletionsResponse, Choice, ContentType, FunctionCallDetail, Message, ToolCall, ToolType,
Usage,
};
use common::configuration::Configuration;
use http::StatusCode;
use proxy_wasm_test_framework::tester::{self, Tester};
use proxy_wasm_test_framework::types::{
Action, BufferType, LogLevel, MapType, MetricType, ReturnType,
};
use serde_yaml::Value;
use serial_test::serial;
use std::collections::HashMap;
use std::path::Path;
fn wasm_module() -> String {
let wasm_file = Path::new("../target/wasm32-wasip1/release/prompt_gateway.wasm");
assert!(
wasm_file.exists(),
"Run `cargo build --release --target=wasm32-wasip1` first"
);
wasm_file.to_str().unwrap().to_string()
}
fn request_headers_expectations(module: &mut Tester, http_context: i32) {
module
.call_proxy_on_request_headers(http_context, 0, false)
.expect_log(Some(LogLevel::Debug), None)
.expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
.returning(Some("/v1/chat/completions"))
.expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
.returning(None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
.returning(None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
.returning(None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
module
.call_proxy_on_context_create(http_context, filter_context)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
request_headers_expectations(module, http_context);
// Request Body
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
],\
\"model\": \"gpt-4\"\
}";
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_http_call(
Some("arch_internal"),
Some(vec![
("x-arch-upstream", "model_server"),
(":method", "POST"),
(":path", "/function_calling"),
("content-type", "application/json"),
(":authority", "model_server"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
]),
None,
None,
Some(5000),
)
.returning(Some(1))
.expect_metric_increment("active_http_calls", 1)
.execute_and_expect(ReturnType::Action(Action::Pause))
.unwrap();
}
fn setup_filter(module: &mut Tester, config: &str) -> i32 {
let filter_context = 1;
module
.call_proxy_on_context_create(filter_context, 0)
.expect_metric_creation(MetricType::Gauge, "active_http_calls")
.execute_and_expect(ReturnType::None)
.unwrap();
module
.call_proxy_on_configure(filter_context, config.len() as i32)
.expect_get_buffer_bytes(Some(BufferType::PluginConfiguration))
.returning(Some(config))
.execute_and_expect(ReturnType::Bool(true))
.unwrap();
filter_context
}
fn default_config() -> &'static str {
r#"
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
endpoints:
api_server:
endpoint: api_server:80
connect_timeout: 0.005s
llm_providers:
- name: open-ai-gpt-4
provider_interface: openai
access_key: secret_key
model: gpt-4
default: true
overrides:
# confidence threshold for prompt target intent matching
prompt_target_intent_matching_threshold: 0.0
system_prompt: |
You are a helpful assistant.
prompt_guards:
input_guards:
jailbreak:
on_exception:
message: "Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters."
prompt_targets:
- name: weather_forecast
description: This function provides realtime weather forecast information for a given city.
parameters:
- name: city
required: true
description: The city for which the weather forecast is requested.
- name: days
description: The number of days for which the weather forecast is requested.
- name: units
description: The units in which the weather forecast is requested.
endpoint:
name: api_server
path: /weather
http_method: POST
system_prompt: |
You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
- Use farenheight for temperature
- Use miles per hour for wind speed
ratelimits:
- model: gpt-4
selector:
key: selector-key
value: selector-value
limit:
tokens: 1
unit: minute
"#
}
#[test]
#[serial]
fn prompt_gateway_successful_request_to_open_ai_chat_completions() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
module
.call_proxy_on_context_create(http_context, filter_context)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
request_headers_expectations(&mut module, http_context);
// Request Body
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
],\
\"model\": \"gpt-4\"\
}";
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_http_call(Some("arch_internal"), None, None, None, None)
.returning(Some(4))
.expect_metric_increment("active_http_calls", 1)
.execute_and_expect(ReturnType::Action(Action::Pause))
.unwrap();
}
#[test]
#[serial]
fn prompt_gateway_bad_request_to_open_ai_chat_completions() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
module
.call_proxy_on_context_create(http_context, filter_context)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
request_headers_expectations(&mut module, http_context);
// Request Body
let incomplete_chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]\
}";
module
.call_proxy_on_request_body(
http_context,
incomplete_chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(incomplete_chat_completions_request_body))
.expect_log(Some(LogLevel::Debug), None)
.expect_send_local_response(
Some(StatusCode::BAD_REQUEST.as_u16().into()),
None,
None,
None,
)
.execute_and_expect(ReturnType::Action(Action::Pause))
.unwrap();
}
#[test]
#[ignore]
#[serial]
fn prompt_gateway_request_to_llm_gateway() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let mut config: Configuration = serde_yaml::from_str(default_config()).unwrap();
config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
let config_str = serde_json::to_string(&config).unwrap();
let filter_context = setup_filter(&mut module, &config_str);
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
let arch_fc_resp = ChatCompletionsResponse {
usage: Some(Usage {
completion_tokens: 0,
}),
choices: vec![Choice {
finish_reason: Some("test".to_string()),
index: Some(0),
message: Message {
role: "system".to_string(),
content: None,
tool_calls: Some(vec![ToolCall {
id: String::from("test"),
tool_type: ToolType::Function,
function: FunctionCallDetail {
name: String::from("weather_forecast"),
arguments: Some(HashMap::from([(
String::from("city"),
Value::String(String::from("seattle")),
)])),
},
}]),
model: None,
tool_call_id: None,
},
}],
model: String::from("test"),
metadata: {
let mut map: HashMap<String, String> = HashMap::new();
map.insert("function_latency".to_string(), "0.0".to_string());
Some(map)
},
};
let expected_body = "{\"city\":\"seattle\"}";
let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
module
.call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&arch_fc_resp_str))
.expect_log(Some(LogLevel::Warn), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_http_call(
Some("arch_internal"),
Some(vec![
("x-envoy-max-retries", "3"),
("x-arch-upstream", "api_server"),
("content-type", "application/json"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
(":path", "/weather"),
(":method", "POST"),
(":authority", "api_server"),
]),
Some(expected_body),
None,
Some(5000),
)
.returning(Some(2))
.expect_metric_increment("active_http_calls", 1)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
let body_text = String::from("test body");
module
.call_proxy_on_http_call_response(http_context, 2, 0, body_text.len() as i32, 0)
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&body_text))
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status"))
.returning(Some("200"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.expect_log(Some(LogLevel::Debug), None)
.execute_and_expect(ReturnType::None)
.unwrap();
let chat_completion_response = ChatCompletionsResponse {
usage: Some(Usage {
completion_tokens: 0,
}),
choices: vec![Choice {
finish_reason: Some("test".to_string()),
index: Some(0),
message: Message {
role: "assistant".to_string(),
content: Some(ContentType::Text("hello from fake llm gateway".to_string())),
model: None,
tool_calls: None,
tool_call_id: None,
},
}],
model: String::from("test"),
metadata: None,
};
let chat_completion_response_str = serde_json::to_string(&chat_completion_response).unwrap();
module
.call_proxy_on_response_body(
http_context,
chat_completion_response_str.len() as i32,
true,
)
.expect_get_buffer_bytes(Some(BufferType::HttpResponseBody))
.returning(Some(chat_completion_response_str.as_str()))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_set_buffer_bytes(Some(BufferType::HttpResponseBody), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[ignore]
#[serial]
fn prompt_gateway_request_no_intent_match() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let mut config: Configuration = serde_yaml::from_str(default_config()).unwrap();
config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
let config_str = serde_json::to_string(&config).unwrap();
let filter_context = setup_filter(&mut module, &config_str);
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
let arch_fc_resp = ChatCompletionsResponse {
usage: Some(Usage {
completion_tokens: 0,
}),
choices: vec![Choice {
finish_reason: Some("test".to_string()),
index: Some(0),
message: Message {
role: "assistant".to_string(),
content: None,
tool_calls: None,
model: None,
tool_call_id: None,
},
}],
model: String::from("test"),
metadata: None,
};
let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
module
.call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&arch_fc_resp_str))
.expect_log(Some(LogLevel::Warn), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("intent matched: false"))
.expect_log(
Some(LogLevel::Info),
Some("no default prompt target found, forwarding request to upstream llm"),
)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Info), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::None)
.unwrap();
}
fn arch_config_default_target() -> &'static str {
r#"
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
endpoints:
api_server:
endpoint: api_server:80
connect_timeout: 0.005s
llm_providers:
- name: open-ai-gpt-4
provider_interface: openai
access_key: secret_key
model: gpt-4
default: true
overrides:
# confidence threshold for prompt target intent matching
prompt_target_intent_matching_threshold: 0.0
system_prompt: |
You are a helpful assistant.
prompt_guards:
input_guards:
jailbreak:
on_exception:
message: "Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters."
prompt_targets:
- name: weather_forecast
description: This function provides realtime weather forecast information for a given city.
parameters:
- name: city
required: true
description: The city for which the weather forecast is requested.
- name: days
description: The number of days for which the weather forecast is requested.
- name: units
description: The units in which the weather forecast is requested.
endpoint:
name: api_server
path: /weather
http_method: POST
system_prompt: |
You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
- Use farenheight for temperature
- Use miles per hour for wind speed
- name: default_target
default: true
description: This is the default target for all unmatched prompts.
endpoint:
name: weather_forecast_service
path: /default_target
http_method: POST
system_prompt: |
You are a helpful assistant! Summarize the user's request and provide a helpful response.
# if it is set to false arch will send response that it received from this prompt target to the user
# if true arch will forward the response to the default LLM
auto_llm_dispatch_on_response: false
ratelimits:
- model: gpt-4
selector:
key: selector-key
value: selector-value
limit:
tokens: 1
unit: minute
"#
}
#[test]
#[ignore]
#[serial]
fn prompt_gateway_request_no_intent_match_default_target() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let mut config: Configuration = serde_yaml::from_str(arch_config_default_target()).unwrap();
config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
let config_str = serde_json::to_string(&config).unwrap();
let filter_context = setup_filter(&mut module, &config_str);
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
let arch_fc_resp = ChatCompletionsResponse {
usage: Some(Usage {
completion_tokens: 0,
}),
choices: vec![Choice {
finish_reason: Some("test".to_string()),
index: Some(0),
message: Message {
role: "system".to_string(),
content: None,
tool_calls: None,
model: None,
tool_call_id: None,
},
}],
model: String::from("test"),
metadata: None,
};
let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
module
.call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&arch_fc_resp_str))
.expect_log(Some(LogLevel::Warn), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("intent matched: false"))
.expect_log(
Some(LogLevel::Info),
Some("default prompt target found, forwarding request to default prompt target"),
)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_http_call(
Some("arch_internal"),
Some(vec![
(":method", "POST"),
("x-arch-upstream", "weather_forecast_service"),
(":path", "/default_target"),
(":authority", "weather_forecast_service"),
("content-type", "application/json"),
("x-envoy-max-retries", "3"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
]),
None,
None,
Some(5000),
)
.returning(Some(2))
.expect_metric_increment("active_http_calls", 1)
.execute_and_expect(ReturnType::None)
.unwrap();
}

View file

@ -12,7 +12,7 @@ python = ">=3.10,<3.13.3"
pydantic = "^2.0"
openai = "^1.0"
pyyaml = "^6.0"
archgw ="^0.3.14"
archgw ="^0.3.15"
[tool.poetry.group.dev.dependencies]
pytest = "^8.3"

View file

@ -14,9 +14,9 @@ Make sure your machine is up to date with [latest version of archgw]([url](https
2. start archgw in the foreground
```bash
(venv) $ archgw up --service archgw --foreground
2025-05-30 18:00:09,953 - cli.main - INFO - Starting archgw cli version: 0.3.14
2025-05-30 18:00:09,953 - cli.main - INFO - Starting archgw cli version: 0.3.15
2025-05-30 18:00:09,953 - cli.main - INFO - Validating /Users/adilhafeez/src/intelligent-prompt-gateway/demos/use_cases/preference_based_routing/arch_config.yaml
2025-05-30 18:00:10,422 - cli.core - INFO - Starting arch gateway, image name: archgw, tag: katanemo/archgw:0.3.14
2025-05-30 18:00:10,422 - cli.core - INFO - Starting arch gateway, image name: archgw, tag: katanemo/archgw:0.3.15
2025-05-30 18:00:10,662 - cli.core - INFO - archgw status: running, health status: starting
2025-05-30 18:00:11,712 - cli.core - INFO - archgw status: running, health status: starting
2025-05-30 18:00:12,761 - cli.core - INFO - archgw is running and is healthy!

View file

@ -402,6 +402,94 @@ xAI
- model: xai/grok-beta
access_key: $XAI_API_KEY
Moonshot AI
~~~~~~~~~~~
**Provider Prefix:** ``moonshotai/``
**API Endpoint:** ``/v1/chat/completions``
**Authentication:** API Key - Get your Moonshot AI API key from `Moonshot AI Platform <https://platform.moonshot.ai/>`_.
**Supported Chat Models:** All Moonshot AI chat models including Kimi K2, Moonshot v1, and all future releases.
.. list-table::
:header-rows: 1
:widths: 30 20 50
* - Model Name
- Model ID for Config
- Description
* - Kimi K2 Preview
- ``moonshotai/kimi-k2-0905-preview``
- Foundation model optimized for agentic tasks with 32B activated parameters
* - Moonshot v1 32K
- ``moonshotai/moonshot-v1-32k``
- Extended context model with 32K tokens
* - Moonshot v1 128K
- ``moonshotai/moonshot-v1-128k``
- Long context model with 128K tokens
**Configuration Examples:**
.. code-block:: yaml
llm_providers:
# Latest K2 models for agentic tasks
- model: moonshotai/kimi-k2-0905-preview
access_key: $MOONSHOTAI_API_KEY
# V1 models with different context lengths
- model: moonshotai/moonshot-v1-32k
access_key: $MOONSHOTAI_API_KEY
- model: moonshotai/moonshot-v1-128k
access_key: $MOONSHOTAI_API_KEY
Zhipu AI
~~~~~~~~
**Provider Prefix:** ``zhipu/``
**API Endpoint:** ``/api/paas/v4/chat/completions``
**Authentication:** API Key - Get your Zhipu AI API key from `Zhipu AI Platform <https://open.bigmodel.cn/console/overview/>`_.
**Supported Chat Models:** All Zhipu AI GLM models including GLM-4, GLM-4 Flash, and all future releases.
.. list-table::
:header-rows: 1
:widths: 30 20 50
* - Model Name
- Model ID for Config
- Description
* - GLM-4.6
- ``zhipu/glm-4.6``
- Latest and most capable GLM model with enhanced reasoning abilities
* - GLM-4.5
- ``zhipu/glm-4.5``
- High-performance model with multimodal capabilities
* - GLM-4.5 Air
- ``zhipu/glm-4.5-air``
- Lightweight and fast model optimized for efficiency
**Configuration Examples:**
.. code-block:: yaml
llm_providers:
# Latest GLM models
- model: zhipu/glm-4.6
access_key: $ZHIPU_API_KEY
- model: zhipu/glm-4.5
access_key: $ZHIPU_API_KEY
- model: zhipu/glm-4.5-air
access_key: $ZHIPU_API_KEY
Providers Requiring Base URL
----------------------------
@ -451,6 +539,7 @@ Ollama
- model: ollama/codellama
base_url: http://host.docker.internal:11434
OpenAI-Compatible Providers
~~~~~~~~~~~~~~~~~~~~~~~~~~~

View file

@ -15,7 +15,7 @@ from sphinxawesome_theme.postprocess import Icons
project = "Arch Docs"
copyright = "2025, Katanemo Labs, Inc"
author = "Katanemo Labs, Inc"
release = " v0.3.14"
release = " v0.3.15"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

View file

@ -25,7 +25,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
$ python -m venv venv
$ source venv/bin/activate # On Windows, use: venv\Scripts\activate
$ pip install archgw==0.3.14
$ pip install archgw==0.3.15
Build AI Agent with Arch Gateway

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "archgw_modelserver"
version = "0.3.14"
version = "0.3.15"
description = "A model server for serving models"
authors = ["Katanemo Labs, Inc <info@katanemo.com>"]
license = "Apache 2.0"

View file

@ -45,7 +45,8 @@ Content-Type: application/json
"role": "user",
"content": "hello"
}
]
],
"model": "gpt-4o-mini"
}
### llm gateway request (streaming)