Introduce brightstaff a new terminal service for llm routing (#477)

This commit is contained in:
Adil Hafeez 2025-05-19 09:59:22 -07:00 committed by GitHub
parent 1f95fac4af
commit 27c0f2fdce
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 2817 additions and 150 deletions

View file

@ -24,7 +24,7 @@ jobs:
- name: build arch docker image
run: |
cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.2.8
cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.2.8 -t katanemo/archgw:latest
- name: start archgw
env:

View file

@ -24,7 +24,8 @@ jobs:
run: rustup target add wasm32-wasip1
- name: Build wasm module
run: cargo build --release --target=wasm32-wasip1
run: |
cargo build --release --target=wasm32-wasip1 -p llm_gateway -p prompt_gateway
- name: Run unit tests
run: cargo test --lib

View file

@ -1,11 +1,11 @@
# build filter using rust toolchain
# build docker image for arch gateway
FROM rust:1.82.0 as builder
RUN rustup -v target add wasm32-wasip1
WORKDIR /arch
COPY crates .
RUN cd prompt_gateway && cargo build --release --target wasm32-wasip1
RUN cd llm_gateway && cargo build --release --target wasm32-wasip1
RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway
RUN cargo build --release -p brightstaff
# copy built filter into envoy image
FROM docker.io/envoyproxy/envoy:v1.32-latest as envoy
@ -13,20 +13,27 @@ FROM docker.io/envoyproxy/envoy:v1.32-latest as envoy
#Build config generator, so that we have a single build image for both Rust and Python
FROM python:3.12-slim as arch
RUN apt-get update && apt-get install -y gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y supervisor gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
COPY --from=builder /arch/target/release/brightstaff /app/brightstaff
COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
WORKDIR /app
COPY arch/requirements.txt .
RUN pip install -r requirements.txt
COPY arch/tools/cli/config_generator.py .
COPY arch/envoy.template.yaml .
COPY arch/arch_config_schema.yaml .
COPY arch/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
RUN pip install requests
RUN touch /var/log/envoy.log
RUN mkdir -p /var/log/supervisor/
RUN touch /var/log/supervisor/supervisord.log
ENTRYPOINT ["sh","-c", "/usr/bin/supervisord"]
# ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --log-level trace 2>&1 | tee /var/log/envoy.log"]
ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"]
# ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"]

View file

@ -90,6 +90,8 @@ properties:
- https
http_host:
type: string
usage:
type: string
additionalProperties: false
required:
- name
@ -225,6 +227,12 @@ properties:
enum:
- llm
- prompt
routing:
type: object
properties:
model:
type: string
additionalProperties: false
prompt_guards:
type: object
properties:

View file

@ -328,11 +328,15 @@ static_resources:
domains:
- "*"
routes:
- match:
prefix: "/healthz"
direct_response:
status: 200
- match:
prefix: "/"
route:
auto_host_rewrite: true
cluster: arch_listener_llm
cluster: bright_staff
timeout: {{ llm_gateway_listener.timeout }}
http_filters:
- name: envoy.filters.http.router
@ -380,12 +384,6 @@ static_resources:
domains:
- "*"
routes:
- match:
prefix: "/healthz"
route:
auto_host_rewrite: true
cluster: openai
timeout: 60s
{% for provider in arch_llm_providers %}
# if endpoint is set then use custom cluster for upstream llm
{% if provider.endpoint %}
@ -615,6 +613,38 @@ static_resources:
port_value: 11000
hostname: arch_internal
- name: bright_staff
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: bright_staff
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 0.0.0.0
port_value: 9091
hostname: localhost
- name: router_model_host
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: router_model_host
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 34.30.16.38
port_value: 8000
hostname: router_model_host
- name: arch_prompt_gateway_listener
connect_timeout: 0.5s
type: LOGICAL_DNS

16
arch/supervisord.conf Normal file
View file

@ -0,0 +1,16 @@
[supervisord]
nodaemon=true
[program:brightstaff]
command=sh -c "/app/brightstaff 2>&1 | tee /var/log/brightstaff.log"
stdout_logfile=/dev/stdout
redirect_stderr=true
stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0
[program:envoy]
command=/bin/sh -c "python /app/config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml 2>&1 | tee /var/log//envoy.log"
stdout_logfile=/dev/stdout
redirect_stderr=true
stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0

View file

@ -6,6 +6,7 @@ import sys
import yaml
from cli.utils import getLogger
from cli.consts import (
ARCHGW_DOCKER_IMAGE,
ARCHGW_DOCKER_NAME,
KATANEMO_LOCAL_MODEL_LIST,
)
@ -55,7 +56,9 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
path (str): The path where the prompt_config.yml file is located.
log_timeout (int): Time in seconds to show logs before checking for healthy state.
"""
log.info("Starting arch gateway")
log.info(
f"Starting arch gateway, image name: {ARCHGW_DOCKER_NAME}, tag: {ARCHGW_DOCKER_IMAGE}"
)
try:
archgw_container_status = docker_container_status(ARCHGW_DOCKER_NAME)
@ -92,10 +95,15 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
current_time = time.time()
elapsed_time = current_time - start_time
if archgw_status == "exited":
log.info("archgw container exited unexpectedly.")
stream_gateway_logs(follow=False)
sys.exit(1)
# Check if timeout is reached
if elapsed_time > log_timeout:
log.info(f"stopping log monitoring after {log_timeout} seconds.")
break
sys.exit(1)
if prompt_gateway_health_check_status or llm_gateway_health_check_status:
log.info("archgw is running and is healthy!")
@ -109,27 +117,27 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
except KeyboardInterrupt:
log.info("Keyboard interrupt received, stopping arch gateway service.")
stop_arch()
stop_docker_container()
def stop_arch():
def stop_docker_container(service=ARCHGW_DOCKER_NAME):
"""
Shutdown all Docker Compose services by running `docker-compose down`.
Args:
path (str): The path where the docker-compose.yml file is located.
"""
log.info("Shutting down arch gateway service.")
log.info(f"Shutting down {service} service.")
try:
subprocess.run(
["docker", "stop", ARCHGW_DOCKER_NAME],
["docker", "stop", service],
)
subprocess.run(
["docker", "rm", ARCHGW_DOCKER_NAME],
["docker", "rm", service],
)
log.info("Successfully shut down arch gateway service.")
log.info(f"Successfully shut down {service} service.")
except subprocess.CalledProcessError as e:
log.info(f"Failed to shut down services: {str(e)}")

View file

@ -3,7 +3,10 @@ import json
import sys
import requests
from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
from cli.consts import (
ARCHGW_DOCKER_IMAGE,
ARCHGW_DOCKER_NAME,
)
from cli.utils import getLogger
log = getLogger(__name__)
@ -54,7 +57,6 @@ def docker_start_archgw_detached(
port_mappings_args = [item for port in port_mappings for item in ("-p", port)]
volume_mappings = [
f"{logs_path_abs}:/var/log:rw",
f"{arch_config_file}:/app/arch_config.yaml:ro",
# "/Users/adilhafeez/src/intelligent-prompt-gateway/crates/target/wasm32-wasip1/release:/etc/envoy/proxy-wasm-plugins:ro",
]
@ -90,7 +92,7 @@ def health_check_endpoint(endpoint: str) -> bool:
return False
def stream_gateway_logs(follow):
def stream_gateway_logs(follow, service="archgw"):
"""
Stream logs from the arch gateway service.
"""
@ -99,7 +101,7 @@ def stream_gateway_logs(follow):
options = ["docker", "logs"]
if follow:
options.append("-f")
options.append(ARCHGW_DOCKER_NAME)
options.append(service)
try:
# Run `docker-compose logs` to stream logs from the gateway service
subprocess.run(

View file

@ -16,7 +16,7 @@ from cli.core import (
start_arch_modelserver,
stop_arch_modelserver,
start_arch,
stop_arch,
stop_docker_container,
download_models_from_hf,
)
from cli.consts import (
@ -51,6 +51,18 @@ def get_version():
return "version not found"
def verify_service_name(service):
"""Verify if the service name is valid."""
if service not in [
SERVICE_NAME_ARCHGW,
SERVICE_NAME_MODEL_SERVER,
SERVICE_ALL,
]:
print(f"Error: Invalid service {service}. Exiting")
sys.exit(1)
return True
@click.group(invoke_without_command=True)
@click.option("--version", is_flag=True, help="Show the archgw cli version and exit.")
@click.pass_context
@ -75,9 +87,8 @@ def main(ctx, version):
)
def build(service):
"""Build Arch from source. Must be in root of cloned repo."""
if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
print(f"Error: Invalid service {service}. Exiting")
sys.exit(1)
verify_service_name(service)
# Check if /arch/Dockerfile exists
if service == SERVICE_NAME_ARCHGW or service == SERVICE_ALL:
if os.path.exists(ARCHGW_DOCKERFILE):
@ -146,9 +157,7 @@ def build(service):
)
def up(file, path, service, foreground):
"""Starts Arch."""
if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
log.info(f"Error: Invalid service {service}. Exiting")
sys.exit(1)
verify_service_name(service)
if service == SERVICE_ALL and foreground:
# foreground can only be specified when starting individual services
@ -156,7 +165,7 @@ def up(file, path, service, foreground):
sys.exit(1)
if service == SERVICE_NAME_MODEL_SERVER:
log.info("Download archgw models from HuggingFace...")
log.info("Download models from HuggingFace...")
download_models_from_hf()
start_arch_modelserver(foreground)
return
@ -186,8 +195,6 @@ def up(file, path, service, foreground):
log.info(f"Validation stderr: {validation_stderr}")
sys.exit(1)
log.info("Starting arch model server and arch gateway")
# Set the ARCH_CONFIG_FILE environment variable
env_stage = {
"OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
@ -210,7 +217,6 @@ def up(file, path, service, foreground):
else:
app_env_file = os.path.abspath(os.path.join(path, ".env"))
print(f"app_env_file: {app_env_file}")
if not os.path.exists(
app_env_file
): # check to see if the environment variables in the current environment or not
@ -248,17 +254,15 @@ def up(file, path, service, foreground):
def down(service):
"""Stops Arch."""
if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
log.info(f"Error: Invalid service {service}. Exiting")
sys.exit(1)
verify_service_name(service)
if service == SERVICE_NAME_MODEL_SERVER:
stop_arch_modelserver()
elif service == SERVICE_NAME_ARCHGW:
stop_arch()
stop_docker_container()
else:
stop_arch_modelserver()
stop_arch()
stop_docker_container(SERVICE_NAME_ARCHGW)
@click.command()

1736
crates/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,3 @@
[workspace]
resolver = "2"
members = ["llm_gateway", "prompt_gateway", "common"]
members = ["llm_gateway", "prompt_gateway", "common", "brightstaff"]

View file

@ -0,0 +1,32 @@
[package]
name = "brightstaff"
version = "0.1.0"
edition = "2021"
[dependencies]
bytes = "1.10.1"
common = { version = "0.1.0", path = "../common" }
eventsource-client = "0.15.0"
eventsource-stream = "0.2.3"
futures = "0.3.31"
futures-util = "0.3.31"
http-body = "1.0.1"
http-body-util = "0.1.3"
hyper = { version = "1.6.0", features = ["full"] }
hyper-util = "0.1.11"
opentelemetry = "0.29.1"
opentelemetry-http = "0.29.0"
opentelemetry-otlp = {version="0.29.0", features=["trace", "tonic", "grpc-tonic"]}
opentelemetry-stdout = "0.29.0"
opentelemetry_sdk = "0.29.0"
pretty_assertions = "1.4.1"
reqwest = { version = "0.12.15", features = ["stream"] }
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.140"
serde_yaml = "0.9.34"
thiserror = "2.0.12"
tokio = { version = "1.44.2", features = ["full"] }
tokio-stream = "0.1.17"
tracing = "0.1.41"
tracing-opentelemetry = "0.30.0"
tracing-subscriber = { version = "0.3.19", features = ["env-filter", "fmt"] }

View file

@ -0,0 +1,168 @@
use std::sync::Arc;
use bytes::Bytes;
use common::api::open_ai::ChatCompletionsRequest;
use common::consts::ARCH_PROVIDER_HINT_HEADER;
use common::utils::shorten_string;
use http_body_util::combinators::BoxBody;
use http_body_util::{BodyExt, Full, StreamBody};
use hyper::body::Frame;
use hyper::header::{self};
use hyper::{Request, Response, StatusCode};
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tokio_stream::StreamExt;
use tracing::{info, warn};
use crate::router::llm_router::RouterService;
fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
Full::new(chunk.into())
.map_err(|never| match never {})
.boxed()
}
pub async fn chat_completions(
request: Request<hyper::body::Incoming>,
router_service: Arc<RouterService>,
llm_provider_endpoint: String,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let mut request_headers = request.headers().clone();
let chat_request_bytes = request.collect().await?.to_bytes();
let chat_completion_request: ChatCompletionsRequest =
match serde_json::from_slice(&chat_request_bytes) {
Ok(request) => request,
Err(err) => {
let err_msg = format!("Failed to parse request body: {}", err);
let mut bad_request = Response::new(full(err_msg));
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
return Ok(bad_request);
}
};
info!(
"request body received: {}",
shorten_string(&serde_json::to_string(&chat_completion_request).unwrap())
);
let trace_parent = request_headers
.iter()
.find(|(ty, _)| ty.as_str() == "traceparent")
.map(|(_, value)| value.to_str().unwrap_or_default().to_string());
let selected_llm = match router_service
.determine_route(&chat_completion_request.messages, trace_parent.clone())
.await
{
Ok(route) => route,
Err(err) => {
let err_msg = format!("Failed to determine route: {}", err);
let mut internal_error = Response::new(full(err_msg));
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return Ok(internal_error);
}
};
info!(
"sending request to llm provider: {} with llm model: {:?}",
llm_provider_endpoint, selected_llm
);
if let Some(trace_parent) = trace_parent {
request_headers.insert(
header::HeaderName::from_static("traceparent"),
header::HeaderValue::from_str(&trace_parent).unwrap(),
);
}
if let Some(selected_llm) = selected_llm {
request_headers.insert(
ARCH_PROVIDER_HINT_HEADER,
header::HeaderValue::from_str(&selected_llm).unwrap(),
);
}
let llm_response = match reqwest::Client::new()
.post(llm_provider_endpoint)
.headers(request_headers)
.body(chat_request_bytes)
.send()
.await
{
Ok(res) => res,
Err(err) => {
let err_msg = format!("Failed to send request: {}", err);
let mut internal_error = Response::new(full(err_msg));
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return Ok(internal_error);
}
};
// copy over the headers from the original response
let response_headers = llm_response.headers().clone();
let mut response = Response::builder();
let headers = response.headers_mut().unwrap();
for (header_name, header_value) in response_headers.iter() {
headers.insert(header_name, header_value.clone());
}
if chat_completion_request.stream {
// channel to create async stream
let (tx, rx) = mpsc::channel::<Bytes>(16);
// Spawn a task to send data as it becomes available
tokio::spawn(async move {
let mut byte_stream = llm_response.bytes_stream();
while let Some(item) = byte_stream.next().await {
let item = match item {
Ok(item) => item,
Err(err) => {
warn!("Error receiving chunk: {:?}", err);
break;
}
};
if tx.send(item).await.is_err() {
warn!("Receiver dropped");
break;
}
}
});
let stream = ReceiverStream::new(rx).map(|chunk| Ok::<_, hyper::Error>(Frame::data(chunk)));
let stream_body = BoxBody::new(StreamBody::new(stream));
match response.body(stream_body) {
Ok(response) => Ok(response),
Err(err) => {
let err_msg = format!("Failed to create response: {}", err);
let mut internal_error = Response::new(full(err_msg));
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
Ok(internal_error)
}
}
} else {
let body = match llm_response.text().await {
Ok(body) => body,
Err(err) => {
let err_msg = format!("Failed to read response: {}", err);
let mut internal_error = Response::new(full(err_msg));
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return Ok(internal_error);
}
};
match response.body(full(body)) {
Ok(response) => Ok(response),
Err(err) => {
let err_msg = format!("Failed to create response: {}", err);
let mut internal_error = Response::new(full(err_msg));
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
Ok(internal_error)
}
}
}
}

View file

@ -0,0 +1 @@
pub mod chat_completions;

View file

@ -0,0 +1,2 @@
pub mod handlers;
pub mod router;

View file

@ -0,0 +1,157 @@
use brightstaff::handlers::chat_completions::chat_completions;
use brightstaff::router::llm_router::RouterService;
use bytes::Bytes;
use common::configuration::Configuration;
use common::utils::shorten_string;
use http_body_util::{combinators::BoxBody, BodyExt, Empty};
use hyper::body::Incoming;
use hyper::server::conn::http1;
use hyper::service::service_fn;
use hyper::{Method, Request, Response, StatusCode};
use hyper_util::rt::TokioIo;
use opentelemetry::global::BoxedTracer;
use opentelemetry::trace::FutureExt;
use opentelemetry::{
global,
trace::{SpanKind, Tracer},
Context,
};
use opentelemetry_http::HeaderExtractor;
use opentelemetry_sdk::{propagation::TraceContextPropagator, trace::SdkTracerProvider};
use opentelemetry_stdout::SpanExporter;
use std::sync::{Arc, OnceLock};
use std::{env, fs};
use tokio::net::TcpListener;
use tracing::info;
use tracing_subscriber::EnvFilter;
pub mod router;
const BIND_ADDRESS: &str = "0.0.0.0:9091";
fn get_tracer() -> &'static BoxedTracer {
static TRACER: OnceLock<BoxedTracer> = OnceLock::new();
TRACER.get_or_init(|| global::tracer("archgw/router"))
}
// Utility function to extract the context from the incoming request headers
fn extract_context_from_request(req: &Request<Incoming>) -> Context {
global::get_text_map_propagator(|propagator| {
propagator.extract(&HeaderExtractor(req.headers()))
})
}
fn init_tracer() -> SdkTracerProvider {
global::set_text_map_propagator(TraceContextPropagator::new());
// Install stdout exporter pipeline to be able to retrieve the collected spans.
// For the demonstration, use `Sampler::AlwaysOn` sampler to sample all traces.
let provider = SdkTracerProvider::builder()
.with_simple_exporter(SpanExporter::default())
.build();
global::set_tracer_provider(provider.clone());
provider
}
fn empty() -> BoxBody<Bytes, hyper::Error> {
Empty::<Bytes>::new()
.map_err(|never| match never {})
.boxed()
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let _tracer_provider = init_tracer();
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.init();
let bind_address = env::var("BIND_ADDRESS").unwrap_or_else(|_| BIND_ADDRESS.to_string());
//loading arch_config.yaml file
let arch_config_path =
env::var("ARCH_CONFIG_PATH").unwrap_or_else(|_| "./arch_config.yaml".to_string());
info!("Loading arch_config.yaml from {}", arch_config_path);
let config_contents =
fs::read_to_string(&arch_config_path).expect("Failed to read arch_config.yaml");
let config: Configuration =
serde_yaml::from_str(&config_contents).expect("Failed to parse arch_config.yaml");
let arch_config = Arc::new(config);
info!(
"arch_config: {:?}",
shorten_string(&serde_json::to_string(arch_config.as_ref()).unwrap())
);
let llm_provider_endpoint = env::var("LLM_PROVIDER_ENDPOINT")
.unwrap_or_else(|_| "http://localhost:12001/v1/chat/completions".to_string());
info!("llm provider endpoint: {}", llm_provider_endpoint);
info!("Listening on http://{}", bind_address);
let listener = TcpListener::bind(bind_address).await?;
// if routing is null then return gpt-4o as model name
let model = arch_config.routing.as_ref().map_or_else(
|| "gpt-4o".to_string(),
|routing| routing.model.clone(),
);
let router_service: Arc<RouterService> = Arc::new(RouterService::new(
arch_config.llm_providers.clone(),
llm_provider_endpoint.clone(),
model,
));
loop {
let (stream, _) = listener.accept().await?;
let peer_addr = stream.peer_addr()?;
let io = TokioIo::new(stream);
let router_service = Arc::clone(&router_service);
let llm_provider_endpoint = llm_provider_endpoint.clone();
let service = service_fn(move |req| {
let router_service = Arc::clone(&router_service);
let parent_cx = extract_context_from_request(&req);
info!("parent_cx: {:?}", parent_cx);
let tracer = get_tracer();
let _span = tracer
.span_builder("request")
.with_kind(SpanKind::Server)
.start_with_context(tracer, &parent_cx);
let llm_provider_endpoint = llm_provider_endpoint.clone();
async move {
match (req.method(), req.uri().path()) {
(&Method::POST, "/v1/chat/completions") => {
chat_completions(req, router_service, llm_provider_endpoint)
.with_context(parent_cx)
.await
}
_ => {
let mut not_found = Response::new(empty());
*not_found.status_mut() = StatusCode::NOT_FOUND;
Ok(not_found)
}
}
}
});
tokio::task::spawn(async move {
info!("Accepted connection from {:?}", peer_addr);
if let Err(err) = http1::Builder::new()
// .serve_connection(io, service_fn(chat_completion))
.serve_connection(io, service)
.await
{
info!("Error serving connection: {:?}", err);
}
});
}
}

View file

@ -0,0 +1,151 @@
use std::sync::Arc;
use common::{
api::open_ai::{ChatCompletionsResponse, Message},
configuration::LlmProvider,
consts::ARCH_PROVIDER_HINT_HEADER,
utils::shorten_string,
};
use hyper::header;
use thiserror::Error;
use tracing::{info, warn};
use super::router_model::RouterModel;
pub struct RouterService {
router_url: String,
client: reqwest::Client,
router_model: Arc<dyn RouterModel>,
routing_model_name: String,
llm_usage_defined: bool,
}
#[derive(Debug, Error)]
pub enum RoutingError {
#[error("Failed to send request: {0}")]
RequestError(#[from] reqwest::Error),
#[error("Failed to parse JSON: {0}, JSON: {1}")]
JsonError(serde_json::Error, String),
#[error("Router model error: {0}")]
RouterModelError(#[from] super::router_model::RoutingModelError),
}
pub type Result<T> = std::result::Result<T, RoutingError>;
impl RouterService {
pub fn new(
providers: Vec<LlmProvider>,
router_url: String,
routing_model_name: String,
) -> Self {
let providers_with_usage = providers
.iter()
.filter(|provider| provider.usage.is_some())
.cloned()
.collect::<Vec<LlmProvider>>();
// convert the llm_providers to yaml string but only include name and usage
let llm_providers_with_usage_yaml = providers_with_usage
.iter()
.map(|provider| {
format!(
"- name: {}\n description: {}",
provider.name,
provider.usage.as_ref().unwrap_or(&"".to_string())
)
})
.collect::<Vec<String>>()
.join("\n");
info!(
"llm_providers from config with usage: {}...",
shorten_string(&llm_providers_with_usage_yaml.replace("\n", "\\n"))
);
let router_model = Arc::new(super::router_model_v1::RouterModelV1::new(
llm_providers_with_usage_yaml.clone(),
routing_model_name.clone(),
));
RouterService {
router_url,
client: reqwest::Client::new(),
router_model,
routing_model_name,
llm_usage_defined: !providers_with_usage.is_empty(),
}
}
pub async fn determine_route(
&self,
messages: &[Message],
trace_parent: Option<String>,
) -> Result<Option<String>> {
if !self.llm_usage_defined {
return Ok(None);
}
let router_request = self.router_model.generate_request(messages);
info!(
"router_request: {}",
shorten_string(&serde_json::to_string(&router_request).unwrap()),
);
let mut llm_route_request_headers = header::HeaderMap::new();
llm_route_request_headers.insert(
header::CONTENT_TYPE,
header::HeaderValue::from_static("application/json"),
);
llm_route_request_headers.insert(
header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
header::HeaderValue::from_str(&self.routing_model_name).unwrap(),
);
if let Some(trace_parent) = trace_parent {
llm_route_request_headers.insert(
header::HeaderName::from_static("traceparent"),
header::HeaderValue::from_str(&trace_parent).unwrap(),
);
}
let res = self
.client
.post(&self.router_url)
.headers(llm_route_request_headers)
.body(serde_json::to_string(&router_request).unwrap())
.send()
.await?;
let body = res.text().await?;
let chat_completion_response: ChatCompletionsResponse = match serde_json::from_str(&body) {
Ok(response) => response,
Err(err) => {
warn!(
"Failed to parse JSON: {}. Body: {}",
err,
&serde_json::to_string(&body).unwrap()
);
return Err(RoutingError::JsonError(
err,
format!("Failed to parse JSON: {}", body),
));
}
};
let selected_llm = self.router_model.parse_response(
chat_completion_response.choices[0]
.message
.content
.as_ref()
.unwrap(),
)?;
Ok(selected_llm)
}
}

View file

@ -0,0 +1,3 @@
pub mod llm_router;
pub mod router_model;
pub mod router_model_v1;

View file

@ -0,0 +1,15 @@
use common::api::open_ai::{ChatCompletionsRequest, Message};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum RoutingModelError {
#[error("Failed to parse JSON: {0}")]
JsonError(#[from] serde_json::Error),
}
pub type Result<T> = std::result::Result<T, RoutingModelError>;
pub trait RouterModel: Send + Sync {
fn generate_request(&self, messages: &[Message]) -> ChatCompletionsRequest;
fn parse_response(&self, content: &str) -> Result<Option<String>>;
}

View file

@ -0,0 +1,251 @@
use common::{
api::open_ai::{ChatCompletionsRequest, Message},
consts::{SYSTEM_ROLE, USER_ROLE},
};
use serde::{Deserialize, Serialize};
use tracing::info;
use super::router_model::{RouterModel, RoutingModelError};
pub const ARCH_ROUTER_V1_SYSTEM_PROMPT: &str = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
{routes}
</routes>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
<conversation>
{conversation}
</conversation>
"#;
pub type Result<T> = std::result::Result<T, RoutingModelError>;
pub struct RouterModelV1 {
llm_providers_with_usage_yaml: String,
routing_model: String,
}
impl RouterModelV1 {
pub fn new(llm_providers_with_usage_yaml: String, routing_model: String) -> Self {
RouterModelV1 {
llm_providers_with_usage_yaml,
routing_model,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct LlmRouterResponse {
pub route: Option<String>,
}
impl RouterModel for RouterModelV1 {
fn generate_request(&self, messages: &[Message]) -> ChatCompletionsRequest {
let messages_str = messages
.iter()
.filter(|m| m.role != SYSTEM_ROLE)
.map(|m| {
let content_json_str = serde_json::to_string(&m.content).unwrap_or_default();
format!("{}: {}", m.role, content_json_str)
})
.collect::<Vec<String>>()
.join("\n");
let message = ARCH_ROUTER_V1_SYSTEM_PROMPT
.replace("{routes}", &self.llm_providers_with_usage_yaml)
.replace("{conversation}", messages_str.as_str());
ChatCompletionsRequest {
model: self.routing_model.clone(),
messages: vec![Message {
content: Some(message),
role: USER_ROLE.to_string(),
model: None,
tool_calls: None,
tool_call_id: None,
}],
tools: None,
stream: false,
stream_options: None,
metadata: None,
}
}
fn parse_response(&self, content: &str) -> Result<Option<String>> {
if content.is_empty() {
return Ok(None);
}
let router_resp_fixed = fix_json_response(content);
info!(
"router response (fixed): {}",
router_resp_fixed.replace("\n", "\\n")
);
let router_response: LlmRouterResponse = serde_json::from_str(router_resp_fixed.as_str())?;
let selected_llm = router_response.route.unwrap_or_default().to_string();
if selected_llm.is_empty() {
return Ok(None);
}
Ok(Some(selected_llm))
}
}
fn fix_json_response(body: &str) -> String {
let mut updated_body = body.to_string();
updated_body = updated_body.replace("'", "\"");
if updated_body.contains("\\n") {
updated_body = updated_body.replace("\\n", "");
}
if updated_body.starts_with("```json") {
updated_body = updated_body
.strip_prefix("```json")
.unwrap_or(&updated_body)
.to_string();
}
if updated_body.ends_with("```") {
updated_body = updated_body
.strip_suffix("```")
.unwrap_or(&updated_body)
.to_string();
}
updated_body
}
impl std::fmt::Debug for dyn RouterModel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "RouterModel")
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_system_prompt_format() {
let expected_prompt = r#"
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
route1: description1
route2: description2
</routes>
Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
{"route": "route_name"}
<conversation>
user: "Hello, I want to book a flight."
assistant: "Sure, where would you like to go?"
user: "seattle"
</conversation>
"#;
let routes_yaml = "route1: description1\nroute2: description2";
let routing_model = "test-model".to_string();
let router = RouterModelV1::new(routes_yaml.to_string(), routing_model.clone());
let messages = vec![
Message {
role: "system".to_string(),
content: Some("You are a helpful assistant.".to_string()),
..Default::default()
},
Message {
role: "user".to_string(),
content: Some("Hello, I want to book a flight.".to_string()),
..Default::default()
},
Message {
role: "assistant".to_string(),
content: Some("Sure, where would you like to go?".to_string()),
..Default::default()
},
Message {
role: "user".to_string(),
content: Some("seattle".to_string()),
..Default::default()
},
];
let req = router.generate_request(&messages);
let prompt = req.messages[0].content.as_ref().unwrap();
println!("Prompt: {}", prompt);
assert_eq!(expected_prompt, prompt);
}
}
#[test]
fn test_parse_response() {
let router = RouterModelV1::new(
"route1: description1\nroute2: description2".to_string(),
"test-model".to_string(),
);
// Case 1: Valid JSON with non-empty route
let input = r#"{"route": "route1"}"#;
let result = router.parse_response(input).unwrap();
assert_eq!(result, Some("route1".to_string()));
// Case 2: Valid JSON with empty route
let input = r#"{"route": ""}"#;
let result = router.parse_response(input).unwrap();
assert_eq!(result, None);
// Case 3: Valid JSON with null route
let input = r#"{"route": null}"#;
let result = router.parse_response(input).unwrap();
assert_eq!(result, None);
// Case 4: JSON missing route field
let input = r#"{}"#;
let result = router.parse_response(input).unwrap();
assert_eq!(result, None);
// Case 4.1: empty string
let input = r#""#;
let result = router.parse_response(input).unwrap();
assert_eq!(result, None);
// Case 5: Malformed JSON
let input = r#"{"route": "route1""#; // missing closing }
let result = router.parse_response(input);
assert!(result.is_err());
// Case 6: Single quotes and \n in JSON
let input = "{'route': 'route2'}\\n";
let result = router.parse_response(input).unwrap();
assert_eq!(result, Some("route2".to_string()));
// Case 7: Code block marker
let input = "```json\n{\"route\": \"route1\"}\n```";
let result = router.parse_response(input).unwrap();
assert_eq!(result, Some("route1".to_string()));
}

View file

@ -171,6 +171,18 @@ pub struct Message {
pub tool_call_id: Option<String>,
}
impl Default for Message {
fn default() -> Self {
Message {
role: ASSISTANT_ROLE.to_string(),
content: None,
model: None,
tool_calls: None,
tool_call_id: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Choice {
pub finish_reason: Option<String>,

View file

@ -6,6 +6,11 @@ use crate::api::open_ai::{
ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Routing {
pub model: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Configuration {
pub version: String,
@ -19,6 +24,7 @@ pub struct Configuration {
pub ratelimits: Option<Vec<Ratelimit>>,
pub tracing: Option<Tracing>,
pub mode: Option<GatewayMode>,
pub routing: Option<Routing>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -166,6 +172,7 @@ pub struct LlmProvider {
pub endpoint: Option<String>,
pub port: Option<u16>,
pub rate_limits: Option<LlmRatelimit>,
pub usage: Option<String>,
}
impl Display for LlmProvider {

View file

@ -11,7 +11,8 @@ pub const MODEL_SERVER_NAME: &str = "model_server";
pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
pub const MESSAGES_KEY: &str = "messages";
pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
pub const CHAT_COMPLETIONS_PATH: [&str; 2] = ["/v1/chat/completions", "/openai/v1/chat/completions"];
pub const CHAT_COMPLETIONS_PATH: [&str; 2] =
["/v1/chat/completions", "/openai/v1/chat/completions"];
pub const HEALTHZ_PATH: &str = "/healthz";
pub const X_ARCH_STATE_HEADER: &str = "x-arch-state";
pub const X_ARCH_API_RESPONSE: &str = "x-arch-api-response-message";
@ -27,3 +28,4 @@ pub const HALLUCINATION_TEMPLATE: &str =
"It seems I'm missing some information. Could you provide the following details ";
pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
pub const OTEL_POST_PATH: &str = "/v1/traces";
pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";

View file

@ -11,3 +11,4 @@ pub mod routing;
pub mod stats;
pub mod tokenizer;
pub mod tracing;
pub mod utils;

View file

@ -0,0 +1,7 @@
pub fn shorten_string(s: &str) -> String {
if s.len() > 80 {
format!("{}...", &s[..80])
} else {
s.to_string()
}
}

View file

@ -228,6 +228,7 @@ impl HttpContext for StreamContext {
stream: None,
port: None,
rate_limits: None,
usage: None,
}));
} else {
self.select_llm_provider();
@ -316,10 +317,6 @@ impl HttpContext for StreamContext {
}
};
// remove metadata from the request body
//TODO: move this to prompt gateway
// deserialized_body.metadata = None;
// delete model key from message array
for message in deserialized_body.messages.iter_mut() {
message.model = None;
}
@ -342,24 +339,22 @@ impl HttpContext for StreamContext {
};
let model_requested = deserialized_body.model.clone();
if deserialized_body.model.is_empty() || deserialized_body.model.to_lowercase() == "none" {
deserialized_body.model = match model_name {
Some(model_name) => model_name.clone(),
None => {
if use_agent_orchestrator {
"agent_orchestrator".to_string()
} else {
self.send_server_error(
ServerError::BadRequest {
why: format!("No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?}", deserialized_body.model, self.llm_provider().name, self.llm_provider().model).to_string(),
},
Some(StatusCode::BAD_REQUEST),
);
return Action::Continue;
}
deserialized_body.model = match model_name {
Some(model_name) => model_name.clone(),
None => {
if use_agent_orchestrator {
"agent_orchestrator".to_string()
} else {
self.send_server_error(
ServerError::BadRequest {
why: format!("No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?}", deserialized_body.model, self.llm_provider().name, self.llm_provider().model).to_string(),
},
Some(StatusCode::BAD_REQUEST),
);
return Action::Continue;
}
}
}
};
info!(
"on_http_request_body: provider: {}, model requested: {}, model selected: {}",

View file

@ -489,7 +489,6 @@ fn llm_gateway_override_model_name() {
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)

View file

@ -777,18 +777,19 @@ impl StreamContext {
fn check_intent_matched(model_server_response: &ChatCompletionsResponse) -> bool {
let content = model_server_response
.choices.first()
.choices
.first()
.and_then(|choice| choice.message.content.as_ref());
let content_has_value = content.is_some() && !content.unwrap().is_empty();
let tool_calls = model_server_response
.choices.first()
.choices
.first()
.and_then(|choice| choice.message.tool_calls.as_ref());
// intent was matched if content has some value or tool_calls is empty
content_has_value || (tool_calls.is_some() && !tool_calls.unwrap().is_empty())
}

View file

@ -1,13 +1,16 @@
#!/bin/bash
set -eu
echo "docker images"
docker images
# for demo in currency_exchange hr_agent
for demo in currency_exchange
for demo in samples_python/currency_exchange use_cases/preference_based_routing
do
echo "******************************************"
echo "Running tests for $demo ..."
echo "****************************************"
cd ../../samples_python/$demo
cd ../../$demo
echo "starting archgw"
archgw up arch_config.yaml
echo "starting docker containers"

View file

@ -0,0 +1,2 @@
# Usage based LLM Routing
This demo shows how you can use user preferences to route user prompts to appropriate llm. See [arch_config.yaml](arch_config.yaml) for details on how you can define user preferences.

View file

@ -0,0 +1,39 @@
version: "0.1-beta"
routing:
model: gpt-4o
listeners:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
llm_providers:
- name: archgw-v1-router-model
provider_interface: openai
model: cotran2/llama-1b-4-26
base_url: http://35.192.87.187:8000/v1
- name: gpt-4o-mini
provider_interface: openai
access_key: $OPENAI_API_KEY
model: gpt-4o-mini
default: true
- name: gpt-4o
provider_interface: openai
access_key: $OPENAI_API_KEY
model: gpt-4o
usage: Generating original content such as scripts, articles, or creative materials.
- name: o4-mini
provider_interface: openai
access_key: $OPENAI_API_KEY
model: o4-mini
usage: Requesting topic ideas specifically related to personal finance and budgeting.
tracing:
random_sampling: 100

View file

@ -0,0 +1,32 @@
services:
chatbot_ui:
build:
context: ../../shared/chatbot_ui
dockerfile: Dockerfile
ports:
- "18080:8080"
environment:
- CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- ./arch_config.yaml:/app/arch_config.yaml
jaeger:
build:
context: ../../shared/jaeger
ports:
- "16686:16686"
- "4317:4317"
- "4318:4318"
prometheus:
build:
context: ../../shared/prometheus
grafana:
build:
context: ../../shared/grafana
ports:
- "3000:3000"

View file

@ -0,0 +1,18 @@
POST http://localhost:12000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "I am running under debt, how should I keep a tab on my expenses?"
}
]
}
HTTP 200
[Asserts]
header "content-type" == "application/json"
jsonpath "$.model" matches /^o4-mini/
jsonpath "$.usage" != null
jsonpath "$.choices[0].message.content" != null
jsonpath "$.choices[0].message.role" == "assistant"

View file

@ -0,0 +1,16 @@
POST http://localhost:12000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "I am running under debt, how should I keep a tab on my expenses?"
}
],
"stream": true
}
HTTP 200
[Asserts]
header "content-type" matches /text\/event-stream/
body matches /^data: .*?o4-mini.*?\n/

View file

@ -0,0 +1,24 @@
@arch_llm_router_endpoint = http://35.192.87.187:8000
POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
{
"model": "cotran2/llama-1b-4-26",
"messages": [
{
"role": "user",
"content": "You are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o()\n description: \"complex reasoning problem, require multi step answer\\n\"\n- name: o4-mini()\n description: \"simple requests, basic fact retrieval, easy to answer\\n\"\n\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n\nuser: Hello\nassistant: Hi! How can I assist you today?\nuser: List us presidents who are born in odd years and are still alive. Order them by their age and I also know what is their home city they were born. And what year they became president. Also give me summary of which president was the best for economy of the US.\n\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace."
}
]
}
### test 2
POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
{"model":"cotran2/llama-1b-4-26","messages":[{"role":"user","content":"\nYou are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o\n description: simple requests, basic fact retrieval, easy to answer\n- name: o4-mini()\n description: complex reasoning problem, require multi step answer\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n[{\"role\":\"user\",\"content\":\"What is the capital of France?\"}]\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.\n"}],"stream":false}
### get model list
GET http://34.46.85.85:8000/v1/models HTTP/1.1

View file

@ -0,0 +1,77 @@
@llm_endpoint = http://localhost:12000
@openai_endpoint = https://api.openai.com
@access_key = {{$dotenv OPENAI_API_KEY}}
### openai request
POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
Authorization: Bearer {{access_key}}
{
"messages": [
{
"role": "user",
"content": "hello"
}
],
"model": "gpt-4o-mini",
"stream": true
}
### openai request (streaming)
POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
Authorization: Bearer {{access_key}}
{
"messages": [
{
"role": "user",
"content": "hello"
}
],
"model": "gpt-4o-mini",
"stream": true
}
### llm gateway request
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "hello"
}
]
}
### llm gateway request (streaming)
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "hello"
}
],
"stream": true
}
### llm gateway request (provider hint)
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
x-arch-llm-provider-hint: gpt-3.5-turbo-0125
{
"messages": [
{
"role": "user",
"content": "hello"
}
]
}