remove 9090 and consolite to port 12000 for llm routing

This commit is contained in:
Adil Hafeez 2025-05-16 17:10:37 -07:00
parent ef65527ff0
commit 7feb168a06
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
5 changed files with 32 additions and 69 deletions

View file

@ -30,55 +30,6 @@ stats_config:
static_resources:
listeners:
- name: arch_router
address:
socket_address:
address: 0.0.0.0
port_value: 9090
traffic_direction: INBOUND
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
generate_request_id: true
tracing:
provider:
name: envoy.tracers.opentelemetry
typed_config:
"@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
grpc_service:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: arch_router
random_sampling:
value: 100
stat_prefix: ingress
codec_type: AUTO
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_arch_router.log"
route_config:
name: local_routes
virtual_hosts:
- name: local_service
domains:
- "*"
routes:
- match:
prefix: "/"
route:
auto_host_rewrite: true
cluster: bright_staff
http_filters:
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: ingress_traffic
address:
socket_address:
@ -378,11 +329,15 @@ static_resources:
domains:
- "*"
routes:
- match:
prefix: "/healthz"
direct_response:
status: 200
- match:
prefix: "/"
route:
auto_host_rewrite: true
cluster: arch_listener_llm
cluster: bright_staff
timeout: {{ llm_gateway_listener.timeout }}
http_filters:
- name: envoy.filters.http.router
@ -430,12 +385,6 @@ static_resources:
domains:
- "*"
routes:
- match:
prefix: "/healthz"
route:
auto_host_rewrite: true
cluster: openai
timeout: 60s
{% for provider in arch_llm_providers %}
# if endpoint is set then use custom cluster for upstream llm
{% if provider.endpoint %}

View file

@ -5,7 +5,6 @@ use common::api::open_ai::ChatCompletionsRequest;
use common::consts::ARCH_PROVIDER_HINT_HEADER;
use http_body_util::combinators::BoxBody;
use http_body_util::{BodyExt, Full, StreamBody};
use hyper::body::Body;
use hyper::body::Frame;
use hyper::header::{self};
use hyper::{Request, Response, StatusCode};
@ -22,18 +21,11 @@ fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
.boxed()
}
pub async fn chat_completion(
pub async fn chat_completions(
request: Request<hyper::body::Incoming>,
router_service: Arc<RouterService>,
llm_provider_endpoint: String,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let max = request.body().size_hint().upper().unwrap_or(u64::MAX);
if max > 1024 * 1024 {
let error_msg = format!("Request body too large: {} bytes", max);
let mut too_large = Response::new(full(error_msg));
*too_large.status_mut() = StatusCode::PAYLOAD_TOO_LARGE;
return Ok(too_large);
}
let mut request_headers = request.headers().clone();

View file

@ -1,4 +1,4 @@
use brightstaff::handlers::chat_completions::chat_completion;
use brightstaff::handlers::chat_completions::chat_completions;
use brightstaff::router::llm_router::RouterService;
use bytes::Bytes;
use common::configuration::Configuration;
@ -89,16 +89,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
);
let llm_provider_endpoint = env::var("LLM_PROVIDER_ENDPOINT")
.unwrap_or_else(|_| "http://localhost:12000/v1/chat/completions".to_string());
.unwrap_or_else(|_| "http://localhost:12001/v1/chat/completions".to_string());
info!("llm provider endpoint: {}", llm_provider_endpoint);
info!("Listening on http://{}", bind_address);
let listener = TcpListener::bind(bind_address).await?;
// if routing is null then return gpt-4o as model name
let model = arch_config.routing.as_ref().map_or_else(
|| "gpt-4o".to_string(),
|routing| routing.model.clone(),
);
let router_service: Arc<RouterService> = Arc::new(RouterService::new(
arch_config.llm_providers.clone(),
llm_provider_endpoint.clone(),
arch_config.routing.as_ref().unwrap().model.clone(),
model,
));
loop {
@ -123,7 +130,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
async move {
match (req.method(), req.uri().path()) {
(&Method::POST, "/v1/chat/completions") => {
chat_completion(req, router_service, llm_provider_endpoint)
chat_completions(req, router_service, llm_provider_endpoint)
.with_context(parent_cx)
.await
}

View file

@ -17,6 +17,7 @@ pub struct RouterService {
client: reqwest::Client,
router_model: Arc<dyn RouterModel>,
routing_model_name: String,
llm_usage_defined: bool,
}
#[derive(Debug, Error)]
@ -73,6 +74,7 @@ impl RouterService {
client: reqwest::Client::new(),
router_model,
routing_model_name,
llm_usage_defined: !providers_with_usage.is_empty(),
}
}
@ -81,6 +83,11 @@ impl RouterService {
messages: &[Message],
trace_parent: Option<String>,
) -> Result<Option<String>> {
if !self.llm_usage_defined {
return Ok(None);
}
let router_request = self.router_model.generate_request(messages);
info!(

View file

@ -82,6 +82,9 @@ impl RouterModel for RouterModelV1 {
}
fn parse_response(&self, content: &str) -> Result<Option<String>> {
if content.is_empty() {
return Ok(None);
}
let router_resp_fixed = fix_json_response(content);
info!(
"router response (fixed): {}",
@ -226,6 +229,11 @@ fn test_parse_response() {
let result = router.parse_response(input).unwrap();
assert_eq!(result, None);
// Case 4.1: empty string
let input = r#""#;
let result = router.parse_response(input).unwrap();
assert_eq!(result, None);
// Case 5: Malformed JSON
let input = r#"{"route": "route1""#; // missing closing }
let result = router.parse_response(input);