From 7feb168a06c5b1ffe1bf010b3845b8a19338189d Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Fri, 16 May 2025 17:10:37 -0700 Subject: [PATCH] remove 9090 and consolite to port 12000 for llm routing --- arch/envoy.template.yaml | 61 ++----------------- .../src/handlers/chat_completions.rs | 10 +-- crates/brightstaff/src/main.rs | 15 +++-- crates/brightstaff/src/router/llm_router.rs | 7 +++ .../brightstaff/src/router/router_model_v1.rs | 8 +++ 5 files changed, 32 insertions(+), 69 deletions(-) diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index f9dd6d4a..3e696e2e 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -30,55 +30,6 @@ stats_config: static_resources: listeners: - - name: arch_router - address: - socket_address: - address: 0.0.0.0 - port_value: 9090 - traffic_direction: INBOUND - filter_chains: - - filters: - - name: envoy.filters.network.http_connection_manager - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager - generate_request_id: true - tracing: - provider: - name: envoy.tracers.opentelemetry - typed_config: - "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig - grpc_service: - envoy_grpc: - cluster_name: opentelemetry_collector - timeout: 0.250s - service_name: arch_router - random_sampling: - value: 100 - stat_prefix: ingress - codec_type: AUTO - access_log: - - name: envoy.access_loggers.file - typed_config: - "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog - path: "/var/log/access_arch_router.log" - route_config: - name: local_routes - virtual_hosts: - - name: local_service - domains: - - "*" - routes: - - match: - prefix: "/" - route: - auto_host_rewrite: true - cluster: bright_staff - http_filters: - - name: envoy.filters.http.router - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - - name: ingress_traffic address: socket_address: @@ -378,11 +329,15 @@ static_resources: domains: - "*" routes: + - match: + prefix: "/healthz" + direct_response: + status: 200 - match: prefix: "/" route: auto_host_rewrite: true - cluster: arch_listener_llm + cluster: bright_staff timeout: {{ llm_gateway_listener.timeout }} http_filters: - name: envoy.filters.http.router @@ -430,12 +385,6 @@ static_resources: domains: - "*" routes: - - match: - prefix: "/healthz" - route: - auto_host_rewrite: true - cluster: openai - timeout: 60s {% for provider in arch_llm_providers %} # if endpoint is set then use custom cluster for upstream llm {% if provider.endpoint %} diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs index b4ce7459..08e3fd7e 100644 --- a/crates/brightstaff/src/handlers/chat_completions.rs +++ b/crates/brightstaff/src/handlers/chat_completions.rs @@ -5,7 +5,6 @@ use common::api::open_ai::ChatCompletionsRequest; use common::consts::ARCH_PROVIDER_HINT_HEADER; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full, StreamBody}; -use hyper::body::Body; use hyper::body::Frame; use hyper::header::{self}; use hyper::{Request, Response, StatusCode}; @@ -22,18 +21,11 @@ fn full>(chunk: T) -> BoxBody { .boxed() } -pub async fn chat_completion( +pub async fn chat_completions( request: Request, router_service: Arc, llm_provider_endpoint: String, ) -> Result>, hyper::Error> { - let max = request.body().size_hint().upper().unwrap_or(u64::MAX); - if max > 1024 * 1024 { - let error_msg = format!("Request body too large: {} bytes", max); - let mut too_large = Response::new(full(error_msg)); - *too_large.status_mut() = StatusCode::PAYLOAD_TOO_LARGE; - return Ok(too_large); - } let mut request_headers = request.headers().clone(); diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 02ed0909..963b21f5 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -1,4 +1,4 @@ -use brightstaff::handlers::chat_completions::chat_completion; +use brightstaff::handlers::chat_completions::chat_completions; use brightstaff::router::llm_router::RouterService; use bytes::Bytes; use common::configuration::Configuration; @@ -89,16 +89,23 @@ async fn main() -> Result<(), Box> { ); let llm_provider_endpoint = env::var("LLM_PROVIDER_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:12000/v1/chat/completions".to_string()); + .unwrap_or_else(|_| "http://localhost:12001/v1/chat/completions".to_string()); info!("llm provider endpoint: {}", llm_provider_endpoint); info!("Listening on http://{}", bind_address); let listener = TcpListener::bind(bind_address).await?; + + // if routing is null then return gpt-4o as model name + let model = arch_config.routing.as_ref().map_or_else( + || "gpt-4o".to_string(), + |routing| routing.model.clone(), + ); + let router_service: Arc = Arc::new(RouterService::new( arch_config.llm_providers.clone(), llm_provider_endpoint.clone(), - arch_config.routing.as_ref().unwrap().model.clone(), + model, )); loop { @@ -123,7 +130,7 @@ async fn main() -> Result<(), Box> { async move { match (req.method(), req.uri().path()) { (&Method::POST, "/v1/chat/completions") => { - chat_completion(req, router_service, llm_provider_endpoint) + chat_completions(req, router_service, llm_provider_endpoint) .with_context(parent_cx) .await } diff --git a/crates/brightstaff/src/router/llm_router.rs b/crates/brightstaff/src/router/llm_router.rs index 8c1d9c23..47f2b41c 100644 --- a/crates/brightstaff/src/router/llm_router.rs +++ b/crates/brightstaff/src/router/llm_router.rs @@ -17,6 +17,7 @@ pub struct RouterService { client: reqwest::Client, router_model: Arc, routing_model_name: String, + llm_usage_defined: bool, } #[derive(Debug, Error)] @@ -73,6 +74,7 @@ impl RouterService { client: reqwest::Client::new(), router_model, routing_model_name, + llm_usage_defined: !providers_with_usage.is_empty(), } } @@ -81,6 +83,11 @@ impl RouterService { messages: &[Message], trace_parent: Option, ) -> Result> { + + if !self.llm_usage_defined { + return Ok(None); + } + let router_request = self.router_model.generate_request(messages); info!( diff --git a/crates/brightstaff/src/router/router_model_v1.rs b/crates/brightstaff/src/router/router_model_v1.rs index 7bd1a987..836c79e5 100644 --- a/crates/brightstaff/src/router/router_model_v1.rs +++ b/crates/brightstaff/src/router/router_model_v1.rs @@ -82,6 +82,9 @@ impl RouterModel for RouterModelV1 { } fn parse_response(&self, content: &str) -> Result> { + if content.is_empty() { + return Ok(None); + } let router_resp_fixed = fix_json_response(content); info!( "router response (fixed): {}", @@ -226,6 +229,11 @@ fn test_parse_response() { let result = router.parse_response(input).unwrap(); assert_eq!(result, None); + // Case 4.1: empty string + let input = r#""#; + let result = router.parse_response(input).unwrap(); + assert_eq!(result, None); + // Case 5: Malformed JSON let input = r#"{"route": "route1""#; // missing closing } let result = router.parse_response(input);