mirror of
https://github.com/katanemo/plano.git
synced 2026-06-17 15:25:17 +02:00
remove 9090 and consolite to port 12000 for llm routing
This commit is contained in:
parent
ef65527ff0
commit
7feb168a06
5 changed files with 32 additions and 69 deletions
|
|
@ -30,55 +30,6 @@ stats_config:
|
|||
static_resources:
|
||||
listeners:
|
||||
|
||||
- name: arch_router
|
||||
address:
|
||||
socket_address:
|
||||
address: 0.0.0.0
|
||||
port_value: 9090
|
||||
traffic_direction: INBOUND
|
||||
filter_chains:
|
||||
- filters:
|
||||
- name: envoy.filters.network.http_connection_manager
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
|
||||
generate_request_id: true
|
||||
tracing:
|
||||
provider:
|
||||
name: envoy.tracers.opentelemetry
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
|
||||
grpc_service:
|
||||
envoy_grpc:
|
||||
cluster_name: opentelemetry_collector
|
||||
timeout: 0.250s
|
||||
service_name: arch_router
|
||||
random_sampling:
|
||||
value: 100
|
||||
stat_prefix: ingress
|
||||
codec_type: AUTO
|
||||
access_log:
|
||||
- name: envoy.access_loggers.file
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
|
||||
path: "/var/log/access_arch_router.log"
|
||||
route_config:
|
||||
name: local_routes
|
||||
virtual_hosts:
|
||||
- name: local_service
|
||||
domains:
|
||||
- "*"
|
||||
routes:
|
||||
- match:
|
||||
prefix: "/"
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: bright_staff
|
||||
http_filters:
|
||||
- name: envoy.filters.http.router
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
|
||||
|
||||
|
||||
- name: ingress_traffic
|
||||
address:
|
||||
socket_address:
|
||||
|
|
@ -378,11 +329,15 @@ static_resources:
|
|||
domains:
|
||||
- "*"
|
||||
routes:
|
||||
- match:
|
||||
prefix: "/healthz"
|
||||
direct_response:
|
||||
status: 200
|
||||
- match:
|
||||
prefix: "/"
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: arch_listener_llm
|
||||
cluster: bright_staff
|
||||
timeout: {{ llm_gateway_listener.timeout }}
|
||||
http_filters:
|
||||
- name: envoy.filters.http.router
|
||||
|
|
@ -430,12 +385,6 @@ static_resources:
|
|||
domains:
|
||||
- "*"
|
||||
routes:
|
||||
- match:
|
||||
prefix: "/healthz"
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: openai
|
||||
timeout: 60s
|
||||
{% for provider in arch_llm_providers %}
|
||||
# if endpoint is set then use custom cluster for upstream llm
|
||||
{% if provider.endpoint %}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ use common::api::open_ai::ChatCompletionsRequest;
|
|||
use common::consts::ARCH_PROVIDER_HINT_HEADER;
|
||||
use http_body_util::combinators::BoxBody;
|
||||
use http_body_util::{BodyExt, Full, StreamBody};
|
||||
use hyper::body::Body;
|
||||
use hyper::body::Frame;
|
||||
use hyper::header::{self};
|
||||
use hyper::{Request, Response, StatusCode};
|
||||
|
|
@ -22,18 +21,11 @@ fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
|
|||
.boxed()
|
||||
}
|
||||
|
||||
pub async fn chat_completion(
|
||||
pub async fn chat_completions(
|
||||
request: Request<hyper::body::Incoming>,
|
||||
router_service: Arc<RouterService>,
|
||||
llm_provider_endpoint: String,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||
let max = request.body().size_hint().upper().unwrap_or(u64::MAX);
|
||||
if max > 1024 * 1024 {
|
||||
let error_msg = format!("Request body too large: {} bytes", max);
|
||||
let mut too_large = Response::new(full(error_msg));
|
||||
*too_large.status_mut() = StatusCode::PAYLOAD_TOO_LARGE;
|
||||
return Ok(too_large);
|
||||
}
|
||||
|
||||
let mut request_headers = request.headers().clone();
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use brightstaff::handlers::chat_completions::chat_completion;
|
||||
use brightstaff::handlers::chat_completions::chat_completions;
|
||||
use brightstaff::router::llm_router::RouterService;
|
||||
use bytes::Bytes;
|
||||
use common::configuration::Configuration;
|
||||
|
|
@ -89,16 +89,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
);
|
||||
|
||||
let llm_provider_endpoint = env::var("LLM_PROVIDER_ENDPOINT")
|
||||
.unwrap_or_else(|_| "http://localhost:12000/v1/chat/completions".to_string());
|
||||
.unwrap_or_else(|_| "http://localhost:12001/v1/chat/completions".to_string());
|
||||
|
||||
info!("llm provider endpoint: {}", llm_provider_endpoint);
|
||||
info!("Listening on http://{}", bind_address);
|
||||
let listener = TcpListener::bind(bind_address).await?;
|
||||
|
||||
|
||||
// if routing is null then return gpt-4o as model name
|
||||
let model = arch_config.routing.as_ref().map_or_else(
|
||||
|| "gpt-4o".to_string(),
|
||||
|routing| routing.model.clone(),
|
||||
);
|
||||
|
||||
let router_service: Arc<RouterService> = Arc::new(RouterService::new(
|
||||
arch_config.llm_providers.clone(),
|
||||
llm_provider_endpoint.clone(),
|
||||
arch_config.routing.as_ref().unwrap().model.clone(),
|
||||
model,
|
||||
));
|
||||
|
||||
loop {
|
||||
|
|
@ -123,7 +130,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
async move {
|
||||
match (req.method(), req.uri().path()) {
|
||||
(&Method::POST, "/v1/chat/completions") => {
|
||||
chat_completion(req, router_service, llm_provider_endpoint)
|
||||
chat_completions(req, router_service, llm_provider_endpoint)
|
||||
.with_context(parent_cx)
|
||||
.await
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ pub struct RouterService {
|
|||
client: reqwest::Client,
|
||||
router_model: Arc<dyn RouterModel>,
|
||||
routing_model_name: String,
|
||||
llm_usage_defined: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
|
|
@ -73,6 +74,7 @@ impl RouterService {
|
|||
client: reqwest::Client::new(),
|
||||
router_model,
|
||||
routing_model_name,
|
||||
llm_usage_defined: !providers_with_usage.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -81,6 +83,11 @@ impl RouterService {
|
|||
messages: &[Message],
|
||||
trace_parent: Option<String>,
|
||||
) -> Result<Option<String>> {
|
||||
|
||||
if !self.llm_usage_defined {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let router_request = self.router_model.generate_request(messages);
|
||||
|
||||
info!(
|
||||
|
|
|
|||
|
|
@ -82,6 +82,9 @@ impl RouterModel for RouterModelV1 {
|
|||
}
|
||||
|
||||
fn parse_response(&self, content: &str) -> Result<Option<String>> {
|
||||
if content.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let router_resp_fixed = fix_json_response(content);
|
||||
info!(
|
||||
"router response (fixed): {}",
|
||||
|
|
@ -226,6 +229,11 @@ fn test_parse_response() {
|
|||
let result = router.parse_response(input).unwrap();
|
||||
assert_eq!(result, None);
|
||||
|
||||
// Case 4.1: empty string
|
||||
let input = r#""#;
|
||||
let result = router.parse_response(input).unwrap();
|
||||
assert_eq!(result, None);
|
||||
|
||||
// Case 5: Malformed JSON
|
||||
let input = r#"{"route": "route1""#; // missing closing }
|
||||
let result = router.parse_response(input);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue