diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index 8144a65d..42f9b17c 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -30,7 +30,7 @@ stats_config: static_resources: listeners: - - name: ingress + - name: arch_router address: socket_address: address: 0.0.0.0 @@ -51,7 +51,7 @@ static_resources: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s - service_name: arch_gateway + service_name: arch_router random_sampling: value: 100 stat_prefix: ingress @@ -60,7 +60,7 @@ static_resources: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog - path: "/var/log/access_ingress.log" + path: "/var/log/access_arch_router.log" route_config: name: local_routes virtual_hosts: @@ -68,16 +68,6 @@ static_resources: domains: - "*" routes: - - match: - prefix: "/" - headers: - - name: "host" - string_match: - exact: router_model_host - route: - auto_host_rewrite: true - cluster: router_model_host - - match: prefix: "/" route: diff --git a/crates/common/src/routing.rs b/crates/common/src/routing.rs index f8e6146b..7acd0b08 100644 --- a/crates/common/src/routing.rs +++ b/crates/common/src/routing.rs @@ -30,8 +30,6 @@ pub fn get_llm_provider( ProviderHint::Name(name) => llm_providers.get(&name), }); - info!("selected provider: maybe_provider: {:?}", maybe_provider); - if let Some(provider) = maybe_provider { return provider; } diff --git a/crates/llm_gateway/src/lib.rs b/crates/llm_gateway/src/lib.rs index e3a071a3..f585ba0e 100644 --- a/crates/llm_gateway/src/lib.rs +++ b/crates/llm_gateway/src/lib.rs @@ -3,8 +3,6 @@ use proxy_wasm::traits::*; use proxy_wasm::types::*; mod filter_context; -mod llm_routing; -mod llm_routing_consts; mod metrics; mod stream_context; diff --git a/crates/llm_gateway/src/llm_routing.rs b/crates/llm_gateway/src/llm_routing.rs deleted file mode 100644 index 9f183155..00000000 --- a/crates/llm_gateway/src/llm_routing.rs +++ /dev/null @@ -1,106 +0,0 @@ -// use std::rc::Rc; -// use std::time::Duration; - -// use common::api::open_ai::{ChatCompletionsRequest, Message}; -// use common::configuration::LlmProvider; -// use common::consts::{ARCH_INTERNAL_CLUSTER_NAME, ARCH_UPSTREAM_HOST_HEADER}; -// use common::errors::ServerError; -// use common::http::{CallArgs, Client}; -// use log::{info, warn}; -// use proxy_wasm::traits::HttpContext; -// use proxy_wasm::types::Action; - -// use crate::llm_routing_consts::SYSTEM_PROMPT; -// use crate::stream_context::{CallContext, StreamContext}; - -// pub trait Routing { -// fn route(&self) -> Action; -// } - -// impl Routing for StreamContext { -// fn route(&self) -> Action { -// let usage_based_providers = self -// .llm_providers -// .iter() -// .filter(|(_, provider)| provider.usage.is_some()) -// .map(|(_, provider)| provider.clone()) -// .collect::>>(); - -// info!( -// "usage based providers found: {}", -// usage_based_providers -// .iter() -// .map(|provider| provider.name.clone()) -// .collect::>() -// .join(", ") -// ); - -// if usage_based_providers.is_empty() { -// self.set_http_request_body( -// 0, -// self.request_size.unwrap(), -// self.request_body.as_ref().unwrap().as_bytes(), -// ); -// return Action::Continue; -// } - -// let llm_routes_str = r#"- name: gpt-4o -// description: simple requests, basic fact retrieval, easy to answer -// - name: o4-mini() -// description: complex reasoning problem, require multi step answer"#; - -// let chat_completions_request_messages_str = -// serde_json::to_string(&self.chat_completion_request.as_ref().unwrap().messages) -// .expect("failed to serialize llm routing request messages"); - -// let system_prompt_formatted = SYSTEM_PROMPT -// .replace("{routes}", llm_routes_str) -// .replace("{conversation}", &chat_completions_request_messages_str); - -// let message = Message { -// role: "user".to_string(), -// content: Some(system_prompt_formatted), -// model: None, -// tool_calls: None, -// tool_call_id: None, -// }; - -// let llm_routing_request = ChatCompletionsRequest { -// model: "cotran2/llama-1b-4-26".to_string(), -// messages: vec![message], -// tools: None, -// stream: false, -// stream_options: None, -// metadata: None, -// }; - -// let llm_routing_request_str = serde_json::to_string(&llm_routing_request) -// .expect("failed to serialize llm routing request"); - -// let headers = vec![ -// (":method", "POST"), -// (ARCH_UPSTREAM_HOST_HEADER, "gcp_hosted_outer_llm"), -// (":path", "/v1/chat/completions"), -// (":authority", "gcp_hosted_outer_llm"), -// ("content-type", "application/json"), -// ("x-envoy-max-retries", "3"), -// ("x-envoy-upstream-rq-timeout-ms", "5000"), -// ]; - -// let call_args = CallArgs::new( -// ARCH_INTERNAL_CLUSTER_NAME, -// "/v1/chat/completions", -// headers, -// llm_routing_request_str.as_bytes().into(), -// vec![], -// Duration::from_secs(5), -// ); - -// if let Err(e) = self.http_call(call_args, CallContext {}) { -// warn!("failed to call LLM provider: {}", e); -// self.send_server_error(ServerError::HttpDispatch(e), None); -// } - -// Action::Pause -// } -// } diff --git a/crates/llm_gateway/src/llm_routing_consts.rs b/crates/llm_gateway/src/llm_routing_consts.rs deleted file mode 100644 index 9f4045c5..00000000 --- a/crates/llm_gateway/src/llm_routing_consts.rs +++ /dev/null @@ -1,32 +0,0 @@ -// pub const SYSTEM_PROMPT: &str = r#" -// You are an advanced Routing Assistant designed to select the optimal route based on user requests. -// Your task is to analyze conversations and match them to the most appropriate predefined route. -// Review the available routes config: - -// # ROUTES CONFIG START -// {routes} -// # ROUTES CONFIG END - -// Examine the following conversation between a user and an assistant: - -// # CONVERSATION START -// {conversation} -// # CONVERSATION END - -// Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps: - -// 1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario. -// 2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description). -// 3. Find the route that best matches. -// 4. Use context clues from the entire conversation to determine the best fit. -// 5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config. -// 6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. - -// # OUTPUT FORMAT -// Your final output must follow this JSON format: -// { -// "route": "route_name" # The matched route name, or empty string '' if no match -// } - -// Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace. -// "#; diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 19fa735e..326ccbb8 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -426,7 +426,6 @@ impl HttpContext for StreamContext { self.request_size = Some(body_size); return Action::Continue; - // return self.route(); } fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action { diff --git a/demos/use_cases/preference_based_routing/README.md b/demos/use_cases/preference_based_routing/README.md index 08c06dc4..470f6eaf 100644 --- a/demos/use_cases/preference_based_routing/README.md +++ b/demos/use_cases/preference_based_routing/README.md @@ -1,58 +1 @@ -# LLM Routing -This demo shows how you can arch gateway to manage keys and route to upstream LLM. - -# Starting the demo -1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly -1. Start Arch - ```sh - sh run_demo.sh - ``` -1. Navigate to http://localhost:18080/ - -Following screen shows an example of interaction with arch gateway showing dynamic routing. You can select between different LLMs using "override model" option in the chat UI. - -![LLM Routing Demo](llm_routing_demo.png) - -You can also pass in a header to override model when sending prompt. Following example shows how you can use `x-arch-llm-provider-hint` header to override model selection, - -```bash - -$ curl --header 'Content-Type: application/json' \ - --header 'x-arch-llm-provider-hint: ministral-3b' \ - --data '{"messages": [{"role": "user","content": "hello"}]}' \ - http://localhost:12000/v1/chat/completions 2> /dev/null | jq . -{ - "id": "xxx", - "object": "chat.completion", - "created": 1737760394, - "model": "ministral-3b-latest", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "tool_calls": null, - "content": "Hello! How can I assist you today? Let's chat about anything you'd like. 😊" - }, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 4, - "total_tokens": 25, - "completion_tokens": 21 - } -} - -``` - -# Observability -Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visualize the stats in dashboard. To see grafana dashboard follow instructions below, - -1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials) -1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats -1. For tracing you can head over to http://localhost:16686/ to view recent traces. - -Following is a screenshot of tracing UI showing call received by arch gateway and making upstream call to LLM, - -![Jaeger Tracing](jaeger_tracing_llm_routing.png) +# Usage based LLM Routing diff --git a/demos/use_cases/preference_based_routing/convert_system_prompt.py b/demos/use_cases/preference_based_routing/convert_system_prompt.py deleted file mode 100644 index 5178cf41..00000000 --- a/demos/use_cases/preference_based_routing/convert_system_prompt.py +++ /dev/null @@ -1,62 +0,0 @@ -import json -import yaml - -system_prompt = """ -You are an advanced Routing Assistant designed to select the optimal route based on user requests. -Your task is to analyze conversations and match them to the most appropriate predefined route. -Review the available routes config: - -# ROUTES CONFIG START -{routes} -# ROUTES CONFIG END - -Examine the following conversation between a user and an assistant: - -# CONVERSATION START -{conversation} -# CONVERSATION END - -Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps: - -1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario. -2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description). -3. Find the route that best matches. -4. Use context clues from the entire conversation to determine the best fit. -5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config. -6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. -""" - -output_format = """ -# OUTPUT FORMAT -Your final output must follow this JSON format: -{ - "route": "route_name" # The matched route name, or empty string '' if no match -} - -Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace. -""" - - -with open("arch_config.yaml", "r") as file: - data = yaml.safe_load(file) - -llm_provider_routes = "" - -for llm_provider in data.get("llm_providers", []): - llm_provider_routes += f"- name: {llm_provider.get('name')}()\n" - llm_provider_routes += f" description: {json.dumps(llm_provider.get('usage'))}\n" - - -conversation = """ -user: Hello -assistant: Hi! How can I assist you today? -user: I want to know how far is sun from earth. -""" - -system_prompt_formatted = system_prompt.format( - routes=llm_provider_routes, conversation=conversation -) - -system_prompt_2 = f"{system_prompt_formatted}\n{output_format}" -print(system_prompt_2) -print(json.dumps(system_prompt_2, indent=2)) diff --git a/demos/use_cases/preference_based_routing/jaeger_tracing_llm_routing.png b/demos/use_cases/preference_based_routing/jaeger_tracing_llm_routing.png deleted file mode 100644 index e18016d1..00000000 Binary files a/demos/use_cases/preference_based_routing/jaeger_tracing_llm_routing.png and /dev/null differ diff --git a/demos/use_cases/preference_based_routing/llm_routing_demo.png b/demos/use_cases/preference_based_routing/llm_routing_demo.png deleted file mode 100644 index 50f25677..00000000 Binary files a/demos/use_cases/preference_based_routing/llm_routing_demo.png and /dev/null differ diff --git a/demos/use_cases/preference_based_routing/run_demo.sh b/demos/use_cases/preference_based_routing/run_demo.sh deleted file mode 100644 index c0eafee6..00000000 --- a/demos/use_cases/preference_based_routing/run_demo.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -e - -# Function to start the demo -start_demo() { - # Step 1: Check if .env file exists - if [ -f ".env" ]; then - echo ".env file already exists. Skipping creation." - else - # Step 2: Create `.env` file and set OpenAI key - if [ -z "$OPENAI_API_KEY" ]; then - echo "Error: OPENAI_API_KEY environment variable is not set for the demo." - exit 1 - fi - - echo "Creating .env file..." - echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env - echo ".env file created with OPENAI_API_KEY." - fi - - # Step 3: Start Arch - echo "Starting Arch with arch_config.yaml..." - archgw up arch_config.yaml - - # Step 4: Start LLM Routing - echo "Starting LLM Routing using Docker Compose..." - docker compose up -d # Run in detached mode -} - -# Function to stop the demo -stop_demo() { - # Step 1: Stop Docker Compose services - echo "Stopping LLM Routing using Docker Compose..." - docker compose down - - # Step 2: Stop Arch - echo "Stopping Arch..." - archgw down -} - -# Main script logic -if [ "$1" == "down" ]; then - stop_demo -else - # Default action is to bring the demo up - start_demo -fi diff --git a/demos/use_cases/preference_based_routing/staff_req.json b/demos/use_cases/preference_based_routing/staff_req.json deleted file mode 100644 index 47195643..00000000 --- a/demos/use_cases/preference_based_routing/staff_req.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "model": "cotran2/llama-1b-4-26", - "messages": [ - { - "role": "user", - "content": "What is the capital of France?" - } - ], - "metadata": { - "llm_providers": "[]" - } -} diff --git a/demos/use_cases/preference_based_routing/test_sytem_prompt.text b/demos/use_cases/preference_based_routing/test_sytem_prompt.text deleted file mode 100644 index ea5efb6e..00000000 --- a/demos/use_cases/preference_based_routing/test_sytem_prompt.text +++ /dev/null @@ -1,31 +0,0 @@ -You are an advanced Routing Assistant designed to select the optimal route based on user requests. -Your task is to analyze conversations and match them to the most appropriate predefined route. -Review the available routes config: - -# ROUTES CONFIG START -{} -# ROUTES CONFIG END - -Examine the following conversation between a user and an assistant: - -# CONVERSATION START -{} -# CONVERSATION END - -Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps: - -1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario. -2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description). -3. Find the route that best matches. -4. Use context clues from the entire conversation to determine the best fit. -5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config. -6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. -""" -output_prompt = """ -# OUTPUT FORMAT -Your final output must follow this JSON format: -{ - "route": "route_name" # The matched route name, or empty string '' if no match -} - -Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.