adding more changes

This commit is contained in:
Adil Hafeez 2025-05-12 13:09:22 -07:00
parent f13fc76a4a
commit 85ab948b13
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
13 changed files with 4 additions and 366 deletions

View file

@ -30,7 +30,7 @@ stats_config:
static_resources:
listeners:
- name: ingress
- name: arch_router
address:
socket_address:
address: 0.0.0.0
@ -51,7 +51,7 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: arch_gateway
service_name: arch_router
random_sampling:
value: 100
stat_prefix: ingress
@ -60,7 +60,7 @@ static_resources:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_ingress.log"
path: "/var/log/access_arch_router.log"
route_config:
name: local_routes
virtual_hosts:
@ -68,16 +68,6 @@ static_resources:
domains:
- "*"
routes:
- match:
prefix: "/"
headers:
- name: "host"
string_match:
exact: router_model_host
route:
auto_host_rewrite: true
cluster: router_model_host
- match:
prefix: "/"
route:

View file

@ -30,8 +30,6 @@ pub fn get_llm_provider(
ProviderHint::Name(name) => llm_providers.get(&name),
});
info!("selected provider: maybe_provider: {:?}", maybe_provider);
if let Some(provider) = maybe_provider {
return provider;
}

View file

@ -3,8 +3,6 @@ use proxy_wasm::traits::*;
use proxy_wasm::types::*;
mod filter_context;
mod llm_routing;
mod llm_routing_consts;
mod metrics;
mod stream_context;

View file

@ -1,106 +0,0 @@
// use std::rc::Rc;
// use std::time::Duration;
// use common::api::open_ai::{ChatCompletionsRequest, Message};
// use common::configuration::LlmProvider;
// use common::consts::{ARCH_INTERNAL_CLUSTER_NAME, ARCH_UPSTREAM_HOST_HEADER};
// use common::errors::ServerError;
// use common::http::{CallArgs, Client};
// use log::{info, warn};
// use proxy_wasm::traits::HttpContext;
// use proxy_wasm::types::Action;
// use crate::llm_routing_consts::SYSTEM_PROMPT;
// use crate::stream_context::{CallContext, StreamContext};
// pub trait Routing {
// fn route(&self) -> Action;
// }
// impl Routing for StreamContext {
// fn route(&self) -> Action {
// let usage_based_providers = self
// .llm_providers
// .iter()
// .filter(|(_, provider)| provider.usage.is_some())
// .map(|(_, provider)| provider.clone())
// .collect::<Vec<Rc<LlmProvider>>>();
// info!(
// "usage based providers found: {}",
// usage_based_providers
// .iter()
// .map(|provider| provider.name.clone())
// .collect::<Vec<String>>()
// .join(", ")
// );
// if usage_based_providers.is_empty() {
// self.set_http_request_body(
// 0,
// self.request_size.unwrap(),
// self.request_body.as_ref().unwrap().as_bytes(),
// );
// return Action::Continue;
// }
// let llm_routes_str = r#"- name: gpt-4o
// description: simple requests, basic fact retrieval, easy to answer
// - name: o4-mini()
// description: complex reasoning problem, require multi step answer"#;
// let chat_completions_request_messages_str =
// serde_json::to_string(&self.chat_completion_request.as_ref().unwrap().messages)
// .expect("failed to serialize llm routing request messages");
// let system_prompt_formatted = SYSTEM_PROMPT
// .replace("{routes}", llm_routes_str)
// .replace("{conversation}", &chat_completions_request_messages_str);
// let message = Message {
// role: "user".to_string(),
// content: Some(system_prompt_formatted),
// model: None,
// tool_calls: None,
// tool_call_id: None,
// };
// let llm_routing_request = ChatCompletionsRequest {
// model: "cotran2/llama-1b-4-26".to_string(),
// messages: vec![message],
// tools: None,
// stream: false,
// stream_options: None,
// metadata: None,
// };
// let llm_routing_request_str = serde_json::to_string(&llm_routing_request)
// .expect("failed to serialize llm routing request");
// let headers = vec![
// (":method", "POST"),
// (ARCH_UPSTREAM_HOST_HEADER, "gcp_hosted_outer_llm"),
// (":path", "/v1/chat/completions"),
// (":authority", "gcp_hosted_outer_llm"),
// ("content-type", "application/json"),
// ("x-envoy-max-retries", "3"),
// ("x-envoy-upstream-rq-timeout-ms", "5000"),
// ];
// let call_args = CallArgs::new(
// ARCH_INTERNAL_CLUSTER_NAME,
// "/v1/chat/completions",
// headers,
// llm_routing_request_str.as_bytes().into(),
// vec![],
// Duration::from_secs(5),
// );
// if let Err(e) = self.http_call(call_args, CallContext {}) {
// warn!("failed to call LLM provider: {}", e);
// self.send_server_error(ServerError::HttpDispatch(e), None);
// }
// Action::Pause
// }
// }

View file

@ -1,32 +0,0 @@
// pub const SYSTEM_PROMPT: &str = r#"
// You are an advanced Routing Assistant designed to select the optimal route based on user requests.
// Your task is to analyze conversations and match them to the most appropriate predefined route.
// Review the available routes config:
// # ROUTES CONFIG START
// {routes}
// # ROUTES CONFIG END
// Examine the following conversation between a user and an assistant:
// # CONVERSATION START
// {conversation}
// # CONVERSATION END
// Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
// 1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
// 2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
// 3. Find the route that best matches.
// 4. Use context clues from the entire conversation to determine the best fit.
// 5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
// 6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
// # OUTPUT FORMAT
// Your final output must follow this JSON format:
// {
// "route": "route_name" # The matched route name, or empty string '' if no match
// }
// Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
// "#;

View file

@ -426,7 +426,6 @@ impl HttpContext for StreamContext {
self.request_size = Some(body_size);
return Action::Continue;
// return self.route();
}
fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {

View file

@ -1,58 +1 @@
# LLM Routing
This demo shows how you can arch gateway to manage keys and route to upstream LLM.
# Starting the demo
1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
1. Start Arch
```sh
sh run_demo.sh
```
1. Navigate to http://localhost:18080/
Following screen shows an example of interaction with arch gateway showing dynamic routing. You can select between different LLMs using "override model" option in the chat UI.
![LLM Routing Demo](llm_routing_demo.png)
You can also pass in a header to override model when sending prompt. Following example shows how you can use `x-arch-llm-provider-hint` header to override model selection,
```bash
$ curl --header 'Content-Type: application/json' \
--header 'x-arch-llm-provider-hint: ministral-3b' \
--data '{"messages": [{"role": "user","content": "hello"}]}' \
http://localhost:12000/v1/chat/completions 2> /dev/null | jq .
{
"id": "xxx",
"object": "chat.completion",
"created": 1737760394,
"model": "ministral-3b-latest",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"tool_calls": null,
"content": "Hello! How can I assist you today? Let's chat about anything you'd like. 😊"
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 4,
"total_tokens": 25,
"completion_tokens": 21
}
}
```
# Observability
Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visualize the stats in dashboard. To see grafana dashboard follow instructions below,
1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
1. For tracing you can head over to http://localhost:16686/ to view recent traces.
Following is a screenshot of tracing UI showing call received by arch gateway and making upstream call to LLM,
![Jaeger Tracing](jaeger_tracing_llm_routing.png)
# Usage based LLM Routing

View file

@ -1,62 +0,0 @@
import json
import yaml
system_prompt = """
You are an advanced Routing Assistant designed to select the optimal route based on user requests.
Your task is to analyze conversations and match them to the most appropriate predefined route.
Review the available routes config:
# ROUTES CONFIG START
{routes}
# ROUTES CONFIG END
Examine the following conversation between a user and an assistant:
# CONVERSATION START
{conversation}
# CONVERSATION END
Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
3. Find the route that best matches.
4. Use context clues from the entire conversation to determine the best fit.
5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
"""
output_format = """
# OUTPUT FORMAT
Your final output must follow this JSON format:
{
"route": "route_name" # The matched route name, or empty string '' if no match
}
Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
"""
with open("arch_config.yaml", "r") as file:
data = yaml.safe_load(file)
llm_provider_routes = ""
for llm_provider in data.get("llm_providers", []):
llm_provider_routes += f"- name: {llm_provider.get('name')}()\n"
llm_provider_routes += f" description: {json.dumps(llm_provider.get('usage'))}\n"
conversation = """
user: Hello
assistant: Hi! How can I assist you today?
user: I want to know how far is sun from earth.
"""
system_prompt_formatted = system_prompt.format(
routes=llm_provider_routes, conversation=conversation
)
system_prompt_2 = f"{system_prompt_formatted}\n{output_format}"
print(system_prompt_2)
print(json.dumps(system_prompt_2, indent=2))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 273 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 284 KiB

View file

@ -1,47 +0,0 @@
#!/bin/bash
set -e
# Function to start the demo
start_demo() {
# Step 1: Check if .env file exists
if [ -f ".env" ]; then
echo ".env file already exists. Skipping creation."
else
# Step 2: Create `.env` file and set OpenAI key
if [ -z "$OPENAI_API_KEY" ]; then
echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
exit 1
fi
echo "Creating .env file..."
echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
echo ".env file created with OPENAI_API_KEY."
fi
# Step 3: Start Arch
echo "Starting Arch with arch_config.yaml..."
archgw up arch_config.yaml
# Step 4: Start LLM Routing
echo "Starting LLM Routing using Docker Compose..."
docker compose up -d # Run in detached mode
}
# Function to stop the demo
stop_demo() {
# Step 1: Stop Docker Compose services
echo "Stopping LLM Routing using Docker Compose..."
docker compose down
# Step 2: Stop Arch
echo "Stopping Arch..."
archgw down
}
# Main script logic
if [ "$1" == "down" ]; then
stop_demo
else
# Default action is to bring the demo up
start_demo
fi

View file

@ -1,12 +0,0 @@
{
"model": "cotran2/llama-1b-4-26",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
],
"metadata": {
"llm_providers": "[]"
}
}

View file

@ -1,31 +0,0 @@
You are an advanced Routing Assistant designed to select the optimal route based on user requests.
Your task is to analyze conversations and match them to the most appropriate predefined route.
Review the available routes config:
# ROUTES CONFIG START
{}
# ROUTES CONFIG END
Examine the following conversation between a user and an assistant:
# CONVERSATION START
{}
# CONVERSATION END
Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
3. Find the route that best matches.
4. Use context clues from the entire conversation to determine the best fit.
5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
"""
output_prompt = """
# OUTPUT FORMAT
Your final output must follow this JSON format:
{
"route": "route_name" # The matched route name, or empty string '' if no match
}
Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.