mirror of
https://github.com/katanemo/plano.git
synced 2026-06-17 15:25:17 +02:00
adding more changes
This commit is contained in:
parent
f13fc76a4a
commit
85ab948b13
13 changed files with 4 additions and 366 deletions
|
|
@ -30,7 +30,7 @@ stats_config:
|
|||
static_resources:
|
||||
listeners:
|
||||
|
||||
- name: ingress
|
||||
- name: arch_router
|
||||
address:
|
||||
socket_address:
|
||||
address: 0.0.0.0
|
||||
|
|
@ -51,7 +51,7 @@ static_resources:
|
|||
envoy_grpc:
|
||||
cluster_name: opentelemetry_collector
|
||||
timeout: 0.250s
|
||||
service_name: arch_gateway
|
||||
service_name: arch_router
|
||||
random_sampling:
|
||||
value: 100
|
||||
stat_prefix: ingress
|
||||
|
|
@ -60,7 +60,7 @@ static_resources:
|
|||
- name: envoy.access_loggers.file
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
|
||||
path: "/var/log/access_ingress.log"
|
||||
path: "/var/log/access_arch_router.log"
|
||||
route_config:
|
||||
name: local_routes
|
||||
virtual_hosts:
|
||||
|
|
@ -68,16 +68,6 @@ static_resources:
|
|||
domains:
|
||||
- "*"
|
||||
routes:
|
||||
- match:
|
||||
prefix: "/"
|
||||
headers:
|
||||
- name: "host"
|
||||
string_match:
|
||||
exact: router_model_host
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: router_model_host
|
||||
|
||||
- match:
|
||||
prefix: "/"
|
||||
route:
|
||||
|
|
|
|||
|
|
@ -30,8 +30,6 @@ pub fn get_llm_provider(
|
|||
ProviderHint::Name(name) => llm_providers.get(&name),
|
||||
});
|
||||
|
||||
info!("selected provider: maybe_provider: {:?}", maybe_provider);
|
||||
|
||||
if let Some(provider) = maybe_provider {
|
||||
return provider;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@ use proxy_wasm::traits::*;
|
|||
use proxy_wasm::types::*;
|
||||
|
||||
mod filter_context;
|
||||
mod llm_routing;
|
||||
mod llm_routing_consts;
|
||||
mod metrics;
|
||||
mod stream_context;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,106 +0,0 @@
|
|||
// use std::rc::Rc;
|
||||
// use std::time::Duration;
|
||||
|
||||
// use common::api::open_ai::{ChatCompletionsRequest, Message};
|
||||
// use common::configuration::LlmProvider;
|
||||
// use common::consts::{ARCH_INTERNAL_CLUSTER_NAME, ARCH_UPSTREAM_HOST_HEADER};
|
||||
// use common::errors::ServerError;
|
||||
// use common::http::{CallArgs, Client};
|
||||
// use log::{info, warn};
|
||||
// use proxy_wasm::traits::HttpContext;
|
||||
// use proxy_wasm::types::Action;
|
||||
|
||||
// use crate::llm_routing_consts::SYSTEM_PROMPT;
|
||||
// use crate::stream_context::{CallContext, StreamContext};
|
||||
|
||||
// pub trait Routing {
|
||||
// fn route(&self) -> Action;
|
||||
// }
|
||||
|
||||
// impl Routing for StreamContext {
|
||||
// fn route(&self) -> Action {
|
||||
// let usage_based_providers = self
|
||||
// .llm_providers
|
||||
// .iter()
|
||||
// .filter(|(_, provider)| provider.usage.is_some())
|
||||
// .map(|(_, provider)| provider.clone())
|
||||
// .collect::<Vec<Rc<LlmProvider>>>();
|
||||
|
||||
// info!(
|
||||
// "usage based providers found: {}",
|
||||
// usage_based_providers
|
||||
// .iter()
|
||||
// .map(|provider| provider.name.clone())
|
||||
// .collect::<Vec<String>>()
|
||||
// .join(", ")
|
||||
// );
|
||||
|
||||
// if usage_based_providers.is_empty() {
|
||||
// self.set_http_request_body(
|
||||
// 0,
|
||||
// self.request_size.unwrap(),
|
||||
// self.request_body.as_ref().unwrap().as_bytes(),
|
||||
// );
|
||||
// return Action::Continue;
|
||||
// }
|
||||
|
||||
// let llm_routes_str = r#"- name: gpt-4o
|
||||
// description: simple requests, basic fact retrieval, easy to answer
|
||||
// - name: o4-mini()
|
||||
// description: complex reasoning problem, require multi step answer"#;
|
||||
|
||||
// let chat_completions_request_messages_str =
|
||||
// serde_json::to_string(&self.chat_completion_request.as_ref().unwrap().messages)
|
||||
// .expect("failed to serialize llm routing request messages");
|
||||
|
||||
// let system_prompt_formatted = SYSTEM_PROMPT
|
||||
// .replace("{routes}", llm_routes_str)
|
||||
// .replace("{conversation}", &chat_completions_request_messages_str);
|
||||
|
||||
// let message = Message {
|
||||
// role: "user".to_string(),
|
||||
// content: Some(system_prompt_formatted),
|
||||
// model: None,
|
||||
// tool_calls: None,
|
||||
// tool_call_id: None,
|
||||
// };
|
||||
|
||||
// let llm_routing_request = ChatCompletionsRequest {
|
||||
// model: "cotran2/llama-1b-4-26".to_string(),
|
||||
// messages: vec![message],
|
||||
// tools: None,
|
||||
// stream: false,
|
||||
// stream_options: None,
|
||||
// metadata: None,
|
||||
// };
|
||||
|
||||
// let llm_routing_request_str = serde_json::to_string(&llm_routing_request)
|
||||
// .expect("failed to serialize llm routing request");
|
||||
|
||||
// let headers = vec![
|
||||
// (":method", "POST"),
|
||||
// (ARCH_UPSTREAM_HOST_HEADER, "gcp_hosted_outer_llm"),
|
||||
// (":path", "/v1/chat/completions"),
|
||||
// (":authority", "gcp_hosted_outer_llm"),
|
||||
// ("content-type", "application/json"),
|
||||
// ("x-envoy-max-retries", "3"),
|
||||
// ("x-envoy-upstream-rq-timeout-ms", "5000"),
|
||||
// ];
|
||||
|
||||
// let call_args = CallArgs::new(
|
||||
// ARCH_INTERNAL_CLUSTER_NAME,
|
||||
// "/v1/chat/completions",
|
||||
// headers,
|
||||
// llm_routing_request_str.as_bytes().into(),
|
||||
// vec![],
|
||||
// Duration::from_secs(5),
|
||||
// );
|
||||
|
||||
// if let Err(e) = self.http_call(call_args, CallContext {}) {
|
||||
// warn!("failed to call LLM provider: {}", e);
|
||||
// self.send_server_error(ServerError::HttpDispatch(e), None);
|
||||
// }
|
||||
|
||||
// Action::Pause
|
||||
// }
|
||||
// }
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
// pub const SYSTEM_PROMPT: &str = r#"
|
||||
// You are an advanced Routing Assistant designed to select the optimal route based on user requests.
|
||||
// Your task is to analyze conversations and match them to the most appropriate predefined route.
|
||||
// Review the available routes config:
|
||||
|
||||
// # ROUTES CONFIG START
|
||||
// {routes}
|
||||
// # ROUTES CONFIG END
|
||||
|
||||
// Examine the following conversation between a user and an assistant:
|
||||
|
||||
// # CONVERSATION START
|
||||
// {conversation}
|
||||
// # CONVERSATION END
|
||||
|
||||
// Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
|
||||
|
||||
// 1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
|
||||
// 2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
|
||||
// 3. Find the route that best matches.
|
||||
// 4. Use context clues from the entire conversation to determine the best fit.
|
||||
// 5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
|
||||
// 6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
|
||||
|
||||
// # OUTPUT FORMAT
|
||||
// Your final output must follow this JSON format:
|
||||
// {
|
||||
// "route": "route_name" # The matched route name, or empty string '' if no match
|
||||
// }
|
||||
|
||||
// Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
|
||||
// "#;
|
||||
|
|
@ -426,7 +426,6 @@ impl HttpContext for StreamContext {
|
|||
self.request_size = Some(body_size);
|
||||
|
||||
return Action::Continue;
|
||||
// return self.route();
|
||||
}
|
||||
|
||||
fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
|
||||
|
|
|
|||
|
|
@ -1,58 +1 @@
|
|||
# LLM Routing
|
||||
This demo shows how you can arch gateway to manage keys and route to upstream LLM.
|
||||
|
||||
# Starting the demo
|
||||
1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
|
||||
1. Start Arch
|
||||
```sh
|
||||
sh run_demo.sh
|
||||
```
|
||||
1. Navigate to http://localhost:18080/
|
||||
|
||||
Following screen shows an example of interaction with arch gateway showing dynamic routing. You can select between different LLMs using "override model" option in the chat UI.
|
||||
|
||||

|
||||
|
||||
You can also pass in a header to override model when sending prompt. Following example shows how you can use `x-arch-llm-provider-hint` header to override model selection,
|
||||
|
||||
```bash
|
||||
|
||||
$ curl --header 'Content-Type: application/json' \
|
||||
--header 'x-arch-llm-provider-hint: ministral-3b' \
|
||||
--data '{"messages": [{"role": "user","content": "hello"}]}' \
|
||||
http://localhost:12000/v1/chat/completions 2> /dev/null | jq .
|
||||
{
|
||||
"id": "xxx",
|
||||
"object": "chat.completion",
|
||||
"created": 1737760394,
|
||||
"model": "ministral-3b-latest",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"tool_calls": null,
|
||||
"content": "Hello! How can I assist you today? Let's chat about anything you'd like. 😊"
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 4,
|
||||
"total_tokens": 25,
|
||||
"completion_tokens": 21
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
# Observability
|
||||
Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visualize the stats in dashboard. To see grafana dashboard follow instructions below,
|
||||
|
||||
1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
|
||||
1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
|
||||
1. For tracing you can head over to http://localhost:16686/ to view recent traces.
|
||||
|
||||
Following is a screenshot of tracing UI showing call received by arch gateway and making upstream call to LLM,
|
||||
|
||||

|
||||
# Usage based LLM Routing
|
||||
|
|
|
|||
|
|
@ -1,62 +0,0 @@
|
|||
import json
|
||||
import yaml
|
||||
|
||||
system_prompt = """
|
||||
You are an advanced Routing Assistant designed to select the optimal route based on user requests.
|
||||
Your task is to analyze conversations and match them to the most appropriate predefined route.
|
||||
Review the available routes config:
|
||||
|
||||
# ROUTES CONFIG START
|
||||
{routes}
|
||||
# ROUTES CONFIG END
|
||||
|
||||
Examine the following conversation between a user and an assistant:
|
||||
|
||||
# CONVERSATION START
|
||||
{conversation}
|
||||
# CONVERSATION END
|
||||
|
||||
Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
|
||||
|
||||
1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
|
||||
2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
|
||||
3. Find the route that best matches.
|
||||
4. Use context clues from the entire conversation to determine the best fit.
|
||||
5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
|
||||
6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
|
||||
"""
|
||||
|
||||
output_format = """
|
||||
# OUTPUT FORMAT
|
||||
Your final output must follow this JSON format:
|
||||
{
|
||||
"route": "route_name" # The matched route name, or empty string '' if no match
|
||||
}
|
||||
|
||||
Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
|
||||
"""
|
||||
|
||||
|
||||
with open("arch_config.yaml", "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
llm_provider_routes = ""
|
||||
|
||||
for llm_provider in data.get("llm_providers", []):
|
||||
llm_provider_routes += f"- name: {llm_provider.get('name')}()\n"
|
||||
llm_provider_routes += f" description: {json.dumps(llm_provider.get('usage'))}\n"
|
||||
|
||||
|
||||
conversation = """
|
||||
user: Hello
|
||||
assistant: Hi! How can I assist you today?
|
||||
user: I want to know how far is sun from earth.
|
||||
"""
|
||||
|
||||
system_prompt_formatted = system_prompt.format(
|
||||
routes=llm_provider_routes, conversation=conversation
|
||||
)
|
||||
|
||||
system_prompt_2 = f"{system_prompt_formatted}\n{output_format}"
|
||||
print(system_prompt_2)
|
||||
print(json.dumps(system_prompt_2, indent=2))
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 273 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 284 KiB |
|
|
@ -1,47 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Function to start the demo
|
||||
start_demo() {
|
||||
# Step 1: Check if .env file exists
|
||||
if [ -f ".env" ]; then
|
||||
echo ".env file already exists. Skipping creation."
|
||||
else
|
||||
# Step 2: Create `.env` file and set OpenAI key
|
||||
if [ -z "$OPENAI_API_KEY" ]; then
|
||||
echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Creating .env file..."
|
||||
echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
|
||||
echo ".env file created with OPENAI_API_KEY."
|
||||
fi
|
||||
|
||||
# Step 3: Start Arch
|
||||
echo "Starting Arch with arch_config.yaml..."
|
||||
archgw up arch_config.yaml
|
||||
|
||||
# Step 4: Start LLM Routing
|
||||
echo "Starting LLM Routing using Docker Compose..."
|
||||
docker compose up -d # Run in detached mode
|
||||
}
|
||||
|
||||
# Function to stop the demo
|
||||
stop_demo() {
|
||||
# Step 1: Stop Docker Compose services
|
||||
echo "Stopping LLM Routing using Docker Compose..."
|
||||
docker compose down
|
||||
|
||||
# Step 2: Stop Arch
|
||||
echo "Stopping Arch..."
|
||||
archgw down
|
||||
}
|
||||
|
||||
# Main script logic
|
||||
if [ "$1" == "down" ]; then
|
||||
stop_demo
|
||||
else
|
||||
# Default action is to bring the demo up
|
||||
start_demo
|
||||
fi
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
{
|
||||
"model": "cotran2/llama-1b-4-26",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the capital of France?"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"llm_providers": "[]"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
You are an advanced Routing Assistant designed to select the optimal route based on user requests.
|
||||
Your task is to analyze conversations and match them to the most appropriate predefined route.
|
||||
Review the available routes config:
|
||||
|
||||
# ROUTES CONFIG START
|
||||
{}
|
||||
# ROUTES CONFIG END
|
||||
|
||||
Examine the following conversation between a user and an assistant:
|
||||
|
||||
# CONVERSATION START
|
||||
{}
|
||||
# CONVERSATION END
|
||||
|
||||
Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
|
||||
|
||||
1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
|
||||
2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
|
||||
3. Find the route that best matches.
|
||||
4. Use context clues from the entire conversation to determine the best fit.
|
||||
5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
|
||||
6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
|
||||
"""
|
||||
output_prompt = """
|
||||
# OUTPUT FORMAT
|
||||
Your final output must follow this JSON format:
|
||||
{
|
||||
"route": "route_name" # The matched route name, or empty string '' if no match
|
||||
}
|
||||
|
||||
Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
|
||||
Loading…
Add table
Add a link
Reference in a new issue