adding more changes

2026-06-17 15:25:17 +02:00 · 2025-05-12 13:09:22 -07:00 · 2025-05-12 13:09:22 -07:00 · 85ab948b13
commit 85ab948b13
parent f13fc76a4a
13 changed files with 4 additions and 366 deletions
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -30,7 +30,7 @@ stats_config:
 static_resources:
  listeners:

-    - name: ingress
+    - name: arch_router
      address:
        socket_address:
          address: 0.0.0.0
@ -51,7 +51,7 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: arch_gateway
+                      service_name: arch_router
                  random_sampling:
                    value: 100
                stat_prefix: ingress
@ -60,7 +60,7 @@ static_resources:
                - name: envoy.access_loggers.file
                  typed_config:
                    "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
-                    path: "/var/log/access_ingress.log"
+                    path: "/var/log/access_arch_router.log"
                route_config:
                  name: local_routes
                  virtual_hosts:
@ -68,16 +68,6 @@ static_resources:
                      domains:
                        - "*"
                      routes:
-                        - match:
-                            prefix: "/"
-                            headers:
-                              - name: "host"
-                                string_match:
-                                  exact: router_model_host
-                          route:
-                            auto_host_rewrite: true
-                            cluster: router_model_host
-
                        - match:
                            prefix: "/"
                          route:
--- a/crates/common/src/routing.rs
+++ b/crates/common/src/routing.rs
@ -30,8 +30,6 @@ pub fn get_llm_provider(
        ProviderHint::Name(name) => llm_providers.get(&name),
    });

-    info!("selected provider: maybe_provider: {:?}", maybe_provider);
-
    if let Some(provider) = maybe_provider {
        return provider;
    }
--- a/crates/llm_gateway/src/lib.rs
+++ b/crates/llm_gateway/src/lib.rs
@ -3,8 +3,6 @@ use proxy_wasm::traits::*;
 use proxy_wasm::types::*;

 mod filter_context;
-mod llm_routing;
-mod llm_routing_consts;
 mod metrics;
 mod stream_context;

--- a/crates/llm_gateway/src/llm_routing.rs
+++ b/crates/llm_gateway/src/llm_routing.rs
@ -1,106 +0,0 @@
-// use std::rc::Rc;
-// use std::time::Duration;
-
-// use common::api::open_ai::{ChatCompletionsRequest, Message};
-// use common::configuration::LlmProvider;
-// use common::consts::{ARCH_INTERNAL_CLUSTER_NAME, ARCH_UPSTREAM_HOST_HEADER};
-// use common::errors::ServerError;
-// use common::http::{CallArgs, Client};
-// use log::{info, warn};
-// use proxy_wasm::traits::HttpContext;
-// use proxy_wasm::types::Action;
-
-// use crate::llm_routing_consts::SYSTEM_PROMPT;
-// use crate::stream_context::{CallContext, StreamContext};
-
-// pub trait Routing {
-//     fn route(&self) -> Action;
-// }
-
-// impl Routing for StreamContext {
-//     fn route(&self) -> Action {
-//         let usage_based_providers = self
-//             .llm_providers
-//             .iter()
-//             .filter(|(_, provider)| provider.usage.is_some())
-//             .map(|(_, provider)| provider.clone())
-//             .collect::<Vec<Rc<LlmProvider>>>();
-
-//         info!(
-//             "usage based providers found: {}",
-//             usage_based_providers
-//                 .iter()
-//                 .map(|provider| provider.name.clone())
-//                 .collect::<Vec<String>>()
-//                 .join(", ")
-//         );
-
-//         if usage_based_providers.is_empty() {
-//             self.set_http_request_body(
-//                 0,
-//                 self.request_size.unwrap(),
-//                 self.request_body.as_ref().unwrap().as_bytes(),
-//             );
-//             return Action::Continue;
-//         }
-
-//         let llm_routes_str = r#"- name: gpt-4o
-//   description: simple requests, basic fact retrieval, easy to answer
-// - name: o4-mini()
-//   description: complex reasoning problem, require multi step answer"#;
-
-//         let chat_completions_request_messages_str =
-//             serde_json::to_string(&self.chat_completion_request.as_ref().unwrap().messages)
-//                 .expect("failed to serialize llm routing request messages");
-
-//         let system_prompt_formatted = SYSTEM_PROMPT
-//             .replace("{routes}", llm_routes_str)
-//             .replace("{conversation}", &chat_completions_request_messages_str);
-
-//         let message = Message {
-//             role: "user".to_string(),
-//             content: Some(system_prompt_formatted),
-//             model: None,
-//             tool_calls: None,
-//             tool_call_id: None,
-//         };
-
-//         let llm_routing_request = ChatCompletionsRequest {
-//             model: "cotran2/llama-1b-4-26".to_string(),
-//             messages: vec![message],
-//             tools: None,
-//             stream: false,
-//             stream_options: None,
-//             metadata: None,
-//         };
-
-//         let llm_routing_request_str = serde_json::to_string(&llm_routing_request)
-//             .expect("failed to serialize llm routing request");
-
-//         let headers = vec![
-//             (":method", "POST"),
-//             (ARCH_UPSTREAM_HOST_HEADER, "gcp_hosted_outer_llm"),
-//             (":path", "/v1/chat/completions"),
-//             (":authority", "gcp_hosted_outer_llm"),
-//             ("content-type", "application/json"),
-//             ("x-envoy-max-retries", "3"),
-//             ("x-envoy-upstream-rq-timeout-ms", "5000"),
-//         ];
-
-//         let call_args = CallArgs::new(
-//             ARCH_INTERNAL_CLUSTER_NAME,
-//             "/v1/chat/completions",
-//             headers,
-//             llm_routing_request_str.as_bytes().into(),
-//             vec![],
-//             Duration::from_secs(5),
-//         );
-
-//         if let Err(e) = self.http_call(call_args, CallContext {}) {
-//             warn!("failed to call LLM provider: {}", e);
-//             self.send_server_error(ServerError::HttpDispatch(e), None);
-//         }
-
-//         Action::Pause
-//     }
-// }
--- a/crates/llm_gateway/src/llm_routing_consts.rs
+++ b/crates/llm_gateway/src/llm_routing_consts.rs
@ -1,32 +0,0 @@
-// pub const SYSTEM_PROMPT: &str = r#"
-// You are an advanced Routing Assistant designed to select the optimal route based on user requests.
-// Your task is to analyze conversations and match them to the most appropriate predefined route.
-// Review the available routes config:
-
-// # ROUTES CONFIG START
-// {routes}
-// # ROUTES CONFIG END
-
-// Examine the following conversation between a user and an assistant:
-
-// # CONVERSATION START
-// {conversation}
-// # CONVERSATION END
-
-// Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
-
-// 1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
-// 2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
-// 3. Find the route that best matches.
-// 4. Use context clues from the entire conversation to determine the best fit.
-// 5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
-// 6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
-
-// # OUTPUT FORMAT
-// Your final output must follow this JSON format:
-// {
-//   "route": "route_name" # The matched route name, or empty string '' if no match
-// }
-
-// Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
-// "#;
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -426,7 +426,6 @@ impl HttpContext for StreamContext {
        self.request_size = Some(body_size);

        return Action::Continue;
-        // return self.route();
    }

    fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
--- a/demos/use_cases/preference_based_routing/README.md
+++ b/demos/use_cases/preference_based_routing/README.md
@ -1,58 +1 @@
-# LLM Routing
-This demo shows how you can arch gateway to manage keys and route to upstream LLM.
-
-# Starting the demo
-1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
-1. Start Arch
-   ```sh
-   sh run_demo.sh
-   ```
-1. Navigate to http://localhost:18080/
-
-Following screen shows an example of interaction with arch gateway showing dynamic routing. You can select between different LLMs using "override model" option in the chat UI.
-
-![LLM Routing Demo](llm_routing_demo.png)
-
-You can also pass in a header to override model when sending prompt. Following example shows how you can use `x-arch-llm-provider-hint` header to override model selection,
-
-```bash
-
-$ curl --header 'Content-Type: application/json' \
-  --header 'x-arch-llm-provider-hint: ministral-3b' \
-  --data '{"messages": [{"role": "user","content": "hello"}]}' \
-  http://localhost:12000/v1/chat/completions 2> /dev/null | jq .
-{
-  "id": "xxx",
-  "object": "chat.completion",
-  "created": 1737760394,
-  "model": "ministral-3b-latest",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "tool_calls": null,
-        "content": "Hello! How can I assist you today? Let's chat about anything you'd like. 😊"
-      },
-      "finish_reason": "stop"
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 4,
-    "total_tokens": 25,
-    "completion_tokens": 21
-  }
-}
-
-```
-
-# Observability
-Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visualize the stats in dashboard. To see grafana dashboard follow instructions below,
-
-1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
-1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
-1. For tracing you can head over to http://localhost:16686/ to view recent traces.
-
-Following is a screenshot of tracing UI showing call received by arch gateway and making upstream call to LLM,
-
-![Jaeger Tracing](jaeger_tracing_llm_routing.png)
+# Usage based LLM Routing
--- a/demos/use_cases/preference_based_routing/convert_system_prompt.py
+++ b/demos/use_cases/preference_based_routing/convert_system_prompt.py
@ -1,62 +0,0 @@
-import json
-import yaml
-
-system_prompt = """
-You are an advanced Routing Assistant designed to select the optimal route based on user requests.
-Your task is to analyze conversations and match them to the most appropriate predefined route.
-Review the available routes config:
-
-# ROUTES CONFIG START
-{routes}
-# ROUTES CONFIG END
-
-Examine the following conversation between a user and an assistant:
-
-# CONVERSATION START
-{conversation}
-# CONVERSATION END
-
-Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
-
-1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
-2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
-3. Find the route that best matches.
-4. Use context clues from the entire conversation to determine the best fit.
-5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
-6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
-"""
-
-output_format = """
-# OUTPUT FORMAT
-Your final output must follow this JSON format:
-{
-  "route": "route_name" # The matched route name, or empty string '' if no match
-}
-
-Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.
-"""
-
-
-with open("arch_config.yaml", "r") as file:
-    data = yaml.safe_load(file)
-
-llm_provider_routes = ""
-
-for llm_provider in data.get("llm_providers", []):
-    llm_provider_routes += f"- name: {llm_provider.get('name')}()\n"
-    llm_provider_routes += f"  description: {json.dumps(llm_provider.get('usage'))}\n"
-
-
-conversation = """
-user: Hello
-assistant: Hi! How can I assist you today?
-user: I want to know how far is sun from earth.
-"""
-
-system_prompt_formatted = system_prompt.format(
-    routes=llm_provider_routes, conversation=conversation
-)
-
-system_prompt_2 = f"{system_prompt_formatted}\n{output_format}"
-print(system_prompt_2)
-print(json.dumps(system_prompt_2, indent=2))
--- a/demos/use_cases/preference_based_routing/jaeger_tracing_llm_routing.png
+++ b/demos/use_cases/preference_based_routing/jaeger_tracing_llm_routing.png
--- a/demos/use_cases/preference_based_routing/llm_routing_demo.png
+++ b/demos/use_cases/preference_based_routing/llm_routing_demo.png
--- a/demos/use_cases/preference_based_routing/run_demo.sh
+++ b/demos/use_cases/preference_based_routing/run_demo.sh
@ -1,47 +0,0 @@
-#!/bin/bash
-set -e
-
-# Function to start the demo
-start_demo() {
-  # Step 1: Check if .env file exists
-  if [ -f ".env" ]; then
-    echo ".env file already exists. Skipping creation."
-  else
-    # Step 2: Create `.env` file and set OpenAI key
-    if [ -z "$OPENAI_API_KEY" ]; then
-      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
-      exit 1
-    fi
-
-    echo "Creating .env file..."
-    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
-    echo ".env file created with OPENAI_API_KEY."
-  fi
-
-  # Step 3: Start Arch
-  echo "Starting Arch with arch_config.yaml..."
-  archgw up arch_config.yaml
-
-  # Step 4: Start LLM Routing
-  echo "Starting LLM Routing using Docker Compose..."
-  docker compose up -d  # Run in detached mode
-}
-
-# Function to stop the demo
-stop_demo() {
-  # Step 1: Stop Docker Compose services
-  echo "Stopping LLM Routing using Docker Compose..."
-  docker compose down
-
-  # Step 2: Stop Arch
-  echo "Stopping Arch..."
-  archgw down
-}
-
-# Main script logic
-if [ "$1" == "down" ]; then
-  stop_demo
-else
-  # Default action is to bring the demo up
-  start_demo
-fi
--- a/demos/use_cases/preference_based_routing/staff_req.json
+++ b/demos/use_cases/preference_based_routing/staff_req.json
@ -1,12 +0,0 @@
-{
-  "model": "cotran2/llama-1b-4-26",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What is the capital of France?"
-    }
-  ],
-  "metadata": {
-    "llm_providers": "[]"
-  }
-}
--- a/demos/use_cases/preference_based_routing/test_sytem_prompt.text
+++ b/demos/use_cases/preference_based_routing/test_sytem_prompt.text
@ -1,31 +0,0 @@
-You are an advanced Routing Assistant designed to select the optimal route based on user requests.
-Your task is to analyze conversations and match them to the most appropriate predefined route.
-Review the available routes config:
-
-# ROUTES CONFIG START
-{}
-# ROUTES CONFIG END
-
-Examine the following conversation between a user and an assistant:
-
-# CONVERSATION START
-{}
-# CONVERSATION END
-
-Your goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:
-
-1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.
-2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).
-3. Find the route that best matches.
-4. Use context clues from the entire conversation to determine the best fit.
-5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.
-6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''.
-"""
-output_prompt = """
-# OUTPUT FORMAT
-Your final output must follow this JSON format:
-{
-  "route": "route_name" # The matched route name, or empty string '' if no match
-}
-
-Based on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.