From 7f90124bd1bcb4b8e462b6ff4cee588337b079ee Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Thu, 10 Jul 2025 15:34:12 -0700
Subject: [PATCH] more updates

---
 .gitignore                                    |  3 -
 arch/arch_config_schema.yaml                  | 23 +++--
 arch/tools/.vscode/settings.json              |  5 +
 arch/tools/cli/config_generator.py            | 40 +++++---
 crates/.vscode/launch.json                    | 21 ++++
 crates/.vscode/tasks.json                     | 21 ++++
 .../src/handlers/chat_completions.rs          |  8 +-
 crates/brightstaff/src/main.rs                |  4 +
 crates/brightstaff/src/router/llm_router.rs   | 15 ++-
 .../brightstaff/src/router/router_model_v1.rs | 28 +++---
 crates/common/src/configuration.rs            | 21 +---
 crates/common/src/llm_providers.rs            | 11 ++-
 crates/llm_gateway/src/stream_context.rs      | 19 ++--
 .../.vscode/launch.json                       | 15 +++
 demos/use_cases/ollama/arch_config.yaml       |  2 +-
 .../preference_based_routing/arch_config.yaml | 21 ++--
 .../arch_config_local.yaml                    | 45 ---------
 .../arch_config_rendered.yaml                 | 29 ++++++
 .../hurl_tests/simple.hurl                    |  6 +-
 .../includes/arch_config_full_reference.yaml  |  2 +-
 .../arch_config_full_reference_rendered.yaml  | 95 +++++++++++++++++++
 model_server/.vscode/launch.json              |  1 +
 model_server/.vscode/settings.json            |  7 ++
 tests/archgw/.vscode/launch.json              | 15 +++
 tests/archgw/.vscode/settings.json            |  7 ++
 tests/e2e/.vscode/launch.json                 | 15 +++
 tests/e2e/.vscode/settings.json               |  7 ++
 tests/modelserver/.vscode/launch.json         | 15 +++
 tests/modelserver/.vscode/settings.json       |  7 ++
 29 files changed, 375 insertions(+), 133 deletions(-)
 create mode 100644 arch/tools/.vscode/settings.json
 create mode 100644 crates/.vscode/launch.json
 create mode 100644 crates/.vscode/tasks.json
 create mode 100644 demos/samples_java/weather_forcecast_service/.vscode/launch.json
 delete mode 100644 demos/use_cases/preference_based_routing/arch_config_local.yaml
 create mode 100644 demos/use_cases/preference_based_routing/arch_config_rendered.yaml
 create mode 100644 docs/source/resources/includes/arch_config_full_reference_rendered.yaml
 create mode 100644 model_server/.vscode/settings.json
 create mode 100644 tests/archgw/.vscode/launch.json
 create mode 100644 tests/archgw/.vscode/settings.json
 create mode 100644 tests/e2e/.vscode/launch.json
 create mode 100644 tests/e2e/.vscode/settings.json
 create mode 100644 tests/modelserver/.vscode/launch.json
 create mode 100644 tests/modelserver/.vscode/settings.json

diff --git a/.gitignore b/.gitignore
index b140bbbe..d2f7c6bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,9 +101,6 @@ venv.bak/
 # mypy
 .mypy_cache/
 
-# VSCode stuff:
-.vscode/
-
 # MacOS Metadata
 *.DS_Store
 
diff --git a/arch/arch_config_schema.yaml b/arch/arch_config_schema.yaml
index 411e189f..0ca9d42d 100644
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@@ -72,20 +72,23 @@ properties:
           type: string
         default:
           type: boolean
-        # endpoint field is deprecated, use base_url instead
-        endpoint:
-          type: string
         base_url:
           type: string
-        protocol:
-          type: string
-          enum:
-            - http
-            - https
         http_host:
           type: string
-        usage:
-          type: string
+        routing_preferences:
+          type: array
+          items:
+            type: object
+            properties:
+              name:
+                type: string
+              description:
+                type: string
+          additionalProperties: false
+          required:
+            - name
+            - description
       additionalProperties: false
       required:
         - model
diff --git a/arch/tools/.vscode/settings.json b/arch/tools/.vscode/settings.json
new file mode 100644
index 00000000..10f9f99d
--- /dev/null
+++ b/arch/tools/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+  "cSpell.words": [
+    "BRIGHTSTAFF"
+  ]
+}
diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py
index 6dea940b..c636813b 100644
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@@ -95,6 +95,8 @@ def validate_and_render_schema():
     updated_llm_providers = []
     llm_provider_name_set = set()
     llms_with_usage = []
+    model_name_keys = set()
+    model_usage_name_keys = set()
     for llm_provider in config_yaml["llm_providers"]:
         if llm_provider.get("usage", None):
             llms_with_usage.append(llm_provider["name"])
@@ -104,6 +106,11 @@ def validate_and_render_schema():
             )
 
         model_name = llm_provider.get("model")
+        if model_name in model_name_keys:
+            raise Exception(
+                f"Duplicate model name {model_name}, please provide unique model name for each llm_provider"
+            )
+        model_name_keys.add(model_name)
         if llm_provider.get("name") is None:
             llm_provider["name"] = model_name
 
@@ -119,6 +126,20 @@ def validate_and_render_schema():
                 f"Unsupported provider {provider} for model {model_name}. Supported providers are: {', '.join(SUPPORTED_PROVIDERS)}"
             )
 
+        if model_id in model_name_keys:
+            raise Exception(
+                f"Duplicate model_id {model_id}, please provide unique model_id for each llm_provider"
+            )
+        model_name_keys.add(model_id)
+
+        for routing_preference in llm_provider.get("routing_preferences", []):
+            if routing_preference.get("name") in model_usage_name_keys:
+                raise Exception(
+                    f"Duplicate routing preference name \"{routing_preference.get('name')}\", please provide unique name for each routing preference"
+                )
+            model_usage_name_keys.add(routing_preference.get("name"))
+
+        llm_provider["model"] = model_id
         llm_provider["provider_interface"] = provider
         llm_provider_name_set.add(llm_provider.get("name"))
         provider = None
@@ -132,21 +153,14 @@ def validate_and_render_schema():
             del llm_provider["provider"]
         updated_llm_providers.append(llm_provider)
 
-        if llm_provider.get("endpoint") and llm_provider.get("base_url"):
-            raise Exception("Please provide either endpoint or base_url, not both")
-
-        if llm_provider.get("endpoint", None):
-            endpoint = llm_provider["endpoint"]
-            protocol = llm_provider.get("protocol", "http")
-            llm_provider["endpoint"], llm_provider["port"] = get_endpoint_and_port(
-                endpoint, protocol
-            )
-            llms_with_endpoint.append(llm_provider)
-        elif llm_provider.get("base_url", None):
+        if llm_provider.get("base_url", None):
             base_url = llm_provider["base_url"]
             urlparse_result = urlparse(base_url)
-            if llm_provider.get("port"):
-                raise Exception("Please provider port in base_url")
+            url_path = urlparse_result.path
+            if url_path and url_path != "/":
+                raise Exception(
+                    f"Please provide base_url without path, got {base_url}. Use base_url like 'http://example.com' instead of 'http://example.com/path'."
+                )
             if urlparse_result.scheme == "" or urlparse_result.scheme not in [
                 "http",
                 "https",
diff --git a/crates/.vscode/launch.json b/crates/.vscode/launch.json
new file mode 100644
index 00000000..56a29b46
--- /dev/null
+++ b/crates/.vscode/launch.json
@@ -0,0 +1,21 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Debug Brightstaff",
+      "type": "lldb",
+      "request": "launch",
+      "program": "${workspaceFolder}/target/debug/brightstaff",
+      "args": [],
+      "cwd": "${workspaceFolder}",
+      "stopOnEntry": false,
+      "sourceLanguages": ["rust"],
+      "env": {
+        "RUST_LOG": "debug",
+        "RUST_BACKTRACE": "1",
+        "ARCH_CONFIG_PATH_RENDERED": "../demos/use_cases/preference_based_routing/arch_config_rendered.yaml"
+      },
+      "preLaunchTask": "rust: cargo build"
+    }
+  ]
+}
diff --git a/crates/.vscode/tasks.json b/crates/.vscode/tasks.json
new file mode 100644
index 00000000..8d648bc7
--- /dev/null
+++ b/crates/.vscode/tasks.json
@@ -0,0 +1,21 @@
+{
+	"version": "2.0.0",
+	"tasks": [
+    {
+      "type": "cargo",
+      "command": "build",
+      "args": [
+        "--bin",
+        "brightstaff"
+      ],
+      "problemMatcher": [
+        "$rustc"
+      ],
+      "group": {
+        "kind": "build",
+        "isDefault": true
+      },
+      "label": "rust: cargo build"
+    }
+  ]
+}
diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs
index 217897cd..89c9ee13 100644
--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@@ -12,7 +12,7 @@ use hyper::{Request, Response, StatusCode};
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_stream::StreamExt;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, info, warn};
 
 use crate::router::llm_router::RouterService;
 
@@ -81,8 +81,8 @@ pub async fn chat_completions(
         }
     }
 
-    trace!(
-        "arch-router request body: {}",
+    debug!(
+        "arch-router request received: {}",
         &serde_json::to_string(&chat_completion_request).unwrap()
     );
 
@@ -102,7 +102,7 @@ pub async fn chat_completions(
         .as_ref()
         .and_then(|s| serde_yaml::from_str(s).ok());
 
-    debug!("usage preferences: {:?}", usage_preferences);
+    debug!("usage preferences from request: {:?}", usage_preferences);
 
     let mut determined_route = match router_service
         .determine_route(
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 9128c33b..4e4f18b7 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -44,6 +44,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
     let _tracer_provider = init_tracer();
     let bind_address = env::var("BIND_ADDRESS").unwrap_or_else(|_| BIND_ADDRESS.to_string());
 
+    info!(
+        "current working directory: {}",
+        env::current_dir().unwrap().display()
+    );
     // loading arch_config.yaml file
     let arch_config_path = env::var("ARCH_CONFIG_PATH_RENDERED")
         .unwrap_or_else(|_| "./arch_config_rendered.yaml".to_string());
diff --git a/crates/brightstaff/src/router/llm_router.rs b/crates/brightstaff/src/router/llm_router.rs
index a78d34e7..c1320c66 100644
--- a/crates/brightstaff/src/router/llm_router.rs
+++ b/crates/brightstaff/src/router/llm_router.rs
@@ -1,7 +1,7 @@
 use std::sync::Arc;
 
 use common::{
-    configuration::{LlmProvider, LlmRoute, ModelUsagePreference},
+    configuration::{LlmProvider, ModelUsagePreference, RoutingPreference},
     consts::ARCH_PROVIDER_HINT_HEADER,
 };
 use hermesllm::providers::openai::types::{ChatCompletionsResponse, ContentType, Message};
@@ -44,11 +44,14 @@ impl RouterService {
     ) -> Self {
         let providers_with_usage = providers
             .iter()
-            .filter(|provider| provider.usage.is_some())
+            .filter(|provider| provider.routing_preferences.is_some())
             .cloned()
             .collect::<Vec<LlmProvider>>();
 
-        let llm_routes: Vec<LlmRoute> = providers_with_usage.iter().map(LlmRoute::from).collect();
+        let llm_routes: Vec<RoutingPreference> = providers_with_usage
+            .iter()
+            .flat_map(|provider| provider.routing_preferences.clone().unwrap_or_default())
+            .collect();
 
         let router_model = Arc::new(router_model_v1::RouterModelV1::new(
             llm_routes,
@@ -156,6 +159,12 @@ impl RouterService {
                 router_response_time.as_millis()
             );
 
+            if let Some(ref route) = route_name {
+                if route == "other" {
+                    return Ok(None);
+                }
+            }
+
             Ok(route_name)
         } else {
             Ok(None)
diff --git a/crates/brightstaff/src/router/router_model_v1.rs b/crates/brightstaff/src/router/router_model_v1.rs
index e6ccd912..0dcefff6 100644
--- a/crates/brightstaff/src/router/router_model_v1.rs
+++ b/crates/brightstaff/src/router/router_model_v1.rs
@@ -1,5 +1,5 @@
 use common::{
-    configuration::{LlmRoute, ModelUsagePreference},
+    configuration::{ModelUsagePreference, RoutingPreference},
     consts::{SYSTEM_ROLE, TOOL_ROLE, USER_ROLE},
 };
 use hermesllm::providers::openai::types::{ChatCompletionsRequest, ContentType, Message};
@@ -36,7 +36,11 @@ pub struct RouterModelV1 {
     max_token_length: usize,
 }
 impl RouterModelV1 {
-    pub fn new(llm_routes: Vec<LlmRoute>, routing_model: String, max_token_length: usize) -> Self {
+    pub fn new(
+        llm_routes: Vec<RoutingPreference>,
+        routing_model: String,
+        max_token_length: usize,
+    ) -> Self {
         let llm_route_json_str =
             serde_json::to_string(&llm_routes).unwrap_or_else(|_| "[]".to_string());
         RouterModelV1 {
@@ -138,9 +142,9 @@ impl RouterModel for RouterModelV1 {
         let llm_route_json = usage_preferences
             .as_ref()
             .map(|prefs| {
-                let llm_route: Vec<LlmRoute> = prefs
+                let llm_route: Vec<RoutingPreference> = prefs
                     .iter()
-                    .map(|pref| LlmRoute {
+                    .map(|pref| RoutingPreference {
                         name: pref.name.clone(),
                         description: pref.usage.clone().unwrap_or_default(),
                     })
@@ -255,7 +259,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
 
@@ -314,7 +318,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
 
@@ -379,7 +383,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), 235);
 
@@ -440,7 +444,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), 200);
 
@@ -501,7 +505,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), 230);
 
@@ -569,7 +573,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
 
@@ -639,7 +643,7 @@ Based on your analysis, provide your response in the following JSON formats if y
               {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
           ]
         "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
         let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
 
@@ -716,7 +720,7 @@ Based on your analysis, provide your response in the following JSON formats if y
     {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
 ]
 "#;
-        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let llm_routes = serde_json::from_str::<Vec<RoutingPreference>>(routes_str).unwrap();
 
         let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000);
 
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index d92f38fb..0693c09b 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -187,24 +187,11 @@ pub struct ModelUsagePreference {
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LlmRoute {
+pub struct RoutingPreference {
     pub name: String,
     pub description: String,
 }
 
-impl From<&LlmProvider> for LlmRoute {
-    fn from(provider: &LlmProvider) -> Self {
-        Self {
-            name: provider.name.to_string(),
-            description: provider
-                .usage
-                .as_ref()
-                .cloned()
-                .unwrap_or_else(|| "No description available".to_string()),
-        }
-    }
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 //TODO: use enum for model, but if there is a new model, we need to update the code
 pub struct LlmProvider {
@@ -218,6 +205,7 @@ pub struct LlmProvider {
     pub port: Option<u16>,
     pub rate_limits: Option<LlmRatelimit>,
     pub usage: Option<String>,
+    pub routing_preferences: Option<Vec<RoutingPreference>>,
 }
 
 pub trait IntoModels {
@@ -256,6 +244,7 @@ impl Default for LlmProvider {
             port: None,
             rate_limits: None,
             usage: None,
+            routing_preferences: None,
         }
     }
 }
@@ -368,7 +357,7 @@ mod test {
     #[test]
     fn test_deserialize_configuration() {
         let ref_config = fs::read_to_string(
-            "../../docs/source/resources/includes/arch_config_full_reference.yaml",
+            "../../docs/source/resources/includes/arch_config_full_reference_rendered.yaml",
         )
         .expect("reference config file not found");
 
@@ -429,7 +418,7 @@ mod test {
     #[test]
     fn test_tool_conversion() {
         let ref_config = fs::read_to_string(
-            "../../docs/source/resources/includes/arch_config_full_reference.yaml",
+            "../../docs/source/resources/includes/arch_config_full_reference_rendered.yaml",
         )
         .expect("reference config file not found");
         let config: super::Configuration = serde_yaml::from_str(&ref_config).unwrap();
diff --git a/crates/common/src/llm_providers.rs b/crates/common/src/llm_providers.rs
index 8214f148..120be691 100644
--- a/crates/common/src/llm_providers.rs
+++ b/crates/common/src/llm_providers.rs
@@ -58,7 +58,16 @@ impl TryFrom<Vec<LlmProvider>> for LlmProviders {
             let name = llm_provider.name.clone();
             if llm_providers
                 .providers
-                .insert(name.clone(), llm_provider)
+                .insert(name.clone(), llm_provider.clone())
+                .is_some()
+            {
+                return Err(LlmProvidersNewError::DuplicateName(name));
+            }
+
+            // also add model_id as key for provider lookup
+            if llm_providers
+                .providers
+                .insert(llm_provider.model.clone().unwrap(), llm_provider)
                 .is_some()
             {
                 return Err(LlmProvidersNewError::DuplicateName(name));
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 2fa29496..d6be1749 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -113,16 +113,10 @@ impl StreamContext {
         }
 
         debug!(
-            "request received: llm provider hint: {}, selected llm: {}, model: {}",
+            "request received: llm provider hint: {}, selected provider: {}",
             self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER)
                 .unwrap_or_default(),
-            self.llm_provider.as_ref().unwrap().name,
-            self.llm_provider
-                .as_ref()
-                .unwrap()
-                .model
-                .as_ref()
-                .unwrap_or(&String::new())
+            self.llm_provider.as_ref().unwrap().name
         );
     }
 
@@ -313,6 +307,11 @@ impl HttpContext for StreamContext {
             }
         };
 
+        debug!(
+            "on_http_request_body: deserialized body: {}",
+            serde_json::to_string(&deserialized_body).unwrap_or_default()
+        );
+
         self.user_message = deserialized_body
             .messages
             .iter()
@@ -349,8 +348,8 @@ impl HttpContext for StreamContext {
         };
 
         info!(
-            "on_http_request_body: provider: {}, model requested: {}, model selected: {}",
-            self.llm_provider().name,
+            "on_http_request_body: provider: {}, model requested (in body): {}, model selected: {}",
+            self.llm_provider().provider_interface,
             model_requested,
             model_name.unwrap_or(&"None".to_string()),
         );
diff --git a/demos/samples_java/weather_forcecast_service/.vscode/launch.json b/demos/samples_java/weather_forcecast_service/.vscode/launch.json
new file mode 100644
index 00000000..a9232a53
--- /dev/null
+++ b/demos/samples_java/weather_forcecast_service/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "type": "java",
+      "name": "WeatherForecastApplication",
+      "request": "launch",
+      "mainClass": "weather.WeatherForecastApplication",
+      "projectName": "weather-forecast-service"
+    }
+  ]
+}
diff --git a/demos/use_cases/ollama/arch_config.yaml b/demos/use_cases/ollama/arch_config.yaml
index 7d464d68..db824ad7 100644
--- a/demos/use_cases/ollama/arch_config.yaml
+++ b/demos/use_cases/ollama/arch_config.yaml
@@ -10,7 +10,7 @@ listeners:
 llm_providers:
 
   - model: openai/llama3.2
-    endpoint: host.docker.internal:11434
+    base_url: http://host.docker.internal:11434
     default: true
 
 system_prompt: |
diff --git a/demos/use_cases/preference_based_routing/arch_config.yaml b/demos/use_cases/preference_based_routing/arch_config.yaml
index c1047206..33136325 100644
--- a/demos/use_cases/preference_based_routing/arch_config.yaml
+++ b/demos/use_cases/preference_based_routing/arch_config.yaml
@@ -9,22 +9,21 @@ listeners:
 
 llm_providers:
 
-  - access_key: $OPENAI_API_KEY
-    model: openai/gpt-4o-mini
-
-  - access_key: $OPENAI_API_KEY
-    model: openai/gpt-4.1
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
     default: true
 
-  - name: code_generation
+  - model: openai/gpt-4o
     access_key: $OPENAI_API_KEY
-    model: openai/gpt-4.1
-    usage: generating new code snippets, functions, or boilerplate based on user prompts or requirements
+    routing_preferences:
+      - name: code understanding
+        description: understand and explain existing code snippets, functions, or libraries
 
-  - name: code_understanding
+  - model: openai/gpt-4.1
     access_key: $OPENAI_API_KEY
-    model: openai/gpt-4o-mini
-    usage: understand and explain existing code snippets, functions, or libraries
+    routing_preferences:
+      - name: code generation
+        description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
 
 tracing:
   random_sampling: 100
diff --git a/demos/use_cases/preference_based_routing/arch_config_local.yaml b/demos/use_cases/preference_based_routing/arch_config_local.yaml
deleted file mode 100644
index 029918d0..00000000
--- a/demos/use_cases/preference_based_routing/arch_config_local.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-version: v0.1.0
-
-routing:
-  model: Arch-Router
-  llm_provider: arch-router
-
-listeners:
-  egress_traffic:
-    address: 0.0.0.0
-    port: 12000
-    message_format: openai
-    timeout: 30s
-
-llm_providers:
-
-  - name: arch-router
-    provider_interface: arch
-    model: hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
-    endpoint: host.docker.internal:11434
-
-  - name: gpt-4o-mini
-    provider_interface: openai
-    access_key: $OPENAI_API_KEY
-    model: gpt-4o-mini
-
-  - name: gpt-4.1
-    provider_interface: openai
-    access_key: $OPENAI_API_KEY
-    model: gpt-4.1
-    default: true
-
-  - name: code_generation
-    access_key: $OPENAI_API_KEY
-    provider_interface: openai
-    model: gpt-4.1
-    usage: generating new code snippets, functions, or boilerplate based on user prompts or requirements
-
-  - name: code_understanding
-    provider_interface: openai
-    access_key: $OPENAI_API_KEY
-    model: gpt-4.1
-    usage: understand and explain existing code snippets, functions, or libraries
-
-tracing:
-  random_sampling: 100
diff --git a/demos/use_cases/preference_based_routing/arch_config_rendered.yaml b/demos/use_cases/preference_based_routing/arch_config_rendered.yaml
new file mode 100644
index 00000000..bdd85f0d
--- /dev/null
+++ b/demos/use_cases/preference_based_routing/arch_config_rendered.yaml
@@ -0,0 +1,29 @@
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    message_format: openai
+    port: 12000
+    timeout: 30s
+llm_providers:
+- access_key: $OPENAI_API_KEY
+  default: true
+  model: gpt-4o-mini
+  name: openai/gpt-4o-mini
+  provider_interface: openai
+- access_key: $OPENAI_API_KEY
+  model: gpt-4o
+  name: openai/gpt-4o
+  provider_interface: openai
+  routing_preferences:
+  - description: b
+    name: code understanding
+- access_key: $OPENAI_API_KEY
+  model: gpt-4.1
+  name: openai/gpt-4.1
+  provider_interface: openai
+  routing_preferences:
+  - description: a
+    name: code understanding
+tracing:
+  random_sampling: 100
+version: v0.1.0
diff --git a/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl b/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
index 432f0996..c4ee5d8a 100644
--- a/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
+++ b/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
@@ -2,18 +2,18 @@ POST http://localhost:12000/v1/chat/completions
 Content-Type: application/json
 
 {
+  "model": "openai/gpt-4.1",
   "messages": [
     {
       "role": "user",
       "content": "hi"
     }
-  ],
-  "model": "none"
+  ]
 }
 HTTP 200
 [Asserts]
 header "content-type" == "application/json"
-jsonpath "$.model" matches /^gpt-4.1/
+jsonpath "$.model" matches /^gpt-4o-mini/
 jsonpath "$.usage" != null
 jsonpath "$.choices[0].message.content" != null
 jsonpath "$.choices[0].message.role" == "assistant"
diff --git a/docs/source/resources/includes/arch_config_full_reference.yaml b/docs/source/resources/includes/arch_config_full_reference.yaml
index 266ccf33..808baff1 100644
--- a/docs/source/resources/includes/arch_config_full_reference.yaml
+++ b/docs/source/resources/includes/arch_config_full_reference.yaml
@@ -39,7 +39,7 @@ llm_providers:
     model: mistral/mistral-8x7b
 
   - model: mistral/mistral-7b-instruct
-    endpoint: mistral_local
+    base_url: http://mistral_local
 
 # provides a way to override default settings for the arch system
 overrides:
diff --git a/docs/source/resources/includes/arch_config_full_reference_rendered.yaml b/docs/source/resources/includes/arch_config_full_reference_rendered.yaml
new file mode 100644
index 00000000..c567de7f
--- /dev/null
+++ b/docs/source/resources/includes/arch_config_full_reference_rendered.yaml
@@ -0,0 +1,95 @@
+version: v0.1
+
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 5s
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 5s
+
+# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
+endpoints:
+  app_server:
+    # value could be ip address or a hostname with port
+    # this could also be a list of endpoints for load balancing
+    # for example endpoint: [ ip1:port, ip2:port ]
+    endpoint: 127.0.0.1:80
+    # max time to wait for a connection to be established
+    connect_timeout: 0.005s
+
+  mistral_local:
+    endpoint: 127.0.0.1:8001
+
+  error_target:
+    endpoint: error_target_1
+
+# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
+llm_providers:
+  - name: openai/gpt-4o
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: gpt-4o
+    default: true
+
+  - name: mistral/mistral-8x7b
+    provider_interface: mistral
+    access_key: $MISTRAL_API_KEY
+    model: mistral-8x7b
+
+  - name: mistral/mistral-7b-instruct
+    provider_interface: mistral
+    model: mistral-7b-instruct
+    base_url: http://mistral_local
+
+# provides a way to override default settings for the arch system
+overrides:
+  # By default Arch uses an NLI + embedding approach to match an incoming prompt to a prompt target.
+  # The intent matching threshold is kept at 0.80, you can override this behavior if you would like
+  prompt_target_intent_matching_threshold: 0.60
+
+# default system prompt used by all prompt targets
+system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions.
+
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters.
+
+prompt_targets:
+  - name: information_extraction
+    default: true
+    description: handel all scenarios that are question and answer in nature. Like summarization, information extraction, etc.
+    endpoint:
+      name: app_server
+      path: /agent/summary
+      http_method: POST
+    # Arch uses the default LLM and treats the response from the endpoint as the prompt to send to the LLM
+    auto_llm_dispatch_on_response: true
+    # override system prompt for this prompt target
+    system_prompt: You are a helpful information extraction assistant. Use the information that is provided to you.
+
+  - name: reboot_network_device
+    description: Reboot a specific network device
+    endpoint:
+      name: app_server
+      path: /agent/action
+    parameters:
+      - name: device_id
+        type: str
+        description: Identifier of the network device to reboot.
+        required: true
+      - name: confirmation
+        type: bool
+        description: Confirmation flag to proceed with reboot.
+        default: false
+        enum: [true, false]
+
+tracing:
+  # sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
+  sampling_rate: 0.1
diff --git a/model_server/.vscode/launch.json b/model_server/.vscode/launch.json
index ca83be87..19ed7342 100644
--- a/model_server/.vscode/launch.json
+++ b/model_server/.vscode/launch.json
@@ -4,6 +4,7 @@
   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
   "version": "0.2.0",
   "configurations": [
+
     {
       "name": "model server",
       "type": "debugpy",
diff --git a/model_server/.vscode/settings.json b/model_server/.vscode/settings.json
new file mode 100644
index 00000000..98ba633e
--- /dev/null
+++ b/model_server/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true
+}
diff --git a/tests/archgw/.vscode/launch.json b/tests/archgw/.vscode/launch.json
new file mode 100644
index 00000000..6a211d8e
--- /dev/null
+++ b/tests/archgw/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
diff --git a/tests/archgw/.vscode/settings.json b/tests/archgw/.vscode/settings.json
new file mode 100644
index 00000000..98ba633e
--- /dev/null
+++ b/tests/archgw/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true
+}
diff --git a/tests/e2e/.vscode/launch.json b/tests/e2e/.vscode/launch.json
new file mode 100644
index 00000000..6a211d8e
--- /dev/null
+++ b/tests/e2e/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
diff --git a/tests/e2e/.vscode/settings.json b/tests/e2e/.vscode/settings.json
new file mode 100644
index 00000000..98ba633e
--- /dev/null
+++ b/tests/e2e/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true
+}
diff --git a/tests/modelserver/.vscode/launch.json b/tests/modelserver/.vscode/launch.json
new file mode 100644
index 00000000..6a211d8e
--- /dev/null
+++ b/tests/modelserver/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
diff --git a/tests/modelserver/.vscode/settings.json b/tests/modelserver/.vscode/settings.json
new file mode 100644
index 00000000..98ba633e
--- /dev/null
+++ b/tests/modelserver/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true
+}