From 7f90124bd1bcb4b8e462b6ff4cee588337b079ee Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Thu, 10 Jul 2025 15:34:12 -0700 Subject: [PATCH] more updates --- .gitignore | 3 - arch/arch_config_schema.yaml | 23 +++-- arch/tools/.vscode/settings.json | 5 + arch/tools/cli/config_generator.py | 40 +++++--- crates/.vscode/launch.json | 21 ++++ crates/.vscode/tasks.json | 21 ++++ .../src/handlers/chat_completions.rs | 8 +- crates/brightstaff/src/main.rs | 4 + crates/brightstaff/src/router/llm_router.rs | 15 ++- .../brightstaff/src/router/router_model_v1.rs | 28 +++--- crates/common/src/configuration.rs | 21 +--- crates/common/src/llm_providers.rs | 11 ++- crates/llm_gateway/src/stream_context.rs | 19 ++-- .../.vscode/launch.json | 15 +++ demos/use_cases/ollama/arch_config.yaml | 2 +- .../preference_based_routing/arch_config.yaml | 21 ++-- .../arch_config_local.yaml | 45 --------- .../arch_config_rendered.yaml | 29 ++++++ .../hurl_tests/simple.hurl | 6 +- .../includes/arch_config_full_reference.yaml | 2 +- .../arch_config_full_reference_rendered.yaml | 95 +++++++++++++++++++ model_server/.vscode/launch.json | 1 + model_server/.vscode/settings.json | 7 ++ tests/archgw/.vscode/launch.json | 15 +++ tests/archgw/.vscode/settings.json | 7 ++ tests/e2e/.vscode/launch.json | 15 +++ tests/e2e/.vscode/settings.json | 7 ++ tests/modelserver/.vscode/launch.json | 15 +++ tests/modelserver/.vscode/settings.json | 7 ++ 29 files changed, 375 insertions(+), 133 deletions(-) create mode 100644 arch/tools/.vscode/settings.json create mode 100644 crates/.vscode/launch.json create mode 100644 crates/.vscode/tasks.json create mode 100644 demos/samples_java/weather_forcecast_service/.vscode/launch.json delete mode 100644 demos/use_cases/preference_based_routing/arch_config_local.yaml create mode 100644 demos/use_cases/preference_based_routing/arch_config_rendered.yaml create mode 100644 docs/source/resources/includes/arch_config_full_reference_rendered.yaml create mode 100644 model_server/.vscode/settings.json create mode 100644 tests/archgw/.vscode/launch.json create mode 100644 tests/archgw/.vscode/settings.json create mode 100644 tests/e2e/.vscode/launch.json create mode 100644 tests/e2e/.vscode/settings.json create mode 100644 tests/modelserver/.vscode/launch.json create mode 100644 tests/modelserver/.vscode/settings.json diff --git a/.gitignore b/.gitignore index b140bbbe..d2f7c6bc 100644 --- a/.gitignore +++ b/.gitignore @@ -101,9 +101,6 @@ venv.bak/ # mypy .mypy_cache/ -# VSCode stuff: -.vscode/ - # MacOS Metadata *.DS_Store diff --git a/arch/arch_config_schema.yaml b/arch/arch_config_schema.yaml index 411e189f..0ca9d42d 100644 --- a/arch/arch_config_schema.yaml +++ b/arch/arch_config_schema.yaml @@ -72,20 +72,23 @@ properties: type: string default: type: boolean - # endpoint field is deprecated, use base_url instead - endpoint: - type: string base_url: type: string - protocol: - type: string - enum: - - http - - https http_host: type: string - usage: - type: string + routing_preferences: + type: array + items: + type: object + properties: + name: + type: string + description: + type: string + additionalProperties: false + required: + - name + - description additionalProperties: false required: - model diff --git a/arch/tools/.vscode/settings.json b/arch/tools/.vscode/settings.json new file mode 100644 index 00000000..10f9f99d --- /dev/null +++ b/arch/tools/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "cSpell.words": [ + "BRIGHTSTAFF" + ] +} diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py index 6dea940b..c636813b 100644 --- a/arch/tools/cli/config_generator.py +++ b/arch/tools/cli/config_generator.py @@ -95,6 +95,8 @@ def validate_and_render_schema(): updated_llm_providers = [] llm_provider_name_set = set() llms_with_usage = [] + model_name_keys = set() + model_usage_name_keys = set() for llm_provider in config_yaml["llm_providers"]: if llm_provider.get("usage", None): llms_with_usage.append(llm_provider["name"]) @@ -104,6 +106,11 @@ def validate_and_render_schema(): ) model_name = llm_provider.get("model") + if model_name in model_name_keys: + raise Exception( + f"Duplicate model name {model_name}, please provide unique model name for each llm_provider" + ) + model_name_keys.add(model_name) if llm_provider.get("name") is None: llm_provider["name"] = model_name @@ -119,6 +126,20 @@ def validate_and_render_schema(): f"Unsupported provider {provider} for model {model_name}. Supported providers are: {', '.join(SUPPORTED_PROVIDERS)}" ) + if model_id in model_name_keys: + raise Exception( + f"Duplicate model_id {model_id}, please provide unique model_id for each llm_provider" + ) + model_name_keys.add(model_id) + + for routing_preference in llm_provider.get("routing_preferences", []): + if routing_preference.get("name") in model_usage_name_keys: + raise Exception( + f"Duplicate routing preference name \"{routing_preference.get('name')}\", please provide unique name for each routing preference" + ) + model_usage_name_keys.add(routing_preference.get("name")) + + llm_provider["model"] = model_id llm_provider["provider_interface"] = provider llm_provider_name_set.add(llm_provider.get("name")) provider = None @@ -132,21 +153,14 @@ def validate_and_render_schema(): del llm_provider["provider"] updated_llm_providers.append(llm_provider) - if llm_provider.get("endpoint") and llm_provider.get("base_url"): - raise Exception("Please provide either endpoint or base_url, not both") - - if llm_provider.get("endpoint", None): - endpoint = llm_provider["endpoint"] - protocol = llm_provider.get("protocol", "http") - llm_provider["endpoint"], llm_provider["port"] = get_endpoint_and_port( - endpoint, protocol - ) - llms_with_endpoint.append(llm_provider) - elif llm_provider.get("base_url", None): + if llm_provider.get("base_url", None): base_url = llm_provider["base_url"] urlparse_result = urlparse(base_url) - if llm_provider.get("port"): - raise Exception("Please provider port in base_url") + url_path = urlparse_result.path + if url_path and url_path != "/": + raise Exception( + f"Please provide base_url without path, got {base_url}. Use base_url like 'http://example.com' instead of 'http://example.com/path'." + ) if urlparse_result.scheme == "" or urlparse_result.scheme not in [ "http", "https", diff --git a/crates/.vscode/launch.json b/crates/.vscode/launch.json new file mode 100644 index 00000000..56a29b46 --- /dev/null +++ b/crates/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Brightstaff", + "type": "lldb", + "request": "launch", + "program": "${workspaceFolder}/target/debug/brightstaff", + "args": [], + "cwd": "${workspaceFolder}", + "stopOnEntry": false, + "sourceLanguages": ["rust"], + "env": { + "RUST_LOG": "debug", + "RUST_BACKTRACE": "1", + "ARCH_CONFIG_PATH_RENDERED": "../demos/use_cases/preference_based_routing/arch_config_rendered.yaml" + }, + "preLaunchTask": "rust: cargo build" + } + ] +} diff --git a/crates/.vscode/tasks.json b/crates/.vscode/tasks.json new file mode 100644 index 00000000..8d648bc7 --- /dev/null +++ b/crates/.vscode/tasks.json @@ -0,0 +1,21 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cargo", + "command": "build", + "args": [ + "--bin", + "brightstaff" + ], + "problemMatcher": [ + "$rustc" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "label": "rust: cargo build" + } + ] +} diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs index 217897cd..89c9ee13 100644 --- a/crates/brightstaff/src/handlers/chat_completions.rs +++ b/crates/brightstaff/src/handlers/chat_completions.rs @@ -12,7 +12,7 @@ use hyper::{Request, Response, StatusCode}; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; use tokio_stream::StreamExt; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, info, warn}; use crate::router::llm_router::RouterService; @@ -81,8 +81,8 @@ pub async fn chat_completions( } } - trace!( - "arch-router request body: {}", + debug!( + "arch-router request received: {}", &serde_json::to_string(&chat_completion_request).unwrap() ); @@ -102,7 +102,7 @@ pub async fn chat_completions( .as_ref() .and_then(|s| serde_yaml::from_str(s).ok()); - debug!("usage preferences: {:?}", usage_preferences); + debug!("usage preferences from request: {:?}", usage_preferences); let mut determined_route = match router_service .determine_route( diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 9128c33b..4e4f18b7 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -44,6 +44,10 @@ async fn main() -> Result<(), Box> { let _tracer_provider = init_tracer(); let bind_address = env::var("BIND_ADDRESS").unwrap_or_else(|_| BIND_ADDRESS.to_string()); + info!( + "current working directory: {}", + env::current_dir().unwrap().display() + ); // loading arch_config.yaml file let arch_config_path = env::var("ARCH_CONFIG_PATH_RENDERED") .unwrap_or_else(|_| "./arch_config_rendered.yaml".to_string()); diff --git a/crates/brightstaff/src/router/llm_router.rs b/crates/brightstaff/src/router/llm_router.rs index a78d34e7..c1320c66 100644 --- a/crates/brightstaff/src/router/llm_router.rs +++ b/crates/brightstaff/src/router/llm_router.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use common::{ - configuration::{LlmProvider, LlmRoute, ModelUsagePreference}, + configuration::{LlmProvider, ModelUsagePreference, RoutingPreference}, consts::ARCH_PROVIDER_HINT_HEADER, }; use hermesllm::providers::openai::types::{ChatCompletionsResponse, ContentType, Message}; @@ -44,11 +44,14 @@ impl RouterService { ) -> Self { let providers_with_usage = providers .iter() - .filter(|provider| provider.usage.is_some()) + .filter(|provider| provider.routing_preferences.is_some()) .cloned() .collect::>(); - let llm_routes: Vec = providers_with_usage.iter().map(LlmRoute::from).collect(); + let llm_routes: Vec = providers_with_usage + .iter() + .flat_map(|provider| provider.routing_preferences.clone().unwrap_or_default()) + .collect(); let router_model = Arc::new(router_model_v1::RouterModelV1::new( llm_routes, @@ -156,6 +159,12 @@ impl RouterService { router_response_time.as_millis() ); + if let Some(ref route) = route_name { + if route == "other" { + return Ok(None); + } + } + Ok(route_name) } else { Ok(None) diff --git a/crates/brightstaff/src/router/router_model_v1.rs b/crates/brightstaff/src/router/router_model_v1.rs index e6ccd912..0dcefff6 100644 --- a/crates/brightstaff/src/router/router_model_v1.rs +++ b/crates/brightstaff/src/router/router_model_v1.rs @@ -1,5 +1,5 @@ use common::{ - configuration::{LlmRoute, ModelUsagePreference}, + configuration::{ModelUsagePreference, RoutingPreference}, consts::{SYSTEM_ROLE, TOOL_ROLE, USER_ROLE}, }; use hermesllm::providers::openai::types::{ChatCompletionsRequest, ContentType, Message}; @@ -36,7 +36,11 @@ pub struct RouterModelV1 { max_token_length: usize, } impl RouterModelV1 { - pub fn new(llm_routes: Vec, routing_model: String, max_token_length: usize) -> Self { + pub fn new( + llm_routes: Vec, + routing_model: String, + max_token_length: usize, + ) -> Self { let llm_route_json_str = serde_json::to_string(&llm_routes).unwrap_or_else(|_| "[]".to_string()); RouterModelV1 { @@ -138,9 +142,9 @@ impl RouterModel for RouterModelV1 { let llm_route_json = usage_preferences .as_ref() .map(|prefs| { - let llm_route: Vec = prefs + let llm_route: Vec = prefs .iter() - .map(|pref| LlmRoute { + .map(|pref| RoutingPreference { name: pref.name.clone(), description: pref.usage.clone().unwrap_or_default(), }) @@ -255,7 +259,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX); @@ -314,7 +318,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX); @@ -379,7 +383,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), 235); @@ -440,7 +444,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), 200); @@ -501,7 +505,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), 230); @@ -569,7 +573,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX); @@ -639,7 +643,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let routing_model = "test-model".to_string(); let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX); @@ -716,7 +720,7 @@ Based on your analysis, provide your response in the following JSON formats if y {"name": "Speech Recognition", "description": "Converting spoken language into written text"} ] "#; - let llm_routes = serde_json::from_str::>(routes_str).unwrap(); + let llm_routes = serde_json::from_str::>(routes_str).unwrap(); let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000); diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index d92f38fb..0693c09b 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -187,24 +187,11 @@ pub struct ModelUsagePreference { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LlmRoute { +pub struct RoutingPreference { pub name: String, pub description: String, } -impl From<&LlmProvider> for LlmRoute { - fn from(provider: &LlmProvider) -> Self { - Self { - name: provider.name.to_string(), - description: provider - .usage - .as_ref() - .cloned() - .unwrap_or_else(|| "No description available".to_string()), - } - } -} - #[derive(Debug, Clone, Serialize, Deserialize)] //TODO: use enum for model, but if there is a new model, we need to update the code pub struct LlmProvider { @@ -218,6 +205,7 @@ pub struct LlmProvider { pub port: Option, pub rate_limits: Option, pub usage: Option, + pub routing_preferences: Option>, } pub trait IntoModels { @@ -256,6 +244,7 @@ impl Default for LlmProvider { port: None, rate_limits: None, usage: None, + routing_preferences: None, } } } @@ -368,7 +357,7 @@ mod test { #[test] fn test_deserialize_configuration() { let ref_config = fs::read_to_string( - "../../docs/source/resources/includes/arch_config_full_reference.yaml", + "../../docs/source/resources/includes/arch_config_full_reference_rendered.yaml", ) .expect("reference config file not found"); @@ -429,7 +418,7 @@ mod test { #[test] fn test_tool_conversion() { let ref_config = fs::read_to_string( - "../../docs/source/resources/includes/arch_config_full_reference.yaml", + "../../docs/source/resources/includes/arch_config_full_reference_rendered.yaml", ) .expect("reference config file not found"); let config: super::Configuration = serde_yaml::from_str(&ref_config).unwrap(); diff --git a/crates/common/src/llm_providers.rs b/crates/common/src/llm_providers.rs index 8214f148..120be691 100644 --- a/crates/common/src/llm_providers.rs +++ b/crates/common/src/llm_providers.rs @@ -58,7 +58,16 @@ impl TryFrom> for LlmProviders { let name = llm_provider.name.clone(); if llm_providers .providers - .insert(name.clone(), llm_provider) + .insert(name.clone(), llm_provider.clone()) + .is_some() + { + return Err(LlmProvidersNewError::DuplicateName(name)); + } + + // also add model_id as key for provider lookup + if llm_providers + .providers + .insert(llm_provider.model.clone().unwrap(), llm_provider) .is_some() { return Err(LlmProvidersNewError::DuplicateName(name)); diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 2fa29496..d6be1749 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -113,16 +113,10 @@ impl StreamContext { } debug!( - "request received: llm provider hint: {}, selected llm: {}, model: {}", + "request received: llm provider hint: {}, selected provider: {}", self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER) .unwrap_or_default(), - self.llm_provider.as_ref().unwrap().name, - self.llm_provider - .as_ref() - .unwrap() - .model - .as_ref() - .unwrap_or(&String::new()) + self.llm_provider.as_ref().unwrap().name ); } @@ -313,6 +307,11 @@ impl HttpContext for StreamContext { } }; + debug!( + "on_http_request_body: deserialized body: {}", + serde_json::to_string(&deserialized_body).unwrap_or_default() + ); + self.user_message = deserialized_body .messages .iter() @@ -349,8 +348,8 @@ impl HttpContext for StreamContext { }; info!( - "on_http_request_body: provider: {}, model requested: {}, model selected: {}", - self.llm_provider().name, + "on_http_request_body: provider: {}, model requested (in body): {}, model selected: {}", + self.llm_provider().provider_interface, model_requested, model_name.unwrap_or(&"None".to_string()), ); diff --git a/demos/samples_java/weather_forcecast_service/.vscode/launch.json b/demos/samples_java/weather_forcecast_service/.vscode/launch.json new file mode 100644 index 00000000..a9232a53 --- /dev/null +++ b/demos/samples_java/weather_forcecast_service/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "java", + "name": "WeatherForecastApplication", + "request": "launch", + "mainClass": "weather.WeatherForecastApplication", + "projectName": "weather-forecast-service" + } + ] +} diff --git a/demos/use_cases/ollama/arch_config.yaml b/demos/use_cases/ollama/arch_config.yaml index 7d464d68..db824ad7 100644 --- a/demos/use_cases/ollama/arch_config.yaml +++ b/demos/use_cases/ollama/arch_config.yaml @@ -10,7 +10,7 @@ listeners: llm_providers: - model: openai/llama3.2 - endpoint: host.docker.internal:11434 + base_url: http://host.docker.internal:11434 default: true system_prompt: | diff --git a/demos/use_cases/preference_based_routing/arch_config.yaml b/demos/use_cases/preference_based_routing/arch_config.yaml index c1047206..33136325 100644 --- a/demos/use_cases/preference_based_routing/arch_config.yaml +++ b/demos/use_cases/preference_based_routing/arch_config.yaml @@ -9,22 +9,21 @@ listeners: llm_providers: - - access_key: $OPENAI_API_KEY - model: openai/gpt-4o-mini - - - access_key: $OPENAI_API_KEY - model: openai/gpt-4.1 + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY default: true - - name: code_generation + - model: openai/gpt-4o access_key: $OPENAI_API_KEY - model: openai/gpt-4.1 - usage: generating new code snippets, functions, or boilerplate based on user prompts or requirements + routing_preferences: + - name: code understanding + description: understand and explain existing code snippets, functions, or libraries - - name: code_understanding + - model: openai/gpt-4.1 access_key: $OPENAI_API_KEY - model: openai/gpt-4o-mini - usage: understand and explain existing code snippets, functions, or libraries + routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements tracing: random_sampling: 100 diff --git a/demos/use_cases/preference_based_routing/arch_config_local.yaml b/demos/use_cases/preference_based_routing/arch_config_local.yaml deleted file mode 100644 index 029918d0..00000000 --- a/demos/use_cases/preference_based_routing/arch_config_local.yaml +++ /dev/null @@ -1,45 +0,0 @@ -version: v0.1.0 - -routing: - model: Arch-Router - llm_provider: arch-router - -listeners: - egress_traffic: - address: 0.0.0.0 - port: 12000 - message_format: openai - timeout: 30s - -llm_providers: - - - name: arch-router - provider_interface: arch - model: hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M - endpoint: host.docker.internal:11434 - - - name: gpt-4o-mini - provider_interface: openai - access_key: $OPENAI_API_KEY - model: gpt-4o-mini - - - name: gpt-4.1 - provider_interface: openai - access_key: $OPENAI_API_KEY - model: gpt-4.1 - default: true - - - name: code_generation - access_key: $OPENAI_API_KEY - provider_interface: openai - model: gpt-4.1 - usage: generating new code snippets, functions, or boilerplate based on user prompts or requirements - - - name: code_understanding - provider_interface: openai - access_key: $OPENAI_API_KEY - model: gpt-4.1 - usage: understand and explain existing code snippets, functions, or libraries - -tracing: - random_sampling: 100 diff --git a/demos/use_cases/preference_based_routing/arch_config_rendered.yaml b/demos/use_cases/preference_based_routing/arch_config_rendered.yaml new file mode 100644 index 00000000..bdd85f0d --- /dev/null +++ b/demos/use_cases/preference_based_routing/arch_config_rendered.yaml @@ -0,0 +1,29 @@ +listeners: + egress_traffic: + address: 0.0.0.0 + message_format: openai + port: 12000 + timeout: 30s +llm_providers: +- access_key: $OPENAI_API_KEY + default: true + model: gpt-4o-mini + name: openai/gpt-4o-mini + provider_interface: openai +- access_key: $OPENAI_API_KEY + model: gpt-4o + name: openai/gpt-4o + provider_interface: openai + routing_preferences: + - description: b + name: code understanding +- access_key: $OPENAI_API_KEY + model: gpt-4.1 + name: openai/gpt-4.1 + provider_interface: openai + routing_preferences: + - description: a + name: code understanding +tracing: + random_sampling: 100 +version: v0.1.0 diff --git a/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl b/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl index 432f0996..c4ee5d8a 100644 --- a/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl +++ b/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl @@ -2,18 +2,18 @@ POST http://localhost:12000/v1/chat/completions Content-Type: application/json { + "model": "openai/gpt-4.1", "messages": [ { "role": "user", "content": "hi" } - ], - "model": "none" + ] } HTTP 200 [Asserts] header "content-type" == "application/json" -jsonpath "$.model" matches /^gpt-4.1/ +jsonpath "$.model" matches /^gpt-4o-mini/ jsonpath "$.usage" != null jsonpath "$.choices[0].message.content" != null jsonpath "$.choices[0].message.role" == "assistant" diff --git a/docs/source/resources/includes/arch_config_full_reference.yaml b/docs/source/resources/includes/arch_config_full_reference.yaml index 266ccf33..808baff1 100644 --- a/docs/source/resources/includes/arch_config_full_reference.yaml +++ b/docs/source/resources/includes/arch_config_full_reference.yaml @@ -39,7 +39,7 @@ llm_providers: model: mistral/mistral-8x7b - model: mistral/mistral-7b-instruct - endpoint: mistral_local + base_url: http://mistral_local # provides a way to override default settings for the arch system overrides: diff --git a/docs/source/resources/includes/arch_config_full_reference_rendered.yaml b/docs/source/resources/includes/arch_config_full_reference_rendered.yaml new file mode 100644 index 00000000..c567de7f --- /dev/null +++ b/docs/source/resources/includes/arch_config_full_reference_rendered.yaml @@ -0,0 +1,95 @@ +version: v0.1 + +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 5s + egress_traffic: + address: 0.0.0.0 + port: 12000 + message_format: openai + timeout: 5s + +# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem. +endpoints: + app_server: + # value could be ip address or a hostname with port + # this could also be a list of endpoints for load balancing + # for example endpoint: [ ip1:port, ip2:port ] + endpoint: 127.0.0.1:80 + # max time to wait for a connection to be established + connect_timeout: 0.005s + + mistral_local: + endpoint: 127.0.0.1:8001 + + error_target: + endpoint: error_target_1 + +# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way +llm_providers: + - name: openai/gpt-4o + provider_interface: openai + access_key: $OPENAI_API_KEY + model: gpt-4o + default: true + + - name: mistral/mistral-8x7b + provider_interface: mistral + access_key: $MISTRAL_API_KEY + model: mistral-8x7b + + - name: mistral/mistral-7b-instruct + provider_interface: mistral + model: mistral-7b-instruct + base_url: http://mistral_local + +# provides a way to override default settings for the arch system +overrides: + # By default Arch uses an NLI + embedding approach to match an incoming prompt to a prompt target. + # The intent matching threshold is kept at 0.80, you can override this behavior if you would like + prompt_target_intent_matching_threshold: 0.60 + +# default system prompt used by all prompt targets +system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions. + +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters. + +prompt_targets: + - name: information_extraction + default: true + description: handel all scenarios that are question and answer in nature. Like summarization, information extraction, etc. + endpoint: + name: app_server + path: /agent/summary + http_method: POST + # Arch uses the default LLM and treats the response from the endpoint as the prompt to send to the LLM + auto_llm_dispatch_on_response: true + # override system prompt for this prompt target + system_prompt: You are a helpful information extraction assistant. Use the information that is provided to you. + + - name: reboot_network_device + description: Reboot a specific network device + endpoint: + name: app_server + path: /agent/action + parameters: + - name: device_id + type: str + description: Identifier of the network device to reboot. + required: true + - name: confirmation + type: bool + description: Confirmation flag to proceed with reboot. + default: false + enum: [true, false] + +tracing: + # sampling rate. Note by default Arch works on OpenTelemetry compatible tracing. + sampling_rate: 0.1 diff --git a/model_server/.vscode/launch.json b/model_server/.vscode/launch.json index ca83be87..19ed7342 100644 --- a/model_server/.vscode/launch.json +++ b/model_server/.vscode/launch.json @@ -4,6 +4,7 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { "name": "model server", "type": "debugpy", diff --git a/model_server/.vscode/settings.json b/model_server/.vscode/settings.json new file mode 100644 index 00000000..98ba633e --- /dev/null +++ b/model_server/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} diff --git a/tests/archgw/.vscode/launch.json b/tests/archgw/.vscode/launch.json new file mode 100644 index 00000000..6a211d8e --- /dev/null +++ b/tests/archgw/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} diff --git a/tests/archgw/.vscode/settings.json b/tests/archgw/.vscode/settings.json new file mode 100644 index 00000000..98ba633e --- /dev/null +++ b/tests/archgw/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} diff --git a/tests/e2e/.vscode/launch.json b/tests/e2e/.vscode/launch.json new file mode 100644 index 00000000..6a211d8e --- /dev/null +++ b/tests/e2e/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} diff --git a/tests/e2e/.vscode/settings.json b/tests/e2e/.vscode/settings.json new file mode 100644 index 00000000..98ba633e --- /dev/null +++ b/tests/e2e/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} diff --git a/tests/modelserver/.vscode/launch.json b/tests/modelserver/.vscode/launch.json new file mode 100644 index 00000000..6a211d8e --- /dev/null +++ b/tests/modelserver/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} diff --git a/tests/modelserver/.vscode/settings.json b/tests/modelserver/.vscode/settings.json new file mode 100644 index 00000000..98ba633e --- /dev/null +++ b/tests/modelserver/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +}