adding support for model aliases in archgw (#566)

* adding support for model aliases in archgw * fixed PR based on feedback * removing README. Not relevant for PR --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-136.local>
2026-06-08 14:55:14 +02:00 · 2025-09-16 11:12:08 -07:00 · 2025-09-16 11:12:08 -07:00 · 4eb2b410c5
commit 4eb2b410c5
parent 1e8c81d8f6
12 changed files with 634 additions and 14 deletions
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -7,6 +7,7 @@ properties:
      - v0.1
      - v0.1.0
      - 0.1-beta
+      - 0.2.0
  listeners:
    type: object
    additionalProperties: false
@ -102,6 +103,19 @@ properties:
      additionalProperties: false
      required:
        - model
+
+  model_aliases:
+    type: object
+    patternProperties:
+      "^.*$":
+        type: object
+        properties:
+          target:
+            type: string
+        additionalProperties: false
+        required:
+          - target
+
  overrides:
    type: object
    properties:
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -208,6 +208,16 @@ def validate_and_render_schema():

    config_yaml["llm_providers"] = updated_llm_providers

+    # Validate model aliases if present
+    if "model_aliases" in config_yaml:
+        model_aliases = config_yaml["model_aliases"]
+        for alias_name, alias_config in model_aliases.items():
+            target = alias_config.get("target")
+            if target not in model_name_keys:
+                raise Exception(
+                    f"Model alias '{alias_name}' targets '{target}' which is not defined as a model. Available models: {', '.join(sorted(model_name_keys))}"
+                )
+
    arch_config_string = yaml.dump(config_yaml)
    arch_llm_config_string = yaml.dump(config_yaml)

--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@ -1,7 +1,7 @@
 use std::sync::Arc;
-
+use std::collections::HashMap;
 use bytes::Bytes;
-use common::configuration::ModelUsagePreference;
+use common::configuration::{ModelAlias, ModelUsagePreference};
 use common::consts::ARCH_PROVIDER_HINT_HEADER;
 use hermesllm::apis::openai::ChatCompletionsRequest;
 use hermesllm::clients::SupportedAPIs;
@ -28,6 +28,7 @@ pub async fn chat(
    request: Request<hyper::body::Incoming>,
    router_service: Arc<RouterService>,
    full_qualified_llm_provider_url: String,
+    model_aliases: Arc<Option<HashMap<String, ModelAlias>>>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {

    let request_path = request.uri().path().to_string();
@ -35,6 +36,7 @@ pub async fn chat(
    let chat_request_bytes = request.collect().await?.to_bytes();

    debug!("Received request body (raw utf8): {}", String::from_utf8_lossy(&chat_request_bytes));
+
    let mut client_request = match ProviderRequestType::try_from((&chat_request_bytes[..], &SupportedAPIs::from_endpoint(request_path.as_str()).unwrap())) {
        Ok(request) => request,
        Err(err) => {
@ -46,6 +48,24 @@ pub async fn chat(
        }
    };

+    // Model alias resolution: update model field in client_request immediately
+    // This ensures all downstream objects use the resolved model
+    let model_from_request = client_request.model().to_string();
+    let resolved_model = if let Some(model_aliases) = model_aliases.as_ref() {
+        if let Some(model_alias) = model_aliases.get(&model_from_request) {
+            debug!(
+                "Model Alias: 'From {}' -> 'To{}'",
+                model_from_request, model_alias.target
+            );
+            model_alias.target.clone()
+        } else {
+            model_from_request.clone()
+        }
+    } else {
+        model_from_request.clone()
+    };
+    client_request.set_model(resolved_model.clone());
+
    // Clone metadata for routing and remove archgw_preference_config from original
    let routing_metadata = client_request.metadata().clone();

@ -77,7 +97,7 @@ pub async fn chat(
        };

    debug!(
-        "[BRIGHTSTAFF -> ARCH_ROUTER] REQ: {}",
+        "[ARCH_ROUTER REQ]: {}",
        &serde_json::to_string(&chat_completions_request_for_arch_router).unwrap()
    );

@ -132,11 +152,12 @@ pub async fn chat(
        Ok(route) => match route {
            Some((_, model_name)) => model_name,
            None => {
-                debug!(
+               debug!(
                    "No route determined, using default model from request: {}",
                    chat_completions_request_for_arch_router.model
                );
                chat_completions_request_for_arch_router.model.clone()
+
            }
        },
        Err(err) => {
@ -148,7 +169,7 @@ pub async fn chat(
    };

    debug!(
-        "[BRIGHTSTAFF -> ARCH_ROUTER] URL: {}, Model Hint: {}",
+        "[ARCH_ROUTER] URL: {}, Resolved Model: {}",
        full_qualified_llm_provider_url, model_name
    );

--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -94,12 +94,16 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
        routing_llm_provider,
    ));

+    let model_aliases = Arc::new(arch_config.model_aliases.clone());
+
+
    loop {
        let (stream, _) = listener.accept().await?;
        let peer_addr = stream.peer_addr()?;
        let io = TokioIo::new(stream);

        let router_service: Arc<RouterService> = Arc::clone(&router_service);
+        let model_aliases = Arc::clone(&model_aliases);
        let llm_provider_url = llm_provider_url.clone();

        let llm_providers = llm_providers.clone();
@ -109,12 +113,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
            let parent_cx = extract_context_from_request(&req);
            let llm_provider_url = llm_provider_url.clone();
            let llm_providers = llm_providers.clone();
+            let model_aliases = Arc::clone(&model_aliases);

            async move {
                match (req.method(), req.uri().path()) {
                    (&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH) => {
                        let fully_qualified_url = format!("{}{}", llm_provider_url, req.uri().path());
-                        chat(req, router_service, fully_qualified_url)
+                        chat(req, router_service, fully_qualified_url, model_aliases)
                            .with_context(parent_cx)
                            .await
                    }
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -13,11 +13,17 @@ pub struct Routing {
    pub model: Option<String>,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelAlias {
+    pub target: String,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Configuration {
    pub version: String,
    pub endpoints: Option<HashMap<String, Endpoint>>,
    pub llm_providers: Vec<LlmProvider>,
+    pub model_aliases: Option<HashMap<String, ModelAlias>>,
    pub overrides: Option<Overrides>,
    pub system_prompt: Option<String>,
    pub prompt_guards: Option<PromptGuards>,
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -104,6 +104,20 @@ pub struct ChatCompletionsRequest {
    // pub web_search: Option<bool>, // GOOD FIRST ISSUE: Future support for web search
 }

+impl ChatCompletionsRequest {
+    /// Suppress max_tokens if the model is o3, o3-*, openrouter/o3, or openrouter/o3-*
+    pub fn suppress_max_tokens_if_o3(&mut self) {
+        let model = self.model.as_str();
+        let is_o3 = model == "o3"
+            || model.starts_with("o3-")
+            || model == "openrouter/o3"
+            || model.starts_with("openrouter/o3-");
+        if is_o3 {
+            self.max_tokens = None;
+        }
+    }
+}
+
 // ============================================================================
 // CHAT COMPLETIONS API TYPES
 // ============================================================================
@ -530,7 +544,10 @@ impl TryFrom<&[u8]> for ChatCompletionsRequest {
    type Error = OpenAIStreamError;

    fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
-        serde_json::from_slice(bytes).map_err(OpenAIStreamError::from)
+       let mut req: ChatCompletionsRequest = serde_json::from_slice(bytes).map_err(OpenAIStreamError::from)?;
+        // Use the centralized suppression logic
+        req.suppress_max_tokens_if_o3();
+        Ok(req)
    }
 }

--- a/crates/hermesllm/src/clients/transformer.rs
+++ b/crates/hermesllm/src/clients/transformer.rs
@ -97,7 +97,7 @@ impl TryFrom<AnthropicMessagesRequest> for ChatCompletionsRequest {
        let openai_tools = req.tools.map(|tools| convert_anthropic_tools(tools));
        let (openai_tool_choice, parallel_tool_calls) = convert_anthropic_tool_choice(req.tool_choice);

-        Ok(ChatCompletionsRequest {
+        let mut _chat_completions_req: ChatCompletionsRequest = ChatCompletionsRequest {
            model: req.model,
            messages: openai_messages,
            temperature: req.temperature,
@ -109,7 +109,9 @@ impl TryFrom<AnthropicMessagesRequest> for ChatCompletionsRequest {
            tool_choice: openai_tool_choice,
            parallel_tool_calls,
            ..Default::default()
-        })
+        };
+        _chat_completions_req.suppress_max_tokens_if_o3();
+        Ok(_chat_completions_req)
    }
 }

--- a/demos/use_cases/model_alias_routing/README.md
+++ b/demos/use_cases/model_alias_routing/README.md
@ -0,0 +1,148 @@
+# Model Alias Demo Suite
+
+This directory contains demos for the model alias feature in archgw.
+
+## Overview
+
+Model aliases allow clients to use friendly, semantic names instead of provider-specific model names. For example:
+- `arch.summarize.v1` → `4o-mini` (fast, cheap model for summaries)
+- `arch.reasoning.v1` → `gpt-4o` (capable model for complex reasoning)
+- `creative-model` → `claude-3-5-sonnet` (creative tasks)
+
+## Configuration
+
+The `arch_config_with_aliases.yaml` file defines several aliases:
+
+```yaml
+# Model aliases - friendly names that map to actual provider names
+model_aliases:
+  # Alias for summarization tasks -> fast/cheap model
+  arch.summarize.v1:
+    target: gpt-4o-mini
+
+  # Alias for general purpose tasks -> latest model
+  arch.v1:
+    target: o3
+
+  # Alias for reasoning tasks -> capable model
+  arch.reasoning.v1:
+    target: gpt-4o
+
+  # Alias for creative tasks -> Claude model
+  arch.creative.v1:
+    target: claude-3-5-sonnet-20241022
+
+  # Alias for quick responses -> fast model
+  arch.fast.v1:
+    target: claude-3-haiku-20240307
+
+  # Semantic aliases
+  summary-model:
+    target: gpt-4o-mini
+
+  chat-model:
+    target: gpt-4o
+
+  creative-model:
+    target: claude-3-5-sonnet-20241022
+```
+
+## Prerequisites
+- Install all dependencies as described in the main Arch README ([link](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites))
+- Set your API keys in your environment:
+  - `export OPENAI_API_KEY=your-openai-key`
+  - `export ANTHROPIC_API_KEY=your-anthropic-key` (optional, but recommended for Anthropic tests)
+
+## How to Run
+
+1. Start the demo:
+   ```sh
+   sh run_demo.sh
+   ```
+   - This will create a `.env` file with your API keys (if not present).
+   - Starts Arch Gateway with model alias config (`arch_config_with_aliases.yaml`).
+
+2. To stop the demo:
+   ```sh
+   sh run_demo.sh down
+   ```
+   - This will stop Arch Gateway and any related services.
+
+## Example Requests
+
+### OpenAI client with alias `arch.summarize.v1`
+```sh
+curl -sS -X POST "http://localhost:12000/v1/chat/completions" \
+  -H "Authorization: Bearer test-key" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "arch.summarize.v1",
+    "max_tokens": 50,
+    "messages": [
+      { "role": "user",
+        "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1!"
+      }
+    ]
+  }' | jq .
+```
+
+### OpenAI client with alias `arch.v1`
+```sh
+curl -sS -X POST "http://localhost:12000/v1/chat/completions" \
+  -H "Authorization: Bearer test-key" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "arch.v1",
+    "max_tokens": 50,
+    "messages": [
+      { "role": "user",
+        "content": "Hello, please respond with exactly: Hello from alias arch.v1!"
+      }
+    ]
+  }' | jq .
+```
+
+### Anthropic client with alias `arch.summarize.v1`
+```sh
+curl -sS -X POST "http://localhost:12000/v1/messages" \
+  -H "x-api-key: test-key" \
+  -H "anthropic-version: 2023-06-01" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "arch.summarize.v1",
+    "max_tokens": 50,
+    "messages": [
+      { "role": "user",
+        "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1 via Anthropic!"
+      }
+    ]
+  }' | jq .
+```
+
+### Anthropic client with alias `arch.v1`
+```sh
+curl -sS -X POST "http://localhost:12000/v1/messages" \
+  -H "x-api-key: test-key" \
+  -H "anthropic-version: 2023-06-01" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "arch.summarize.v1",
+    "max_tokens": 50,
+    "messages": [
+      { "role": "user",
+        "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1 via Anthropic!"
+      }
+    ]
+  }' | jq .
+```
+
+## Notes
+- The `.env` file will be created automatically if missing, with your API keys.
+- If `ANTHROPIC_API_KEY` is not set, Anthropic requests will not work.
+- You can add more aliases in `arch_config_with_aliases.yaml`.
+- All curl examples use `jq .` for pretty-printing JSON responses.
+
+## Troubleshooting
+- Ensure your API keys are set in your environment before running the demo.
+- If you see errors about missing keys, set them and re-run the script.
+- For more details, see the main Arch documentation.
--- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
+++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
@ -0,0 +1,59 @@
+version: v0.1
+
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s
+
+llm_providers:
+  # OpenAI Models
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/o3
+    access_key: $OPENAI_API_KEY
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+
+  # Anthropic Models
+  - model: anthropic/claude-3-5-sonnet-20241022
+    access_key: $ANTHROPIC_API_KEY
+
+  - model: anthropic/claude-3-haiku-20240307
+    access_key: $ANTHROPIC_API_KEY
+
+# Model aliases - friendly names that map to actual provider names
+model_aliases:
+  # Alias for summarization tasks -> fast/cheap model
+  arch.summarize.v1:
+    target: gpt-4o-mini
+
+  # Alias for general purpose tasks -> latest model
+  arch.v1:
+    target: o3
+
+  # Alias for reasoning tasks -> capable model
+  arch.reasoning.v1:
+    target: gpt-4o
+
+  # Alias for creative tasks -> Claude model
+  arch.creative.v1:
+    target: claude-3-5-sonnet-20241022
+
+  # Alias for quick responses -> fast model
+  arch.fast.v1:
+    target: claude-3-haiku-20240307
+
+  # Semantic aliases
+  summary-model:
+    target: gpt-4o-mini
+
+  chat-model:
+    target: gpt-4o
+
+  creative-model:
+    target: claude-3-5-sonnet-20241022
--- a/demos/use_cases/model_alias_routing/run_demo.sh
+++ b/demos/use_cases/model_alias_routing/run_demo.sh
@ -0,0 +1,60 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set API keys
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+    if [ -z "$ANTHROPIC_API_KEY" ]; then
+      echo "Warning: ANTHROPIC_API_KEY environment variable is not set. Anthropic features may not work."
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    if [ -n "$ANTHROPIC_API_KEY" ]; then
+      echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env
+    fi
+    echo ".env file created with API keys."
+  fi
+
+  # Step 3: Start Arch
+  echo "Starting Arch with arch_config_with_aliases.yaml..."
+  archgw up arch_config_with_aliases.yaml
+
+  echo "\n\nArch started successfully."
+  echo "Please run the following CURL command to test model alias routing. Additional instructions are in the README.md file. \n"
+  echo "curl -sS -X POST \"http://localhost:12000/v1/chat/completions\" \
+    -H \"Authorization: Bearer test-key\" \
+    -H \"Content-Type: application/json\" \
+    -d '{
+      \"model\": \"arch.summarize.v1\",
+      \"max_tokens\": 50,
+      \"messages\": [
+        { \"role\": \"user\",
+          \"content\": \"Hello, please respond with exactly: Hello from alias arch.summarize.v1!\"
+        }
+      ]
+    }' | jq ."
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 2: Stop Arch
+  echo "Stopping Arch..."
+  archgw down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  # Default action is to bring the demo up
+  start_demo
+fi
--- a/tests/e2e/run_e2e_tests.sh
+++ b/tests/e2e/run_e2e_tests.sh
@ -60,14 +60,23 @@ archgw up demos/samples_python/weather_forecast/arch_config.yaml
 kill $model_server_tail_pid
 cd -

-log running e2e tests
-log =================
-poetry run pytest
+log running e2e tests for prompt gateway
+log ====================================
+poetry run pytest test_prompt_gateway.py

-log shutting down the arch gateway service
-log ======================================
+log shutting down the arch gateway service for prompt_gateway demo
+log ===============================================================
 archgw down

+log startup arch gateway with model alias routing demo
+cd ../../
+archgw up demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
+cd -
+
+log running e2e tests for model alias routing
+log ========================================
+poetry run pytest test_model_alias_routing.py
+
 log shutting down the weather_forecast demo
 log =======================================
 cd ../../demos/samples_python/weather_forecast
--- a/tests/e2e/test_model_alias_routing.py
+++ b/tests/e2e/test_model_alias_routing.py
@ -0,0 +1,269 @@
+import anthropic
+import openai
+import os
+import logging
+import pytest
+import sys
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+
+LLM_GATEWAY_ENDPOINT = os.getenv(
+    "LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions"
+)
+
+# =============================================================================
+# MODEL ALIAS TESTS
+# =============================================================================
+
+
+def test_openai_client_with_alias_arch_summarize_v1():
+    """Test OpenAI client using model alias 'arch.summarize.v1' which should resolve to '4o-mini'"""
+    logger.info("Testing OpenAI client with alias 'arch.summarize.v1' -> '4o-mini'")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    completion = client.chat.completions.create(
+        model="arch.summarize.v1",  # This should resolve to 4o-mini
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1!",
+            }
+        ],
+    )
+
+    response_content = completion.choices[0].message.content
+    logger.info(f"Response from arch.summarize.v1 alias: {response_content}")
+    assert response_content == "Hello from alias arch.summarize.v1!"
+
+
+def test_openai_client_with_alias_arch_v1():
+    """Test OpenAI client using model alias 'arch.v1' which should resolve to 'o3'"""
+    logger.info("Testing OpenAI client with alias 'arch.v1' -> 'o3'")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    completion = client.chat.completions.create(
+        model="arch.v1",  # This should resolve to gpt-o3
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from alias arch.v1!",
+            }
+        ],
+    )
+
+    response_content = completion.choices[0].message.content
+    logger.info(f"Response from arch.v1 alias: {response_content}")
+    assert response_content == "Hello from alias arch.v1!"
+
+
+def test_anthropic_client_with_alias_arch_summarize_v1():
+    """Test Anthropic client using model alias 'arch.summarize.v1' which should resolve to '4o-mini'"""
+    logger.info("Testing Anthropic client with alias 'arch.summarize.v1' -> '4o-mini'")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
+
+    message = client.messages.create(
+        model="arch.summarize.v1",  # This should resolve to 4o-mini
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1 via Anthropic!",
+            }
+        ],
+    )
+
+    response_content = "".join(b.text for b in message.content if b.type == "text")
+    logger.info(
+        f"Response from arch.summarize.v1 alias via Anthropic: {response_content}"
+    )
+    assert response_content == "Hello from alias arch.summarize.v1 via Anthropic!"
+
+
+def test_anthropic_client_with_alias_arch_v1():
+    """Test Anthropic client using model alias 'arch.v1' which should resolve to 'o3'"""
+    logger.info("Testing Anthropic client with alias 'arch.v1' -> 'o3'")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
+
+    message = client.messages.create(
+        model="arch.v1",  # This should resolve to o3
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from alias arch.v1 via Anthropic!",
+            }
+        ],
+    )
+
+    response_content = "".join(b.text for b in message.content if b.type == "text")
+    logger.info(f"Response from arch.v1 alias via Anthropic: {response_content}")
+    assert response_content == "Hello from alias arch.v1 via Anthropic!"
+
+
+def test_openai_client_with_alias_streaming():
+    """Test OpenAI client using model alias with streaming"""
+    logger.info(
+        "Testing OpenAI client with alias 'arch.summarize.v1' streaming -> '4o-mini'"
+    )
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    stream = client.chat.completions.create(
+        model="arch.summarize.v1",  # This should resolve to 4o-mini
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from streaming alias!",
+            }
+        ],
+        stream=True,
+    )
+
+    content_chunks = []
+    for chunk in stream:
+        if chunk.choices[0].delta.content:
+            content_chunks.append(chunk.choices[0].delta.content)
+
+    full_content = "".join(content_chunks)
+    logger.info(f"Streaming response from arch.summarize.v1 alias: {full_content}")
+    assert full_content == "Hello from streaming alias!"
+
+
+def test_anthropic_client_with_alias_streaming():
+    """Test Anthropic client using model alias with streaming"""
+    logger.info(
+        "Testing Anthropic client with alias 'arch.summarize.v1' streaming -> '4o-mini'"
+    )
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
+
+    with client.messages.stream(
+        model="arch.summarize.v1",  # This should resolve to 4o-mini
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from streaming alias via Anthropic!",
+            }
+        ],
+    ) as stream:
+        pieces = [t for t in stream.text_stream]
+        full_text = "".join(pieces)
+
+    logger.info(
+        f"Streaming response from arch.summarize.v1 alias via Anthropic: {full_text}"
+    )
+    assert full_text == "Hello from streaming alias via Anthropic!"
+
+
+def test_nonexistent_alias():
+    """Test that using a non-existent alias falls back to treating it as a direct model name"""
+    logger.info(
+        "Testing non-existent alias 'nonexistent.alias' should be treated as direct model"
+    )
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    try:
+        completion = client.chat.completions.create(
+            model="nonexistent.alias",  # This alias doesn't exist
+            max_tokens=50,
+            messages=[
+                {
+                    "role": "user",
+                    "content": "Hello, this should fail or use as direct model name",
+                }
+            ],
+        )
+        logger.info("Non-existent alias was handled gracefully")
+        # If it succeeds, it means the alias was passed through as a direct model name
+        logger.info(f"Response: {completion.choices[0].message.content}")
+    except Exception as e:
+        logger.info(f"Non-existent alias resulted in error (expected): {e}")
+        # This is also acceptable behavior
+
+
+# =============================================================================
+# DIRECT MODEL TESTS (for comparison)
+# =============================================================================
+
+
+def test_direct_model_4o_mini_openai():
+    """Test OpenAI client using direct model name '4o-mini'"""
+    logger.info("Testing OpenAI client with direct model '4o-mini'")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    completion = client.chat.completions.create(
+        model="4o-mini",  # Direct model name
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from direct 4o-mini!",
+            }
+        ],
+    )
+
+    response_content = completion.choices[0].message.content
+    logger.info(f"Response from direct 4o-mini: {response_content}")
+    assert response_content == "Hello from direct 4o-mini!"
+
+
+def test_direct_model_4o_mini_anthropic():
+    """Test Anthropic client using direct model name '4o-mini'"""
+    logger.info("Testing Anthropic client with direct model '4o-mini'")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
+
+    message = client.messages.create(
+        model="4o-mini",  # Direct model name
+        max_tokens=50,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, please respond with exactly: Hello from direct 4o-mini via Anthropic!",
+            }
+        ],
+    )
+
+    response_content = "".join(b.text for b in message.content if b.type == "text")
+    logger.info(f"Response from direct 4o-mini via Anthropic: {response_content}")
+    assert response_content == "Hello from direct 4o-mini via Anthropic!"