adding support for claude code routing (#575)

* fixed for claude code routing. first commit * removing redundant enum tags for cache_control * making sure that claude code can run via the archgw cli * fixing broken config * adding a README.md and updated the cli to use more of our defined patterns for params * fixed config.yaml * minor fixes to make sure PR is clean. Ready to ship * adding claude-sonnet-4-5 to the config * fixes based on PR * fixed alias for README * fixed 400 error handling tests, now that we write temperature to 1.0 for GPT-5 --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-257.local> Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-288.local>
2026-07-23 16:51:04 +02:00 · 2025-09-29 19:23:08 -07:00 · 2025-09-29 19:23:08 -07:00 · f00870dccb
commit f00870dccb
parent 03c2cf6f0d
16 changed files with 903 additions and 106 deletions
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -140,7 +140,7 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: {{ llm_cluster_name }}
-                            timeout: 60s
+                            timeout: 300s
                      {% endfor %}

                      {% if agent_orchestrator %}
@ -153,7 +153,7 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: {{ agent_orchestrator }}
-                            timeout: 60s
+                            timeout: 300s
                      {% endif %}
                http_filters:
                  - name: envoy.filters.http.compressor
@ -266,7 +266,7 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: {{ internal_cluster }}
-                            timeout: 60s
+                            timeout: 300s
                        {% endfor %}

                        {% for cluster_name, cluster in arch_clusters.items() %}
@ -279,7 +279,7 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: {{ cluster_name }}
-                            timeout: 60s
+                            timeout: 300s
                        {% endfor %}
                http_filters:
                  - name: envoy.filters.http.router
@ -434,7 +434,7 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: {{ llm_cluster_name }}
-                            timeout: 60s
+                            timeout: 300s
                      {% endfor %}
                        - match:
                            prefix: "/"
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -242,7 +242,7 @@ def validate_and_render_schema():
    if llm_gateway_listener.get("address") == None:
        llm_gateway_listener["address"] = "127.0.0.1"
    if llm_gateway_listener.get("timeout") == None:
-        llm_gateway_listener["timeout"] = "10s"
+        llm_gateway_listener["timeout"] = "300s"

    use_agent_orchestrator = config_yaml.get("overrides", {}).get(
        "use_agent_orchestrator", False
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -1,3 +1,4 @@
+import json
 import subprocess
 import os
 import time
@ -185,3 +186,93 @@ def stop_arch_modelserver():
    except subprocess.CalledProcessError as e:
        log.info(f"Failed to start model_server. Please check archgw_modelserver logs")
        sys.exit(1)
+
+
+def start_cli_agent(arch_config_file=None, settings_json="{}"):
+    """Start a CLI client connected to Arch."""
+
+    with open(arch_config_file, "r") as file:
+        arch_config = file.read()
+        arch_config_yaml = yaml.safe_load(arch_config)
+
+    # Get egress listener configuration
+    egress_config = arch_config_yaml.get("listeners", {}).get("egress_traffic", {})
+    host = egress_config.get("host", "127.0.0.1")
+    port = egress_config.get("port", 12000)
+
+    # Parse additional settings from command line
+    try:
+        additional_settings = json.loads(settings_json) if settings_json else {}
+    except json.JSONDecodeError:
+        log.error("Settings must be valid JSON")
+        sys.exit(1)
+
+    # Set up environment variables
+    env = os.environ.copy()
+    env.update(
+        {
+            "ANTHROPIC_AUTH_TOKEN": "test",  # Use test token for arch
+            "ANTHROPIC_API_KEY": "",
+            "ANTHROPIC_BASE_URL": f"http://{host}:{port}",
+            "NO_PROXY": host,
+            "DISABLE_TELEMETRY": "true",
+            "DISABLE_COST_WARNINGS": "true",
+            "API_TIMEOUT_MS": "600000",
+        }
+    )
+
+    # Set ANTHROPIC_SMALL_FAST_MODEL from additional_settings or model alias
+    if "ANTHROPIC_SMALL_FAST_MODEL" in additional_settings:
+        env["ANTHROPIC_SMALL_FAST_MODEL"] = additional_settings[
+            "ANTHROPIC_SMALL_FAST_MODEL"
+        ]
+    else:
+        # Check if arch.claude.code.small.fast alias exists in model_aliases
+        model_aliases = arch_config_yaml.get("model_aliases", {})
+        if "arch.claude.code.small.fast" in model_aliases:
+            env["ANTHROPIC_SMALL_FAST_MODEL"] = "arch.claude.code.small.fast"
+        else:
+            log.info(
+                "Tip: Set an alias 'arch.claude.code.small.fast' in your model_aliases config to set a small fast model Claude Code"
+            )
+            log.info("Or provide ANTHROPIC_SMALL_FAST_MODEL in --settings JSON")
+
+    # Non-interactive mode configuration from additional_settings only
+    if additional_settings.get("NON_INTERACTIVE_MODE", False):
+        env.update(
+            {
+                "CI": "true",
+                "FORCE_COLOR": "0",
+                "NODE_NO_READLINE": "1",
+                "TERM": "dumb",
+            }
+        )
+
+    # Build claude command arguments
+    claude_args = []
+
+    # Add settings if provided, excluding those already handled as environment variables
+    if additional_settings:
+        # Filter out settings that are already processed as environment variables
+        claude_settings = {
+            k: v
+            for k, v in additional_settings.items()
+            if k not in ["ANTHROPIC_SMALL_FAST_MODEL", "NON_INTERACTIVE_MODE"]
+        }
+        if claude_settings:
+            claude_args.append(f"--settings={json.dumps(claude_settings)}")
+
+    # Use claude from PATH
+    claude_path = "claude"
+    log.info(f"Connecting Claude Code Agent to Arch at {host}:{port}")
+
+    try:
+        subprocess.run([claude_path] + claude_args, env=env, check=True)
+    except subprocess.CalledProcessError as e:
+        log.error(f"Error starting claude: {e}")
+        sys.exit(1)
+    except FileNotFoundError:
+        log.error(
+            f"{claude_path} not found. Make sure Claude Code is installed: npm install -g @anthropic-ai/claude-code"
+        )
+        sys.exit(1)
--- a/arch/tools/cli/main.py
+++ b/arch/tools/cli/main.py
@ -4,13 +4,20 @@ import sys
 import subprocess
 import multiprocessing
 import importlib.metadata
+import json
 from cli import targets
-from cli.docker_cli import docker_validate_archgw_schema, stream_gateway_logs
+from cli.docker_cli import (
+    docker_validate_archgw_schema,
+    stream_gateway_logs,
+    docker_container_status,
+)
 from cli.utils import (
    getLogger,
    get_llm_provider_access_keys,
+    has_ingress_listener,
    load_env_file_to_dict,
    stream_access_logs,
+    find_config_file,
 )
 from cli.core import (
    start_arch_modelserver,
@ -18,9 +25,11 @@ from cli.core import (
    start_arch,
    stop_docker_container,
    download_models_from_hf,
+    start_cli_agent,
 )
 from cli.consts import (
    ARCHGW_DOCKER_IMAGE,
+    ARCHGW_DOCKER_NAME,
    KATANEMO_DOCKERHUB_REPO,
    SERVICE_NAME_ARCHGW,
    SERVICE_NAME_MODEL_SERVER,
@ -170,12 +179,8 @@ def up(file, path, service, foreground):
        start_arch_modelserver(foreground)
        return

-    if file:
-        # If a file is provided, process that file
-        arch_config_file = os.path.abspath(file)
-    else:
-        # If no file is provided, use the path and look for arch_config.yaml
-        arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml"))
+    # Use the utility function to find config file
+    arch_config_file = find_config_file(path, file)

    # Check if the file exists
    if not os.path.exists(arch_config_file):
@ -183,7 +188,6 @@ def up(file, path, service, foreground):
        return

    log.info(f"Validating {arch_config_file}")
-
    (
        validation_return_code,
        validation_stdout,
@ -240,8 +244,15 @@ def up(file, path, service, foreground):
    if service == SERVICE_NAME_ARCHGW:
        start_arch(arch_config_file, env, foreground=foreground)
    else:
-        download_models_from_hf()
-        start_arch_modelserver(foreground)
+        # Check if ingress_traffic listener is configured before starting model_server
+        if has_ingress_listener(arch_config_file):
+            download_models_from_hf()
+            start_arch_modelserver(foreground)
+        else:
+            log.info(
+                "Skipping model_server startup: no ingress_traffic listener configured in arch_config.yaml"
+            )
+
        start_arch(arch_config_file, env, foreground=foreground)


@ -321,10 +332,51 @@ def logs(debug, follow):
            archgw_process.terminate()


+@click.command()
+@click.argument("type", type=click.Choice(["claude"]), required=True)
+@click.argument("file", required=False)  # Optional file argument
+@click.option(
+    "--path", default=".", help="Path to the directory containing arch_config.yaml"
+)
+@click.option(
+    "--settings",
+    default="{}",
+    help="Additional settings as JSON string for the CLI agent.",
+)
+def cli_agent(type, file, path, settings):
+    """Start a CLI agent connected to Arch.
+
+    CLI_AGENT: The type of CLI agent to start (currently only 'claude' is supported)
+    """
+
+    # Check if archgw docker container is running
+    archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
+    if archgw_status != "running":
+        log.error(f"archgw docker container is not running (status: {archgw_status})")
+        log.error("Please start archgw using the 'archgw up' command.")
+        sys.exit(1)
+
+    # Determine arch_config.yaml path
+    arch_config_file = find_config_file(path, file)
+    if not os.path.exists(arch_config_file):
+        log.error(f"Config file not found: {arch_config_file}")
+        sys.exit(1)
+
+    try:
+        start_cli_agent(arch_config_file, settings)
+    except SystemExit:
+        # Re-raise SystemExit to preserve exit codes
+        raise
+    except Exception as e:
+        click.echo(f"Error: {e}")
+        sys.exit(1)
+
+
 main.add_command(up)
 main.add_command(down)
 main.add_command(build)
 main.add_command(logs)
+main.add_command(cli_agent)
 main.add_command(generate_prompt_targets)

 if __name__ == "__main__":
--- a/arch/tools/cli/utils.py
+++ b/arch/tools/cli/utils.py
@ -21,6 +21,22 @@ def getLogger(name="cli"):
 log = getLogger(__name__)


+def has_ingress_listener(arch_config_file):
+    """Check if the arch config file has ingress_traffic listener configured."""
+    try:
+        with open(arch_config_file) as f:
+            arch_config_dict = yaml.safe_load(f)
+
+        ingress_traffic = arch_config_dict.get("listeners", {}).get(
+            "ingress_traffic", {}
+        )
+
+        return bool(ingress_traffic)
+    except Exception as e:
+        log.error(f"Error reading config file {arch_config_file}: {e}")
+        return False
+
+
 def get_llm_provider_access_keys(arch_config_file):
    with open(arch_config_file, "r") as file:
        arch_config = file.read()
@ -72,6 +88,19 @@ def load_env_file_to_dict(file_path):
    return env_dict


+def find_config_file(path=".", file=None):
+    """Find the appropriate config file path."""
+    if file:
+        # If a file is provided, process that file
+        return os.path.abspath(file)
+    else:
+        # If no file is provided, use the path and look for arch_config.yaml first, then config.yaml for convenience
+        arch_config_file = os.path.abspath(os.path.join(path, "config.yaml"))
+        if not os.path.exists(arch_config_file):
+            arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml"))
+        return arch_config_file
+
+
 def stream_access_logs(follow):
    """
    Get the archgw access logs
--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@ -126,8 +126,9 @@ pub async fn chat(
            });

    const MAX_MESSAGE_LENGTH: usize = 50;
-    let latest_message_for_log = if latest_message_for_log.len() > MAX_MESSAGE_LENGTH {
-        format!("{}...", &latest_message_for_log[..MAX_MESSAGE_LENGTH])
+    let latest_message_for_log = if latest_message_for_log.chars().count() > MAX_MESSAGE_LENGTH {
+        let truncated: String = latest_message_for_log.chars().take(MAX_MESSAGE_LENGTH).collect();
+        format!("{}...", truncated)
    } else {
        latest_message_for_log
    };
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -88,6 +88,7 @@ pub struct ChatCompletionsRequest {
    pub prediction: Option<StaticContent>,
    // pub reasoning_effect: Option<bool>, // GOOD FIRST ISSUE: Future support for reasoning effects
    pub response_format: Option<Value>,
+    pub reasoning_effort: Option<String>, // e.g., "none", "low", "medium", "high"
    // pub safety_identifier: Option<String>, // GOOD FIRST ISSUE: Future support for safety identifiers
    pub seed: Option<i32>,
    pub service_tier: Option<String>,
@ -116,6 +117,13 @@ impl ChatCompletionsRequest {
            self.max_tokens = None;
        }
    }
+
+    pub fn fix_temperature_if_gpt5(&mut self) {
+        let model = self.model.as_str();
+        if model.starts_with("gpt-5") {
+            self.temperature = Some(1.0);
+        }
+    }
 }

 // ============================================================================
@ -598,6 +606,7 @@ impl TryFrom<&[u8]> for ChatCompletionsRequest {
       let mut req: ChatCompletionsRequest = serde_json::from_slice(bytes).map_err(OpenAIStreamError::from)?;
        // Use the centralized suppression logic
        req.suppress_max_tokens_if_o3();
+        req.fix_temperature_if_gpt5();
        Ok(req)
    }
 }
--- a/crates/hermesllm/src/clients/transformer.rs
+++ b/crates/hermesllm/src/clients/transformer.rs
@ -111,6 +111,7 @@ impl TryFrom<AnthropicMessagesRequest> for ChatCompletionsRequest {
            ..Default::default()
        };
        _chat_completions_req.suppress_max_tokens_if_o3();
+        _chat_completions_req.fix_temperature_if_gpt5();
        Ok(_chat_completions_req)
    }
 }
@ -352,6 +353,7 @@ impl TryFrom<ChatCompletionsStreamResponse> for MessagesStreamEvent {
        let choice = &resp.choices[0];

        // Handle final chunk with usage
+        let has_usage = resp.usage.is_some();
        if let Some(usage) = resp.usage {
            if let Some(finish_reason) = &choice.finish_reason {
                let anthropic_stop_reason: MessagesStopReason = finish_reason.clone().into();
@ -403,11 +405,27 @@ impl TryFrom<ChatCompletionsStreamResponse> for MessagesStreamEvent {
            return convert_tool_call_deltas(tool_calls.clone());
        }

-        // Handle finish reason
+        // Handle finish reason - generate MessageDelta only (MessageStop comes later)
        if let Some(finish_reason) = &choice.finish_reason {
-            if *finish_reason == FinishReason::Stop {
-                return Ok(MessagesStreamEvent::MessageStop);
+            // If we have usage data, it was already handled above
+            // If not, we need to generate MessageDelta with default usage
+            if !has_usage {
+                let anthropic_stop_reason: MessagesStopReason = finish_reason.clone().into();
+                return Ok(MessagesStreamEvent::MessageDelta {
+                    delta: MessagesMessageDelta {
+                        stop_reason: anthropic_stop_reason,
+                        stop_sequence: None,
+                    },
+                    usage: MessagesUsage {
+                        input_tokens: 0,
+                        output_tokens: 0,
+                        cache_creation_input_tokens: None,
+                        cache_read_input_tokens: None,
+                    },
+                });
            }
+            // If usage was already handled above, we don't need to do anything more here
+            // MessageStop will be handled when [DONE] is encountered
        }

        // Default to ping for unhandled cases
@ -468,18 +486,6 @@ impl TryFrom<MessagesMessage> for Vec<Message> {
            }
            MessagesMessageContent::Blocks(blocks) => {
                let (content_parts, tool_calls, tool_results) = blocks.split_for_openai()?;
-
-                // Create main message
-                let content = build_openai_content(content_parts, &tool_calls);
-                let main_message = Message {
-                    role: message.role.into(),
-                    content,
-                    name: None,
-                    tool_calls: if tool_calls.is_empty() { None } else { Some(tool_calls) },
-                    tool_call_id: None,
-                };
-                result.push(main_message);
-
                // Add tool result messages
                for (tool_use_id, result_text, _is_error) in tool_results {
                    result.push(Message {
@ -490,6 +496,20 @@ impl TryFrom<MessagesMessage> for Vec<Message> {
                        tool_call_id: Some(tool_use_id),
                    });
                }
+
+                // Only create main message if there's actual content or tool calls
+                // Skip creating empty content messages (e.g., when message only contains tool_result blocks)
+                if !content_parts.is_empty() || !tool_calls.is_empty() {
+                    let content = build_openai_content(content_parts, &tool_calls);
+                    let main_message = Message {
+                        role: message.role.into(),
+                        content,
+                        name: None,
+                        tool_calls: if tool_calls.is_empty() { None } else { Some(tool_calls) },
+                        tool_call_id: None,
+                    };
+                    result.push(main_message);
+                }
            }
        }

@ -515,9 +535,11 @@ impl TryFrom<Message> for MessagesMessage {
                        MessagesContentBlock::ToolResult {
                            tool_use_id: tool_call_id,
                            is_error: None,
-                            content: vec![MessagesContentBlock::Text {
+                            content: ToolResultContent::Blocks(vec![MessagesContentBlock::Text {
                                text: message.content.extract_text(),
-                            }],
+                                cache_control: None,
+                            }]),
+                            cache_control: None,
                        },
                    ]),
                });
@ -551,7 +573,7 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {

        for block in self {
            match block {
-                MessagesContentBlock::ToolUse { id, name, input } |
+                MessagesContentBlock::ToolUse { id, name, input, .. } |
                MessagesContentBlock::ServerToolUse { id, name, input } |
                MessagesContentBlock::McpToolUse { id, name, input } => {
                    let arguments = serde_json::to_string(&input)?;
@ -575,7 +597,7 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {

        for block in self {
            match block {
-                MessagesContentBlock::Text { text } => {
+                MessagesContentBlock::Text { text, .. } => {
                    content_parts.push(ContentPart::Text { text: text.clone() });
                }
                MessagesContentBlock::Image { source } => {
@ -587,7 +609,7 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {
                        },
                    });
                }
-                MessagesContentBlock::ToolUse { id, name, input } |
+                MessagesContentBlock::ToolUse { id, name, input, .. } |
                MessagesContentBlock::ServerToolUse { id, name, input } |
                MessagesContentBlock::McpToolUse { id, name, input } => {
                    let arguments = serde_json::to_string(&input)?;
@ -597,7 +619,10 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {
                        function: FunctionCall { name: name.clone(), arguments },
                    });
                }
-                MessagesContentBlock::ToolResult { tool_use_id, content, is_error } |
+                MessagesContentBlock::ToolResult { tool_use_id, content, is_error, .. } => {
+                    let result_text = content.extract_text();
+                    tool_results.push((tool_use_id.clone(), result_text, is_error.unwrap_or(false)));
+                }
                MessagesContentBlock::WebSearchToolResult { tool_use_id, content, is_error } |
                MessagesContentBlock::CodeExecutionToolResult { tool_use_id, content, is_error } |
                MessagesContentBlock::McpToolResult { tool_use_id, content, is_error } => {
@ -819,7 +844,7 @@ fn build_openai_content(content_parts: Vec<ContentPart>, tool_calls: &[ToolCall]
 fn build_anthropic_content(content_blocks: Vec<MessagesContentBlock>) -> MessagesMessageContent {
    if content_blocks.len() == 1 {
        match &content_blocks[0] {
-            MessagesContentBlock::Text { text } => MessagesMessageContent::Single(text.clone()),
+            MessagesContentBlock::Text { text, .. } => MessagesMessageContent::Single(text.clone()),
            _ => MessagesMessageContent::Blocks(content_blocks),
        }
    } else if content_blocks.is_empty() {
@ -835,12 +860,11 @@ fn convert_anthropic_content_to_openai(content: &[MessagesContentBlock]) -> Resu

    for block in content {
        match block {
-            MessagesContentBlock::Text { text } => {
+            MessagesContentBlock::Text { text, .. } => {
                text_parts.push(text.clone());
            }
-            MessagesContentBlock::Thinking { text } => {
-                // Include thinking as regular text for OpenAI
-                text_parts.push(format!("[Thinking: {}]", text));
+            MessagesContentBlock::Thinking { thinking, .. } => {
+                text_parts.push(format!("thinking: {}", thinking));
            }
            _ => {
                // Skip other content types for basic text conversion
@ -860,14 +884,14 @@ fn convert_openai_message_to_anthropic_content(message: &Message) -> Result<Vec<
    match &message.content {
        MessageContent::Text(text) => {
            if !text.is_empty() {
-                blocks.push(MessagesContentBlock::Text { text: text.clone() });
+                blocks.push(MessagesContentBlock::Text { text: text.clone(), cache_control: None });
            }
        }
        MessageContent::Parts(parts) => {
            for part in parts {
                match part {
                    ContentPart::Text { text } => {
-                        blocks.push(MessagesContentBlock::Text { text: text.clone() });
+                        blocks.push(MessagesContentBlock::Text { text: text.clone(), cache_control: None });
                    }
                    ContentPart::ImageUrl { image_url } => {
                        let source = convert_image_url_to_source(image_url);
@ -886,6 +910,7 @@ fn convert_openai_message_to_anthropic_content(message: &Message) -> Result<Vec<
                id: tool_call.id.clone(),
                name: tool_call.function.name.clone(),
                input,
+                cache_control: None,
            });
        }
    }
@ -984,6 +1009,21 @@ fn convert_content_delta(delta: MessagesContentDelta) -> Result<ChatCompletionsS
                None,
            ))
        }
+        MessagesContentDelta::ThinkingDelta { thinking } => {
+            Ok(create_openai_chunk(
+                "stream",
+                "unknown",
+                MessageDelta {
+                    role: None,
+                    content: Some(format!("thinking: {}", thinking)),
+                    refusal: None,
+                    function_call: None,
+                    tool_calls: None,
+                },
+                None,
+                None,
+            ))
+        }
        MessagesContentDelta::InputJsonDelta { partial_json } => {
            Ok(create_openai_chunk(
                "stream",
@ -1023,6 +1063,7 @@ fn convert_tool_call_deltas(tool_calls: Vec<ToolCallDelta>) -> Result<MessagesSt
                            id: id.clone(),
                            name: name.clone(),
                            input: Value::Object(serde_json::Map::new()),
+                            cache_control: None,
                        },
                    });
                }
@ -1254,6 +1295,7 @@ mod tests {
                id: "call_123".to_string(),
                name: "get_weather".to_string(),
                input: json!({}),
+                cache_control: None,
            },
        };

@ -1566,6 +1608,7 @@ mod tests {
                id: "call_weather".to_string(),
                name: "get_weather".to_string(),
                input: json!({}),
+                cache_control: None,
            },
        };

--- a/crates/hermesllm/src/providers/response.rs
+++ b/crates/hermesllm/src/providers/response.rs
@ -269,6 +269,13 @@ impl TryFrom<(&[u8], &SupportedAPIs, &SupportedAPIs)> for ProviderStreamResponse
                Ok(ProviderStreamResponseType::ChatCompletionsStreamResponse(chat_resp))
            }
            (SupportedAPIs::OpenAIChatCompletions(_), SupportedAPIs::AnthropicMessagesAPI(_)) => {
+                // Special case: Handle [DONE] marker for OpenAI -> Anthropic conversion
+                if bytes == b"[DONE]" {
+                    return Ok(ProviderStreamResponseType::MessagesStreamEvent(
+                        crate::apis::anthropic::MessagesStreamEvent::MessageStop
+                    ));
+                }
+
                let openai_resp: crate::apis::openai::ChatCompletionsStreamResponse = serde_json::from_slice(bytes)?;

                // Transform to Anthropic Messages stream format using the transformer
@ -287,8 +294,8 @@ impl TryFrom<(SseEvent, &SupportedAPIs, &SupportedAPIs)> for SseEvent {
        // Create a new transformed event based on the original
        let mut transformed_event = sse_event;

-        // If not [DONE] and has data, parse the data as a provider stream response (business logic layer)
-        if !transformed_event.is_done() && transformed_event.data.is_some() {
+        // If has data, parse the data as a provider stream response (business logic layer)
+        if transformed_event.data.is_some() {
            let data_str = transformed_event.data.as_ref().unwrap();
            let data_bytes = data_str.as_bytes();
            let transformed_response = ProviderStreamResponseType::try_from((data_bytes, client_api, upstream_api))?;
@ -380,6 +387,7 @@ where
    I::Item: AsRef<str>,
 {
    pub lines: I,
+    pub done_seen: bool,
 }

 impl<I> SseStreamIter<I>
@ -388,7 +396,7 @@ where
    I::Item: AsRef<str>,
 {
    pub fn new(lines: I) -> Self {
-        Self { lines }
+        Self { lines, done_seen: false }
    }
 }

@ -411,14 +419,20 @@ where
    type Item = SseEvent;

    fn next(&mut self) -> Option<Self::Item> {
+        // If we already returned [DONE], terminate the stream
+        if self.done_seen {
+            return None;
+        }
+
        for line in &mut self.lines {
            let line_str = line.as_ref();

            // Try to parse as either data: or event: line
            if let Ok(event) = line_str.parse::<SseEvent>() {
-                // For data: lines, check if this is the [DONE] marker - if so, end the stream
+                // For data: lines, check if this is the [DONE] marker
                if event.data.is_some() && event.is_done() {
-                    return None;
+                    self.done_seen = true;
+                    return Some(event); // Return [DONE] event for transformation
                }
                // For data: lines, skip events that should be filtered at the transport layer
                if event.data.is_some() && event.should_skip() {
@ -706,7 +720,11 @@ mod tests {
        assert!(event2.data.as_ref().unwrap().contains("msg2"));
        assert!(!event2.should_skip());

-        // Iterator should end at [DONE] (no more events)
+        // Third event should be [DONE]
+        let done_event = iter.next().unwrap();
+        assert!(done_event.is_done());
+
+        // Iterator should end after [DONE]
        assert!(iter.next().is_none());
    }

@ -745,7 +763,11 @@ mod tests {
        assert!(!event4.is_event_only());
        assert!(event4.data.as_ref().unwrap().contains("Hello"));

-        // Iterator should end at [DONE]
+        // Fifth event should be [DONE]
+        let done_event = iter.next().unwrap();
+        assert!(done_event.is_done());
+
+        // Iterator should end after [DONE]
        assert!(iter.next().is_none());
    }

@ -776,4 +798,25 @@ mod tests {
        let provider_type = ProviderStreamResponseType::ChatCompletionsStreamResponse(openai_event);
        assert_eq!(provider_type.event_type(), None);
    }
+
+    #[test]
+    fn test_done_marker_handled_in_stream_response_transformation() {
+        use crate::apis::anthropic::AnthropicApi;
+
+        // Test that [DONE] marker is properly converted to MessageStop in the transformation layer
+        let done_bytes = b"[DONE]";
+        let client_api = SupportedAPIs::AnthropicMessagesAPI(AnthropicApi::Messages);
+        let upstream_api = SupportedAPIs::OpenAIChatCompletions(crate::apis::openai::OpenAIApi::ChatCompletions);
+
+        let result = ProviderStreamResponseType::try_from((done_bytes.as_slice(), &client_api, &upstream_api));
+        assert!(result.is_ok());
+
+        if let Ok(ProviderStreamResponseType::MessagesStreamEvent(event)) = result {
+            // Verify it's a MessageStop event
+            assert_eq!(event.event_type(), Some("message_stop"));
+            assert!(matches!(event, crate::apis::anthropic::MessagesStreamEvent::MessageStop));
+        } else {
+            panic!("Expected MessagesStreamEvent::MessageStop");
+        }
+    }
 }
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -395,23 +395,15 @@ impl StreamContext {
        }
    }

-    fn debug_log_body(&self, body: &[u8]) {
-        debug!(
-            "[ARCHGW_REQ_ID:{}] UPSTREAM_RAW_RESPONSE: body_size={} content={}",
-            self.request_identifier(),
-            body.len(),
-            String::from_utf8_lossy(body)
-        );
-    }
-
    fn handle_streaming_response(
        &mut self,
        body: &[u8],
        provider_id: ProviderId,
    ) -> Result<Vec<u8>, Action> {
        debug!(
-            "[ARCHGW_REQ_ID:{}] STREAMING_PROCESS: provider_id={:?} chunk_size={}",
+            "[ARCHGW_REQ_ID:{}] STREAMING_PROCESS: client={:?} provider_id={:?} chunk_size={}",
            self.request_identifier(),
+            self.client_api,
            provider_id,
            body.len()
        );
@ -958,7 +950,12 @@ impl HttpContext for StreamContext {
            Err(action) => return action,
        };

-        self.debug_log_body(&body);
+        debug!(
+            "[ARCHGW_REQ_ID:{}] UPSTREAM_RAW_RESPONSE: body_size={} content={}",
+            self.request_identifier(),
+            body.len(),
+            String::from_utf8_lossy(&body)
+        );

        let provider_id = self.get_provider_id();
        if self.streaming_response {
--- a/demos/use_cases/claude_code/README.md
+++ b/demos/use_cases/claude_code/README.md
@ -0,0 +1,133 @@
+# Claude Code Routing with (Preference-aligned) Intelligence
+
+## Why This Matters
+
+**Claude Code is powerful, but what if you could access the best of ALL AI models through one familiar interface?**
+
+Instead of being locked into a set of LLMs from one provier, imagine:
+- Using **DeepSeek's coding expertise** for complex algorithms
+- Leveraging **GPT-5's reasoning** for architecture decisions
+- Tapping **Claude's analysis** for code reviews
+- Accessing **Grok's speed** for quick iterations
+
+**All through the same Claude Code interface you already love.**
+
+## The Solution: Intelligent Multi-LLM Routing
+
+Arch Gateway transforms Claude Code into a **universal AI development interface** that:
+
+### 🌐 **Connects to Any LLM Provider**
+- **OpenAI**: GPT-4.1, GPT-5, etc.
+- **Anthropic**: Claude 3.5 Sonnet, Claude 3 Haiku, Claude 4.5
+- **DeepSeek**: DeepSeek-V3, DeepSeek-Coder-V2
+- **Grok**: Grok-2, Grok-2-mini
+- **Others**: Gemini, Llama, Mistral, local models via Ollama
+
+### 🧠 **Routes Intelligently Based on Task**
+Our research-backed routing system automatically selects the optimal model by analyzing:
+- **Task complexity** (simple refactoring vs. architectural design)
+- **Content type** (code generation vs. debugging vs. documentation)
+
+
+## Quick Start
+
+### Prerequisites
+- Claude Code installed: `npm install -g @anthropic-ai/claude-code`
+- Docker running on your system
+- Create a python virtual environment in your current working directory
+
+### 1. Get the Configuration File
+Download the demo configuration file using one of these methods:
+
+**Option A: Direct download**
+```bash
+curl -O https://raw.githubusercontent.com/katanemo/arch/main/demos/use_cases/claude_code/config.yaml
+```
+
+**Option B: Clone the repository**
+```bash
+git clone https://github.com/katanemo/arch.git
+cd arch/demos/use_cases/claude_code
+
+```
+
+### 2. Set Up Your API Keys
+Set up your environment variables with your actual API keys:
+```bash
+export OPENAI_API_KEY="your-openai-api-key"
+export ANTHROPIC_API_KEY="your-anthropic-api-key"
+export AZURE_API_KEY="your-azure-api-key"  # Optional
+```
+
+Alternatively, create a `.env` file in your working directory:
+```bash
+echo "OPENAI_API_KEY=your-openai-api-key" > .env
+echo "ANTHROPIC_API_KEY=your-anthropic-api-key" >> .env
+```
+
+### 3. Install and Start Arch Gateway
+```bash
+pip install archgw
+archgw up
+```
+
+### 4. Launch Claude Code with Multi-LLM Support
+```bash
+archgw cli-agent claude
+```
+
+That's it! Claude Code now has access to multiple LLM providers with intelligent routing.
+
+## What You'll Experience
+
+### Screenshot Placeholder
+![Claude Code with Multi-LLM Routing](screenshot-placeholder.png)
+*Claude Code interface enhanced with intelligent model routing and multi-provider access*
+
+### Real-Time Model Selection
+When you interact with Claude Code, you'll get:
+- **Automatic model selection** based on your query type
+- **Transparent routing decisions** showing which model was chosen and why
+- **Seamless failover** if a model becomes unavailable
+
+## Configuration
+
+The setup uses the included `config.yaml` file which defines:
+
+### Multi-Provider Access
+```yaml
+llm_providers:
+  - model: openai/gpt-4.1-2025-04-14
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+    - name: code generation
+        description: generating new code snippets and functions
+  - model: anthropic/claude-3-5-sonnet-20241022
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+        name: code understanding
+        description: explaining and analyzing existing code
+```
+## Advanced Usage
+
+### Custom Model Selection
+```bash
+# Force a specific model for this session
+archgw cli-agent claude --settings='{"ANTHROPIC_SMALL_FAST_MODEL": "deepseek-coder-v2"}'
+
+# Enable detailed routing information
+archgw cli-agent claude --settings='{"statusLine": {"type": "command", "command": "ccr statusline"}}'
+```
+
+### Environment Variables
+The system automatically configures:
+```bash
+ANTHROPIC_BASE_URL=http://127.0.0.1:12000  # Routes through Arch Gateway
+ANTHROPIC_SMALL_FAST_MODEL=arch.claude.code.small.fast    # Uses intelligent alias
+```
+
+## Real Developer Workflows
+
+This intelligent routing is powered by our research in preference-aligned LLMM routing:
+- **Research Paper**: [Preference-Aligned LLM Router](https://arxiv.org/abs/2506.16655)
+- **Technical Docs**: [docs.archgw.com](https://docs.archgw.com)
--- a/demos/use_cases/claude_code/config.yaml
+++ b/demos/use_cases/claude_code/config.yaml
@ -0,0 +1,41 @@
+version: v0.1
+
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s
+
+llm_providers:
+  # OpenAI Models
+  - model: openai/gpt-5-2025-08-07
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code generation
+        description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
+
+  - model: openai/gpt-4.1-2025-04-14
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: code understanding
+        description: understand and explain existing code snippets, functions, or libraries
+
+  # Anthropic Models
+  - model: anthropic/claude-sonnet-4-5
+    default: true
+    access_key: $ANTHROPIC_API_KEY
+
+  - model: anthropic/claude-3-haiku-20240307
+    access_key: $ANTHROPIC_API_KEY
+
+  # Ollama Models
+  - model: ollama/llama3.1
+    base_url: http://host.docker.internal:11434
+
+
+# Model aliases - friendly names that map to actual provider names
+model_aliases:
+  # Alias for a small faster Claude model
+  arch.claude.code.small.fast:
+    target: claude-3-haiku-20240307
--- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
+++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
@ -24,7 +24,7 @@ llm_providers:
    access_key: $OPENAI_API_KEY

  # Anthropic Models
-  - model: anthropic/claude-3-5-sonnet-20241022
+  - model: anthropic/claude-sonnet-4-20250514
    access_key: $ANTHROPIC_API_KEY

  - model: anthropic/claude-3-haiku-20240307
@ -56,7 +56,7 @@ model_aliases:

  # Alias for creative tasks -> Claude model
  arch.creative.v1:
-    target: claude-3-5-sonnet-20241022
+    target: claude-sonnet-4-20250514

  # Alias for quick responses -> fast model
  arch.fast.v1:
@ -67,7 +67,7 @@ model_aliases:
    target: gpt-5-mini-2025-08-07

  chat-model:
-    target: llama3.1
+    target: gpt-5-mini-2025-08-07

  creative-model:
-    target: claude-3-5-sonnet-20241022
+    target: claude-sonnet-4-20250514
--- a/tests/e2e/test_model_alias_routing.py
+++ b/tests/e2e/test_model_alias_routing.py
@ -199,8 +199,7 @@ def test_400_error_handling_with_alias():
    try:
        completion = client.chat.completions.create(
            model="arch.summarize.v1",  # This should resolve to gpt-5-mini-2025-08-07
-            max_completion_tokens=50,
-            temperature=0.7,  # This is a typo - should be "temperature", which should trigger a 400 error
+            max_tokens=50,
            messages=[
                {
                    "role": "user",
@ -350,3 +349,57 @@ def test_direct_model_4o_mini_anthropic():
    response_content = "".join(b.text for b in message.content if b.type == "text")
    logger.info(f"Response from direct 4o-mini via Anthropic: {response_content}")
    assert response_content == "Hello from direct 4o-mini via Anthropic!"
+
+
+def test_anthropic_thinking_mode_streaming():
+    # Anthropic base_url should be the root, not /v1/chat/completions
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+
+    client = anthropic.Anthropic(
+        api_key=os.environ.get("ANTHROPIC_API_KEY", "test-key"),
+        base_url=base_url,
+    )
+
+    thinking_block_started = False
+    thinking_delta_seen = False
+    text_delta_seen = False
+
+    with client.messages.stream(
+        model="claude-sonnet-4-20250514",
+        max_tokens=2048,
+        thinking={"type": "enabled", "budget_tokens": 1024},  # <- idiomatic
+        messages=[{"role": "user", "content": "Explain briefly what 2+2 equals"}],
+    ) as stream:
+        for event in stream:
+            # 1) detect when a thinking block starts
+            if event.type == "content_block_start" and getattr(
+                event, "content_block", None
+            ):
+                if getattr(event.content_block, "type", None) == "thinking":
+                    thinking_block_started = True
+
+            # 2) collect text vs thinking deltas
+            if event.type == "content_block_delta" and getattr(event, "delta", None):
+                if event.delta.type == "text_delta":
+                    text_delta_seen = True
+                elif event.delta.type == "thinking_delta":
+                    # some SDKs expose .thinking, others .text for this delta; not needed here
+                    thinking_delta_seen = True
+
+        final = stream.get_final_message()
+
+    # Basic integrity
+    assert final is not None
+    assert final.content and len(final.content) > 0
+
+    # Normal text should have streamed
+    assert text_delta_seen, "Expected normal text deltas in stream"
+
+    # With thinking enabled, we expect a thinking block and at least one thinking delta
+    assert thinking_block_started, "No thinking block started"
+    assert thinking_delta_seen, "No thinking deltas observed"
+
+    # Optional: double-check on the assembled message
+    final_block_types = [blk.type for blk in final.content]
+    assert "text" in final_block_types
+    assert "thinking" in final_block_types
--- a/tests/e2e/test_prompt_gateway.py
+++ b/tests/e2e/test_prompt_gateway.py
@ -417,12 +417,12 @@ def test_anthropic_client_with_openai_model_streaming():
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    with client.messages.stream(
-        model="gpt-4o-mini",  # OpenAI model via Anthropic client
-        max_tokens=50,
+        model="gpt-5-mini-2025-08-07",  # OpenAI model via Anthropic client
+        max_tokens=500,
        messages=[
            {
                "role": "user",
-                "content": "Hello, please respond with exactly: Hello from GPT-4o-mini via Anthropic!",
+                "content": "Hello, please respond with exactly: Hello from ChatGPT!",
            }
        ],
    ) as stream:
@ -435,8 +435,8 @@ def test_anthropic_client_with_openai_model_streaming():
        # A safe way to reassemble text from the content blocks:
        final_text = "".join(b.text for b in final.content if b.type == "text")

-    assert full_text == "Hello from GPT-4o-mini via Anthropic!"
-    assert final_text == "Hello from GPT-4o-mini via Anthropic!"
+    assert full_text == "Hello from ChatGPT!"
+    assert final_text == "Hello from ChatGPT!"


 def test_openai_gpt4o_mini_v1_messages_api():