making sure that claude code can run via the archgw cli

2026-06-17 15:25:17 +02:00 · 2025-09-28 14:15:42 -07:00 · 2025-09-28 14:15:42 -07:00 · 2f611f74a8
commit 2f611f74a8
parent 1b7f9e43e7
9 changed files with 422 additions and 30 deletions
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -4,7 +4,7 @@ import time
 import sys

 import yaml
-from cli.utils import getLogger
+from cli.utils import getLogger, read_config_file
 from cli.consts import (
    ARCHGW_DOCKER_IMAGE,
    ARCHGW_DOCKER_NAME,
@ -185,3 +185,106 @@ def stop_arch_modelserver():
    except subprocess.CalledProcessError as e:
        log.info(f"Failed to start model_server. Please check archgw_modelserver logs")
        sys.exit(1)
+
+
+def start_cli_agent(arch_config_file=None, settings_json="{}"):
+    """Start a CLI client connected to Arch."""
+    import json
+
+    # Use current directory for config if not specified
+    if arch_config_file is None:
+        config_path = "."
+    else:
+        config_path = (
+            os.path.dirname(arch_config_file)
+            if os.path.dirname(arch_config_file)
+            else "."
+        )
+
+    # Get port and host from arch_config.yaml listeners > egress
+    arch_config = read_config_file(config_path)
+    if not arch_config:
+        log.error(f"Config file not found in {config_path}")
+        sys.exit(1)
+
+    # Get egress listener configuration
+    egress_config = arch_config.get("listeners", {}).get("egress_traffic", {})
+    host = egress_config.get("host", "127.0.0.1")
+    port = egress_config.get("port", 12000)
+
+    # Parse additional settings from command line
+    try:
+        additional_settings = json.loads(settings_json) if settings_json else {}
+    except json.JSONDecodeError:
+        log.error("Settings must be valid JSON")
+        sys.exit(1)
+
+    # Set up environment variables
+    env = os.environ.copy()
+    env.update(
+        {
+            "ANTHROPIC_AUTH_TOKEN": "test",  # Use test token for arch
+            "ANTHROPIC_API_KEY": "",
+            "ANTHROPIC_BASE_URL": f"http://{host}:{port}",
+            "NO_PROXY": host,
+            "DISABLE_TELEMETRY": "true",
+            "DISABLE_COST_WARNINGS": "true",
+            "API_TIMEOUT_MS": "600000",
+        }
+    )
+
+    # Set ANTHROPIC_SMALL_FAST_MODEL from additional_settings or model alias
+    if "ANTHROPIC_SMALL_FAST_MODEL" in additional_settings:
+        env["ANTHROPIC_SMALL_FAST_MODEL"] = additional_settings[
+            "ANTHROPIC_SMALL_FAST_MODEL"
+        ]
+    else:
+        # Check if arch.claude.code.small.fast alias exists in model_aliases
+        model_aliases = arch_config.get("model_aliases", {})
+        if "arch.claude.code.small.fast" in model_aliases:
+            env["ANTHROPIC_SMALL_FAST_MODEL"] = "arch.claude.code.small.fast"
+        else:
+            log.info(
+                "Tip: Set an alias 'arch.claude.code.small.fast' in your model_aliases config to set a small fast model Claude Code"
+            )
+            log.info("Or provide ANTHROPIC_SMALL_FAST_MODEL in --settings JSON")
+
+    # Non-interactive mode configuration from additional_settings only
+    if additional_settings.get("NON_INTERACTIVE_MODE", False):
+        env.update(
+            {
+                "CI": "true",
+                "FORCE_COLOR": "0",
+                "NODE_NO_READLINE": "1",
+                "TERM": "dumb",
+            }
+        )
+
+    # Build claude command arguments
+    claude_args = []
+
+    # Add settings if provided, excluding those already handled as environment variables
+    if additional_settings:
+        # Filter out settings that are already processed as environment variables
+        claude_settings = {
+            k: v
+            for k, v in additional_settings.items()
+            if k not in ["ANTHROPIC_SMALL_FAST_MODEL", "NON_INTERACTIVE_MODE"]
+        }
+        if claude_settings:
+            claude_args.append(f"--settings={json.dumps(claude_settings)}")
+
+    # Use claude from PATH
+    claude_path = "claude"
+    log.info(f"Starting Claude CLI Agent to Arch at {host}:{port}")
+
+    try:
+        subprocess.run([claude_path] + claude_args, env=env, check=True)
+    except subprocess.CalledProcessError as e:
+        log.error(f"Error starting claude: {e}")
+        sys.exit(1)
+    except FileNotFoundError:
+        log.error(
+            f"{claude_path} not found. Make sure Claude Code is installed: npm install -g @anthropic-ai/claude-code"
+        )
+        sys.exit(1)
--- a/arch/tools/cli/main.py
+++ b/arch/tools/cli/main.py
@ -4,14 +4,21 @@ import sys
 import subprocess
 import multiprocessing
 import importlib.metadata
+import json
 from cli import targets
-from cli.docker_cli import docker_validate_archgw_schema, stream_gateway_logs
+from cli.docker_cli import (
+    docker_validate_archgw_schema,
+    stream_gateway_logs,
+    docker_container_status,
+)
 from cli.utils import (
    getLogger,
    get_llm_provider_access_keys,
    has_ingress_listener,
    load_env_file_to_dict,
    stream_access_logs,
+    read_config_file,
+    find_config_file,
 )
 from cli.core import (
    start_arch_modelserver,
@ -19,9 +26,11 @@ from cli.core import (
    start_arch,
    stop_docker_container,
    download_models_from_hf,
+    start_cli_agent,
 )
 from cli.consts import (
    ARCHGW_DOCKER_IMAGE,
+    ARCHGW_DOCKER_NAME,
    KATANEMO_DOCKERHUB_REPO,
    SERVICE_NAME_ARCHGW,
    SERVICE_NAME_MODEL_SERVER,
@ -171,12 +180,8 @@ def up(file, path, service, foreground):
        start_arch_modelserver(foreground)
        return

-    if file:
-        # If a file is provided, process that file
-        arch_config_file = os.path.abspath(file)
-    else:
-        # If no file is provided, use the path and look for arch_config.yaml
-        arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml"))
+    # Use the utility function to find config file
+    arch_config_file = find_config_file(path, file)

    # Check if the file exists
    if not os.path.exists(arch_config_file):
@ -329,10 +334,52 @@ def logs(debug, follow):
            archgw_process.terminate()


+@click.command()
+@click.argument("cli_type", type=click.Choice(["claude"]), required=True)
+@click.option(
+    "--path",
+    default=None,
+    help="Path to the directory containing arch_config.yaml (defaults to current directory)",
+)
+@click.option(
+    "--settings",
+    default="{}",
+    help="Additional settings as JSON string for the CLI agent.",
+)
+def cli_agent(cli_type, path, settings):
+    """Start a CLI agent connected to Arch.
+
+    CLI_TYPE: The type of CLI agent to start (currently only 'claude' is supported)
+    """
+    # Determine arch_config.yaml path
+    arch_config_file = None
+    if path:
+        arch_config_file = os.path.join(path, "arch_config.yaml")
+    else:
+        arch_config_file = "arch_config.yaml"  # Current directory
+
+    # Check if archgw docker container is running
+    archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
+    if archgw_status != "running":
+        log.error(f"archgw docker container is not running (status: {archgw_status})")
+        log.error("Please start archgw using the 'archgw up' command.")
+        sys.exit(1)
+
+    try:
+        start_cli_agent(arch_config_file, settings)
+    except SystemExit:
+        # Re-raise SystemExit to preserve exit codes
+        raise
+    except Exception as e:
+        click.echo(f"Error: {e}")
+        sys.exit(1)
+
+
 main.add_command(up)
 main.add_command(down)
 main.add_command(build)
 main.add_command(logs)
+main.add_command(cli_agent)
 main.add_command(generate_prompt_targets)

 if __name__ == "__main__":
--- a/arch/tools/cli/utils.py
+++ b/arch/tools/cli/utils.py
@ -88,6 +88,36 @@ def load_env_file_to_dict(file_path):
    return env_dict


+def read_config_file(path="."):
+    """Read configuration from arch_config.yaml or config.yaml in the specified path."""
+    config_files = ["arch_config.yaml", "config.yaml"]
+
+    for config_file in config_files:
+        config_path = os.path.abspath(os.path.join(path, config_file))
+        if os.path.exists(config_path):
+            try:
+                with open(config_path, "r") as f:
+                    return yaml.safe_load(f)
+            except Exception as e:
+                log.warning(f"Error reading {config_path}: {e}")
+                continue
+
+    return {}
+
+
+def find_config_file(path=".", file=None):
+    """Find the appropriate config file path."""
+    if file:
+        # If a file is provided, process that file
+        return os.path.abspath(file)
+    else:
+        # If no file is provided, use the path and look for arch_config.yaml first, then config.yaml for convenience
+        arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml"))
+        if not os.path.exists(arch_config_file):
+            arch_config_file = os.path.abspath(os.path.join(path, "config.yaml"))
+        return arch_config_file
+
+
 def stream_access_logs(follow):
    """
    Get the archgw access logs
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
@ -70,7 +70,9 @@ pub enum ServiceTier {
 #[skip_serializing_none]
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct ThinkingConfig {
-    pub enabled: bool,
+    #[serde(rename = "type")]
+    pub thinking_type: String,
+    pub budget_tokens: Option<u32>,
 }

 // MCP Server types
@ -166,7 +168,8 @@ pub enum MessagesContentBlock {
        cache_control: Option<MessagesCacheControl>,
    },
    Thinking {
-        text: String,
+        thinking: String,
+        signature: Option<String>,
        cache_control: Option<MessagesCacheControl>,
    },
    Image {
@ -235,6 +238,7 @@ impl ExtractText for Vec<MessagesContentBlock> {

 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(rename_all = "snake_case")]
+#[serde(tag = "type")]
 pub enum MessagesImageSource {
    Base64 {
        media_type: String,
@ -247,6 +251,7 @@ pub enum MessagesImageSource {

 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(rename_all = "snake_case")]
+#[serde(tag = "type")]
 pub enum MessagesDocumentSource {
    Base64 {
        media_type: String,
@ -409,6 +414,8 @@ pub enum MessagesContentDelta {
    TextDelta { text: String },
    #[serde(rename = "input_json_delta")]
    InputJsonDelta { partial_json: String },
+    #[serde(rename = "thinking_delta")]
+    ThinkingDelta { thinking: String },
 }

 #[skip_serializing_none]
@ -566,10 +573,10 @@ impl ProviderStreamResponse for MessagesStreamEvent {
    fn content_delta(&self) -> Option<&str> {
        match self {
            MessagesStreamEvent::ContentBlockDelta { delta, .. } => {
-                if let MessagesContentDelta::TextDelta { text } = delta {
-                    Some(text)
-                } else {
-                    None
+                match delta {
+                    MessagesContentDelta::TextDelta { text } => Some(text),
+                    MessagesContentDelta::ThinkingDelta { thinking } => Some(thinking),
+                    _ => None,
                }
            }
            _ => None,
@ -672,7 +679,7 @@ mod tests {
            "system": "You are a helpful assistant",
            "service_tier": "auto",
            "thinking": {
-                "enabled": true
+                "type": "enabled"
            },
            "metadata": {
                "user_id": "123"
@ -699,7 +706,7 @@ mod tests {
        }

        if let Some(thinking) = &deserialized_request.thinking {
-            assert_eq!(thinking.enabled, true);
+            assert_eq!(thinking.thinking_type, "enabled");
        } else {
            panic!("Expected thinking config");
        }
@ -754,10 +761,9 @@ mod tests {
                        {
                            "type": "image",
                            "source": {
-                                "base64": {
-                                    "media_type": "image/jpeg",
-                                    "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
-                                }
+                                "type": "base64",
+                                "media_type": "image/jpeg",
+                                "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
                            }
                        }
                    ]
@ -767,7 +773,7 @@ mod tests {
                    "content": [
                        {
                            "type": "thinking",
-                            "text": "Let me analyze the image and then check the weather..."
+                            "thinking": "Let me analyze the image and then check the weather..."
                        },
                        {
                            "type": "text",
@ -854,8 +860,8 @@ mod tests {
            assert_eq!(content_blocks.len(), 3);

            // Validate thinking content block
-            if let MessagesContentBlock::Thinking { text, .. } = &content_blocks[0] {
-                assert_eq!(text, "Let me analyze the image and then check the weather...");
+            if let MessagesContentBlock::Thinking { thinking, .. } = &content_blocks[0] {
+                assert_eq!(thinking, "Let me analyze the image and then check the weather...");
            } else {
                panic!("Expected thinking content block");
            }
@ -1320,4 +1326,68 @@ mod tests {
        assert_eq!(all_variants.len(), 1);
        assert_eq!(all_variants[0], AnthropicApi::Messages);
    }
+
+    #[test]
+    fn test_anthropic_thinking_streaming() {
+        // Test thinking delta stream event
+        let thinking_delta_json = json!({
+            "type": "content_block_delta",
+            "index": 0,
+            "delta": {
+                "type": "thinking_delta",
+                "thinking": ".\n\nI need to consider:\n1. Current"
+            }
+        });
+
+        let deserialized_event: MessagesStreamEvent = serde_json::from_value(thinking_delta_json.clone()).unwrap();
+        if let MessagesStreamEvent::ContentBlockDelta { index, ref delta } = deserialized_event {
+            assert_eq!(index, 0);
+            if let MessagesContentDelta::ThinkingDelta { thinking } = delta {
+                assert_eq!(thinking, ".\n\nI need to consider:\n1. Current");
+            } else {
+                panic!("Expected thinking delta");
+            }
+        } else {
+            panic!("Expected content block delta event");
+        }
+
+        // Test that thinking delta is returned by content_delta()
+        assert_eq!(deserialized_event.content_delta(), Some(".\n\nI need to consider:\n1. Current"));
+
+        let serialized_event_json = serde_json::to_value(&deserialized_event).unwrap();
+        assert_eq!(thinking_delta_json, serialized_event_json);
+    }
+
+    #[test]
+    fn test_anthropic_thinking_request_config() {
+        // Test thinking config with budget_tokens
+        let request_json = json!({
+            "model": "claude-sonnet-4-20250514",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "Test message"
+                }
+            ],
+            "max_tokens": 2048,
+            "thinking": {
+                "type": "enabled",
+                "budget_tokens": 1024
+            }
+        });
+
+        let deserialized_request: MessagesRequest = serde_json::from_value(request_json.clone()).unwrap();
+        assert_eq!(deserialized_request.model, "claude-sonnet-4-20250514");
+        assert_eq!(deserialized_request.max_tokens, 2048);
+
+        if let Some(thinking) = &deserialized_request.thinking {
+            assert_eq!(thinking.thinking_type, "enabled");
+            assert_eq!(thinking.budget_tokens, Some(1024));
+        } else {
+            panic!("Expected thinking config");
+        }
+
+        let serialized_json = serde_json::to_value(&deserialized_request).unwrap();
+        assert_eq!(request_json, serialized_json);
+    }
 }
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -88,6 +88,7 @@ pub struct ChatCompletionsRequest {
    pub prediction: Option<StaticContent>,
    // pub reasoning_effect: Option<bool>, // GOOD FIRST ISSUE: Future support for reasoning effects
    pub response_format: Option<Value>,
+    pub reasoning_effort: Option<String>, // e.g., "none", "low", "medium", "high"
    // pub safety_identifier: Option<String>, // GOOD FIRST ISSUE: Future support for safety identifiers
    pub seed: Option<i32>,
    pub service_tier: Option<String>,
--- a/crates/hermesllm/src/clients/transformer.rs
+++ b/crates/hermesllm/src/clients/transformer.rs
@ -862,9 +862,8 @@ fn convert_anthropic_content_to_openai(content: &[MessagesContentBlock]) -> Resu
            MessagesContentBlock::Text { text, .. } => {
                text_parts.push(text.clone());
            }
-            MessagesContentBlock::Thinking { text, .. } => {
-                // Include thinking as regular text for OpenAI
-                text_parts.push(format!("[Thinking: {}]", text));
+            MessagesContentBlock::Thinking { thinking, .. } => {
+                text_parts.push(format!("thinking: {}", thinking));
            }
            _ => {
                // Skip other content types for basic text conversion
@ -1009,6 +1008,21 @@ fn convert_content_delta(delta: MessagesContentDelta) -> Result<ChatCompletionsS
                None,
            ))
        }
+        MessagesContentDelta::ThinkingDelta { thinking } => {
+            Ok(create_openai_chunk(
+                "stream",
+                "unknown",
+                MessageDelta {
+                    role: None,
+                    content: Some(format!("[Thinking: {}]", thinking)),
+                    refusal: None,
+                    function_call: None,
+                    tool_calls: None,
+                },
+                None,
+                None,
+            ))
+        }
        MessagesContentDelta::InputJsonDelta { partial_json } => {
            Ok(create_openai_chunk(
                "stream",
--- a/demos/use_cases/claude_code/config.yaml
+++ b/demos/use_cases/claude_code/config.yaml
@ -0,0 +1,73 @@
+version: v0.1
+
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s
+
+llm_providers:
+
+  # OpenAI Models
+  - model: openai/gpt-5-mini-2025-08-07
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+
+  - model: openai/o3
+    access_key: $OPENAI_API_KEY
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+
+  # Anthropic Models
+  - model: anthropic/claude-3-5-sonnet-20241022
+    access_key: $ANTHROPIC_API_KEY
+
+  - model: anthropic/claude-3-haiku-20240307
+    access_key: $ANTHROPIC_API_KEY
+
+  # Azure OpenAI Models
+  - model: azure_openai/gpt-5-mini
+    access_key: $AZURE_API_KEY
+    base_url: https://katanemo.openai.azure.com
+
+  # Ollama Models
+  - model: ollama/llama3.1
+    base_url: http://host.docker.internal:11434
+
+
+# Model aliases - friendly names that map to actual provider names
+model_aliases:
+  # Alias for summarization tasks -> fast/cheap model
+  arch.summarize.v1:
+    target: gpt-5-mini-2025-08-07
+
+  # Alias for general purpose tasks -> latest model
+  arch.v1:
+    target: o3
+
+  # Alias for reasoning tasks -> capable model
+  arch.reasoning.v1:
+    target: gpt-4o
+
+  # Alias for creative tasks -> Claude model
+  arch.creative.v1:
+    target: claude-3-5-sonnet-20241022
+
+  # Alias for quick responses -> fast model
+  arch.fast.v1:
+    target: claude-3-haiku-20240307
+
+  # Semantic aliases
+  summary-model:
+    target: gpt-5-mini-2025-08-07
+
+  chat-model:
+    target: claude-3-5-sonnet-20241022
+
+  creative-model:
+    target: claude-3-5-sonnet-20241022
--- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
+++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
@ -24,7 +24,7 @@ llm_providers:
    access_key: $OPENAI_API_KEY

  # Anthropic Models
-  - model: anthropic/claude-3-5-sonnet-20241022
+  - model: anthropic/claude-sonnet-4-20250514
    access_key: $ANTHROPIC_API_KEY

  - model: anthropic/claude-3-haiku-20240307
@ -56,7 +56,7 @@ model_aliases:

  # Alias for creative tasks -> Claude model
  arch.creative.v1:
-    target: claude-3-5-sonnet-20241022
+    target: claude-sonnet-4-20250514

  # Alias for quick responses -> fast model
  arch.fast.v1:
@ -67,7 +67,7 @@ model_aliases:
    target: gpt-5-mini-2025-08-07

  chat-model:
-    target: llama3.1
+    target:

  creative-model:
-    target: claude-3-5-sonnet-20241022
+    target: claude-sonnet-4-20250514
--- a/tests/e2e/test_model_alias_routing.py
+++ b/tests/e2e/test_model_alias_routing.py
@ -350,3 +350,57 @@ def test_direct_model_4o_mini_anthropic():
    response_content = "".join(b.text for b in message.content if b.type == "text")
    logger.info(f"Response from direct 4o-mini via Anthropic: {response_content}")
    assert response_content == "Hello from direct 4o-mini via Anthropic!"
+
+
+def test_anthropic_thinking_mode_streaming():
+    # Anthropic base_url should be the root, not /v1/chat/completions
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+
+    client = anthropic.Anthropic(
+        api_key=os.environ.get("ANTHROPIC_API_KEY", "test-key"),
+        base_url=base_url,
+    )
+
+    thinking_block_started = False
+    thinking_delta_seen = False
+    text_delta_seen = False
+
+    with client.messages.stream(
+        model="claude-sonnet-4-20250514",
+        max_tokens=2048,
+        thinking={"type": "enabled", "budget_tokens": 1024},  # <- idiomatic
+        messages=[{"role": "user", "content": "Explain briefly what 2+2 equals"}],
+    ) as stream:
+        for event in stream:
+            # 1) detect when a thinking block starts
+            if event.type == "content_block_start" and getattr(
+                event, "content_block", None
+            ):
+                if getattr(event.content_block, "type", None) == "thinking":
+                    thinking_block_started = True
+
+            # 2) collect text vs thinking deltas
+            if event.type == "content_block_delta" and getattr(event, "delta", None):
+                if event.delta.type == "text_delta":
+                    text_delta_seen = True
+                elif event.delta.type == "thinking_delta":
+                    # some SDKs expose .thinking, others .text for this delta; not needed here
+                    thinking_delta_seen = True
+
+        final = stream.get_final_message()
+
+    # Basic integrity
+    assert final is not None
+    assert final.content and len(final.content) > 0
+
+    # Normal text should have streamed
+    assert text_delta_seen, "Expected normal text deltas in stream"
+
+    # With thinking enabled, we expect a thinking block and at least one thinking delta
+    assert thinking_block_started, "No thinking block started"
+    assert thinking_delta_seen, "No thinking deltas observed"
+
+    # Optional: double-check on the assembled message
+    final_block_types = [blk.type for blk in final.content]
+    assert "text" in final_block_types
+    assert "thinking" in final_block_types