diff --git a/arch/tools/cli/core.py b/arch/tools/cli/core.py index 59d42ab4..fda8ba88 100644 --- a/arch/tools/cli/core.py +++ b/arch/tools/cli/core.py @@ -4,7 +4,7 @@ import time import sys import yaml -from cli.utils import getLogger +from cli.utils import getLogger, read_config_file from cli.consts import ( ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME, @@ -185,3 +185,106 @@ def stop_arch_modelserver(): except subprocess.CalledProcessError as e: log.info(f"Failed to start model_server. Please check archgw_modelserver logs") sys.exit(1) + + +def start_cli_agent(arch_config_file=None, settings_json="{}"): + """Start a CLI client connected to Arch.""" + import json + + # Use current directory for config if not specified + if arch_config_file is None: + config_path = "." + else: + config_path = ( + os.path.dirname(arch_config_file) + if os.path.dirname(arch_config_file) + else "." + ) + + # Get port and host from arch_config.yaml listeners > egress + arch_config = read_config_file(config_path) + if not arch_config: + log.error(f"Config file not found in {config_path}") + sys.exit(1) + + # Get egress listener configuration + egress_config = arch_config.get("listeners", {}).get("egress_traffic", {}) + host = egress_config.get("host", "127.0.0.1") + port = egress_config.get("port", 12000) + + # Parse additional settings from command line + try: + additional_settings = json.loads(settings_json) if settings_json else {} + except json.JSONDecodeError: + log.error("Settings must be valid JSON") + sys.exit(1) + + # Set up environment variables + env = os.environ.copy() + env.update( + { + "ANTHROPIC_AUTH_TOKEN": "test", # Use test token for arch + "ANTHROPIC_API_KEY": "", + "ANTHROPIC_BASE_URL": f"http://{host}:{port}", + "NO_PROXY": host, + "DISABLE_TELEMETRY": "true", + "DISABLE_COST_WARNINGS": "true", + "API_TIMEOUT_MS": "600000", + } + ) + + # Set ANTHROPIC_SMALL_FAST_MODEL from additional_settings or model alias + if "ANTHROPIC_SMALL_FAST_MODEL" in additional_settings: + env["ANTHROPIC_SMALL_FAST_MODEL"] = additional_settings[ + "ANTHROPIC_SMALL_FAST_MODEL" + ] + else: + # Check if arch.claude.code.small.fast alias exists in model_aliases + model_aliases = arch_config.get("model_aliases", {}) + if "arch.claude.code.small.fast" in model_aliases: + env["ANTHROPIC_SMALL_FAST_MODEL"] = "arch.claude.code.small.fast" + else: + log.info( + "Tip: Set an alias 'arch.claude.code.small.fast' in your model_aliases config to set a small fast model Claude Code" + ) + log.info("Or provide ANTHROPIC_SMALL_FAST_MODEL in --settings JSON") + + # Non-interactive mode configuration from additional_settings only + if additional_settings.get("NON_INTERACTIVE_MODE", False): + env.update( + { + "CI": "true", + "FORCE_COLOR": "0", + "NODE_NO_READLINE": "1", + "TERM": "dumb", + } + ) + + # Build claude command arguments + claude_args = [] + + # Add settings if provided, excluding those already handled as environment variables + if additional_settings: + # Filter out settings that are already processed as environment variables + claude_settings = { + k: v + for k, v in additional_settings.items() + if k not in ["ANTHROPIC_SMALL_FAST_MODEL", "NON_INTERACTIVE_MODE"] + } + if claude_settings: + claude_args.append(f"--settings={json.dumps(claude_settings)}") + + # Use claude from PATH + claude_path = "claude" + log.info(f"Starting Claude CLI Agent to Arch at {host}:{port}") + + try: + subprocess.run([claude_path] + claude_args, env=env, check=True) + except subprocess.CalledProcessError as e: + log.error(f"Error starting claude: {e}") + sys.exit(1) + except FileNotFoundError: + log.error( + f"{claude_path} not found. Make sure Claude Code is installed: npm install -g @anthropic-ai/claude-code" + ) + sys.exit(1) diff --git a/arch/tools/cli/main.py b/arch/tools/cli/main.py index 3d67f30b..df4a25cf 100644 --- a/arch/tools/cli/main.py +++ b/arch/tools/cli/main.py @@ -4,14 +4,21 @@ import sys import subprocess import multiprocessing import importlib.metadata +import json from cli import targets -from cli.docker_cli import docker_validate_archgw_schema, stream_gateway_logs +from cli.docker_cli import ( + docker_validate_archgw_schema, + stream_gateway_logs, + docker_container_status, +) from cli.utils import ( getLogger, get_llm_provider_access_keys, has_ingress_listener, load_env_file_to_dict, stream_access_logs, + read_config_file, + find_config_file, ) from cli.core import ( start_arch_modelserver, @@ -19,9 +26,11 @@ from cli.core import ( start_arch, stop_docker_container, download_models_from_hf, + start_cli_agent, ) from cli.consts import ( ARCHGW_DOCKER_IMAGE, + ARCHGW_DOCKER_NAME, KATANEMO_DOCKERHUB_REPO, SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, @@ -171,12 +180,8 @@ def up(file, path, service, foreground): start_arch_modelserver(foreground) return - if file: - # If a file is provided, process that file - arch_config_file = os.path.abspath(file) - else: - # If no file is provided, use the path and look for arch_config.yaml - arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml")) + # Use the utility function to find config file + arch_config_file = find_config_file(path, file) # Check if the file exists if not os.path.exists(arch_config_file): @@ -329,10 +334,52 @@ def logs(debug, follow): archgw_process.terminate() +@click.command() +@click.argument("cli_type", type=click.Choice(["claude"]), required=True) +@click.option( + "--path", + default=None, + help="Path to the directory containing arch_config.yaml (defaults to current directory)", +) +@click.option( + "--settings", + default="{}", + help="Additional settings as JSON string for the CLI agent.", +) +def cli_agent(cli_type, path, settings): + """Start a CLI agent connected to Arch. + + CLI_TYPE: The type of CLI agent to start (currently only 'claude' is supported) + """ + # Determine arch_config.yaml path + arch_config_file = None + if path: + arch_config_file = os.path.join(path, "arch_config.yaml") + else: + arch_config_file = "arch_config.yaml" # Current directory + + # Check if archgw docker container is running + archgw_status = docker_container_status(ARCHGW_DOCKER_NAME) + if archgw_status != "running": + log.error(f"archgw docker container is not running (status: {archgw_status})") + log.error("Please start archgw using the 'archgw up' command.") + sys.exit(1) + + try: + start_cli_agent(arch_config_file, settings) + except SystemExit: + # Re-raise SystemExit to preserve exit codes + raise + except Exception as e: + click.echo(f"Error: {e}") + sys.exit(1) + + main.add_command(up) main.add_command(down) main.add_command(build) main.add_command(logs) +main.add_command(cli_agent) main.add_command(generate_prompt_targets) if __name__ == "__main__": diff --git a/arch/tools/cli/utils.py b/arch/tools/cli/utils.py index dc731d81..97a8428f 100644 --- a/arch/tools/cli/utils.py +++ b/arch/tools/cli/utils.py @@ -88,6 +88,36 @@ def load_env_file_to_dict(file_path): return env_dict +def read_config_file(path="."): + """Read configuration from arch_config.yaml or config.yaml in the specified path.""" + config_files = ["arch_config.yaml", "config.yaml"] + + for config_file in config_files: + config_path = os.path.abspath(os.path.join(path, config_file)) + if os.path.exists(config_path): + try: + with open(config_path, "r") as f: + return yaml.safe_load(f) + except Exception as e: + log.warning(f"Error reading {config_path}: {e}") + continue + + return {} + + +def find_config_file(path=".", file=None): + """Find the appropriate config file path.""" + if file: + # If a file is provided, process that file + return os.path.abspath(file) + else: + # If no file is provided, use the path and look for arch_config.yaml first, then config.yaml for convenience + arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml")) + if not os.path.exists(arch_config_file): + arch_config_file = os.path.abspath(os.path.join(path, "config.yaml")) + return arch_config_file + + def stream_access_logs(follow): """ Get the archgw access logs diff --git a/crates/hermesllm/src/apis/anthropic.rs b/crates/hermesllm/src/apis/anthropic.rs index c707e7b1..abfde5b7 100644 --- a/crates/hermesllm/src/apis/anthropic.rs +++ b/crates/hermesllm/src/apis/anthropic.rs @@ -70,7 +70,9 @@ pub enum ServiceTier { #[skip_serializing_none] #[derive(Serialize, Deserialize, Debug, Clone)] pub struct ThinkingConfig { - pub enabled: bool, + #[serde(rename = "type")] + pub thinking_type: String, + pub budget_tokens: Option, } // MCP Server types @@ -166,7 +168,8 @@ pub enum MessagesContentBlock { cache_control: Option, }, Thinking { - text: String, + thinking: String, + signature: Option, cache_control: Option, }, Image { @@ -235,6 +238,7 @@ impl ExtractText for Vec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] +#[serde(tag = "type")] pub enum MessagesImageSource { Base64 { media_type: String, @@ -247,6 +251,7 @@ pub enum MessagesImageSource { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] +#[serde(tag = "type")] pub enum MessagesDocumentSource { Base64 { media_type: String, @@ -409,6 +414,8 @@ pub enum MessagesContentDelta { TextDelta { text: String }, #[serde(rename = "input_json_delta")] InputJsonDelta { partial_json: String }, + #[serde(rename = "thinking_delta")] + ThinkingDelta { thinking: String }, } #[skip_serializing_none] @@ -566,10 +573,10 @@ impl ProviderStreamResponse for MessagesStreamEvent { fn content_delta(&self) -> Option<&str> { match self { MessagesStreamEvent::ContentBlockDelta { delta, .. } => { - if let MessagesContentDelta::TextDelta { text } = delta { - Some(text) - } else { - None + match delta { + MessagesContentDelta::TextDelta { text } => Some(text), + MessagesContentDelta::ThinkingDelta { thinking } => Some(thinking), + _ => None, } } _ => None, @@ -672,7 +679,7 @@ mod tests { "system": "You are a helpful assistant", "service_tier": "auto", "thinking": { - "enabled": true + "type": "enabled" }, "metadata": { "user_id": "123" @@ -699,7 +706,7 @@ mod tests { } if let Some(thinking) = &deserialized_request.thinking { - assert_eq!(thinking.enabled, true); + assert_eq!(thinking.thinking_type, "enabled"); } else { panic!("Expected thinking config"); } @@ -754,10 +761,9 @@ mod tests { { "type": "image", "source": { - "base64": { - "media_type": "image/jpeg", - "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" - } + "type": "base64", + "media_type": "image/jpeg", + "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" } } ] @@ -767,7 +773,7 @@ mod tests { "content": [ { "type": "thinking", - "text": "Let me analyze the image and then check the weather..." + "thinking": "Let me analyze the image and then check the weather..." }, { "type": "text", @@ -854,8 +860,8 @@ mod tests { assert_eq!(content_blocks.len(), 3); // Validate thinking content block - if let MessagesContentBlock::Thinking { text, .. } = &content_blocks[0] { - assert_eq!(text, "Let me analyze the image and then check the weather..."); + if let MessagesContentBlock::Thinking { thinking, .. } = &content_blocks[0] { + assert_eq!(thinking, "Let me analyze the image and then check the weather..."); } else { panic!("Expected thinking content block"); } @@ -1320,4 +1326,68 @@ mod tests { assert_eq!(all_variants.len(), 1); assert_eq!(all_variants[0], AnthropicApi::Messages); } + + #[test] + fn test_anthropic_thinking_streaming() { + // Test thinking delta stream event + let thinking_delta_json = json!({ + "type": "content_block_delta", + "index": 0, + "delta": { + "type": "thinking_delta", + "thinking": ".\n\nI need to consider:\n1. Current" + } + }); + + let deserialized_event: MessagesStreamEvent = serde_json::from_value(thinking_delta_json.clone()).unwrap(); + if let MessagesStreamEvent::ContentBlockDelta { index, ref delta } = deserialized_event { + assert_eq!(index, 0); + if let MessagesContentDelta::ThinkingDelta { thinking } = delta { + assert_eq!(thinking, ".\n\nI need to consider:\n1. Current"); + } else { + panic!("Expected thinking delta"); + } + } else { + panic!("Expected content block delta event"); + } + + // Test that thinking delta is returned by content_delta() + assert_eq!(deserialized_event.content_delta(), Some(".\n\nI need to consider:\n1. Current")); + + let serialized_event_json = serde_json::to_value(&deserialized_event).unwrap(); + assert_eq!(thinking_delta_json, serialized_event_json); + } + + #[test] + fn test_anthropic_thinking_request_config() { + // Test thinking config with budget_tokens + let request_json = json!({ + "model": "claude-sonnet-4-20250514", + "messages": [ + { + "role": "user", + "content": "Test message" + } + ], + "max_tokens": 2048, + "thinking": { + "type": "enabled", + "budget_tokens": 1024 + } + }); + + let deserialized_request: MessagesRequest = serde_json::from_value(request_json.clone()).unwrap(); + assert_eq!(deserialized_request.model, "claude-sonnet-4-20250514"); + assert_eq!(deserialized_request.max_tokens, 2048); + + if let Some(thinking) = &deserialized_request.thinking { + assert_eq!(thinking.thinking_type, "enabled"); + assert_eq!(thinking.budget_tokens, Some(1024)); + } else { + panic!("Expected thinking config"); + } + + let serialized_json = serde_json::to_value(&deserialized_request).unwrap(); + assert_eq!(request_json, serialized_json); + } } diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs index ead6e7d2..c4181768 100644 --- a/crates/hermesllm/src/apis/openai.rs +++ b/crates/hermesllm/src/apis/openai.rs @@ -88,6 +88,7 @@ pub struct ChatCompletionsRequest { pub prediction: Option, // pub reasoning_effect: Option, // GOOD FIRST ISSUE: Future support for reasoning effects pub response_format: Option, + pub reasoning_effort: Option, // e.g., "none", "low", "medium", "high" // pub safety_identifier: Option, // GOOD FIRST ISSUE: Future support for safety identifiers pub seed: Option, pub service_tier: Option, diff --git a/crates/hermesllm/src/clients/transformer.rs b/crates/hermesllm/src/clients/transformer.rs index cf2a48e3..33d2b8c1 100644 --- a/crates/hermesllm/src/clients/transformer.rs +++ b/crates/hermesllm/src/clients/transformer.rs @@ -862,9 +862,8 @@ fn convert_anthropic_content_to_openai(content: &[MessagesContentBlock]) -> Resu MessagesContentBlock::Text { text, .. } => { text_parts.push(text.clone()); } - MessagesContentBlock::Thinking { text, .. } => { - // Include thinking as regular text for OpenAI - text_parts.push(format!("[Thinking: {}]", text)); + MessagesContentBlock::Thinking { thinking, .. } => { + text_parts.push(format!("thinking: {}", thinking)); } _ => { // Skip other content types for basic text conversion @@ -1009,6 +1008,21 @@ fn convert_content_delta(delta: MessagesContentDelta) -> Result { + Ok(create_openai_chunk( + "stream", + "unknown", + MessageDelta { + role: None, + content: Some(format!("[Thinking: {}]", thinking)), + refusal: None, + function_call: None, + tool_calls: None, + }, + None, + None, + )) + } MessagesContentDelta::InputJsonDelta { partial_json } => { Ok(create_openai_chunk( "stream", diff --git a/demos/use_cases/claude_code/config.yaml b/demos/use_cases/claude_code/config.yaml new file mode 100644 index 00000000..e1e89d1e --- /dev/null +++ b/demos/use_cases/claude_code/config.yaml @@ -0,0 +1,73 @@ +version: v0.1 + +listeners: + egress_traffic: + address: 0.0.0.0 + port: 12000 + message_format: openai + timeout: 30s + +llm_providers: + + # OpenAI Models + - model: openai/gpt-5-mini-2025-08-07 + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + + - model: openai/o3 + access_key: $OPENAI_API_KEY + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + + # Anthropic Models + - model: anthropic/claude-3-5-sonnet-20241022 + access_key: $ANTHROPIC_API_KEY + + - model: anthropic/claude-3-haiku-20240307 + access_key: $ANTHROPIC_API_KEY + + # Azure OpenAI Models + - model: azure_openai/gpt-5-mini + access_key: $AZURE_API_KEY + base_url: https://katanemo.openai.azure.com + + # Ollama Models + - model: ollama/llama3.1 + base_url: http://host.docker.internal:11434 + + +# Model aliases - friendly names that map to actual provider names +model_aliases: + # Alias for summarization tasks -> fast/cheap model + arch.summarize.v1: + target: gpt-5-mini-2025-08-07 + + # Alias for general purpose tasks -> latest model + arch.v1: + target: o3 + + # Alias for reasoning tasks -> capable model + arch.reasoning.v1: + target: gpt-4o + + # Alias for creative tasks -> Claude model + arch.creative.v1: + target: claude-3-5-sonnet-20241022 + + # Alias for quick responses -> fast model + arch.fast.v1: + target: claude-3-haiku-20240307 + + # Semantic aliases + summary-model: + target: gpt-5-mini-2025-08-07 + + chat-model: + target: claude-3-5-sonnet-20241022 + + creative-model: + target: claude-3-5-sonnet-20241022 diff --git a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml index c829b395..4cda86f9 100644 --- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml +++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml @@ -24,7 +24,7 @@ llm_providers: access_key: $OPENAI_API_KEY # Anthropic Models - - model: anthropic/claude-3-5-sonnet-20241022 + - model: anthropic/claude-sonnet-4-20250514 access_key: $ANTHROPIC_API_KEY - model: anthropic/claude-3-haiku-20240307 @@ -56,7 +56,7 @@ model_aliases: # Alias for creative tasks -> Claude model arch.creative.v1: - target: claude-3-5-sonnet-20241022 + target: claude-sonnet-4-20250514 # Alias for quick responses -> fast model arch.fast.v1: @@ -67,7 +67,7 @@ model_aliases: target: gpt-5-mini-2025-08-07 chat-model: - target: llama3.1 + target: creative-model: - target: claude-3-5-sonnet-20241022 + target: claude-sonnet-4-20250514 diff --git a/tests/e2e/test_model_alias_routing.py b/tests/e2e/test_model_alias_routing.py index 696507f2..74b4ff36 100644 --- a/tests/e2e/test_model_alias_routing.py +++ b/tests/e2e/test_model_alias_routing.py @@ -350,3 +350,57 @@ def test_direct_model_4o_mini_anthropic(): response_content = "".join(b.text for b in message.content if b.type == "text") logger.info(f"Response from direct 4o-mini via Anthropic: {response_content}") assert response_content == "Hello from direct 4o-mini via Anthropic!" + + +def test_anthropic_thinking_mode_streaming(): + # Anthropic base_url should be the root, not /v1/chat/completions + base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") + + client = anthropic.Anthropic( + api_key=os.environ.get("ANTHROPIC_API_KEY", "test-key"), + base_url=base_url, + ) + + thinking_block_started = False + thinking_delta_seen = False + text_delta_seen = False + + with client.messages.stream( + model="claude-sonnet-4-20250514", + max_tokens=2048, + thinking={"type": "enabled", "budget_tokens": 1024}, # <- idiomatic + messages=[{"role": "user", "content": "Explain briefly what 2+2 equals"}], + ) as stream: + for event in stream: + # 1) detect when a thinking block starts + if event.type == "content_block_start" and getattr( + event, "content_block", None + ): + if getattr(event.content_block, "type", None) == "thinking": + thinking_block_started = True + + # 2) collect text vs thinking deltas + if event.type == "content_block_delta" and getattr(event, "delta", None): + if event.delta.type == "text_delta": + text_delta_seen = True + elif event.delta.type == "thinking_delta": + # some SDKs expose .thinking, others .text for this delta; not needed here + thinking_delta_seen = True + + final = stream.get_final_message() + + # Basic integrity + assert final is not None + assert final.content and len(final.content) > 0 + + # Normal text should have streamed + assert text_delta_seen, "Expected normal text deltas in stream" + + # With thinking enabled, we expect a thinking block and at least one thinking delta + assert thinking_block_started, "No thinking block started" + assert thinking_delta_seen, "No thinking deltas observed" + + # Optional: double-check on the assembled message + final_block_types = [blk.type for blk in final.content] + assert "text" in final_block_types + assert "thinking" in final_block_types