diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index 5ee4c899..16b14343 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -140,7 +140,7 @@ static_resources: route: auto_host_rewrite: true cluster: {{ llm_cluster_name }} - timeout: 60s + timeout: 300s {% endfor %} {% if agent_orchestrator %} @@ -153,7 +153,7 @@ static_resources: route: auto_host_rewrite: true cluster: {{ agent_orchestrator }} - timeout: 60s + timeout: 300s {% endif %} http_filters: - name: envoy.filters.http.compressor @@ -266,7 +266,7 @@ static_resources: route: auto_host_rewrite: true cluster: {{ internal_cluster }} - timeout: 60s + timeout: 300s {% endfor %} {% for cluster_name, cluster in arch_clusters.items() %} @@ -279,7 +279,7 @@ static_resources: route: auto_host_rewrite: true cluster: {{ cluster_name }} - timeout: 60s + timeout: 300s {% endfor %} http_filters: - name: envoy.filters.http.router @@ -434,7 +434,7 @@ static_resources: route: auto_host_rewrite: true cluster: {{ llm_cluster_name }} - timeout: 60s + timeout: 300s {% endfor %} - match: prefix: "/" diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py index 8f0dcefd..965bf040 100644 --- a/arch/tools/cli/config_generator.py +++ b/arch/tools/cli/config_generator.py @@ -242,7 +242,7 @@ def validate_and_render_schema(): if llm_gateway_listener.get("address") == None: llm_gateway_listener["address"] = "127.0.0.1" if llm_gateway_listener.get("timeout") == None: - llm_gateway_listener["timeout"] = "10s" + llm_gateway_listener["timeout"] = "300s" use_agent_orchestrator = config_yaml.get("overrides", {}).get( "use_agent_orchestrator", False diff --git a/arch/tools/cli/core.py b/arch/tools/cli/core.py index 59d42ab4..0846a51a 100644 --- a/arch/tools/cli/core.py +++ b/arch/tools/cli/core.py @@ -1,3 +1,4 @@ +import json import subprocess import os import time @@ -185,3 +186,93 @@ def stop_arch_modelserver(): except subprocess.CalledProcessError as e: log.info(f"Failed to start model_server. Please check archgw_modelserver logs") sys.exit(1) + + +def start_cli_agent(arch_config_file=None, settings_json="{}"): + """Start a CLI client connected to Arch.""" + + with open(arch_config_file, "r") as file: + arch_config = file.read() + arch_config_yaml = yaml.safe_load(arch_config) + + # Get egress listener configuration + egress_config = arch_config_yaml.get("listeners", {}).get("egress_traffic", {}) + host = egress_config.get("host", "127.0.0.1") + port = egress_config.get("port", 12000) + + # Parse additional settings from command line + try: + additional_settings = json.loads(settings_json) if settings_json else {} + except json.JSONDecodeError: + log.error("Settings must be valid JSON") + sys.exit(1) + + # Set up environment variables + env = os.environ.copy() + env.update( + { + "ANTHROPIC_AUTH_TOKEN": "test", # Use test token for arch + "ANTHROPIC_API_KEY": "", + "ANTHROPIC_BASE_URL": f"http://{host}:{port}", + "NO_PROXY": host, + "DISABLE_TELEMETRY": "true", + "DISABLE_COST_WARNINGS": "true", + "API_TIMEOUT_MS": "600000", + } + ) + + # Set ANTHROPIC_SMALL_FAST_MODEL from additional_settings or model alias + if "ANTHROPIC_SMALL_FAST_MODEL" in additional_settings: + env["ANTHROPIC_SMALL_FAST_MODEL"] = additional_settings[ + "ANTHROPIC_SMALL_FAST_MODEL" + ] + else: + # Check if arch.claude.code.small.fast alias exists in model_aliases + model_aliases = arch_config_yaml.get("model_aliases", {}) + if "arch.claude.code.small.fast" in model_aliases: + env["ANTHROPIC_SMALL_FAST_MODEL"] = "arch.claude.code.small.fast" + else: + log.info( + "Tip: Set an alias 'arch.claude.code.small.fast' in your model_aliases config to set a small fast model Claude Code" + ) + log.info("Or provide ANTHROPIC_SMALL_FAST_MODEL in --settings JSON") + + # Non-interactive mode configuration from additional_settings only + if additional_settings.get("NON_INTERACTIVE_MODE", False): + env.update( + { + "CI": "true", + "FORCE_COLOR": "0", + "NODE_NO_READLINE": "1", + "TERM": "dumb", + } + ) + + # Build claude command arguments + claude_args = [] + + # Add settings if provided, excluding those already handled as environment variables + if additional_settings: + # Filter out settings that are already processed as environment variables + claude_settings = { + k: v + for k, v in additional_settings.items() + if k not in ["ANTHROPIC_SMALL_FAST_MODEL", "NON_INTERACTIVE_MODE"] + } + if claude_settings: + claude_args.append(f"--settings={json.dumps(claude_settings)}") + + # Use claude from PATH + claude_path = "claude" + log.info(f"Connecting Claude Code Agent to Arch at {host}:{port}") + + try: + subprocess.run([claude_path] + claude_args, env=env, check=True) + except subprocess.CalledProcessError as e: + log.error(f"Error starting claude: {e}") + sys.exit(1) + except FileNotFoundError: + log.error( + f"{claude_path} not found. Make sure Claude Code is installed: npm install -g @anthropic-ai/claude-code" + ) + sys.exit(1) diff --git a/arch/tools/cli/main.py b/arch/tools/cli/main.py index 001f3d9c..25c00404 100644 --- a/arch/tools/cli/main.py +++ b/arch/tools/cli/main.py @@ -4,13 +4,20 @@ import sys import subprocess import multiprocessing import importlib.metadata +import json from cli import targets -from cli.docker_cli import docker_validate_archgw_schema, stream_gateway_logs +from cli.docker_cli import ( + docker_validate_archgw_schema, + stream_gateway_logs, + docker_container_status, +) from cli.utils import ( getLogger, get_llm_provider_access_keys, + has_ingress_listener, load_env_file_to_dict, stream_access_logs, + find_config_file, ) from cli.core import ( start_arch_modelserver, @@ -18,9 +25,11 @@ from cli.core import ( start_arch, stop_docker_container, download_models_from_hf, + start_cli_agent, ) from cli.consts import ( ARCHGW_DOCKER_IMAGE, + ARCHGW_DOCKER_NAME, KATANEMO_DOCKERHUB_REPO, SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, @@ -170,12 +179,8 @@ def up(file, path, service, foreground): start_arch_modelserver(foreground) return - if file: - # If a file is provided, process that file - arch_config_file = os.path.abspath(file) - else: - # If no file is provided, use the path and look for arch_config.yaml - arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml")) + # Use the utility function to find config file + arch_config_file = find_config_file(path, file) # Check if the file exists if not os.path.exists(arch_config_file): @@ -183,7 +188,6 @@ def up(file, path, service, foreground): return log.info(f"Validating {arch_config_file}") - ( validation_return_code, validation_stdout, @@ -240,8 +244,15 @@ def up(file, path, service, foreground): if service == SERVICE_NAME_ARCHGW: start_arch(arch_config_file, env, foreground=foreground) else: - download_models_from_hf() - start_arch_modelserver(foreground) + # Check if ingress_traffic listener is configured before starting model_server + if has_ingress_listener(arch_config_file): + download_models_from_hf() + start_arch_modelserver(foreground) + else: + log.info( + "Skipping model_server startup: no ingress_traffic listener configured in arch_config.yaml" + ) + start_arch(arch_config_file, env, foreground=foreground) @@ -321,10 +332,51 @@ def logs(debug, follow): archgw_process.terminate() +@click.command() +@click.argument("type", type=click.Choice(["claude"]), required=True) +@click.argument("file", required=False) # Optional file argument +@click.option( + "--path", default=".", help="Path to the directory containing arch_config.yaml" +) +@click.option( + "--settings", + default="{}", + help="Additional settings as JSON string for the CLI agent.", +) +def cli_agent(type, file, path, settings): + """Start a CLI agent connected to Arch. + + CLI_AGENT: The type of CLI agent to start (currently only 'claude' is supported) + """ + + # Check if archgw docker container is running + archgw_status = docker_container_status(ARCHGW_DOCKER_NAME) + if archgw_status != "running": + log.error(f"archgw docker container is not running (status: {archgw_status})") + log.error("Please start archgw using the 'archgw up' command.") + sys.exit(1) + + # Determine arch_config.yaml path + arch_config_file = find_config_file(path, file) + if not os.path.exists(arch_config_file): + log.error(f"Config file not found: {arch_config_file}") + sys.exit(1) + + try: + start_cli_agent(arch_config_file, settings) + except SystemExit: + # Re-raise SystemExit to preserve exit codes + raise + except Exception as e: + click.echo(f"Error: {e}") + sys.exit(1) + + main.add_command(up) main.add_command(down) main.add_command(build) main.add_command(logs) +main.add_command(cli_agent) main.add_command(generate_prompt_targets) if __name__ == "__main__": diff --git a/arch/tools/cli/utils.py b/arch/tools/cli/utils.py index 019e181c..c7d39d66 100644 --- a/arch/tools/cli/utils.py +++ b/arch/tools/cli/utils.py @@ -21,6 +21,22 @@ def getLogger(name="cli"): log = getLogger(__name__) +def has_ingress_listener(arch_config_file): + """Check if the arch config file has ingress_traffic listener configured.""" + try: + with open(arch_config_file) as f: + arch_config_dict = yaml.safe_load(f) + + ingress_traffic = arch_config_dict.get("listeners", {}).get( + "ingress_traffic", {} + ) + + return bool(ingress_traffic) + except Exception as e: + log.error(f"Error reading config file {arch_config_file}: {e}") + return False + + def get_llm_provider_access_keys(arch_config_file): with open(arch_config_file, "r") as file: arch_config = file.read() @@ -72,6 +88,19 @@ def load_env_file_to_dict(file_path): return env_dict +def find_config_file(path=".", file=None): + """Find the appropriate config file path.""" + if file: + # If a file is provided, process that file + return os.path.abspath(file) + else: + # If no file is provided, use the path and look for arch_config.yaml first, then config.yaml for convenience + arch_config_file = os.path.abspath(os.path.join(path, "config.yaml")) + if not os.path.exists(arch_config_file): + arch_config_file = os.path.abspath(os.path.join(path, "arch_config.yaml")) + return arch_config_file + + def stream_access_logs(follow): """ Get the archgw access logs diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs index 3b95b15f..2989fec5 100644 --- a/crates/brightstaff/src/handlers/chat_completions.rs +++ b/crates/brightstaff/src/handlers/chat_completions.rs @@ -126,8 +126,9 @@ pub async fn chat( }); const MAX_MESSAGE_LENGTH: usize = 50; - let latest_message_for_log = if latest_message_for_log.len() > MAX_MESSAGE_LENGTH { - format!("{}...", &latest_message_for_log[..MAX_MESSAGE_LENGTH]) + let latest_message_for_log = if latest_message_for_log.chars().count() > MAX_MESSAGE_LENGTH { + let truncated: String = latest_message_for_log.chars().take(MAX_MESSAGE_LENGTH).collect(); + format!("{}...", truncated) } else { latest_message_for_log }; diff --git a/crates/hermesllm/src/apis/anthropic.rs b/crates/hermesllm/src/apis/anthropic.rs index ae61e2fe..abfde5b7 100644 --- a/crates/hermesllm/src/apis/anthropic.rs +++ b/crates/hermesllm/src/apis/anthropic.rs @@ -70,7 +70,9 @@ pub enum ServiceTier { #[skip_serializing_none] #[derive(Serialize, Deserialize, Debug, Clone)] pub struct ThinkingConfig { - pub enabled: bool, + #[serde(rename = "type")] + pub thinking_type: String, + pub budget_tokens: Option, } // MCP Server types @@ -131,6 +133,31 @@ pub enum MessagesRole { Assistant, } +/// Cache control types for content blocks +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "type")] +pub enum MessagesCacheControl { + Ephemeral, +} + +/// Tool result content can be either a string or array of content blocks +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(untagged)] +pub enum ToolResultContent { + Text(String), + Blocks(Vec), +} + +impl ExtractText for ToolResultContent { + fn extract_text(&self) -> String { + match self { + ToolResultContent::Text(text) => text.clone(), + ToolResultContent::Blocks(blocks) => blocks.extract_text(), + } + } +} + #[skip_serializing_none] #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] @@ -138,9 +165,12 @@ pub enum MessagesRole { pub enum MessagesContentBlock { Text { text: String, + cache_control: Option, }, Thinking { - text: String, + thinking: String, + signature: Option, + cache_control: Option, }, Image { source: MessagesImageSource, @@ -152,11 +182,13 @@ pub enum MessagesContentBlock { id: String, name: String, input: Value, + cache_control: Option, }, ToolResult { tool_use_id: String, is_error: Option, - content: Vec, + content: ToolResultContent, + cache_control: Option, }, ServerToolUse { id: String, @@ -195,7 +227,7 @@ impl ExtractText for Vec { fn extract_text(&self) -> String { self.iter() .filter_map(|block| match block { - MessagesContentBlock::Text { text } => Some(text.as_str()), + MessagesContentBlock::Text { text, .. } => Some(text.as_str()), _ => None, }) .collect::>() @@ -206,6 +238,7 @@ impl ExtractText for Vec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] +#[serde(tag = "type")] pub enum MessagesImageSource { Base64 { media_type: String, @@ -218,6 +251,7 @@ pub enum MessagesImageSource { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] +#[serde(tag = "type")] pub enum MessagesDocumentSource { Base64 { media_type: String, @@ -380,6 +414,8 @@ pub enum MessagesContentDelta { TextDelta { text: String }, #[serde(rename = "input_json_delta")] InputJsonDelta { partial_json: String }, + #[serde(rename = "thinking_delta")] + ThinkingDelta { thinking: String }, } #[skip_serializing_none] @@ -447,7 +483,7 @@ impl ProviderRequest for MessagesRequest { MessagesSystemPrompt::Single(s) => text_parts.push(s.clone()), MessagesSystemPrompt::Blocks(blocks) => { for block in blocks { - if let MessagesContentBlock::Text { text } = block { + if let MessagesContentBlock::Text { text, .. } = block { text_parts.push(text.clone()); } } @@ -461,7 +497,7 @@ impl ProviderRequest for MessagesRequest { MessagesMessageContent::Single(text) => text_parts.push(text.clone()), MessagesMessageContent::Blocks(blocks) => { for block in blocks { - if let MessagesContentBlock::Text { text } = block { + if let MessagesContentBlock::Text { text, .. } = block { text_parts.push(text.clone()); } } @@ -480,7 +516,7 @@ impl ProviderRequest for MessagesRequest { MessagesMessageContent::Single(text) => return Some(text.clone()), MessagesMessageContent::Blocks(blocks) => { for block in blocks { - if let MessagesContentBlock::Text { text } = block { + if let MessagesContentBlock::Text { text, .. } = block { return Some(text.clone()); } } @@ -537,10 +573,10 @@ impl ProviderStreamResponse for MessagesStreamEvent { fn content_delta(&self) -> Option<&str> { match self { MessagesStreamEvent::ContentBlockDelta { delta, .. } => { - if let MessagesContentDelta::TextDelta { text } = delta { - Some(text) - } else { - None + match delta { + MessagesContentDelta::TextDelta { text } => Some(text), + MessagesContentDelta::ThinkingDelta { thinking } => Some(thinking), + _ => None, } } _ => None, @@ -643,7 +679,7 @@ mod tests { "system": "You are a helpful assistant", "service_tier": "auto", "thinking": { - "enabled": true + "type": "enabled" }, "metadata": { "user_id": "123" @@ -670,7 +706,7 @@ mod tests { } if let Some(thinking) = &deserialized_request.thinking { - assert_eq!(thinking.enabled, true); + assert_eq!(thinking.thinking_type, "enabled"); } else { panic!("Expected thinking config"); } @@ -725,10 +761,9 @@ mod tests { { "type": "image", "source": { - "base64": { - "media_type": "image/jpeg", - "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" - } + "type": "base64", + "media_type": "image/jpeg", + "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" } } ] @@ -738,7 +773,7 @@ mod tests { "content": [ { "type": "thinking", - "text": "Let me analyze the image and then check the weather..." + "thinking": "Let me analyze the image and then check the weather..." }, { "type": "text", @@ -797,7 +832,7 @@ mod tests { assert_eq!(content_blocks.len(), 2); // Validate text content block - if let MessagesContentBlock::Text { text } = &content_blocks[0] { + if let MessagesContentBlock::Text { text, .. } = &content_blocks[0] { assert_eq!(text, "What can you see in this image and what's the weather like?"); } else { panic!("Expected text content block"); @@ -825,21 +860,21 @@ mod tests { assert_eq!(content_blocks.len(), 3); // Validate thinking content block - if let MessagesContentBlock::Thinking { text } = &content_blocks[0] { - assert_eq!(text, "Let me analyze the image and then check the weather..."); + if let MessagesContentBlock::Thinking { thinking, .. } = &content_blocks[0] { + assert_eq!(thinking, "Let me analyze the image and then check the weather..."); } else { panic!("Expected thinking content block"); } // Validate text content block - if let MessagesContentBlock::Text { text } = &content_blocks[1] { + if let MessagesContentBlock::Text { text, .. } = &content_blocks[1] { assert_eq!(text, "I can see the image. Let me check the weather for you."); } else { panic!("Expected text content block"); } // Validate tool use content block - if let MessagesContentBlock::ToolUse { ref id, ref name, ref input } = content_blocks[2] { + if let MessagesContentBlock::ToolUse { ref id, ref name, ref input, .. } = content_blocks[2] { assert_eq!(id, "toolu_weather123"); assert_eq!(name, "get_weather"); assert_eq!(input["location"], "San Francisco, CA"); @@ -871,7 +906,7 @@ mod tests { assert!(deserialized_request.system.is_some()); if let Some(MessagesSystemPrompt::Blocks(ref system_blocks)) = deserialized_request.system { assert_eq!(system_blocks.len(), 1); - if let MessagesContentBlock::Text { text } = &system_blocks[0] { + if let MessagesContentBlock::Text { text, .. } = &system_blocks[0] { assert_eq!(text, "You are a helpful assistant that can analyze images and provide weather information."); } else { panic!("Expected text content block in system prompt"); @@ -967,7 +1002,7 @@ mod tests { // Check content assert_eq!(deserialized_response.content.len(), 1); - if let MessagesContentBlock::Text { text } = &deserialized_response.content[0] { + if let MessagesContentBlock::Text { text, .. } = &deserialized_response.content[0] { assert_eq!(text, "Hello! How can I help you today?"); } else { panic!("Expected text content block"); @@ -1021,7 +1056,7 @@ mod tests { }); let deserialized_tool_use: MessagesContentBlock = serde_json::from_value(tool_use_json.clone()).unwrap(); - if let MessagesContentBlock::ToolUse { ref id, ref name, ref input } = deserialized_tool_use { + if let MessagesContentBlock::ToolUse { ref id, ref name, ref input, .. } = deserialized_tool_use { assert_eq!(id, "toolu_01ABC123"); assert_eq!(name, "get_weather"); assert_eq!(input["location"], "San Francisco, CA"); @@ -1045,14 +1080,18 @@ mod tests { }); let deserialized_tool_result: MessagesContentBlock = serde_json::from_value(tool_result_json.clone()).unwrap(); - if let MessagesContentBlock::ToolResult { ref tool_use_id, ref is_error, ref content } = deserialized_tool_result { + if let MessagesContentBlock::ToolResult { ref tool_use_id, ref is_error, ref content, .. } = deserialized_tool_result { assert_eq!(tool_use_id, "toolu_01ABC123"); assert!(is_error.is_none()); - assert_eq!(content.len(), 1); - if let MessagesContentBlock::Text { text } = &content[0] { - assert_eq!(text, "The weather in San Francisco is sunny, 72°F"); + if let ToolResultContent::Blocks(blocks) = content { + assert_eq!(blocks.len(), 1); + if let MessagesContentBlock::Text { text, .. } = &blocks[0] { + assert_eq!(text, "The weather in San Francisco is sunny, 72°F"); + } else { + panic!("Expected text content in tool result"); + } } else { - panic!("Expected text content in tool result"); + panic!("Expected blocks content in tool result"); } } else { panic!("Expected tool result content block"); @@ -1062,6 +1101,208 @@ mod tests { assert_eq!(tool_result_json, serialized_tool_result_json); } + #[test] + fn test_anthropic_nested_types_with_cache_control() { + // Test complete MessagesRequest with cache_control fields and various content types + let complex_request_json = json!({ + "model": "claude-sonnet-4-20250514", + "max_tokens": 4096, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "\nThis is a reminder that your todo list is currently empty. DO NOT mention this to the user explicitly because they are already aware. If you are working on tasks that would benefit from a todo list please use the TodoWrite tool to create one. If not, please feel free to ignore. Again do not mention this message to the user.\n" + }, + { + "type": "text", + "text": "\nAs you answer the user's questions, you can use the following context:\n# important-instruction-reminders\nDo what has been asked; nothing more, nothing less.\nNEVER create files unless they're absolutely necessary for achieving your goal.\nALWAYS prefer editing an existing file to creating a new one.\nNEVER proactively create documentation files (*.md) or README files. Only create documentation files if explicitly requested by the User.\n\n \n IMPORTANT: this context may or may not be relevant to your tasks. You should not respond to this context unless it is highly relevant to your task.\n\n" + }, + { + "type": "text", + "text": "Do we need to add more tests to transformers.rs?" + } + ] + }, + { + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": "call_kV50LtJQKHvvzZui5TW56DUl", + "name": "TodoWrite", + "input": { + "todos": [ + { + "activeForm": "Locating and inspecting transformers.rs tests", + "content": "Locate transformers.rs and inspect existing tests", + "status": "pending" + }, + { + "activeForm": "Running tests and checking failures", + "content": "Run the test suite and check for failures related to transformers.rs", + "status": "pending" + }, + { + "activeForm": "Adding/updating tests for transformers.rs", + "content": "Add or update unit/integration tests for transformers.rs if coverage is insufficient", + "status": "pending" + } + ] + }, + "cache_control": { + "type": "ephemeral" + } + } + ] + }, + { + "role": "user", + "content": [ + { + "tool_use_id": "call_kV50LtJQKHvvzZui5TW56DUl", + "type": "tool_result", + "content": "Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable\n\n\nYour todo list has changed. DO NOT mention this explicitly to the user. Here are the latest contents of your todo list:\n\n[{\"content\":\"Locate transformers.rs and inspect existing tests\",\"status\":\"pending\",\"activeForm\":\"Locating and inspecting transformers.rs tests\"},{\"content\":\"Run the test suite and check for failures related to transformers.rs\",\"status\":\"pending\",\"activeForm\":\"Running tests and checking failures\"},{\"content\":\"Add or update unit/integration tests for transformers.rs if coverage is insufficient\",\"status\":\"pending\",\"activeForm\":\"Adding/updating tests for transformers.rs\"}]. Continue on with the tasks at hand if applicable.\n" + }, + { + "type": "text", + "text": "should I add more tests to transformers.rs?" + }, + { + "type": "text", + "text": "try again", + "cache_control": { + "type": "ephemeral" + } + } + ] + } + ], + "temperature": 1, + "system": [ + { + "type": "text", + "text": "You are Claude Code, Anthropic's official CLI for Claude.", + "cache_control": { + "type": "ephemeral" + } + }, + { + "type": "text", + "text": "\nYou are an interactive CLI tool that helps users with software engineering tasks. Use the instructions below and the tools available to you to assist the user.\n\nIMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. Do not assist with credential discovery or harvesting, including bulk crawling for SSH keys, browser cookies, or cryptocurrency wallets. Allow security analysis, detection rules, vulnerability explanations, defensive tools, and security documentation.\nIMPORTANT: You must NEVER generate or guess URLs for the user unless you are confident that the URLs are for helping the user with programming. You may use URLs provided by the user in their messages or local files.\n\nIf the user asks for help or wants to give feedback inform them of the following: \n- /help: Get help with using Claude Code\n- To give feedback, users should report the issue at https://github.com/anthropics/claude-code/issues\n\nWhen⁠ the user directly asks about Claude Code (eg. \"can Claude Code do...\", \"does Claude Code have...\"), or asks in second person (eg. \"are you able...\", \"can you do...\"), or asks how to use a specific Claude Code feature (eg. implement a hook, or write a slash command), use the WebFetch tool to gather information to answer the question from Claude Code docs. The list of available docs is available at https://docs.claude.com/en/docs/claude-code/claude_code_docs_map.md.\n\n#⁠ Tone and style\nYou should be concise, direct, and to the point.\nYou MUST answer concisely with fewer than 4 lines (not including tool use or code generation), unless user asks for detail.\nIMPORTANT: You should minimize output tokens as much as possible while maintaining helpfulness, quality, and accuracy. Only address the specific task at hand, avoiding tangential information unless absolutely critical for completing the request. If you can answer in 1-3 sentences or a short paragraph, please do.\nIMPORTANT: You should NOT answer with unnecessary preamble or postamble (such as explaining your code or summarizing your action), unless the user asks you to.\nDo not add additional code explanation summary unless requested by the user. After working on a file, just stop, rather than providing an explanation of what you did.\nAnswer the user's question directly, avoiding any elaboration, explanation, introduction, conclusion, or excessive details. One word answers are best. You MUST avoid text before/after your response, such as \"The answer is .\", \"Here is the content of the file...\" or \"Based on the information provided, the answer is...\" or \"Here is what I will do next...\".\n\nHere are some examples to demonstrate appropriate verbosity:\n\nuser: 2 + 2\nassistant: 4\n\n\n\nuser: what is 2+2?\nassistant: 4\n\n\n\nuser: is 11 a prime number?\nassistant: Yes\n\n\n\nuser: what command should I run to list files in the current directory?\nassistant: ls\n\n\n\nuser: what command should I run to watch files in the current directory?\nassistant: [runs ls to list the files in the current directory, then read docs/commands in the relevant file to find out how to watch files]\nnpm run dev\n\n\n\nuser: How many golf balls fit inside a jetta?\nassistant: 150000\n\n\n\nuser: what files are in the directory src/?\nassistant: [runs ls and sees foo.c, bar.c, baz.c]\nuser: which file contains the implementation of foo?\nassistant: src/foo.c\n\nWhen you run a non-trivial bash command, you should explain what the command does and why you are running it, to make sure the user understands what you are doing (this is especially important when you are running a command that will make changes to the user's system).\nRemember that your output will be displayed on a command line interface. Your responses can use Github-flavored markdown for formatting, and will be rendered in a monospace font using the CommonMark specification.\nOutput text to communicate with the user; all text you output outside of tool use is displayed to the user. Only use tools to complete tasks. Never use tools like Bash or code comments as means to communicate with the user during the session.\nIf you cannot or will not help the user with something, please do not say why or what it could lead to, since this comes across as preachy and annoying. Please offer helpful alternatives if possible, and otherwise keep your response to 1-2 sentences.\nOnly use emojis if the user explicitly requests it. Avoid using emojis in all communication unless asked.\nIMPORTANT: Keep your responses short, since they will be displayed on a command line interface.\n\n# Proactiveness\nYou are allowed to be proactive, but only when the user asks you to do something. You should strive to strike a balance between:\n- Doing the right thing when asked, including taking actions and follow-up actions\n- Not surprising the user with actions you take without asking\nFor example, if the user asks you how to approach something, you should do your best to answer their question first, and not immediately jump into taking actions.\n\n# Professional objectivity\nPrioritize technical accuracy and truthfulness over validating the user's beliefs. Focus on facts and problem-solving, providing direct, objective technical info without any unnecessary superlatives, praise, or emotional validation. It is best for the user if Claude honestly applies the same rigorous standards to all ideas and disagrees when necessary, even if it may not be what the user wants to hear. Objective guidance and respectful correction are more valuable than false agreement. Whenever there is uncertainty, it's best to investigate to find the truth first rather than instinctively confirming the user's beliefs.\n\n# Following conventions\nWhen making changes to files, first understand the file's code conventions. Mimic code style, use existing libraries and utilities, and follow existing patterns.\n- NEVER assume that a given library is available, even if it is well known. Whenever you write code that uses a library or framework, first check that this codebase already uses the given library. For example, you might look at neighboring files, or check the package.json (or cargo.toml, and so on depending on the language).\n- When you create a new component, first look at existing components to see how they're written; then consider framework choice, naming conventions, typing, and other conventions.\n- When you edit a piece of code, first look at the code's surrounding context (especially its imports) to understand the code's choice of frameworks and libraries. Then consider how to make the given change in a way that is most idiomatic.\n- Always follow security best practices. Never introduce code that exposes or logs secrets and keys. Never commit secrets or keys to the repository.\n\n# Code style\n- IMPORTANT: DO NOT ADD ***ANY*** COMMENTS unless asked\n\n\n# Task Management\nYou have access to the TodoWrite tools to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.\nThese tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.\n\nIt is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.\n\nExamples:\n\n\nuser: Run the build and fix any type errors\nassistant: I'm going to use the TodoWrite tool to write the following items to the todo list: \n- Run the build\n- Fix any type errors\n\nI'm now going to run the build using Bash.\n\nLooks like I found 10 type errors. I'm going to use the TodoWrite tool to write 10 items to the todo list.\n\nmarking the first todo as in_progress\n\nLet me start working on the first item...\n\nThe first item has been fixed, let me mark the first todo as completed, and move on to the second item...\n..\n..\n\nIn the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.\n\n\nuser: Help me write a new feature that allows users to track their usage metrics and export them to various formats\n\nassistant: I'll help you implement a usage metrics tracking and export feature. Let me first use the TodoWrite tool to plan this task.\nAdding the following todos to the todo list:\n1. Research existing metrics tracking in the codebase\n2. Design the metrics collection system\n3. Implement core metrics tracking functionality\n4. Create export functionality for different formats\n\nLet me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.\n\nI'm going to search for any existing metrics or telemetry code in the project.\n\nI've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...\n\n[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]\n\n\n\nUsers may configure 'hooks', shell commands that execute in response to events like tool calls, in settings. Treat feedback from hooks, including , as coming from the user. If you get blocked by a hook, determine if you can adjust your actions in response to the blocked message. If not, ask the user to check their hooks configuration.\n\n# Doing tasks\nThe user will primarily request you perform software engineering tasks. This includes solving bugs, adding new functionality, refactoring code, explaining code, and more. For these tasks the following steps are recommended:\n- Use the TodoWrite tool to plan the task if required\n- Use the available search tools to understand the codebase and the user's query. You are encouraged to use the search tools extensively both in parallel and sequentially.\n- Implement the solution using all tools available to you\n- Verify the solution if possible with tests. NEVER assume specific test framework or test script. Check the README or search codebase to determine the testing approach.\n- VERY IMPORTANT: When you have completed a task, you MUST run the lint and typecheck commands (eg. npm run lint, npm run typecheck, ruff, etc.) with Bash if they were provided to you to ensure your code is correct. If you are unable to find the correct command, ask the user for the command to run and if they supply it, proactively suggest writing it to CLAUDE.md so that you will know to run it next time.\nNEVER commit changes unless the user explicitly asks you to. It is VERY IMPORTANT to only commit when explicitly asked, otherwise the user will feel that you are being too proactive.\n\n- Tool results and user messages may include tags. tags contain useful information and reminders. They are automatically added by the system, and bear no direct relation to the specific tool results or user messages in which they appear.\n\n\n# Tool usage policy\n- When doing file search, prefer to use the Task tool in order to reduce context usage.\n- You should proactively use the Task tool with specialized agents when the task at hand matches the agent's description.\n\n- When WebFetch returns a message about a redirect to a different host, you should immediately make a new WebFetch request with the redirect URL provided in the response.\n- You have the capability to call multiple tools in a single response. When multiple independent pieces of information are requested, batch your tool calls together for optimal performance. When making multiple bash tool calls, you MUST send a single message with multiple tools calls to run the calls in parallel. For example, if you need to run \"git status\" and \"git diff\", send a single message with two tool calls to run the calls in parallel.\n- If the user specifies that they want you to run tools \"in parallel\", you MUST send a single message with multiple tool use content blocks. For example, if you need to launch multiple agents in parallel, send a single message with multiple Task tool calls.\n\n\n\n\nHere is useful information about the environment you are running in:\n\nWorking directory: /Users/salmanparacha/arch/crates/llm_gateway\nIs directory a git repo: Yes\nPlatform: darwin\nOS Version: Darwin 25.0.0\nToday's date: 2025-09-25\n\nYou are powered by the model named Sonnet 4. The exact model ID is claude-sonnet-4-20250514.\n\nAssistant knowledge cutoff is January 2025.\n\n\nIMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. Do not assist with credential discovery or harvesting, including bulk crawling for 2025-09-25T22:19:13.499582010Z SSH keys, browser cookies, or cryptocurrency wallets. Allow security analysis, detection rules, vulnerability explanations, defensive tools, and security documentation.\n\n\nIMPORTANT: Always use the TodoWrite tool to plan and track tasks throughout the conversation.\n\n# Code References\n\nWhen referencing specific functions or pieces of code include the pattern `file_path:line_number` to allow the user to easily navigate to the source code location.\n\n\nuser: Where are errors from the client handled?\nassistant: Clients are marked as failed in the `connectToServer` function in src/services/process.ts:712.\n\n", + "cache_control": { + "type": "ephemeral" + } + } + ], + "tools": [ + { + "name": "Task", + "description": "Launch a new agent to handle complex, multi-step tasks autonomously. \n\nAvailable agent types and the tools they have access to:\n- general-purpose: General-purpose agent for researching complex questions, searching for code, and executing multi-step tasks. When you are searching for a keyword or file and are not confident that you will find the right match in the first few tries use this agent to perform the search for you. (Tools: *)\n- statusline-setup: Use this agent to configure the user's Claude Code status line setting. (Tools: Read, Edit)\n- output-style-setup: Use this agent to create a Claude Code output style. (Tools: Read, Write, Edit, Glob, Grep)\n\nWhen using the Task tool, you must specify a subagent_type parameter to select which agent type to use.\n\nWhen NOT to use the Agent tool:\n- If you want to read a specific file path, use the Read or Glob tool instead of the Agent tool, to find the match more quickly\n- If you are searching for a specific class definition like \"class Foo\", use the Glob tool instead, to find the match more quickly\n- If you are searching for code within a specific file or set of 2-3 files, use the Read tool instead of the Agent tool, to find the match more quickly\n- Other tasks that are not related to the agent descriptions above\n\n\nUsage notes:\n1. Launch multiple agents concurrently whenever possible, to maximize performance; to do that, use a single message with multiple tool uses\n2. When the agent is done, it will return a single message back to you. The result returned by the agent is not visible to the user. To show the user the result, you should send a text message back to the user with a concise summary of the result.\n3. Each agent invocation is stateless. You will not be able to send additional messages to the agent, nor will the agent be able to communicate with you outside of its final report. Therefore, your prompt should contain a highly detailed task description for the agent to perform autonomously and you should specify exactly what information the agent should return back to you in its final and only message to you.\n4. The agent's outputs should generally be trusted\n5. Clearly tell the agent whether you expect it to write code or just to do research (search, file reads, web fetches, etc.), since it is not aware of the user's intent\n6. If the agent description mentions that it should be used proactively, then you should try your best to use it without the user having to ask for it first. Use your judgement.\n7. If the user specifies that they want you to run agents \"in parallel\", you MUST send a single message with multiple Task tool use content blocks. For example, if you need to launch both a code-reviewer agent and a test-runner agent in parallel, send a single message with both tool calls.\n\nExample usage:\n\n\n\"code-reviewer\": use this agent after you are done writing a signficant piece of code\n\"greeting-responder\": use this agent when to respond to user greetings with a friendly joke\n\n\n\nuser: \"Please write a function that checks if a number is prime\"\nassistant: Sure let me write a function that checks if a number is prime\nassistant: First let me use the Write tool to write a function that checks if a number is prime\nassistant: I'm going to use the Write tool to write the following code:\n\nfunction isPrime(n) {\n if (n <= 1) return false\n for (let i = 2; i * i <= n; i++) {\n if (n % i === 0) return false\n }\n return true\n}\n\n\nSince a signficant piece of code was written and the task was completed, now use the code-reviewer agent to review the code\n\nassistant: Now let me use the code-reviewer agent to review the code\nassistant: Uses the Task tool to launch the with the code-reviewer agent \n\n\n\nuser: \"Hello\"\n\nSince the user is greeting, use the greeting-responder agent to respond with a friendly joke\n\nassistant: \"I'm going to use the Task tool to launch the with the greeting-responder agent\"\n\n", + "input_schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "A short (3-5 word) description of the task" + }, + "prompt": { + "type": "string", + "description": "The task for the agent to perform" + }, + "subagent_type": { + "type": "string", + "description": "The type of specialized agent to use for this task" + } + }, + "required": [ + "description", + "prompt", + "subagent_type" + ], + "additionalProperties": false, + "$schema": "http://json-schema.org/draft-07/schema#" + } + } + ] + }); + + // Deserialize the complex MessagesRequest + let deserialized_request: MessagesRequest = serde_json::from_value(complex_request_json.clone()).unwrap(); + + // Verify basic fields + assert_eq!(deserialized_request.model, "claude-sonnet-4-20250514"); + assert_eq!(deserialized_request.temperature, Some(1.0)); + assert_eq!(deserialized_request.messages.len(), 3); + + // Verify system message with cache_control + if let Some(MessagesSystemPrompt::Blocks(ref system_blocks)) = deserialized_request.system { + assert_eq!(system_blocks.len(), 2); + if let MessagesContentBlock::Text { text, cache_control } = &system_blocks[0] { + assert_eq!(text, "You are Claude Code, Anthropic's official CLI for Claude."); + assert_eq!(cache_control, &Some(MessagesCacheControl::Ephemeral)); + } else { + panic!("Expected text system message with cache_control"); + } + } else { + panic!("Expected system blocks"); + } + + // Verify tool_use message with cache_control + let assistant_message = &deserialized_request.messages[1]; + assert_eq!(assistant_message.role, MessagesRole::Assistant); + if let MessagesMessageContent::Blocks(ref content_blocks) = assistant_message.content { + if let MessagesContentBlock::ToolUse { id, name, input, cache_control } = &content_blocks[0] { + assert_eq!(id, "call_kV50LtJQKHvvzZui5TW56DUl"); + assert_eq!(name, "TodoWrite"); + assert_eq!(cache_control, &Some(MessagesCacheControl::Ephemeral)); + // Verify the complex input structure + assert!(input.get("todos").is_some()); + let todos = input.get("todos").unwrap().as_array().unwrap(); + assert_eq!(todos.len(), 3); + } else { + panic!("Expected tool_use message with cache_control"); + } + } else { + panic!("Expected content blocks in assistant message"); + } + + // Verify tool_result with string content + let user_message = &deserialized_request.messages[2]; + assert_eq!(user_message.role, MessagesRole::User); + if let MessagesMessageContent::Blocks(ref content_blocks) = user_message.content { + if let MessagesContentBlock::ToolResult { tool_use_id, content, .. } = &content_blocks[0] { + assert_eq!(tool_use_id, "call_kV50LtJQKHvvzZui5TW56DUl"); + if let ToolResultContent::Text(text) = content { + assert!(text.contains("Todos have been modified successfully")); + } else { + panic!("Expected string content in tool result"); + } + } else { + panic!("Expected tool_result message"); + } + + // Verify text content with cache_control + if let MessagesContentBlock::Text { text, cache_control } = &content_blocks[2] { + assert_eq!(text, "try again"); + assert_eq!(cache_control, &Some(MessagesCacheControl::Ephemeral)); + } else { + panic!("Expected text message with cache_control"); + } + } else { + panic!("Expected content blocks in user message"); + } + + // Test serialization round-trip + let serialized_request = serde_json::to_value(&deserialized_request).unwrap(); + let re_deserialized_request: MessagesRequest = serde_json::from_value(serialized_request).unwrap(); + + // Verify round-trip consistency + assert_eq!(deserialized_request.model, re_deserialized_request.model); + assert_eq!(deserialized_request.messages.len(), re_deserialized_request.messages.len()); + } + #[test] fn test_anthropic_api_provider_trait_implementation() { // Test that AnthropicApi implements ApiDefinition trait correctly @@ -1085,4 +1326,68 @@ mod tests { assert_eq!(all_variants.len(), 1); assert_eq!(all_variants[0], AnthropicApi::Messages); } + + #[test] + fn test_anthropic_thinking_streaming() { + // Test thinking delta stream event + let thinking_delta_json = json!({ + "type": "content_block_delta", + "index": 0, + "delta": { + "type": "thinking_delta", + "thinking": ".\n\nI need to consider:\n1. Current" + } + }); + + let deserialized_event: MessagesStreamEvent = serde_json::from_value(thinking_delta_json.clone()).unwrap(); + if let MessagesStreamEvent::ContentBlockDelta { index, ref delta } = deserialized_event { + assert_eq!(index, 0); + if let MessagesContentDelta::ThinkingDelta { thinking } = delta { + assert_eq!(thinking, ".\n\nI need to consider:\n1. Current"); + } else { + panic!("Expected thinking delta"); + } + } else { + panic!("Expected content block delta event"); + } + + // Test that thinking delta is returned by content_delta() + assert_eq!(deserialized_event.content_delta(), Some(".\n\nI need to consider:\n1. Current")); + + let serialized_event_json = serde_json::to_value(&deserialized_event).unwrap(); + assert_eq!(thinking_delta_json, serialized_event_json); + } + + #[test] + fn test_anthropic_thinking_request_config() { + // Test thinking config with budget_tokens + let request_json = json!({ + "model": "claude-sonnet-4-20250514", + "messages": [ + { + "role": "user", + "content": "Test message" + } + ], + "max_tokens": 2048, + "thinking": { + "type": "enabled", + "budget_tokens": 1024 + } + }); + + let deserialized_request: MessagesRequest = serde_json::from_value(request_json.clone()).unwrap(); + assert_eq!(deserialized_request.model, "claude-sonnet-4-20250514"); + assert_eq!(deserialized_request.max_tokens, 2048); + + if let Some(thinking) = &deserialized_request.thinking { + assert_eq!(thinking.thinking_type, "enabled"); + assert_eq!(thinking.budget_tokens, Some(1024)); + } else { + panic!("Expected thinking config"); + } + + let serialized_json = serde_json::to_value(&deserialized_request).unwrap(); + assert_eq!(request_json, serialized_json); + } } diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs index ead6e7d2..d7d6ea70 100644 --- a/crates/hermesllm/src/apis/openai.rs +++ b/crates/hermesllm/src/apis/openai.rs @@ -88,6 +88,7 @@ pub struct ChatCompletionsRequest { pub prediction: Option, // pub reasoning_effect: Option, // GOOD FIRST ISSUE: Future support for reasoning effects pub response_format: Option, + pub reasoning_effort: Option, // e.g., "none", "low", "medium", "high" // pub safety_identifier: Option, // GOOD FIRST ISSUE: Future support for safety identifiers pub seed: Option, pub service_tier: Option, @@ -116,6 +117,13 @@ impl ChatCompletionsRequest { self.max_tokens = None; } } + + pub fn fix_temperature_if_gpt5(&mut self) { + let model = self.model.as_str(); + if model.starts_with("gpt-5") { + self.temperature = Some(1.0); + } + } } // ============================================================================ @@ -598,6 +606,7 @@ impl TryFrom<&[u8]> for ChatCompletionsRequest { let mut req: ChatCompletionsRequest = serde_json::from_slice(bytes).map_err(OpenAIStreamError::from)?; // Use the centralized suppression logic req.suppress_max_tokens_if_o3(); + req.fix_temperature_if_gpt5(); Ok(req) } } diff --git a/crates/hermesllm/src/clients/transformer.rs b/crates/hermesllm/src/clients/transformer.rs index 0ec06847..0856c359 100644 --- a/crates/hermesllm/src/clients/transformer.rs +++ b/crates/hermesllm/src/clients/transformer.rs @@ -111,6 +111,7 @@ impl TryFrom for ChatCompletionsRequest { ..Default::default() }; _chat_completions_req.suppress_max_tokens_if_o3(); + _chat_completions_req.fix_temperature_if_gpt5(); Ok(_chat_completions_req) } } @@ -352,6 +353,7 @@ impl TryFrom for MessagesStreamEvent { let choice = &resp.choices[0]; // Handle final chunk with usage + let has_usage = resp.usage.is_some(); if let Some(usage) = resp.usage { if let Some(finish_reason) = &choice.finish_reason { let anthropic_stop_reason: MessagesStopReason = finish_reason.clone().into(); @@ -403,11 +405,27 @@ impl TryFrom for MessagesStreamEvent { return convert_tool_call_deltas(tool_calls.clone()); } - // Handle finish reason + // Handle finish reason - generate MessageDelta only (MessageStop comes later) if let Some(finish_reason) = &choice.finish_reason { - if *finish_reason == FinishReason::Stop { - return Ok(MessagesStreamEvent::MessageStop); + // If we have usage data, it was already handled above + // If not, we need to generate MessageDelta with default usage + if !has_usage { + let anthropic_stop_reason: MessagesStopReason = finish_reason.clone().into(); + return Ok(MessagesStreamEvent::MessageDelta { + delta: MessagesMessageDelta { + stop_reason: anthropic_stop_reason, + stop_sequence: None, + }, + usage: MessagesUsage { + input_tokens: 0, + output_tokens: 0, + cache_creation_input_tokens: None, + cache_read_input_tokens: None, + }, + }); } + // If usage was already handled above, we don't need to do anything more here + // MessageStop will be handled when [DONE] is encountered } // Default to ping for unhandled cases @@ -468,18 +486,6 @@ impl TryFrom for Vec { } MessagesMessageContent::Blocks(blocks) => { let (content_parts, tool_calls, tool_results) = blocks.split_for_openai()?; - - // Create main message - let content = build_openai_content(content_parts, &tool_calls); - let main_message = Message { - role: message.role.into(), - content, - name: None, - tool_calls: if tool_calls.is_empty() { None } else { Some(tool_calls) }, - tool_call_id: None, - }; - result.push(main_message); - // Add tool result messages for (tool_use_id, result_text, _is_error) in tool_results { result.push(Message { @@ -490,6 +496,20 @@ impl TryFrom for Vec { tool_call_id: Some(tool_use_id), }); } + + // Only create main message if there's actual content or tool calls + // Skip creating empty content messages (e.g., when message only contains tool_result blocks) + if !content_parts.is_empty() || !tool_calls.is_empty() { + let content = build_openai_content(content_parts, &tool_calls); + let main_message = Message { + role: message.role.into(), + content, + name: None, + tool_calls: if tool_calls.is_empty() { None } else { Some(tool_calls) }, + tool_call_id: None, + }; + result.push(main_message); + } } } @@ -515,9 +535,11 @@ impl TryFrom for MessagesMessage { MessagesContentBlock::ToolResult { tool_use_id: tool_call_id, is_error: None, - content: vec![MessagesContentBlock::Text { + content: ToolResultContent::Blocks(vec![MessagesContentBlock::Text { text: message.content.extract_text(), - }], + cache_control: None, + }]), + cache_control: None, }, ]), }); @@ -551,7 +573,7 @@ impl ContentUtils for Vec { for block in self { match block { - MessagesContentBlock::ToolUse { id, name, input } | + MessagesContentBlock::ToolUse { id, name, input, .. } | MessagesContentBlock::ServerToolUse { id, name, input } | MessagesContentBlock::McpToolUse { id, name, input } => { let arguments = serde_json::to_string(&input)?; @@ -575,7 +597,7 @@ impl ContentUtils for Vec { for block in self { match block { - MessagesContentBlock::Text { text } => { + MessagesContentBlock::Text { text, .. } => { content_parts.push(ContentPart::Text { text: text.clone() }); } MessagesContentBlock::Image { source } => { @@ -587,7 +609,7 @@ impl ContentUtils for Vec { }, }); } - MessagesContentBlock::ToolUse { id, name, input } | + MessagesContentBlock::ToolUse { id, name, input, .. } | MessagesContentBlock::ServerToolUse { id, name, input } | MessagesContentBlock::McpToolUse { id, name, input } => { let arguments = serde_json::to_string(&input)?; @@ -597,7 +619,10 @@ impl ContentUtils for Vec { function: FunctionCall { name: name.clone(), arguments }, }); } - MessagesContentBlock::ToolResult { tool_use_id, content, is_error } | + MessagesContentBlock::ToolResult { tool_use_id, content, is_error, .. } => { + let result_text = content.extract_text(); + tool_results.push((tool_use_id.clone(), result_text, is_error.unwrap_or(false))); + } MessagesContentBlock::WebSearchToolResult { tool_use_id, content, is_error } | MessagesContentBlock::CodeExecutionToolResult { tool_use_id, content, is_error } | MessagesContentBlock::McpToolResult { tool_use_id, content, is_error } => { @@ -819,7 +844,7 @@ fn build_openai_content(content_parts: Vec, tool_calls: &[ToolCall] fn build_anthropic_content(content_blocks: Vec) -> MessagesMessageContent { if content_blocks.len() == 1 { match &content_blocks[0] { - MessagesContentBlock::Text { text } => MessagesMessageContent::Single(text.clone()), + MessagesContentBlock::Text { text, .. } => MessagesMessageContent::Single(text.clone()), _ => MessagesMessageContent::Blocks(content_blocks), } } else if content_blocks.is_empty() { @@ -835,12 +860,11 @@ fn convert_anthropic_content_to_openai(content: &[MessagesContentBlock]) -> Resu for block in content { match block { - MessagesContentBlock::Text { text } => { + MessagesContentBlock::Text { text, .. } => { text_parts.push(text.clone()); } - MessagesContentBlock::Thinking { text } => { - // Include thinking as regular text for OpenAI - text_parts.push(format!("[Thinking: {}]", text)); + MessagesContentBlock::Thinking { thinking, .. } => { + text_parts.push(format!("thinking: {}", thinking)); } _ => { // Skip other content types for basic text conversion @@ -860,14 +884,14 @@ fn convert_openai_message_to_anthropic_content(message: &Message) -> Result { if !text.is_empty() { - blocks.push(MessagesContentBlock::Text { text: text.clone() }); + blocks.push(MessagesContentBlock::Text { text: text.clone(), cache_control: None }); } } MessageContent::Parts(parts) => { for part in parts { match part { ContentPart::Text { text } => { - blocks.push(MessagesContentBlock::Text { text: text.clone() }); + blocks.push(MessagesContentBlock::Text { text: text.clone(), cache_control: None }); } ContentPart::ImageUrl { image_url } => { let source = convert_image_url_to_source(image_url); @@ -886,6 +910,7 @@ fn convert_openai_message_to_anthropic_content(message: &Message) -> Result Result { + Ok(create_openai_chunk( + "stream", + "unknown", + MessageDelta { + role: None, + content: Some(format!("thinking: {}", thinking)), + refusal: None, + function_call: None, + tool_calls: None, + }, + None, + None, + )) + } MessagesContentDelta::InputJsonDelta { partial_json } => { Ok(create_openai_chunk( "stream", @@ -1023,6 +1063,7 @@ fn convert_tool_call_deltas(tool_calls: Vec) -> Result for ProviderStreamResponse Ok(ProviderStreamResponseType::ChatCompletionsStreamResponse(chat_resp)) } (SupportedAPIs::OpenAIChatCompletions(_), SupportedAPIs::AnthropicMessagesAPI(_)) => { + // Special case: Handle [DONE] marker for OpenAI -> Anthropic conversion + if bytes == b"[DONE]" { + return Ok(ProviderStreamResponseType::MessagesStreamEvent( + crate::apis::anthropic::MessagesStreamEvent::MessageStop + )); + } + let openai_resp: crate::apis::openai::ChatCompletionsStreamResponse = serde_json::from_slice(bytes)?; // Transform to Anthropic Messages stream format using the transformer @@ -287,8 +294,8 @@ impl TryFrom<(SseEvent, &SupportedAPIs, &SupportedAPIs)> for SseEvent { // Create a new transformed event based on the original let mut transformed_event = sse_event; - // If not [DONE] and has data, parse the data as a provider stream response (business logic layer) - if !transformed_event.is_done() && transformed_event.data.is_some() { + // If has data, parse the data as a provider stream response (business logic layer) + if transformed_event.data.is_some() { let data_str = transformed_event.data.as_ref().unwrap(); let data_bytes = data_str.as_bytes(); let transformed_response = ProviderStreamResponseType::try_from((data_bytes, client_api, upstream_api))?; @@ -380,6 +387,7 @@ where I::Item: AsRef, { pub lines: I, + pub done_seen: bool, } impl SseStreamIter @@ -388,7 +396,7 @@ where I::Item: AsRef, { pub fn new(lines: I) -> Self { - Self { lines } + Self { lines, done_seen: false } } } @@ -411,14 +419,20 @@ where type Item = SseEvent; fn next(&mut self) -> Option { + // If we already returned [DONE], terminate the stream + if self.done_seen { + return None; + } + for line in &mut self.lines { let line_str = line.as_ref(); // Try to parse as either data: or event: line if let Ok(event) = line_str.parse::() { - // For data: lines, check if this is the [DONE] marker - if so, end the stream + // For data: lines, check if this is the [DONE] marker if event.data.is_some() && event.is_done() { - return None; + self.done_seen = true; + return Some(event); // Return [DONE] event for transformation } // For data: lines, skip events that should be filtered at the transport layer if event.data.is_some() && event.should_skip() { @@ -706,7 +720,11 @@ mod tests { assert!(event2.data.as_ref().unwrap().contains("msg2")); assert!(!event2.should_skip()); - // Iterator should end at [DONE] (no more events) + // Third event should be [DONE] + let done_event = iter.next().unwrap(); + assert!(done_event.is_done()); + + // Iterator should end after [DONE] assert!(iter.next().is_none()); } @@ -745,7 +763,11 @@ mod tests { assert!(!event4.is_event_only()); assert!(event4.data.as_ref().unwrap().contains("Hello")); - // Iterator should end at [DONE] + // Fifth event should be [DONE] + let done_event = iter.next().unwrap(); + assert!(done_event.is_done()); + + // Iterator should end after [DONE] assert!(iter.next().is_none()); } @@ -776,4 +798,25 @@ mod tests { let provider_type = ProviderStreamResponseType::ChatCompletionsStreamResponse(openai_event); assert_eq!(provider_type.event_type(), None); } + + #[test] + fn test_done_marker_handled_in_stream_response_transformation() { + use crate::apis::anthropic::AnthropicApi; + + // Test that [DONE] marker is properly converted to MessageStop in the transformation layer + let done_bytes = b"[DONE]"; + let client_api = SupportedAPIs::AnthropicMessagesAPI(AnthropicApi::Messages); + let upstream_api = SupportedAPIs::OpenAIChatCompletions(crate::apis::openai::OpenAIApi::ChatCompletions); + + let result = ProviderStreamResponseType::try_from((done_bytes.as_slice(), &client_api, &upstream_api)); + assert!(result.is_ok()); + + if let Ok(ProviderStreamResponseType::MessagesStreamEvent(event)) = result { + // Verify it's a MessageStop event + assert_eq!(event.event_type(), Some("message_stop")); + assert!(matches!(event, crate::apis::anthropic::MessagesStreamEvent::MessageStop)); + } else { + panic!("Expected MessagesStreamEvent::MessageStop"); + } + } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 12054ccb..d0400b3d 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -395,23 +395,15 @@ impl StreamContext { } } - fn debug_log_body(&self, body: &[u8]) { - debug!( - "[ARCHGW_REQ_ID:{}] UPSTREAM_RAW_RESPONSE: body_size={} content={}", - self.request_identifier(), - body.len(), - String::from_utf8_lossy(body) - ); - } - fn handle_streaming_response( &mut self, body: &[u8], provider_id: ProviderId, ) -> Result, Action> { debug!( - "[ARCHGW_REQ_ID:{}] STREAMING_PROCESS: provider_id={:?} chunk_size={}", + "[ARCHGW_REQ_ID:{}] STREAMING_PROCESS: client={:?} provider_id={:?} chunk_size={}", self.request_identifier(), + self.client_api, provider_id, body.len() ); @@ -958,7 +950,12 @@ impl HttpContext for StreamContext { Err(action) => return action, }; - self.debug_log_body(&body); + debug!( + "[ARCHGW_REQ_ID:{}] UPSTREAM_RAW_RESPONSE: body_size={} content={}", + self.request_identifier(), + body.len(), + String::from_utf8_lossy(&body) + ); let provider_id = self.get_provider_id(); if self.streaming_response { diff --git a/demos/use_cases/claude_code/README.md b/demos/use_cases/claude_code/README.md new file mode 100644 index 00000000..b84e95db --- /dev/null +++ b/demos/use_cases/claude_code/README.md @@ -0,0 +1,133 @@ +# Claude Code Routing with (Preference-aligned) Intelligence + +## Why This Matters + +**Claude Code is powerful, but what if you could access the best of ALL AI models through one familiar interface?** + +Instead of being locked into a set of LLMs from one provier, imagine: +- Using **DeepSeek's coding expertise** for complex algorithms +- Leveraging **GPT-5's reasoning** for architecture decisions +- Tapping **Claude's analysis** for code reviews +- Accessing **Grok's speed** for quick iterations + +**All through the same Claude Code interface you already love.** + +## The Solution: Intelligent Multi-LLM Routing + +Arch Gateway transforms Claude Code into a **universal AI development interface** that: + +### 🌐 **Connects to Any LLM Provider** +- **OpenAI**: GPT-4.1, GPT-5, etc. +- **Anthropic**: Claude 3.5 Sonnet, Claude 3 Haiku, Claude 4.5 +- **DeepSeek**: DeepSeek-V3, DeepSeek-Coder-V2 +- **Grok**: Grok-2, Grok-2-mini +- **Others**: Gemini, Llama, Mistral, local models via Ollama + +### 🧠 **Routes Intelligently Based on Task** +Our research-backed routing system automatically selects the optimal model by analyzing: +- **Task complexity** (simple refactoring vs. architectural design) +- **Content type** (code generation vs. debugging vs. documentation) + + +## Quick Start + +### Prerequisites +- Claude Code installed: `npm install -g @anthropic-ai/claude-code` +- Docker running on your system +- Create a python virtual environment in your current working directory + +### 1. Get the Configuration File +Download the demo configuration file using one of these methods: + +**Option A: Direct download** +```bash +curl -O https://raw.githubusercontent.com/katanemo/arch/main/demos/use_cases/claude_code/config.yaml +``` + +**Option B: Clone the repository** +```bash +git clone https://github.com/katanemo/arch.git +cd arch/demos/use_cases/claude_code + +``` + +### 2. Set Up Your API Keys +Set up your environment variables with your actual API keys: +```bash +export OPENAI_API_KEY="your-openai-api-key" +export ANTHROPIC_API_KEY="your-anthropic-api-key" +export AZURE_API_KEY="your-azure-api-key" # Optional +``` + +Alternatively, create a `.env` file in your working directory: +```bash +echo "OPENAI_API_KEY=your-openai-api-key" > .env +echo "ANTHROPIC_API_KEY=your-anthropic-api-key" >> .env +``` + +### 3. Install and Start Arch Gateway +```bash +pip install archgw +archgw up +``` + +### 4. Launch Claude Code with Multi-LLM Support +```bash +archgw cli-agent claude +``` + +That's it! Claude Code now has access to multiple LLM providers with intelligent routing. + +## What You'll Experience + +### Screenshot Placeholder +![Claude Code with Multi-LLM Routing](screenshot-placeholder.png) +*Claude Code interface enhanced with intelligent model routing and multi-provider access* + +### Real-Time Model Selection +When you interact with Claude Code, you'll get: +- **Automatic model selection** based on your query type +- **Transparent routing decisions** showing which model was chosen and why +- **Seamless failover** if a model becomes unavailable + +## Configuration + +The setup uses the included `config.yaml` file which defines: + +### Multi-Provider Access +```yaml +llm_providers: + - model: openai/gpt-4.1-2025-04-14 + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code generation + description: generating new code snippets and functions + - model: anthropic/claude-3-5-sonnet-20241022 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + name: code understanding + description: explaining and analyzing existing code +``` +## Advanced Usage + +### Custom Model Selection +```bash +# Force a specific model for this session +archgw cli-agent claude --settings='{"ANTHROPIC_SMALL_FAST_MODEL": "deepseek-coder-v2"}' + +# Enable detailed routing information +archgw cli-agent claude --settings='{"statusLine": {"type": "command", "command": "ccr statusline"}}' +``` + +### Environment Variables +The system automatically configures: +```bash +ANTHROPIC_BASE_URL=http://127.0.0.1:12000 # Routes through Arch Gateway +ANTHROPIC_SMALL_FAST_MODEL=arch.claude.code.small.fast # Uses intelligent alias +``` + +## Real Developer Workflows + +This intelligent routing is powered by our research in preference-aligned LLMM routing: +- **Research Paper**: [Preference-Aligned LLM Router](https://arxiv.org/abs/2506.16655) +- **Technical Docs**: [docs.archgw.com](https://docs.archgw.com) diff --git a/demos/use_cases/claude_code/config.yaml b/demos/use_cases/claude_code/config.yaml new file mode 100644 index 00000000..11a98c07 --- /dev/null +++ b/demos/use_cases/claude_code/config.yaml @@ -0,0 +1,41 @@ +version: v0.1 + +listeners: + egress_traffic: + address: 0.0.0.0 + port: 12000 + message_format: openai + timeout: 30s + +llm_providers: + # OpenAI Models + - model: openai/gpt-5-2025-08-07 + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + + - model: openai/gpt-4.1-2025-04-14 + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code understanding + description: understand and explain existing code snippets, functions, or libraries + + # Anthropic Models + - model: anthropic/claude-sonnet-4-5 + default: true + access_key: $ANTHROPIC_API_KEY + + - model: anthropic/claude-3-haiku-20240307 + access_key: $ANTHROPIC_API_KEY + + # Ollama Models + - model: ollama/llama3.1 + base_url: http://host.docker.internal:11434 + + +# Model aliases - friendly names that map to actual provider names +model_aliases: + # Alias for a small faster Claude model + arch.claude.code.small.fast: + target: claude-3-haiku-20240307 diff --git a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml index c829b395..794ed117 100644 --- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml +++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml @@ -24,7 +24,7 @@ llm_providers: access_key: $OPENAI_API_KEY # Anthropic Models - - model: anthropic/claude-3-5-sonnet-20241022 + - model: anthropic/claude-sonnet-4-20250514 access_key: $ANTHROPIC_API_KEY - model: anthropic/claude-3-haiku-20240307 @@ -56,7 +56,7 @@ model_aliases: # Alias for creative tasks -> Claude model arch.creative.v1: - target: claude-3-5-sonnet-20241022 + target: claude-sonnet-4-20250514 # Alias for quick responses -> fast model arch.fast.v1: @@ -67,7 +67,7 @@ model_aliases: target: gpt-5-mini-2025-08-07 chat-model: - target: llama3.1 + target: gpt-5-mini-2025-08-07 creative-model: - target: claude-3-5-sonnet-20241022 + target: claude-sonnet-4-20250514 diff --git a/tests/e2e/test_model_alias_routing.py b/tests/e2e/test_model_alias_routing.py index 696507f2..d5a289a6 100644 --- a/tests/e2e/test_model_alias_routing.py +++ b/tests/e2e/test_model_alias_routing.py @@ -199,8 +199,7 @@ def test_400_error_handling_with_alias(): try: completion = client.chat.completions.create( model="arch.summarize.v1", # This should resolve to gpt-5-mini-2025-08-07 - max_completion_tokens=50, - temperature=0.7, # This is a typo - should be "temperature", which should trigger a 400 error + max_tokens=50, messages=[ { "role": "user", @@ -350,3 +349,57 @@ def test_direct_model_4o_mini_anthropic(): response_content = "".join(b.text for b in message.content if b.type == "text") logger.info(f"Response from direct 4o-mini via Anthropic: {response_content}") assert response_content == "Hello from direct 4o-mini via Anthropic!" + + +def test_anthropic_thinking_mode_streaming(): + # Anthropic base_url should be the root, not /v1/chat/completions + base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") + + client = anthropic.Anthropic( + api_key=os.environ.get("ANTHROPIC_API_KEY", "test-key"), + base_url=base_url, + ) + + thinking_block_started = False + thinking_delta_seen = False + text_delta_seen = False + + with client.messages.stream( + model="claude-sonnet-4-20250514", + max_tokens=2048, + thinking={"type": "enabled", "budget_tokens": 1024}, # <- idiomatic + messages=[{"role": "user", "content": "Explain briefly what 2+2 equals"}], + ) as stream: + for event in stream: + # 1) detect when a thinking block starts + if event.type == "content_block_start" and getattr( + event, "content_block", None + ): + if getattr(event.content_block, "type", None) == "thinking": + thinking_block_started = True + + # 2) collect text vs thinking deltas + if event.type == "content_block_delta" and getattr(event, "delta", None): + if event.delta.type == "text_delta": + text_delta_seen = True + elif event.delta.type == "thinking_delta": + # some SDKs expose .thinking, others .text for this delta; not needed here + thinking_delta_seen = True + + final = stream.get_final_message() + + # Basic integrity + assert final is not None + assert final.content and len(final.content) > 0 + + # Normal text should have streamed + assert text_delta_seen, "Expected normal text deltas in stream" + + # With thinking enabled, we expect a thinking block and at least one thinking delta + assert thinking_block_started, "No thinking block started" + assert thinking_delta_seen, "No thinking deltas observed" + + # Optional: double-check on the assembled message + final_block_types = [blk.type for blk in final.content] + assert "text" in final_block_types + assert "thinking" in final_block_types diff --git a/tests/e2e/test_prompt_gateway.py b/tests/e2e/test_prompt_gateway.py index 362be227..2edab55d 100644 --- a/tests/e2e/test_prompt_gateway.py +++ b/tests/e2e/test_prompt_gateway.py @@ -417,12 +417,12 @@ def test_anthropic_client_with_openai_model_streaming(): client = anthropic.Anthropic(api_key="test-key", base_url=base_url) with client.messages.stream( - model="gpt-4o-mini", # OpenAI model via Anthropic client - max_tokens=50, + model="gpt-5-mini-2025-08-07", # OpenAI model via Anthropic client + max_tokens=500, messages=[ { "role": "user", - "content": "Hello, please respond with exactly: Hello from GPT-4o-mini via Anthropic!", + "content": "Hello, please respond with exactly: Hello from ChatGPT!", } ], ) as stream: @@ -435,8 +435,8 @@ def test_anthropic_client_with_openai_model_streaming(): # A safe way to reassemble text from the content blocks: final_text = "".join(b.text for b in final.content if b.type == "text") - assert full_text == "Hello from GPT-4o-mini via Anthropic!" - assert final_text == "Hello from GPT-4o-mini via Anthropic!" + assert full_text == "Hello from ChatGPT!" + assert final_text == "Hello from ChatGPT!" def test_openai_gpt4o_mini_v1_messages_api():