adding support for claude code routing (#575)

* fixed for claude code routing. first commit * removing redundant enum tags for cache_control * making sure that claude code can run via the archgw cli * fixing broken config * adding a README.md and updated the cli to use more of our defined patterns for params * fixed config.yaml * minor fixes to make sure PR is clean. Ready to ship * adding claude-sonnet-4-5 to the config * fixes based on PR * fixed alias for README * fixed 400 error handling tests, now that we write temperature to 1.0 for GPT-5 --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-257.local> Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-288.local>
2026-07-14 16:22:12 +02:00 · 2025-09-29 19:23:08 -07:00 · 2025-09-29 19:23:08 -07:00 · f00870dccb
commit f00870dccb
parent 03c2cf6f0d
16 changed files with 903 additions and 106 deletions
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -88,6 +88,7 @@ pub struct ChatCompletionsRequest {
    pub prediction: Option<StaticContent>,
    // pub reasoning_effect: Option<bool>, // GOOD FIRST ISSUE: Future support for reasoning effects
    pub response_format: Option<Value>,
+    pub reasoning_effort: Option<String>, // e.g., "none", "low", "medium", "high"
    // pub safety_identifier: Option<String>, // GOOD FIRST ISSUE: Future support for safety identifiers
    pub seed: Option<i32>,
    pub service_tier: Option<String>,
@ -116,6 +117,13 @@ impl ChatCompletionsRequest {
            self.max_tokens = None;
        }
    }
+
+    pub fn fix_temperature_if_gpt5(&mut self) {
+        let model = self.model.as_str();
+        if model.starts_with("gpt-5") {
+            self.temperature = Some(1.0);
+        }
+    }
 }

 // ============================================================================
@ -598,6 +606,7 @@ impl TryFrom<&[u8]> for ChatCompletionsRequest {
       let mut req: ChatCompletionsRequest = serde_json::from_slice(bytes).map_err(OpenAIStreamError::from)?;
        // Use the centralized suppression logic
        req.suppress_max_tokens_if_o3();
+        req.fix_temperature_if_gpt5();
        Ok(req)
    }
 }
--- a/crates/hermesllm/src/clients/transformer.rs
+++ b/crates/hermesllm/src/clients/transformer.rs
@ -111,6 +111,7 @@ impl TryFrom<AnthropicMessagesRequest> for ChatCompletionsRequest {
            ..Default::default()
        };
        _chat_completions_req.suppress_max_tokens_if_o3();
+        _chat_completions_req.fix_temperature_if_gpt5();
        Ok(_chat_completions_req)
    }
 }
@ -352,6 +353,7 @@ impl TryFrom<ChatCompletionsStreamResponse> for MessagesStreamEvent {
        let choice = &resp.choices[0];

        // Handle final chunk with usage
+        let has_usage = resp.usage.is_some();
        if let Some(usage) = resp.usage {
            if let Some(finish_reason) = &choice.finish_reason {
                let anthropic_stop_reason: MessagesStopReason = finish_reason.clone().into();
@ -403,11 +405,27 @@ impl TryFrom<ChatCompletionsStreamResponse> for MessagesStreamEvent {
            return convert_tool_call_deltas(tool_calls.clone());
        }

-        // Handle finish reason
+        // Handle finish reason - generate MessageDelta only (MessageStop comes later)
        if let Some(finish_reason) = &choice.finish_reason {
-            if *finish_reason == FinishReason::Stop {
-                return Ok(MessagesStreamEvent::MessageStop);
+            // If we have usage data, it was already handled above
+            // If not, we need to generate MessageDelta with default usage
+            if !has_usage {
+                let anthropic_stop_reason: MessagesStopReason = finish_reason.clone().into();
+                return Ok(MessagesStreamEvent::MessageDelta {
+                    delta: MessagesMessageDelta {
+                        stop_reason: anthropic_stop_reason,
+                        stop_sequence: None,
+                    },
+                    usage: MessagesUsage {
+                        input_tokens: 0,
+                        output_tokens: 0,
+                        cache_creation_input_tokens: None,
+                        cache_read_input_tokens: None,
+                    },
+                });
            }
+            // If usage was already handled above, we don't need to do anything more here
+            // MessageStop will be handled when [DONE] is encountered
        }

        // Default to ping for unhandled cases
@ -468,18 +486,6 @@ impl TryFrom<MessagesMessage> for Vec<Message> {
            }
            MessagesMessageContent::Blocks(blocks) => {
                let (content_parts, tool_calls, tool_results) = blocks.split_for_openai()?;
-
-                // Create main message
-                let content = build_openai_content(content_parts, &tool_calls);
-                let main_message = Message {
-                    role: message.role.into(),
-                    content,
-                    name: None,
-                    tool_calls: if tool_calls.is_empty() { None } else { Some(tool_calls) },
-                    tool_call_id: None,
-                };
-                result.push(main_message);
-
                // Add tool result messages
                for (tool_use_id, result_text, _is_error) in tool_results {
                    result.push(Message {
@ -490,6 +496,20 @@ impl TryFrom<MessagesMessage> for Vec<Message> {
                        tool_call_id: Some(tool_use_id),
                    });
                }
+
+                // Only create main message if there's actual content or tool calls
+                // Skip creating empty content messages (e.g., when message only contains tool_result blocks)
+                if !content_parts.is_empty() || !tool_calls.is_empty() {
+                    let content = build_openai_content(content_parts, &tool_calls);
+                    let main_message = Message {
+                        role: message.role.into(),
+                        content,
+                        name: None,
+                        tool_calls: if tool_calls.is_empty() { None } else { Some(tool_calls) },
+                        tool_call_id: None,
+                    };
+                    result.push(main_message);
+                }
            }
        }

@ -515,9 +535,11 @@ impl TryFrom<Message> for MessagesMessage {
                        MessagesContentBlock::ToolResult {
                            tool_use_id: tool_call_id,
                            is_error: None,
-                            content: vec![MessagesContentBlock::Text {
+                            content: ToolResultContent::Blocks(vec![MessagesContentBlock::Text {
                                text: message.content.extract_text(),
-                            }],
+                                cache_control: None,
+                            }]),
+                            cache_control: None,
                        },
                    ]),
                });
@ -551,7 +573,7 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {

        for block in self {
            match block {
-                MessagesContentBlock::ToolUse { id, name, input } |
+                MessagesContentBlock::ToolUse { id, name, input, .. } |
                MessagesContentBlock::ServerToolUse { id, name, input } |
                MessagesContentBlock::McpToolUse { id, name, input } => {
                    let arguments = serde_json::to_string(&input)?;
@ -575,7 +597,7 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {

        for block in self {
            match block {
-                MessagesContentBlock::Text { text } => {
+                MessagesContentBlock::Text { text, .. } => {
                    content_parts.push(ContentPart::Text { text: text.clone() });
                }
                MessagesContentBlock::Image { source } => {
@ -587,7 +609,7 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {
                        },
                    });
                }
-                MessagesContentBlock::ToolUse { id, name, input } |
+                MessagesContentBlock::ToolUse { id, name, input, .. } |
                MessagesContentBlock::ServerToolUse { id, name, input } |
                MessagesContentBlock::McpToolUse { id, name, input } => {
                    let arguments = serde_json::to_string(&input)?;
@ -597,7 +619,10 @@ impl ContentUtils<ToolCall> for Vec<MessagesContentBlock> {
                        function: FunctionCall { name: name.clone(), arguments },
                    });
                }
-                MessagesContentBlock::ToolResult { tool_use_id, content, is_error } |
+                MessagesContentBlock::ToolResult { tool_use_id, content, is_error, .. } => {
+                    let result_text = content.extract_text();
+                    tool_results.push((tool_use_id.clone(), result_text, is_error.unwrap_or(false)));
+                }
                MessagesContentBlock::WebSearchToolResult { tool_use_id, content, is_error } |
                MessagesContentBlock::CodeExecutionToolResult { tool_use_id, content, is_error } |
                MessagesContentBlock::McpToolResult { tool_use_id, content, is_error } => {
@ -819,7 +844,7 @@ fn build_openai_content(content_parts: Vec<ContentPart>, tool_calls: &[ToolCall]
 fn build_anthropic_content(content_blocks: Vec<MessagesContentBlock>) -> MessagesMessageContent {
    if content_blocks.len() == 1 {
        match &content_blocks[0] {
-            MessagesContentBlock::Text { text } => MessagesMessageContent::Single(text.clone()),
+            MessagesContentBlock::Text { text, .. } => MessagesMessageContent::Single(text.clone()),
            _ => MessagesMessageContent::Blocks(content_blocks),
        }
    } else if content_blocks.is_empty() {
@ -835,12 +860,11 @@ fn convert_anthropic_content_to_openai(content: &[MessagesContentBlock]) -> Resu

    for block in content {
        match block {
-            MessagesContentBlock::Text { text } => {
+            MessagesContentBlock::Text { text, .. } => {
                text_parts.push(text.clone());
            }
-            MessagesContentBlock::Thinking { text } => {
-                // Include thinking as regular text for OpenAI
-                text_parts.push(format!("[Thinking: {}]", text));
+            MessagesContentBlock::Thinking { thinking, .. } => {
+                text_parts.push(format!("thinking: {}", thinking));
            }
            _ => {
                // Skip other content types for basic text conversion
@ -860,14 +884,14 @@ fn convert_openai_message_to_anthropic_content(message: &Message) -> Result<Vec<
    match &message.content {
        MessageContent::Text(text) => {
            if !text.is_empty() {
-                blocks.push(MessagesContentBlock::Text { text: text.clone() });
+                blocks.push(MessagesContentBlock::Text { text: text.clone(), cache_control: None });
            }
        }
        MessageContent::Parts(parts) => {
            for part in parts {
                match part {
                    ContentPart::Text { text } => {
-                        blocks.push(MessagesContentBlock::Text { text: text.clone() });
+                        blocks.push(MessagesContentBlock::Text { text: text.clone(), cache_control: None });
                    }
                    ContentPart::ImageUrl { image_url } => {
                        let source = convert_image_url_to_source(image_url);
@ -886,6 +910,7 @@ fn convert_openai_message_to_anthropic_content(message: &Message) -> Result<Vec<
                id: tool_call.id.clone(),
                name: tool_call.function.name.clone(),
                input,
+                cache_control: None,
            });
        }
    }
@ -984,6 +1009,21 @@ fn convert_content_delta(delta: MessagesContentDelta) -> Result<ChatCompletionsS
                None,
            ))
        }
+        MessagesContentDelta::ThinkingDelta { thinking } => {
+            Ok(create_openai_chunk(
+                "stream",
+                "unknown",
+                MessageDelta {
+                    role: None,
+                    content: Some(format!("thinking: {}", thinking)),
+                    refusal: None,
+                    function_call: None,
+                    tool_calls: None,
+                },
+                None,
+                None,
+            ))
+        }
        MessagesContentDelta::InputJsonDelta { partial_json } => {
            Ok(create_openai_chunk(
                "stream",
@ -1023,6 +1063,7 @@ fn convert_tool_call_deltas(tool_calls: Vec<ToolCallDelta>) -> Result<MessagesSt
                            id: id.clone(),
                            name: name.clone(),
                            input: Value::Object(serde_json::Map::new()),
+                            cache_control: None,
                        },
                    });
                }
@ -1254,6 +1295,7 @@ mod tests {
                id: "call_123".to_string(),
                name: "get_weather".to_string(),
                input: json!({}),
+                cache_control: None,
            },
        };

@ -1566,6 +1608,7 @@ mod tests {
                id: "call_weather".to_string(),
                name: "get_weather".to_string(),
                input: json!({}),
+                cache_control: None,
            },
        };

--- a/crates/hermesllm/src/providers/response.rs
+++ b/crates/hermesllm/src/providers/response.rs
@ -269,6 +269,13 @@ impl TryFrom<(&[u8], &SupportedAPIs, &SupportedAPIs)> for ProviderStreamResponse
                Ok(ProviderStreamResponseType::ChatCompletionsStreamResponse(chat_resp))
            }
            (SupportedAPIs::OpenAIChatCompletions(_), SupportedAPIs::AnthropicMessagesAPI(_)) => {
+                // Special case: Handle [DONE] marker for OpenAI -> Anthropic conversion
+                if bytes == b"[DONE]" {
+                    return Ok(ProviderStreamResponseType::MessagesStreamEvent(
+                        crate::apis::anthropic::MessagesStreamEvent::MessageStop
+                    ));
+                }
+
                let openai_resp: crate::apis::openai::ChatCompletionsStreamResponse = serde_json::from_slice(bytes)?;

                // Transform to Anthropic Messages stream format using the transformer
@ -287,8 +294,8 @@ impl TryFrom<(SseEvent, &SupportedAPIs, &SupportedAPIs)> for SseEvent {
        // Create a new transformed event based on the original
        let mut transformed_event = sse_event;

-        // If not [DONE] and has data, parse the data as a provider stream response (business logic layer)
-        if !transformed_event.is_done() && transformed_event.data.is_some() {
+        // If has data, parse the data as a provider stream response (business logic layer)
+        if transformed_event.data.is_some() {
            let data_str = transformed_event.data.as_ref().unwrap();
            let data_bytes = data_str.as_bytes();
            let transformed_response = ProviderStreamResponseType::try_from((data_bytes, client_api, upstream_api))?;
@ -380,6 +387,7 @@ where
    I::Item: AsRef<str>,
 {
    pub lines: I,
+    pub done_seen: bool,
 }

 impl<I> SseStreamIter<I>
@ -388,7 +396,7 @@ where
    I::Item: AsRef<str>,
 {
    pub fn new(lines: I) -> Self {
-        Self { lines }
+        Self { lines, done_seen: false }
    }
 }

@ -411,14 +419,20 @@ where
    type Item = SseEvent;

    fn next(&mut self) -> Option<Self::Item> {
+        // If we already returned [DONE], terminate the stream
+        if self.done_seen {
+            return None;
+        }
+
        for line in &mut self.lines {
            let line_str = line.as_ref();

            // Try to parse as either data: or event: line
            if let Ok(event) = line_str.parse::<SseEvent>() {
-                // For data: lines, check if this is the [DONE] marker - if so, end the stream
+                // For data: lines, check if this is the [DONE] marker
                if event.data.is_some() && event.is_done() {
-                    return None;
+                    self.done_seen = true;
+                    return Some(event); // Return [DONE] event for transformation
                }
                // For data: lines, skip events that should be filtered at the transport layer
                if event.data.is_some() && event.should_skip() {
@ -706,7 +720,11 @@ mod tests {
        assert!(event2.data.as_ref().unwrap().contains("msg2"));
        assert!(!event2.should_skip());

-        // Iterator should end at [DONE] (no more events)
+        // Third event should be [DONE]
+        let done_event = iter.next().unwrap();
+        assert!(done_event.is_done());
+
+        // Iterator should end after [DONE]
        assert!(iter.next().is_none());
    }

@ -745,7 +763,11 @@ mod tests {
        assert!(!event4.is_event_only());
        assert!(event4.data.as_ref().unwrap().contains("Hello"));

-        // Iterator should end at [DONE]
+        // Fifth event should be [DONE]
+        let done_event = iter.next().unwrap();
+        assert!(done_event.is_done());
+
+        // Iterator should end after [DONE]
        assert!(iter.next().is_none());
    }

@ -776,4 +798,25 @@ mod tests {
        let provider_type = ProviderStreamResponseType::ChatCompletionsStreamResponse(openai_event);
        assert_eq!(provider_type.event_type(), None);
    }
+
+    #[test]
+    fn test_done_marker_handled_in_stream_response_transformation() {
+        use crate::apis::anthropic::AnthropicApi;
+
+        // Test that [DONE] marker is properly converted to MessageStop in the transformation layer
+        let done_bytes = b"[DONE]";
+        let client_api = SupportedAPIs::AnthropicMessagesAPI(AnthropicApi::Messages);
+        let upstream_api = SupportedAPIs::OpenAIChatCompletions(crate::apis::openai::OpenAIApi::ChatCompletions);
+
+        let result = ProviderStreamResponseType::try_from((done_bytes.as_slice(), &client_api, &upstream_api));
+        assert!(result.is_ok());
+
+        if let Ok(ProviderStreamResponseType::MessagesStreamEvent(event)) = result {
+            // Verify it's a MessageStop event
+            assert_eq!(event.event_type(), Some("message_stop"));
+            assert!(matches!(event, crate::apis::anthropic::MessagesStreamEvent::MessageStop));
+        } else {
+            panic!("Expected MessagesStreamEvent::MessageStop");
+        }
+    }
 }