diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs index 91c4b0cc..33dea44a 100644 --- a/crates/hermesllm/src/apis/openai_responses.rs +++ b/crates/hermesllm/src/apis/openai_responses.rs @@ -113,16 +113,29 @@ pub struct ResponsesAPIRequest { pub enum InputParam { /// Simple text input Text(String), - /// Array of input items + /// Array of input items (messages, references, outputs, etc.) Items(Vec), } -/// Input item discriminated by type +/// Input item - can be a message, item reference, function call output, etc. #[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] +#[serde(untagged)] pub enum InputItem { - /// Input message + /// Input message (role + content) Message(InputMessage), + /// Item reference + ItemReference { + #[serde(rename = "type")] + item_type: String, + id: String, + }, + /// Function call output + FunctionCallOutput { + #[serde(rename = "type")] + item_type: String, + call_id: String, + output: String, + }, } /// Input message with role and content @@ -130,8 +143,18 @@ pub enum InputItem { pub struct InputMessage { /// Message role pub role: MessageRole, - /// Message content - pub content: Vec, + /// Message content - can be a string or array of InputContent + pub content: MessageContent, +} + +/// Message content - can be either a simple string or array of content items +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum MessageContent { + /// Simple text content + Text(String), + /// Array of content items + Items(Vec), } /// Message roles @@ -1025,16 +1048,23 @@ impl ProviderRequest for ResponsesAPIRequest { items.iter().fold(String::new(), |acc, item| { match item { InputItem::Message(msg) => { - let content_text = msg.content.iter().fold(String::new(), |acc, content| { - acc + " " + &match content { - InputContent::InputText { text } => text.clone(), - InputContent::InputImage { .. } => "[Image]".to_string(), - InputContent::InputFile { .. } => "[File]".to_string(), - InputContent::InputAudio { .. } => "[Audio]".to_string(), + let content_text = match &msg.content { + MessageContent::Text(text) => text.clone(), + MessageContent::Items(content_items) => { + content_items.iter().fold(String::new(), |acc, content| { + acc + " " + &match content { + InputContent::InputText { text } => text.clone(), + InputContent::InputImage { .. } => "[Image]".to_string(), + InputContent::InputFile { .. } => "[File]".to_string(), + InputContent::InputAudio { .. } => "[Audio]".to_string(), + } + }) } - }); + }; acc + " " + &content_text } + // Skip non-message items (references, outputs, etc.) + _ => acc, } }) } @@ -1048,14 +1078,20 @@ impl ProviderRequest for ResponsesAPIRequest { items.iter().rev().find_map(|item| { match item { InputItem::Message(msg) if matches!(msg.role, MessageRole::User) => { - // Extract text from the first text content - msg.content.iter().find_map(|content| { - match content { - InputContent::InputText { text } => Some(text.clone()), - _ => None, + // Extract text from content + match &msg.content { + MessageContent::Text(text) => Some(text.clone()), + MessageContent::Items(content_items) => { + content_items.iter().find_map(|content| { + match content { + InputContent::InputText { text } => Some(text.clone()), + _ => None, + } + }) } - }) + } } + // Skip non-message items _ => None, } }) diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs index 83f13fe8..27366f4d 100644 --- a/crates/hermesllm/src/transforms/request/from_openai.rs +++ b/crates/hermesllm/src/transforms/request/from_openai.rs @@ -291,15 +291,43 @@ impl TryFrom for ChatCompletionsRequest { MessageRole::Developer => Role::System, // Map developer to system }; - // Convert content blocks - let content = if input_msg.content.len() == 1 { - // Single content item - check if it's simple text - match &input_msg.content[0] { - InputContent::InputText { text } => MessageContent::Text(text.clone()), - _ => { - // Convert to parts for non-text content + // Convert content based on MessageContent type + let content = match &input_msg.content { + crate::apis::openai_responses::MessageContent::Text(text) => { + // Simple text content + MessageContent::Text(text.clone()) + } + crate::apis::openai_responses::MessageContent::Items(content_items) => { + // Check if it's a single text item (can use simple text format) + if content_items.len() == 1 { + if let InputContent::InputText { text } = &content_items[0] { + MessageContent::Text(text.clone()) + } else { + // Single non-text item - use parts format + MessageContent::Parts( + content_items.iter() + .filter_map(|c| match c { + InputContent::InputText { text } => { + Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) + } + InputContent::InputImage { image_url, .. } => { + Some(crate::apis::openai::ContentPart::ImageUrl { + image_url: crate::apis::openai::ImageUrl { + url: image_url.clone(), + detail: None, + } + }) + } + InputContent::InputFile { .. } => None, // Skip files for now + InputContent::InputAudio { .. } => None, // Skip audio for now + }) + .collect() + ) + } + } else { + // Multiple content items - convert to parts MessageContent::Parts( - input_msg.content.iter() + content_items.iter() .filter_map(|c| match c { InputContent::InputText { text } => { Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) @@ -319,27 +347,6 @@ impl TryFrom for ChatCompletionsRequest { ) } } - } else { - // Multiple content items - convert to parts - MessageContent::Parts( - input_msg.content.iter() - .filter_map(|c| match c { - InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) - } - InputContent::InputImage { image_url, .. } => { - Some(crate::apis::openai::ContentPart::ImageUrl { - image_url: crate::apis::openai::ImageUrl { - url: image_url.clone(), - detail: None, - } - }) - } - InputContent::InputFile { .. } => None, // Skip files for now - InputContent::InputAudio { .. } => None, // Skip audio for now - }) - .collect() - ) }; converted_messages.push(Message { @@ -350,6 +357,9 @@ impl TryFrom for ChatCompletionsRequest { tool_calls: None, }); } + // Skip non-message items (references, outputs) for now + // These would need special handling in chat completions format + _ => {} } } diff --git a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml index 0aaaa537..00b72a6d 100644 --- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml +++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml @@ -89,6 +89,3 @@ model_aliases: # Alias for grok testing arch.grok.v1: target: grok-4-0709 - -tracing: - random_sampling: 100 diff --git a/tests/e2e/test_openai_responses_api_client.py b/tests/e2e/test_openai_responses_api_client.py index 800db93d..e1fa8da8 100644 --- a/tests/e2e/test_openai_responses_api_client.py +++ b/tests/e2e/test_openai_responses_api_client.py @@ -628,3 +628,44 @@ def test_openai_responses_api_streaming_with_tools_upstream_anthropic(): assert ( full_text or tool_calls ), "Expected streamed text or tool call argument deltas from Responses tools stream" + + +def test_openai_responses_api_mixed_content_types(): + """Test Responses API with mixed content types (string and array) in input messages""" + base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") + client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") + + # This test mimics the request that was failing: + # One message with string content, another with array content + resp = client.responses.create( + model="arch.title.v1", + input=[ + { + "role": "developer", + "content": "Generate a very short chat title (2-5 words max) based on the user's message.\n" + "Rules:\n" + "- Maximum 30 characters\n" + "- No quotes, colons, hashtags, or markdown\n" + "- Just the topic/intent, not a full sentence\n" + '- If the message is a greeting like "hi" or "hello", respond with just "New conversation"\n' + '- Be concise: "Weather in NYC" not "User asking about the weather in New York City"', + }, + { + "role": "user", + "content": [ + {"type": "input_text", "text": "What is the weather in Seattle"} + ], + }, + ], + ) + + # Print the response + print(f"\n{'='*80}") + print(f"Model: {resp.model}") + print(f"Output: {resp.output_text}") + print(f"{'='*80}\n") + + assert resp is not None + assert resp.id is not None + # Verify we got a reasonable title + assert len(resp.output_text) > 0