From 26531ec889c09b31d2655a4be28bcf4f15cc5891 Mon Sep 17 00:00:00 2001 From: Salman Paracha Date: Mon, 15 Dec 2025 14:31:26 -0800 Subject: [PATCH] fixed mixed inputs from openai v1/responses api --- crates/hermesllm/src/apis/openai_responses.rs | 83 ++++++++++--------- .../src/transforms/request/from_openai.rs | 77 ++++++++--------- tests/e2e/test_openai_responses_api_client.py | 41 +++++++++ 3 files changed, 126 insertions(+), 75 deletions(-) diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs index 91c4b0cc..a0b9f46a 100644 --- a/crates/hermesllm/src/apis/openai_responses.rs +++ b/crates/hermesllm/src/apis/openai_responses.rs @@ -107,22 +107,14 @@ pub struct ResponsesAPIRequest { pub top_logprobs: Option, } -/// Input parameter - can be a simple string or array of input items +/// Input parameter - can be a simple string or array of input messages #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] pub enum InputParam { /// Simple text input Text(String), - /// Array of input items - Items(Vec), -} - -/// Input item discriminated by type -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum InputItem { - /// Input message - Message(InputMessage), + /// Array of input messages + Items(Vec), } /// Input message with role and content @@ -130,8 +122,18 @@ pub enum InputItem { pub struct InputMessage { /// Message role pub role: MessageRole, - /// Message content - pub content: Vec, + /// Message content - can be a string or array of InputContent + pub content: MessageContent, +} + +/// Message content - can be either a simple string or array of content items +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum MessageContent { + /// Simple text content + Text(String), + /// Array of content items + Items(Vec), } /// Message roles @@ -991,8 +993,8 @@ pub struct ListInputItemsRequest { pub struct ListInputItemsResponse { /// Object type - always "list" pub object: String, - /// Array of input items - pub data: Vec, + /// Array of input messages + pub data: Vec, /// First ID in the list pub first_id: Option, /// Last ID in the list @@ -1022,20 +1024,21 @@ impl ProviderRequest for ResponsesAPIRequest { match &self.input { InputParam::Text(text) => text.clone(), InputParam::Items(items) => { - items.iter().fold(String::new(), |acc, item| { - match item { - InputItem::Message(msg) => { - let content_text = msg.content.iter().fold(String::new(), |acc, content| { - acc + " " + &match content { - InputContent::InputText { text } => text.clone(), - InputContent::InputImage { .. } => "[Image]".to_string(), - InputContent::InputFile { .. } => "[File]".to_string(), - InputContent::InputAudio { .. } => "[Audio]".to_string(), + items.iter().fold(String::new(), |acc, msg| { + let content_text = match &msg.content { + MessageContent::Text(text) => text.clone(), + MessageContent::Items(content_items) => { + content_items.iter().fold(String::new(), |acc, content| { + acc + " " + &match content { + InputContent::InputText { text } => text.clone(), + InputContent::InputImage { .. } => "[Image]".to_string(), + InputContent::InputFile { .. } => "[File]".to_string(), + InputContent::InputAudio { .. } => "[Audio]".to_string(), + } + }) } - }); + }; acc + " " + &content_text - } - } }) } } @@ -1045,18 +1048,22 @@ impl ProviderRequest for ResponsesAPIRequest { match &self.input { InputParam::Text(text) => Some(text.clone()), InputParam::Items(items) => { - items.iter().rev().find_map(|item| { - match item { - InputItem::Message(msg) if matches!(msg.role, MessageRole::User) => { - // Extract text from the first text content - msg.content.iter().find_map(|content| { - match content { - InputContent::InputText { text } => Some(text.clone()), - _ => None, + items.iter().rev().find_map(|msg| { + if matches!(msg.role, MessageRole::User) { + // Extract text from content + match &msg.content { + MessageContent::Text(text) => Some(text.clone()), + MessageContent::Items(content_items) => { + content_items.iter().find_map(|content| { + match content { + InputContent::InputText { text } => Some(text.clone()), + _ => None, + } + }) } - }) - } - _ => None, + } + } else { + None } }) } diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs index 83f13fe8..9607565f 100644 --- a/crates/hermesllm/src/transforms/request/from_openai.rs +++ b/crates/hermesllm/src/transforms/request/from_openai.rs @@ -14,7 +14,7 @@ use crate::apis::openai::{ }; use crate::apis::openai_responses::{ - ResponsesAPIRequest, InputContent, InputItem, InputParam, MessageRole, Modality, ReasoningEffort, Tool as ResponsesTool, ToolChoice as ResponsesToolChoice + ResponsesAPIRequest, InputContent, InputParam, MessageRole, Modality, ReasoningEffort, Tool as ResponsesTool, ToolChoice as ResponsesToolChoice }; use crate::clients::TransformError; use crate::transforms::lib::ExtractText; @@ -280,26 +280,52 @@ impl TryFrom for ChatCompletionsRequest { }); } - // Convert each input item - for item in items { - match item { - InputItem::Message(input_msg) => { - let role = match input_msg.role { + // Convert each input message + for input_msg in items { + let role = match input_msg.role { MessageRole::User => Role::User, MessageRole::Assistant => Role::Assistant, MessageRole::System => Role::System, MessageRole::Developer => Role::System, // Map developer to system }; - // Convert content blocks - let content = if input_msg.content.len() == 1 { - // Single content item - check if it's simple text - match &input_msg.content[0] { - InputContent::InputText { text } => MessageContent::Text(text.clone()), - _ => { - // Convert to parts for non-text content + // Convert content based on MessageContent type + let content = match &input_msg.content { + crate::apis::openai_responses::MessageContent::Text(text) => { + // Simple text content + MessageContent::Text(text.clone()) + } + crate::apis::openai_responses::MessageContent::Items(content_items) => { + // Check if it's a single text item (can use simple text format) + if content_items.len() == 1 { + if let InputContent::InputText { text } = &content_items[0] { + MessageContent::Text(text.clone()) + } else { + // Single non-text item - use parts format + MessageContent::Parts( + content_items.iter() + .filter_map(|c| match c { + InputContent::InputText { text } => { + Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) + } + InputContent::InputImage { image_url, .. } => { + Some(crate::apis::openai::ContentPart::ImageUrl { + image_url: crate::apis::openai::ImageUrl { + url: image_url.clone(), + detail: None, + } + }) + } + InputContent::InputFile { .. } => None, // Skip files for now + InputContent::InputAudio { .. } => None, // Skip audio for now + }) + .collect() + ) + } + } else { + // Multiple content items - convert to parts MessageContent::Parts( - input_msg.content.iter() + content_items.iter() .filter_map(|c| match c { InputContent::InputText { text } => { Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) @@ -319,27 +345,6 @@ impl TryFrom for ChatCompletionsRequest { ) } } - } else { - // Multiple content items - convert to parts - MessageContent::Parts( - input_msg.content.iter() - .filter_map(|c| match c { - InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) - } - InputContent::InputImage { image_url, .. } => { - Some(crate::apis::openai::ContentPart::ImageUrl { - image_url: crate::apis::openai::ImageUrl { - url: image_url.clone(), - detail: None, - } - }) - } - InputContent::InputFile { .. } => None, // Skip files for now - InputContent::InputAudio { .. } => None, // Skip audio for now - }) - .collect() - ) }; converted_messages.push(Message { @@ -349,8 +354,6 @@ impl TryFrom for ChatCompletionsRequest { tool_call_id: None, tool_calls: None, }); - } - } } converted_messages diff --git a/tests/e2e/test_openai_responses_api_client.py b/tests/e2e/test_openai_responses_api_client.py index 800db93d..e1fa8da8 100644 --- a/tests/e2e/test_openai_responses_api_client.py +++ b/tests/e2e/test_openai_responses_api_client.py @@ -628,3 +628,44 @@ def test_openai_responses_api_streaming_with_tools_upstream_anthropic(): assert ( full_text or tool_calls ), "Expected streamed text or tool call argument deltas from Responses tools stream" + + +def test_openai_responses_api_mixed_content_types(): + """Test Responses API with mixed content types (string and array) in input messages""" + base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") + client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") + + # This test mimics the request that was failing: + # One message with string content, another with array content + resp = client.responses.create( + model="arch.title.v1", + input=[ + { + "role": "developer", + "content": "Generate a very short chat title (2-5 words max) based on the user's message.\n" + "Rules:\n" + "- Maximum 30 characters\n" + "- No quotes, colons, hashtags, or markdown\n" + "- Just the topic/intent, not a full sentence\n" + '- If the message is a greeting like "hi" or "hello", respond with just "New conversation"\n' + '- Be concise: "Weather in NYC" not "User asking about the weather in New York City"', + }, + { + "role": "user", + "content": [ + {"type": "input_text", "text": "What is the weather in Seattle"} + ], + }, + ], + ) + + # Print the response + print(f"\n{'='*80}") + print(f"Model: {resp.model}") + print(f"Output: {resp.output_text}") + print(f"{'='*80}\n") + + assert resp is not None + assert resp.id is not None + # Verify we got a reasonable title + assert len(resp.output_text) > 0