From 474b74aa18262780716b4eb7d5d51094006850cd Mon Sep 17 00:00:00 2001 From: Musa Date: Mon, 29 Jun 2026 13:58:35 -0700 Subject: [PATCH] fix(hermesllm): preserve output_text for Responses API multi-turn (#978) --- crates/hermesllm/src/apis/openai_responses.rs | 65 ++++++++++++++++++- .../src/transforms/request/from_openai.rs | 61 +++++++++-------- .../transforms/response/output_to_input.rs | 14 ++-- 3 files changed, 104 insertions(+), 36 deletions(-) diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs index 92d362b2..af37688e 100644 --- a/crates/hermesllm/src/apis/openai_responses.rs +++ b/crates/hermesllm/src/apis/openai_responses.rs @@ -183,9 +183,13 @@ pub enum MessageRole { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] pub enum InputContent { - /// Text input - #[serde(rename = "input_text", alias = "text", alias = "output_text")] + /// Text input (input-role message content) + #[serde(rename = "input_text", alias = "text")] InputText { text: String }, + /// Text produced by the model in a prior turn. This must round-trip as + /// `output_text` because the Responses API rejects `input_text` for + /// output-role (assistant) message content. + OutputText { text: String }, /// Image input via URL InputImage { image_url: String, @@ -1051,6 +1055,7 @@ pub struct ListInputItemsResponse { fn append_input_content_text(buffer: &mut String, content: &InputContent) { match content { InputContent::InputText { text } => buffer.push_str(text), + InputContent::OutputText { text } => buffer.push_str(text), InputContent::InputImage { .. } => buffer.push_str("[Image]"), InputContent::InputFile { .. } => buffer.push_str("[File]"), InputContent::InputAudio { .. } => buffer.push_str("[Audio]"), @@ -1642,6 +1647,62 @@ mod tests { } } + #[test] + fn test_input_content_preserves_output_text_round_trip() { + // Multi-turn request: a user turn carrying input_text and a prior + // assistant turn carrying output_text. The Responses API rejects + // input_text for output-role content, so the assistant turn must + // survive a serialize round-trip as output_text (not be rewritten). + let request = json!({ + "model": "gpt-5.3-codex", + "input": [ + { + "role": "user", + "content": [ + { "type": "input_text", "text": "hello" } + ] + }, + { + "role": "assistant", + "content": [ + { "type": "output_text", "text": "hi there" } + ] + } + ] + }); + + let bytes = serde_json::to_vec(&request).unwrap(); + let parsed = ResponsesAPIRequest::try_from(bytes.as_slice()).unwrap(); + + let items = match &parsed.input { + InputParam::Items(items) => items, + _ => panic!("expected array input"), + }; + assert_eq!(items.len(), 2); + + // Assistant output_text must deserialize into the OutputText variant. + let assistant = items + .iter() + .find_map(|item| match item { + InputItem::Message(msg) if matches!(msg.role, MessageRole::Assistant) => Some(msg), + _ => None, + }) + .expect("assistant message present"); + match &assistant.content { + MessageContent::Items(contents) => { + assert!(matches!(contents[0], InputContent::OutputText { .. })); + } + _ => panic!("expected array content"), + } + + // Round-trip serialize and assert the type tags are preserved: + // user content stays input_text, assistant content stays output_text. + let serialized = serde_json::to_value(&parsed).unwrap(); + let input = &serialized["input"]; + assert_eq!(input[0]["content"][0]["type"], "input_text"); + assert_eq!(input[1]["content"][0]["type"], "output_text"); + } + #[test] fn test_request_deserializes_text_config_without_format() { let request = json!({ diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs index b673af38..0514c039 100644 --- a/crates/hermesllm/src/transforms/request/from_openai.rs +++ b/crates/hermesllm/src/transforms/request/from_openai.rs @@ -112,33 +112,37 @@ impl TryFrom for Vec { ) => { // Check if it's a single text item (can use simple text format) if content_items.len() == 1 { - if let InputContent::InputText { text } = &content_items[0] - { - MessageContent::Text(text.clone()) - } else { - // Single non-text item - use parts format - MessageContent::Parts( - content_items - .iter() - .filter_map(|c| match c { - InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { - text: text.clone(), - }) - } - InputContent::InputImage { image_url, .. } => { - Some(crate::apis::openai::ContentPart::ImageUrl { - image_url: crate::apis::openai::ImageUrl { - url: image_url.clone(), - detail: None, - }, - }) - } - InputContent::InputFile { .. } => None, // Skip files for now - InputContent::InputAudio { .. } => None, // Skip audio for now - }) - .collect(), - ) + match &content_items[0] { + InputContent::InputText { text } + | InputContent::OutputText { text } => { + MessageContent::Text(text.clone()) + } + _ => { + // Single non-text item - use parts format + MessageContent::Parts( + content_items + .iter() + .filter_map(|c| match c { + InputContent::InputText { text } + | InputContent::OutputText { text } => { + Some(crate::apis::openai::ContentPart::Text { + text: text.clone(), + }) + } + InputContent::InputImage { image_url, .. } => { + Some(crate::apis::openai::ContentPart::ImageUrl { + image_url: crate::apis::openai::ImageUrl { + url: image_url.clone(), + detail: None, + }, + }) + } + InputContent::InputFile { .. } => None, // Skip files for now + InputContent::InputAudio { .. } => None, // Skip audio for now + }) + .collect(), + ) + } } } else { // Multiple content items - convert to parts @@ -146,7 +150,8 @@ impl TryFrom for Vec { content_items .iter() .filter_map(|c| match c { - InputContent::InputText { text } => { + InputContent::InputText { text } + | InputContent::OutputText { text } => { Some(crate::apis::openai::ContentPart::Text { text: text.clone(), }) diff --git a/crates/hermesllm/src/transforms/response/output_to_input.rs b/crates/hermesllm/src/transforms/response/output_to_input.rs index e62f32b8..e323ccf7 100644 --- a/crates/hermesllm/src/transforms/response/output_to_input.rs +++ b/crates/hermesllm/src/transforms/response/output_to_input.rs @@ -18,7 +18,9 @@ pub fn convert_responses_output_to_input_items(output: &OutputItem) -> Option { - Some(InputContent::InputText { text: text.clone() }) + // Assistant (output-role) content must round-trip as + // output_text; the Responses API rejects input_text here. + Some(InputContent::OutputText { text: text.clone() }) } OutputContent::OutputAudio { data, .. } => Some(InputContent::InputAudio { data: data.clone(), @@ -59,7 +61,7 @@ pub fn convert_responses_output_to_input_items(output: &OutputItem) -> Option { assert_eq!(items.len(), 1); match &items[0] { - InputContent::InputText { text } => assert_eq!(text, "Hello!"), - _ => panic!("Expected InputText"), + InputContent::OutputText { text } => assert_eq!(text, "Hello!"), + _ => panic!("Expected OutputText"), } } _ => panic!("Expected MessageContent::Items"), @@ -132,10 +134,10 @@ mod tests { assert!(matches!(msg.role, MessageRole::Assistant)); match &msg.content { MessageContent::Items(items) => match &items[0] { - InputContent::InputText { text } => { + InputContent::OutputText { text } => { assert!(text.contains("get_weather")); } - _ => panic!("Expected InputText"), + _ => panic!("Expected OutputText"), }, _ => panic!("Expected MessageContent::Items"), }