fix: truncate oversized user messages in orchestrator routing prompt

The orchestrator trimmer had a bypass that kept the latest user message whole even when it alone exceeded the configured token budget. This caused brightstaff to send a ~500KB prompt to the Plano-Orchestrator model, which rejected it with a 400 "context length exceeded" from the upstream 32K-token window. Brightstaff then surfaced a confusing "missing field id" parse error instead of the real upstream message. Fix the bypass by trimming the overflowing user message from the end toward the beginning until it fits in the remaining token budget. The beginning of the message (where user intent usually lives) is preserved and the tail is dropped. Added a UTF-8-safe byte-truncation helper and a regression test that mirrors the production payload (a single ~500KB user message with a small budget).
2026-06-02 14:35:14 +02:00 · 2026-04-17 18:00:02 -07:00 · 2026-04-17 18:00:02 -07:00 · 321c28da37
commit 321c28da37
parent 37600fd07a
1 changed files with 129 additions and 28 deletions
--- a/crates/brightstaff/src/router/orchestrator_model_v1.rs
+++ b/crates/brightstaff/src/router/orchestrator_model_v1.rs
@ -192,32 +192,57 @@ impl OrchestratorModel for OrchestratorModelV1 {
        // Following code is to ensure that the conversation does not exceed max token length
        // Note: we use a simple heuristic to estimate token count based on character length to optimize for performance
        let mut token_count = ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR;
-        let mut selected_messages_list_reversed: Vec<&Message> = vec![];
+        let mut selected_messages_list_reversed: Vec<Message> = vec![];
        for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() {
-            let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR;
-            token_count += message_token_count;
-            if token_count > self.max_token_length {
+            let message_text = message.content.extract_text();
+            let message_token_count = message_text.len() / TOKEN_LENGTH_DIVISOR;
+            if token_count + message_token_count > self.max_token_length {
+                let remaining_tokens = self.max_token_length.saturating_sub(token_count);
                debug!(
-                    token_count = token_count,
+                    attempted_total_tokens = token_count + message_token_count,
                    max_tokens = self.max_token_length,
+                    remaining_tokens,
                    selected = selected_messsage_count,
                    total = messages_vec.len(),
                    "token count exceeds max, truncating conversation"
                );
-                if message.role == Role::User {
-                    // If message that exceeds max token length is from user, we need to keep it
-                    selected_messages_list_reversed.push(message);
+                // If the message that overflows the budget is from the user we need
+                // to keep at least some of it so the orchestrator sees the latest
+                // user intent. Trim from the end of the message toward the
+                // beginning until we fit in the remaining token budget.
+                if message.role == Role::User && remaining_tokens > 0 {
+                    let max_bytes = remaining_tokens.saturating_mul(TOKEN_LENGTH_DIVISOR);
+                    let truncated = truncate_to_utf8_boundary(&message_text, max_bytes);
+                    selected_messages_list_reversed.push(Message {
+                        role: Role::User,
+                        content: Some(MessageContent::Text(truncated.to_string())),
+                        name: None,
+                        tool_calls: None,
+                        tool_call_id: None,
+                    });
                }
                break;
            }
-            // If we are here, it means that the message is within the max token length
-            selected_messages_list_reversed.push(message);
+            token_count += message_token_count;
+            selected_messages_list_reversed.push(Message {
+                role: message.role.clone(),
+                content: Some(MessageContent::Text(message_text)),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            });
        }

        if selected_messages_list_reversed.is_empty() {
            debug!("no messages selected, using last message");
            if let Some(last_message) = messages_vec.last() {
-                selected_messages_list_reversed.push(last_message);
+                selected_messages_list_reversed.push(Message {
+                    role: last_message.role.clone(),
+                    content: Some(MessageContent::Text(last_message.content.extract_text())),
+                    name: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                });
            }
        }

@ -237,22 +262,8 @@ impl OrchestratorModel for OrchestratorModelV1 {
        }

        // Reverse the selected messages to maintain the conversation order
-        let selected_conversation_list = selected_messages_list_reversed
-            .iter()
-            .rev()
-            .map(|message| Message {
-                role: message.role.clone(),
-                content: Some(MessageContent::Text(
-                    message
-                        .content
-                        .as_ref()
-                        .map_or(String::new(), |c| c.to_string()),
-                )),
-                name: None,
-                tool_calls: None,
-                tool_call_id: None,
-            })
-            .collect::<Vec<Message>>();
+        let selected_conversation_list: Vec<Message> =
+            selected_messages_list_reversed.into_iter().rev().collect();

        // Generate the orchestrator request message based on the usage preferences.
        // If preferences are passed in request then we use them;
@ -405,6 +416,20 @@ fn fix_json_response(body: &str) -> String {
    body.replace("'", "\"").replace("\\n", "")
 }

+/// Truncate `s` so that the returned slice is at most `max_bytes` bytes long
+/// and ends on a UTF-8 character boundary. Keeps the beginning of the string
+/// and drops characters from the end.
+fn truncate_to_utf8_boundary(s: &str, max_bytes: usize) -> &str {
+    if s.len() <= max_bytes {
+        return s;
+    }
+    let mut end = max_bytes;
+    while end > 0 && !s.is_char_boundary(end) {
+        end -= 1;
+    }
+    &s[..end]
+}
+
 impl std::fmt::Debug for dyn OrchestratorModel {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "OrchestratorModel")
@ -777,6 +802,10 @@ If no routes are needed, return an empty list for `route`.

    #[test]
    fn test_conversation_trim_upto_user_message() {
+        // With max_token_length=230 the older user message "given the image In
+        // style of Andy Warhol" overflows the remaining budget and is
+        // truncated from the end (chars dropped from the end of the string)
+        // until it fits. The newer assistant/user turns are preserved in full.
        let expected_prompt = r#"
 You are a helpful assistant that selects the most suitable routes based on user intent.
 You are provided with a list of available routes enclosed within <routes></routes> XML tags:
@ -789,7 +818,7 @@ You are also given the conversation context enclosed within <conversation></conv
 [
    {
        "role": "user",
-        "content": "given the image In style of Andy Warhol"
+        "content": "given the im"
    },
    {
        "role": "assistant",
@ -862,6 +891,78 @@ If no routes are needed, return an empty list for `route`.
        assert_eq!(expected_prompt, prompt);
    }

+    #[test]
+    fn test_huge_single_user_message_is_truncated() {
+        // Regression test for the case where a single, extremely large user
+        // message was being passed through to the orchestrator verbatim,
+        // blowing past the upstream model's context window. The trimmer must
+        // now truncate the oversized user message from the end until it fits
+        // within the configured budget.
+        let orchestrations_str = r#"
+          {
+            "gpt-4o": [
+              {"name": "Image generation", "description": "generating image"}
+            ]
+        }
+        "#;
+        let agent_orchestrations = serde_json::from_str::<
+            HashMap<String, Vec<OrchestrationPreference>>,
+        >(orchestrations_str)
+        .unwrap();
+
+        let max_token_length = 2048;
+        let orchestrator = OrchestratorModelV1::new(
+            agent_orchestrations,
+            "test-model".to_string(),
+            max_token_length,
+        );
+
+        // ~500KB of content — similar in scale to the real payload that
+        // triggered the upstream 400 "context length exceeded".
+        let huge_user_content = "A".repeat(500_000);
+        let conversation = vec![Message {
+            role: Role::User,
+            content: Some(MessageContent::Text(huge_user_content.clone())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+        }];
+
+        let req = orchestrator.generate_request(&conversation, &None);
+        let prompt = req.messages[0].content.extract_text();
+
+        // Final prompt must be bounded. Use a generous ceiling: the configured
+        // budget converted to bytes (tokens * divisor) plus the system prompt
+        // and routes JSON overhead. In practice the result should be well
+        // under this ceiling.
+        let byte_ceiling = max_token_length * TOKEN_LENGTH_DIVISOR
+            + ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len()
+            + 512;
+        assert!(
+            prompt.len() < byte_ceiling,
+            "prompt length {} exceeded ceiling {} — truncation did not apply",
+            prompt.len(),
+            byte_ceiling,
+        );
+
+        // The oversized user message must have been truncated — i.e. not all
+        // 500k "A" characters made it through.
+        let a_count = prompt.chars().filter(|c| *c == 'A').count();
+        assert!(
+            a_count < huge_user_content.len(),
+            "expected user message to be truncated, but all {} 'A' chars survived",
+            a_count
+        );
+        assert!(
+            a_count > 0,
+            "expected at least some of the user message to survive truncation"
+        );
+
+        // Sanity: the prompt still includes the routing prompt scaffolding.
+        assert!(prompt.contains("<conversation>"));
+        assert!(prompt.contains("<routes>"));
+    }
+
    #[test]
    fn test_non_text_input() {
        let expected_prompt = r#"