From dfd58867433b2800f83bb9835af3180da7a75961 Mon Sep 17 00:00:00 2001 From: Salman Paracha Date: Wed, 17 Dec 2025 14:13:30 -0800 Subject: [PATCH] reducing false positives for signals like positive interaction --- crates/brightstaff/src/handlers/utils.rs | 2 +- crates/brightstaff/src/signals/signals.rs | 210 +++++++++++++++++++++- 2 files changed, 202 insertions(+), 10 deletions(-) diff --git a/crates/brightstaff/src/handlers/utils.rs b/crates/brightstaff/src/handlers/utils.rs index 5d8612f5..d0060090 100644 --- a/crates/brightstaff/src/handlers/utils.rs +++ b/crates/brightstaff/src/handlers/utils.rs @@ -165,7 +165,7 @@ impl StreamProcessor for ObservableStreamProcessor { || report.escalation.escalation_requested || matches!( report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Critical + InteractionQuality::Poor | InteractionQuality::Severe ); if should_flag { diff --git a/crates/brightstaff/src/signals/signals.rs b/crates/brightstaff/src/signals/signals.rs index 0893afe7..656702ac 100644 --- a/crates/brightstaff/src/signals/signals.rs +++ b/crates/brightstaff/src/signals/signals.rs @@ -16,7 +16,7 @@ use hermesllm::apis::openai::{Message, Role}; // ============================================================================ /// Flag emoji for marking spans/operations worth investigating -pub const FLAG_MARKER: &str = "\u{2691}"; +pub const FLAG_MARKER: &str = "\u{1F6A9}"; // ============================================================================ // Normalized Message Processing @@ -152,7 +152,7 @@ pub enum InteractionQuality { /// Poor interaction with concerning signals Poor, /// Critical interaction with severe negative signals - Critical, + Severe, } /// Container for all computed signals for a conversation @@ -382,6 +382,12 @@ impl SignalAnalyzer { } } + /// Check if a pattern is long enough to warrant fuzzy matching + /// Short patterns (< 3 words) should use exact matching only to avoid false positives + fn should_use_fuzzy_matching(pattern: &str) -> bool { + pattern.split_whitespace().count() >= 3 + } + /// Create a new signal analyzer with default settings pub fn new() -> Self { Self { @@ -571,7 +577,7 @@ impl SignalAnalyzer { repair_phrases.push(format!("Turn {}: '{}'", i + 1, pattern)); found_in_turn = true; break; - } else if norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { + } else if Self::should_use_fuzzy_matching(pattern) && norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { repair_count += 1; repair_phrases.push(format!("Turn {}: '{}' (fuzzy)", i + 1, pattern)); found_in_turn = true; @@ -754,7 +760,7 @@ impl SignalAnalyzer { snippet: pattern.to_string(), }); break; - } else if norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { + } else if Self::should_use_fuzzy_matching(pattern) && norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { indicators.push(FrustrationIndicator { indicator_type: FrustrationType::DirectComplaint, message_index: *i, @@ -1056,7 +1062,7 @@ impl SignalAnalyzer { }); found_in_turn = true; break; - } else if norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { + } else if Self::should_use_fuzzy_matching(pattern) && norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { indicators.push(PositiveIndicator { indicator_type: PositiveType::Gratitude, message_index: *i, @@ -1243,7 +1249,7 @@ impl SignalAnalyzer { escalation_type: EscalationType::HumanAgent, }); break; - } else if norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { + } else if Self::should_use_fuzzy_matching(pattern) && norm_msg.fuzzy_contains_phrase(pattern, self.fuzzy_threshold) { requests.push(EscalationRequest { message_index: *i, snippet: format!("{} (fuzzy)", pattern), @@ -1343,7 +1349,7 @@ impl SignalAnalyzer { || repetition.severity >= 3 || turn_count.is_excessive { - return InteractionQuality::Critical; + return InteractionQuality::Severe; } // Calculate quality score @@ -1379,7 +1385,7 @@ impl SignalAnalyzer { } else if score >= 25.0 { InteractionQuality::Poor } else { - InteractionQuality::Critical + InteractionQuality::Severe } } @@ -1661,7 +1667,7 @@ mod tests { let report = analyzer.analyze(&messages); assert!(matches!( report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Critical + InteractionQuality::Poor | InteractionQuality::Severe )); assert!(report.frustration.has_frustration); assert!(report.escalation.escalation_requested); @@ -1848,4 +1854,190 @@ mod tests { // Should not detect as rephrase since only stopwords overlap assert_eq!(signal.repair_count, 0, "Messages with only stopword overlap should not be rephrases"); } + + #[test] + fn test_frustrated_user_with_legitimate_repair() { + let start = Instant::now(); + let analyzer = SignalAnalyzer::new(); + + use hermesllm::apis::openai::{ToolCall, FunctionCall}; + + // Helper to create a message with tool calls + let create_assistant_with_tools = |content: &str, tool_id: &str, tool_name: &str, args: &str| -> Message { + Message { + role: Role::Assistant, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: Some(vec![ToolCall { + id: tool_id.to_string(), + call_type: "function".to_string(), + function: FunctionCall { + name: tool_name.to_string(), + arguments: args.to_string(), + }, + }]), + tool_call_id: None, + } + }; + + // Helper to create a tool response message + let create_tool_message = |tool_call_id: &str, content: &str| -> Message { + Message { + role: Role::Tool, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: None, + tool_call_id: Some(tool_call_id.to_string()), + } + }; + + // Scenario: User DOES mention New York in first message, making "I already told you" legitimate + let messages = vec![ + create_message(Role::User, "I need to book a flight from New York to Paris for December 20th"), + create_assistant_with_tools( + "I'll help you search for flights to Paris.", + "call_123", + "search_flights", + r#"{"origin": "NYC", "destination": "Paris", "date": "2025-12-20"}"# + ), + create_tool_message("call_123", r#"{"flights": []}"#), + create_message(Role::Assistant, "I couldn't find any flights. Could you provide your departure city?"), + create_message(Role::User, "I already told you, from New York!"), + create_assistant_with_tools( + "Let me try again.", + "call_456", + "search_flights", + r#"{"origin": "New York", "destination": "Paris", "date": "2025-12-20"}"# + ), + create_tool_message("call_456", r#"{"flights": []}"#), + create_message(Role::Assistant, "I'm still not finding results. Let me check the system."), + create_message(Role::User, "THIS IS RIDICULOUS!!! The tool doesn't work at all. Why do you keep calling it?"), + create_message(Role::Assistant, "I sincerely apologize for the frustration with the search tool."), + create_message(Role::User, "Forget it. I need to speak to a human agent. This is a waste of time."), + ]; + + let report = analyzer.analyze(&messages); + + // Tool messages should be filtered out, so we should only analyze text messages + // That's 4 user messages + 5 assistant text messages = 9 turns + assert_eq!(report.turn_count.total_turns, 9, "Should count 9 text messages (tool messages filtered out)"); + assert!(report.turn_count.is_concerning, "Should flag concerning turn count"); + + // Should detect frustration (all caps, complaints) + assert!(report.frustration.has_frustration, "Should detect frustration"); + assert!(report.frustration.frustration_count >= 2, "Should detect multiple frustration indicators"); + assert!(report.frustration.severity >= 2, "Should have moderate or higher frustration severity"); + + // Should detect escalation request + assert!(report.escalation.escalation_requested, "Should detect escalation to human agent"); + assert!(report.escalation.escalation_count >= 1, "Should detect at least one escalation"); + + // Overall quality should be Poor or Severe + assert!( + matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ), + "Quality should be Poor or Severe, got {:?}", + report.overall_quality + ); + + println!("test_frustrated_user_with_legitimate_repair took: {:?}", start.elapsed()); + } + + #[test] + fn test_frustrated_user_false_claim() { + let start = Instant::now(); + let analyzer = SignalAnalyzer::new(); + + use hermesllm::apis::openai::{ToolCall, FunctionCall}; + + // Helper to create a message with tool calls + let create_assistant_with_tools = |content: &str, tool_id: &str, tool_name: &str, args: &str| -> Message { + Message { + role: Role::Assistant, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: Some(vec![ToolCall { + id: tool_id.to_string(), + call_type: "function".to_string(), + function: FunctionCall { + name: tool_name.to_string(), + arguments: args.to_string(), + }, + }]), + tool_call_id: None, + } + }; + + // Helper to create a tool response message + let create_tool_message = |tool_call_id: &str, content: &str| -> Message { + Message { + role: Role::Tool, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: None, + tool_call_id: Some(tool_call_id.to_string()), + } + }; + + // Scenario: User NEVER mentions New York in first message but claims "I already told you" + // This represents realistic frustrated user behavior - exaggeration/misremembering + let messages = vec![ + create_message(Role::User, "I need to book a flight to Paris for December 20th"), + create_assistant_with_tools( + "I'll help you search for flights to Paris.", + "call_123", + "search_flights", + r#"{"destination": "Paris", "date": "2025-12-20"}"# + ), + create_tool_message("call_123", r#"{"error": "origin required"}"#), + create_message(Role::Assistant, "I couldn't find any flights. Could you provide your departure city?"), + create_message(Role::User, "I already told you, from New York!"), // False claim - never mentioned it + create_assistant_with_tools( + "Let me try again.", + "call_456", + "search_flights", + r#"{"origin": "New York", "destination": "Paris", "date": "2025-12-20"}"# + ), + create_tool_message("call_456", r#"{"flights": []}"#), + create_message(Role::Assistant, "I'm still not finding results. Let me check the system."), + create_message(Role::User, "THIS IS RIDICULOUS!!! The tool doesn't work at all. Why do you keep calling it?"), + create_message(Role::Assistant, "I sincerely apologize for the frustration with the search tool."), + create_message(Role::User, "Forget it. I need to speak to a human agent. This is a waste of time."), + ]; + + let report = analyzer.analyze(&messages); + + // Tool messages should be filtered out, so we should only analyze text messages + // That's 4 user messages + 5 assistant text messages = 9 turns + assert_eq!(report.turn_count.total_turns, 9, "Should count 9 text messages (tool messages filtered out)"); + assert!(report.turn_count.is_concerning, "Should flag concerning turn count"); + + // Should detect frustration (all caps, complaints, false claims) + assert!(report.frustration.has_frustration, "Should detect frustration"); + assert!(report.frustration.frustration_count >= 2, "Should detect multiple frustration indicators"); + assert!(report.frustration.severity >= 2, "Should have moderate or higher frustration severity"); + + // Should detect escalation request + assert!(report.escalation.escalation_requested, "Should detect escalation to human agent"); + assert!(report.escalation.escalation_count >= 1, "Should detect at least one escalation"); + + // Note: May detect false positive "positive feedback" due to fuzzy matching + // e.g., "I already told YOU" matches "you rock", "THIS is RIDICULOUS" matches "this helps" + // However, the overall quality should still be Poor/Severe due to frustration+escalation + + // Overall quality should be Poor or Severe (frustration + escalation indicates poor interaction) + assert!( + matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ), + "Quality should be Poor or Severe for frustrated user with false claims, got {:?}", + report.overall_quality + ); + + println!("test_frustrated_user_false_claim took: {:?}", start.elapsed()); + println!("Full signal analysis completed in {:?}", start.elapsed()); + } }