diff --git a/cli/planoai/main.py b/cli/planoai/main.py index 2130dce1..a2737883 100644 --- a/cli/planoai/main.py +++ b/cli/planoai/main.py @@ -81,7 +81,7 @@ def main(ctx, version): @click.command() def build(): - """Build Arch from source. Works from any directory within the repo.""" + """Build Plano from source. Works from any directory within the repo.""" # Find the repo root repo_root = find_repo_root() @@ -112,7 +112,7 @@ def build(): ], check=True, ) - click.echo("archgw image built successfully.") + click.echo("plano image built successfully.") except subprocess.CalledProcessError as e: click.echo(f"Error building plano image: {e}") sys.exit(1) diff --git a/crates/Cargo.lock b/crates/Cargo.lock index 4dbcae49..c1ac4497 100644 --- a/crates/Cargo.lock +++ b/crates/Cargo.lock @@ -335,6 +335,7 @@ dependencies = [ "serde_json", "serde_with", "serde_yaml", + "strsim", "thiserror 2.0.12", "time", "tokio", diff --git a/crates/brightstaff/Cargo.toml b/crates/brightstaff/Cargo.toml index 233a4da3..f48a8d68 100644 --- a/crates/brightstaff/Cargo.toml +++ b/crates/brightstaff/Cargo.toml @@ -30,6 +30,7 @@ reqwest = { version = "0.12.15", features = ["stream"] } serde = { version = "1.0.219", features = ["derive"] } serde_json = "1.0.140" serde_with = "3.13.0" +strsim = "0.11" serde_yaml = "0.9.34" thiserror = "2.0.12" tokio = { version = "1.44.2", features = ["full"] } diff --git a/crates/brightstaff/src/handlers/llm.rs b/crates/brightstaff/src/handlers/llm.rs index b311976a..b3cea5e1 100644 --- a/crates/brightstaff/src/handlers/llm.rs +++ b/crates/brightstaff/src/handlers/llm.rs @@ -111,6 +111,9 @@ pub async fn llm_chat( .get_recent_user_message() .map(|msg| truncate_message(&msg, 50)); + // Extract messages for signal analysis (clone before moving client_request) + let messages_for_signals = client_request.get_messages(); + client_request.set_model(resolved_model.clone()); if client_request.remove_metadata_key("archgw_preference_config") { debug!( @@ -292,6 +295,7 @@ pub async fn llm_chat( operation_component::LLM, llm_span, request_start_time, + Some(messages_for_signals), ); // === v1/responses state management: Wrap with ResponsesStateProcessor === diff --git a/crates/brightstaff/src/handlers/utils.rs b/crates/brightstaff/src/handlers/utils.rs index 1529ef3e..f2a318a4 100644 --- a/crates/brightstaff/src/handlers/utils.rs +++ b/crates/brightstaff/src/handlers/utils.rs @@ -10,8 +10,10 @@ use tokio_stream::wrappers::ReceiverStream; use tokio_stream::StreamExt; use tracing::warn; -// Import tracing constants -use crate::tracing::{error, llm}; +// Import tracing constants and signals +use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER}; +use crate::tracing::{error, llm, signals as signal_constants}; +use hermesllm::apis::openai::Message; /// Trait for processing streaming chunks /// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging) @@ -38,6 +40,7 @@ pub struct ObservableStreamProcessor { chunk_count: usize, start_time: Instant, time_to_first_token: Option, + messages: Option>, } impl ObservableStreamProcessor { @@ -48,11 +51,13 @@ impl ObservableStreamProcessor { /// * `service_name` - The service name for this span (e.g., "archgw(llm)") /// * `span` - The span to finalize after streaming completes /// * `start_time` - When the request started (for duration calculation) + /// * `messages` - Optional conversation messages for signal analysis pub fn new( collector: Arc, service_name: impl Into, span: Span, start_time: Instant, + messages: Option>, ) -> Self { Self { collector, @@ -62,6 +67,7 @@ impl ObservableStreamProcessor { chunk_count: 0, start_time, time_to_first_token: None, + messages, } } } @@ -133,6 +139,94 @@ impl StreamProcessor for ObservableStreamProcessor { } } + // Analyze signals if messages are available and add to span attributes + if let Some(ref messages) = self.messages { + let analyzer: Box = Box::new(TextBasedSignalAnalyzer::new()); + let report = analyzer.analyze(messages); + + // Add overall quality + self.span.attributes.push(Attribute { + key: signal_constants::QUALITY.to_string(), + value: AttributeValue { + string_value: Some(format!("{:?}", report.overall_quality)), + }, + }); + + // Add repair/follow-up metrics if concerning + if report.follow_up.is_concerning || report.follow_up.repair_count > 0 { + self.span.attributes.push(Attribute { + key: signal_constants::REPAIR_COUNT.to_string(), + value: AttributeValue { + string_value: Some(report.follow_up.repair_count.to_string()), + }, + }); + + self.span.attributes.push(Attribute { + key: signal_constants::REPAIR_RATIO.to_string(), + value: AttributeValue { + string_value: Some(format!("{:.3}", report.follow_up.repair_ratio)), + }, + }); + } + + // Add flag marker to operation name if any concerning signal is detected + let should_flag = report.frustration.has_frustration + || report.repetition.has_looping + || report.escalation.escalation_requested + || matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ); + + if should_flag { + // Prepend flag marker to the operation name + self.span.name = format!("{} {}", self.span.name, FLAG_MARKER); + } + + // Add key signal metrics + if report.frustration.has_frustration { + self.span.attributes.push(Attribute { + key: signal_constants::FRUSTRATION_COUNT.to_string(), + value: AttributeValue { + string_value: Some(report.frustration.frustration_count.to_string()), + }, + }); + self.span.attributes.push(Attribute { + key: signal_constants::FRUSTRATION_SEVERITY.to_string(), + value: AttributeValue { + string_value: Some(report.frustration.severity.to_string()), + }, + }); + } + + if report.repetition.has_looping { + self.span.attributes.push(Attribute { + key: signal_constants::REPETITION_COUNT.to_string(), + value: AttributeValue { + string_value: Some(report.repetition.repetition_count.to_string()), + }, + }); + } + + if report.escalation.escalation_requested { + self.span.attributes.push(Attribute { + key: signal_constants::ESCALATION_REQUESTED.to_string(), + value: AttributeValue { + string_value: Some("true".to_string()), + }, + }); + } + + if report.positive_feedback.has_positive_feedback { + self.span.attributes.push(Attribute { + key: signal_constants::POSITIVE_FEEDBACK_COUNT.to_string(), + value: AttributeValue { + string_value: Some(report.positive_feedback.positive_count.to_string()), + }, + }); + } + } + // Record the finalized span self.collector .record_span(&self.service_name, self.span.clone()); diff --git a/crates/brightstaff/src/lib.rs b/crates/brightstaff/src/lib.rs index 36fc902f..47da64aa 100644 --- a/crates/brightstaff/src/lib.rs +++ b/crates/brightstaff/src/lib.rs @@ -1,5 +1,6 @@ pub mod handlers; pub mod router; +pub mod signals; pub mod state; pub mod tracing; pub mod utils; diff --git a/crates/brightstaff/src/signals/analyzer.rs b/crates/brightstaff/src/signals/analyzer.rs new file mode 100644 index 00000000..9880bf2c --- /dev/null +++ b/crates/brightstaff/src/signals/analyzer.rs @@ -0,0 +1,3189 @@ +//! Agentic Signals - Behavioral quality indicators for agent interactions +//! +//! This module implements various signals that serve as early warning indicators +//! of brilliant successes or failures in agentic interactions. These signals are +//! derived from conversation patterns and can be computed algorithmically from +//! message arrays. + +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::sync::LazyLock; + +use hermesllm::apis::openai::{Message, Role}; + +// ============================================================================ +// Constants +// ============================================================================ + +/// Flag emoji for marking spans/operations worth investigating +pub const FLAG_MARKER: &str = "\u{1F6A9}"; + +/// Size of character n-grams for similarity matching (3 = trigrams) +const NGRAM_SIZE: usize = 3; + +// ============================================================================ +// Normalized Message Processing +// ============================================================================ + +/// Pre-processed message with normalized text and tokens for efficient matching +#[derive(Debug, Clone)] +struct NormalizedMessage { + /// Original raw text + raw: String, + /// Tokens (words) extracted from the message + tokens: Vec, + /// Token set for fast lookup + token_set: HashSet, + /// Bigram set for fast similarity computation + bigram_set: HashSet, + /// Character ngram set for robust similarity matching + char_ngram_set: HashSet, + /// Token frequency map for multiset cosine similarity + token_frequency: HashMap, +} + +impl NormalizedMessage { + #[allow(dead_code)] // Used in tests for algorithm validation + fn from_text(text: &str) -> Self { + Self::from_text_with_limit(text, usize::MAX) + } + + fn from_text_with_limit(text: &str, max_length: usize) -> Self { + // Truncate to max_length characters to prevent unbounded computation + // Keep head (20%) + tail (80%) to preserve both context and intent + + let char_count = text.chars().count(); + + let raw = if char_count <= max_length { + text.to_string() + } else { + // Split: 20% head, 79% tail, 1 char space delimiter + let head_len = max_length / 5; + let tail_len = max_length - head_len - 1; + + let head: String = text.chars().take(head_len).collect(); + let tail: String = text.chars().skip(char_count - tail_len).collect(); + + format!("{} {}", head, tail) + }; + + // Normalize unicode punctuation to ASCII equivalents + let normalized_unicode = raw + .replace(['\u{2019}', '\u{2018}'], "'") // U+2019/U+2018 SINGLE QUOTATION MARKs + .replace(['\u{201C}', '\u{201D}'], "\"") // U+201C/U+201D DOUBLE QUOTATION MARKs + .replace(['\u{2013}', '\u{2014}'], "-"); // U+2013/U+2014 EN/EM DASHes + + // Normalize: lowercase, collapse whitespace + let normalized = normalized_unicode + .to_lowercase() + .split_whitespace() + .collect::>() + .join(" "); + + // Tokenize: split on whitespace and strip punctuation from boundaries + let tokens: Vec = normalized + .split_whitespace() + .map(|word| { + // Strip leading/trailing punctuation but keep internal punctuation + word.trim_matches(|c: char| c.is_ascii_punctuation()) + .to_string() + }) + .filter(|w| !w.is_empty()) + .collect(); + + let token_set: HashSet = tokens.iter().cloned().collect(); + + // Generate bigram set directly for similarity matching + let bigram_set: HashSet = tokens + .windows(2) + .map(|w| format!("{} {}", w[0], w[1])) + .collect(); + + // Generate character ngram set for robust similarity matching + // Uses tokens (with punctuation stripped) for consistency with pattern matching + let tokens_text = tokens.join(" "); + let char_ngram_set: HashSet = tokens_text + .chars() + .collect::>() + .windows(NGRAM_SIZE) + .map(|w| w.iter().collect::()) + .collect(); + + // Compute token frequency map for cosine similarity + let mut token_frequency: HashMap = HashMap::new(); + for token in &tokens { + *token_frequency.entry(token.clone()).or_insert(0) += 1; + } + + Self { + raw, + tokens, + token_set, + bigram_set, + char_ngram_set, + token_frequency, + } + } + + /// Check if a single token exists in the message (word boundary aware) + fn contains_token(&self, token: &str) -> bool { + self.token_set.contains(token) + } + + /// Check if a phrase (sequence of tokens) exists in the message + fn contains_phrase(&self, phrase: &str) -> bool { + let phrase_tokens: Vec<&str> = phrase.split_whitespace().collect(); + if phrase_tokens.is_empty() { + return false; + } + + if phrase_tokens.len() == 1 { + return self.contains_token(phrase_tokens[0]); + } + + // Multi-word phrase: check for sequence in tokens + self.tokens.windows(phrase_tokens.len()).any(|window| { + window + .iter() + .zip(phrase_tokens.iter()) + .all(|(token, phrase_token)| token == phrase_token) + }) + } + + /// Calculate character ngram similarity between this message and a pattern + /// Returns a similarity score between 0.0 and 1.0 + /// This is robust to typos, small edits, and word insertions + #[allow(dead_code)] // Used in tests for algorithm validation + fn char_ngram_similarity(&self, pattern: &str) -> f64 { + // Normalize the pattern: lowercase and remove ALL punctuation + // This makes "doesn't" → "doesnt" for robust typo matching + let normalized_pattern = pattern + .to_lowercase() + .chars() + .filter(|c| c.is_alphanumeric() || c.is_whitespace()) + .collect::() + .split_whitespace() + .collect::>() + .join(" "); + + // Generate ngrams for the pattern + let pattern_ngrams: HashSet = normalized_pattern + .chars() + .collect::>() + .windows(NGRAM_SIZE) + .map(|w| w.iter().collect::()) + .collect(); + + if self.char_ngram_set.is_empty() && pattern_ngrams.is_empty() { + return 1.0; // Both empty = identical + } + + if self.char_ngram_set.is_empty() || pattern_ngrams.is_empty() { + return 0.0; + } + + // Compute Jaccard similarity (intersection / union) + let intersection = self.char_ngram_set.intersection(&pattern_ngrams).count(); + let union = self.char_ngram_set.union(&pattern_ngrams).count(); + + if union == 0 { + return 0.0; + } + + intersection as f64 / union as f64 + } + + /// Calculate token-based cosine similarity using term frequencies + /// Returns a similarity score between 0.0 and 1.0 + /// This handles word frequency and is stable for longer messages + #[allow(dead_code)] // Used in tests for algorithm validation + fn token_cosine_similarity(&self, pattern: &str) -> f64 { + // Tokenize and compute frequencies for the pattern + let pattern_tokens: Vec = pattern + .to_lowercase() + .split_whitespace() + .map(|word| { + word.trim_matches(|c: char| c.is_ascii_punctuation()) + .to_string() + }) + .filter(|w| !w.is_empty()) + .collect(); + + let mut pattern_frequency: HashMap = HashMap::new(); + for token in &pattern_tokens { + *pattern_frequency.entry(token.clone()).or_insert(0) += 1; + } + + if self.token_frequency.is_empty() && pattern_frequency.is_empty() { + return 1.0; + } + + if self.token_frequency.is_empty() || pattern_frequency.is_empty() { + return 0.0; + } + + // Compute cosine similarity + // cosine_sim = dot_product / (norm1 * norm2) + + let mut dot_product = 0.0; + let mut norm1_squared = 0.0; + let mut norm2_squared = 0.0; + + // Collect all unique tokens from both sets + let all_tokens: HashSet = self + .token_frequency + .keys() + .chain(pattern_frequency.keys()) + .cloned() + .collect(); + + for token in all_tokens { + let freq1 = *self.token_frequency.get(&token).unwrap_or(&0) as f64; + let freq2 = *pattern_frequency.get(&token).unwrap_or(&0) as f64; + + dot_product += freq1 * freq2; + norm1_squared += freq1 * freq1; + norm2_squared += freq2 * freq2; + } + + let norm1 = norm1_squared.sqrt(); + let norm2 = norm2_squared.sqrt(); + + if norm1 == 0.0 || norm2 == 0.0 { + return 0.0; + } + + dot_product / (norm1 * norm2) + } + + /// Layered phrase matching: exact → character ngram → token cosine + /// Returns true if the pattern matches using any layer + #[allow(dead_code)] // Kept for reference; production uses matches_normalized_pattern + fn layered_contains_phrase( + &self, + pattern: &str, + char_ngram_threshold: f64, + token_cosine_threshold: f64, + ) -> bool { + // Layer 0: Exact phrase match (fastest) + if self.contains_phrase(pattern) { + return true; + } + + // Layer 1: Character ngram similarity (typo/edit robustness) + // Check whole message first (for short messages) + if self.char_ngram_similarity(pattern) >= char_ngram_threshold { + return true; + } + + // ngram containment check for patterns buried in longer messages + // If ALL of the pattern's ngrams exist in the message, the pattern must be + // present (possibly with minor variations like missing apostrophes). + // This is O(pattern_ngrams) lookups vs expensive window sliding. + if self.char_ngram_containment(pattern) >= 1.0 { + return true; + } + + // Layer 2: Token cosine similarity (semantic stability for long messages) + if self.token_cosine_similarity(pattern) >= token_cosine_threshold { + return true; + } + + false + } + + fn char_ngram_containment(&self, pattern: &str) -> f64 { + // Normalize the pattern the same way as char_ngram_similarity + let normalized_pattern = pattern + .to_lowercase() + .chars() + .filter(|c| c.is_alphanumeric() || c.is_whitespace()) + .collect::() + .split_whitespace() + .collect::>() + .join(" "); + + // Generate ngrams for the pattern + let pattern_ngrams: HashSet = normalized_pattern + .chars() + .collect::>() + .windows(NGRAM_SIZE) + .map(|w| w.iter().collect::()) + .collect(); + + if pattern_ngrams.is_empty() { + return 0.0; + } + + // Count how many pattern ngrams exist in the message + let contained = pattern_ngrams + .iter() + .filter(|t| self.char_ngram_set.contains(*t)) + .count(); + + contained as f64 / pattern_ngrams.len() as f64 + } + + /// Fast matching against a pre-normalized pattern + /// This avoids re-normalizing and re-computing ngrams for each pattern + fn matches_normalized_pattern( + &self, + pattern: &NormalizedPattern, + char_ngram_threshold: f64, + token_cosine_threshold: f64, + ) -> bool { + // Layer 0: Exact phrase match (fastest) + if self.contains_phrase(&pattern.raw) { + return true; + } + + // Layer 1: Character ngram similarity using pre-computed ngrams + if !self.char_ngram_set.is_empty() && !pattern.char_ngram_set.is_empty() { + let intersection = self + .char_ngram_set + .intersection(&pattern.char_ngram_set) + .count(); + let union = self.char_ngram_set.union(&pattern.char_ngram_set).count(); + if union > 0 { + let similarity = intersection as f64 / union as f64; + if similarity >= char_ngram_threshold { + return true; + } + } + } + + // Ngram containment check using pre-computed ngrams + if !pattern.char_ngram_set.is_empty() { + let contained = pattern + .char_ngram_set + .iter() + .filter(|t| self.char_ngram_set.contains(*t)) + .count(); + let containment = contained as f64 / pattern.char_ngram_set.len() as f64; + if containment >= 1.0 { + return true; + } + } + + // Layer 2: Token cosine similarity using pre-computed frequencies + if !self.token_frequency.is_empty() && !pattern.token_frequency.is_empty() { + let mut dot_product = 0.0; + let mut norm1_squared = 0.0; + let mut norm2_squared = 0.0; + + // Iterate over pattern tokens (usually smaller set) + for (token, &freq2) in &pattern.token_frequency { + let freq1 = *self.token_frequency.get(token).unwrap_or(&0) as f64; + let freq2 = freq2 as f64; + dot_product += freq1 * freq2; + norm2_squared += freq2 * freq2; + } + + // Add self tokens not in pattern for norm1 + for &freq1 in self.token_frequency.values() { + norm1_squared += (freq1 as f64) * (freq1 as f64); + } + + let norm1 = norm1_squared.sqrt(); + let norm2 = norm2_squared.sqrt(); + + if norm1 > 0.0 && norm2 > 0.0 { + let similarity = dot_product / (norm1 * norm2); + if similarity >= token_cosine_threshold { + return true; + } + } + } + + false + } +} + +// ============================================================================ +// Normalized Pattern (pre-computed for performance) +// ============================================================================ + +/// Pre-processed pattern with normalized text and pre-computed ngrams/tokens +/// This avoids redundant computation when matching against many messages +#[derive(Debug, Clone)] +struct NormalizedPattern { + /// Original raw pattern text + raw: String, + /// Character ngram set for similarity matching + char_ngram_set: HashSet, + /// Token frequency map for cosine similarity + token_frequency: HashMap, +} + +impl NormalizedPattern { + fn new(pattern: &str) -> Self { + // Normalize: lowercase and remove ALL punctuation + let normalized = pattern + .to_lowercase() + .chars() + .filter(|c| c.is_alphanumeric() || c.is_whitespace()) + .collect::() + .split_whitespace() + .collect::>() + .join(" "); + + // Generate ngrams + let char_ngram_set: HashSet = normalized + .chars() + .collect::>() + .windows(NGRAM_SIZE) + .map(|w| w.iter().collect::()) + .collect(); + + // Compute token frequency map + let tokens: Vec = normalized + .split_whitespace() + .map(|s| s.to_string()) + .collect(); + let mut token_frequency: HashMap = HashMap::new(); + for token in tokens { + *token_frequency.entry(token).or_insert(0) += 1; + } + + Self { + raw: pattern.to_string(), + char_ngram_set, + token_frequency, + } + } +} + +/// Helper to create a static slice of normalized patterns +fn normalize_patterns(patterns: &[&str]) -> Vec { + patterns.iter().map(|p| NormalizedPattern::new(p)).collect() +} + +// ============================================================================ +// Pre-computed Pattern Caches (initialized once at startup) +// ============================================================================ + +static REPAIR_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Explicit corrections + "i meant", + "i mean", + "sorry, i meant", + "what i meant was", + "what i actually meant", + "i was trying to say", + "let me correct that", + "correction", + "i misspoke", + // Negations and disagreements + "no, i", + "no i", + "nah i", + "nope i", + "not what i", + "that's not", + "that's not what", + "that isn't what", + "not quite", + "not exactly", + // Rephrasing indicators + "let me rephrase", + "let me try again", + "let me clarify", + "to clarify", + "to be clear", + "let me explain", + "what i'm trying to", + "what i'm saying", + "in other words", + // Actual/really emphasis + "actually i", + "actually no", + "what i actually", + "i actually", + "i really meant", + // Mistake acknowledgment + "i was wrong", + "my mistake", + "my bad", + "i should have said", + "i should clarify", + // Wait/hold indicators + "wait, i", + "wait no", + "hold on", + "hang on", + ]) +}); + +static COMPLAINT_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Useless/unhelpful (multi-word only) + "this is useless", + "not helpful", + "doesn't help", + "not helping", + "you're not helping", + "no help", + "unhelpful", + // Not working + "this doesn't work", + "doesn't work", + "not working", + "isn't working", + "won't work", + "still doesn't work", + "still not working", + // Not fixing/solving + "doesn't fix", + "not fixing", + "doesn't solve", + "doesn't seem to work", + "doesn't seem to fix", + "not resolving", + // Waste/pointless + "waste of time", + "wasting my time", + // Ridiculous/absurd + "this is ridiculous", + "ridiculous", + "this is absurd", + "absurd", + "this is insane", + "insane", + // Stupid/dumb (as adjectives, not as standalone tokens) + "this is stupid", + "this is dumb", + // Quality complaints (multi-word) + "this sucks", + "not good enough", + // Capability questions + "why can't you", + "can't you", + // Frustration + "this is frustrating", + "frustrated", + "incomplete", + "overwhelm", + "overwhelmed", + "overwhelming", + "exhausted", + "struggled", + // same issue + "same issue", + // polite dissatisfaction + "i'm disappointed", + "thanks, but", + "appreciate it, but", + "good, but", + // Fed up/done + "i give up", + "give up", + "fed up", + "had enough", + "can't take", + // Bot-specific complaints + "useless bot", + "dumb bot", + "stupid bot", + ]) +}); + +static CONFUSION_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Don't understand + "i don't understand", + "don't understand", + "not understanding", + "can't understand", + "don't get it", + "don't follow", + // Confused state + "i'm confused", + "so confused", + // Makes no sense + "makes no sense", + "doesn't make sense", + "not making sense", + // What do you mean (keep multi-word) + "what do you mean", + "what does that mean", + "what are you saying", + // Lost/unclear + "i'm lost", + "totally lost", + "lost me", + // No clue + "no clue", + "no idea", + // Come again + "come again", + "say that again", + "repeat that", + ]) +}); + +static GRATITUDE_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Standard gratitude + "thank you", + "thanks", + "thank u", + "thankyou", + "thx", + "ty", + "tyvm", + "tysm", + "thnx", + "thnks", + // Strong gratitude + "thanks so much", + "thank you so much", + "thanks a lot", + "thanks a bunch", + "much appreciated", + "really appreciate", + "greatly appreciate", + "appreciate it", + "appreciate that", + "i appreciate", + "grateful", + "so grateful", + // Helpfulness acknowledgment + "that's helpful", + "very helpful", + "super helpful", + "really helpful", + "that helps", + "this helps", + "helpful", + // Perfection expressions + "perfect", + "that's perfect", + "just perfect", + "exactly what i needed", + "exactly right", + "just what i needed", + "that's exactly", + // Informal positive + "you're the best", + "you rock", + "you're awesome", + "awesome sauce", + "legend", + ]) +}); + +static SATISFACTION_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Works/functions + "that works", + "this works", + "works great", + "works perfectly", + "works for me", + // Great variations + "that's great", + "that's amazing", + "this is great", + "sounds great", + "looks great", + "great job", + // Excellent/perfect + "excellent", + "outstanding", + "superb", + "spectacular", + // Awesome/amazing + "awesome", + "that's awesome", + "amazing", + "incredible", + // Love expressions + "love it", + "love this", + "i love", + "loving it", + "love that", + // Brilliant/wonderful + "brilliant", + "wonderful", + "fantastic", + "fabulous", + "marvelous", + ]) +}); + +static SUCCESS_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Understanding confirmation + "got it", + "i got it", + "understand", + "understood", + "i understand", + "makes sense", + "clear now", + "i see", + // Success/completion + "success", + "successful", + "it worked", + "that worked", + "this worked", + "worked", + // Problem resolution + "solved", + "resolved", + "fixed", + "fixed it", + "issue resolved", + "problem solved", + // Working state + "working now", + "it's working", + "works now", + "working fine", + "working great", + // Completion + "all set", + "all good", + "we're good", + "i'm good", + "all done", + "done", + "complete", + "finished", + // Perfect fit + "spot on", + "nailed it", + "bingo", + "exactly", + "just right", + ]) +}); + +static HUMAN_AGENT_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Speak to human + "speak to a human", + "speak to human", + "speak with a human", + "speak with human", + "talk to a human", + "talk to human", + "talk to a person", + "talk to person", + "talk to someone", + // Human/real agent + "human agent", + "real agent", + "actual agent", + "live agent", + "human support", + // Real/actual person + "real person", + "actual person", + "real human", + "actual human", + "someone real", + // Need/want human + "need a human", + "need human", + "want a human", + "want human", + "get me a human", + "get me human", + "get me someone", + // Transfer/connect + "transfer me", + "connect me", + "escalate this", + // Representative (removed standalone "rep" - too many false positives) + "representative", + "customer service rep", + "customer service representative", + // Not a bot + "not a bot", + "not talking to a bot", + "tired of bots", + ]) +}); + +static SUPPORT_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Contact support + "contact support", + "call support", + "reach support", + "get support", + // Customer support + "customer support", + "customer service", + "tech support", + "technical support", + // Help desk + "help desk", + "helpdesk", + "support desk", + // Talk to support + "talk to support", + "speak to support", + "speak with support", + "chat with support", + // Need help + "need real help", + "need actual help", + "help me now", + ]) +}); + +static QUIT_PATTERNS: LazyLock> = LazyLock::new(|| { + normalize_patterns(&[ + // Give up + "i give up", + "give up", + "giving up", + // Quit/leaving + "i'm going to quit", + "i quit", + "quitting", + "i'm leaving", + "i'm done", + "i'm out", + // Forget it + "forget it", + "forget this", + "screw it", + "screw this", + // Never mind + "never mind", + "nevermind", + "don't bother", + "not worth it", + // Hopeless + "this is hopeless", + // Going elsewhere + "going elsewhere", + "try somewhere else", + "look elsewhere", + "find another", + ]) +}); + +// ============================================================================ +// Core Signal Types +// ============================================================================ + +/// Overall quality assessment for an agent interaction session +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum InteractionQuality { + /// Excellent interaction with strong positive signals + Excellent, + /// Good interaction with mostly positive signals + Good, + /// Neutral interaction with mixed signals + Neutral, + /// Poor interaction with concerning signals + Poor, + /// Critical interaction with severe negative signals + Severe, +} + +/// Container for all computed signals for a conversation +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignalReport { + /// Turn count and efficiency metrics + pub turn_count: TurnCountSignal, + /// Follow-up and repair frequency + pub follow_up: FollowUpSignal, + /// User frustration indicators + pub frustration: FrustrationSignal, + /// Repetition and looping behavior + pub repetition: RepetitionSignal, + /// Positive feedback indicators + pub positive_feedback: PositiveFeedbackSignal, + /// User escalation requests + pub escalation: EscalationSignal, + /// Overall quality assessment + pub overall_quality: InteractionQuality, + /// Human-readable summary + pub summary: String, +} + +// ============================================================================ +// Individual Signal Types +// ============================================================================ + +/// Turn count and efficiency metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TurnCountSignal { + /// Total number of turns (user-agent exchanges) + pub total_turns: usize, + /// Number of user messages + pub user_turns: usize, + /// Number of assistant messages + pub assistant_turns: usize, + /// Whether the turn count is concerning (> 7) + pub is_concerning: bool, + /// Whether the turn count is excessive (> 12) + pub is_excessive: bool, + /// Efficiency score (0.0-1.0, lower turns = higher score) + pub efficiency_score: f64, +} + +/// Follow-up and repair frequency signal +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FollowUpSignal { + /// Number of detected repair attempts + pub repair_count: usize, + /// Ratio of repairs to total user turns + pub repair_ratio: f64, + /// Whether repair ratio is concerning (> 0.3) + pub is_concerning: bool, + /// List of detected repair phrases + pub repair_phrases: Vec, +} + +/// User frustration indicators +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FrustrationSignal { + /// Number of frustration indicators detected + pub frustration_count: usize, + /// Whether frustration is detected + pub has_frustration: bool, + /// Severity level (0-3: none, mild, moderate, severe) + pub severity: u8, + /// List of detected frustration indicators + pub indicators: Vec, +} + +/// Individual frustration indicator +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FrustrationIndicator { + /// Type of frustration detected + pub indicator_type: FrustrationType, + /// Message index where detected + pub message_index: usize, + /// Relevant text snippet + pub snippet: String, +} + +/// Types of frustration indicators +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum FrustrationType { + /// Negative sentiment detected + NegativeSentiment, + /// All caps typing + AllCaps, + /// Excessive punctuation + ExcessivePunctuation, + /// Profanity detected + Profanity, + /// Direct complaint + DirectComplaint, + /// Expression of confusion + Confusion, +} + +/// Repetition and looping behavior signal +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RepetitionSignal { + /// Number of repetitions detected + pub repetition_count: usize, + /// Whether significant looping detected (> 2 repetitions) + pub has_looping: bool, + /// Severity level (0-3: none, mild, moderate, severe) + pub severity: u8, + /// List of detected repetitions + pub repetitions: Vec, +} + +/// Individual repetition instance +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RepetitionInstance { + /// Message indices involved in repetition + pub message_indices: Vec, + /// Similarity score (0.0-1.0) + pub similarity: f64, + /// Type of repetition + pub repetition_type: RepetitionType, +} + +/// Types of repetition +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum RepetitionType { + /// Exact repetition + Exact, + /// Near-duplicate (high similarity) + NearDuplicate, + /// Semantic repetition (similar meaning) + Semantic, +} + +/// Positive feedback indicators +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PositiveFeedbackSignal { + /// Number of positive indicators detected + pub positive_count: usize, + /// Whether positive feedback is present + pub has_positive_feedback: bool, + /// Confidence score (0.0-1.0) + pub confidence: f64, + /// List of detected positive indicators + pub indicators: Vec, +} + +/// Individual positive indicator +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PositiveIndicator { + /// Type of positive feedback + pub indicator_type: PositiveType, + /// Message index where detected + pub message_index: usize, + /// Relevant text snippet + pub snippet: String, +} + +/// Types of positive indicators +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum PositiveType { + /// Expression of gratitude + Gratitude, + /// Explicit satisfaction + Satisfaction, + /// Confirmation of success + Success, + /// Positive sentiment + PositiveSentiment, + /// Natural topic transition + TopicTransition, +} + +/// User escalation signal +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EscalationSignal { + /// Whether escalation was requested + pub escalation_requested: bool, + /// Number of escalation requests + pub escalation_count: usize, + /// List of detected escalation requests + pub requests: Vec, +} + +/// Individual escalation request +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EscalationRequest { + /// Message index where detected + pub message_index: usize, + /// Relevant text snippet + pub snippet: String, + /// Type of escalation + pub escalation_type: EscalationType, +} + +/// Types of escalation +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum EscalationType { + /// Request for human agent + HumanAgent, + /// Request for support + Support, + /// Threat to quit/leave + ThreatToQuit, + /// General help request + HelpRequest, +} + +// ============================================================================ +// Signal Analyzer +// ============================================================================ + +/// Trait for analyzing conversation signals +pub trait SignalAnalyzer { + /// Analyze a conversation and generate a complete signal report + fn analyze(&self, messages: &[Message]) -> SignalReport; +} + +/// Text-based implementation of signal analyzer that computes all signals from a message array +pub struct TextBasedSignalAnalyzer { + /// Baseline expected turns for normal interactions + baseline_turns: usize, + /// Threshold for character ngram similarity (0.0-1.0) + char_ngram_threshold: f64, + /// Threshold for token cosine similarity (0.0-1.0) + token_cosine_threshold: f64, + /// Maximum message length in characters (prevents unbounded computation) + max_message_length: usize, + /// Maximum number of messages to process (prevents unbounded computation) + max_messages: usize, + /// Maximum window size for repetition detection (prevents O(n²) explosion) + max_repetition_window: usize, +} + +impl TextBasedSignalAnalyzer { + /// Extract text content from MessageContent, skipping non-text content + fn extract_text(content: &hermesllm::apis::openai::MessageContent) -> Option { + match content { + hermesllm::apis::openai::MessageContent::Text(text) => Some(text.clone()), + // Tool calls and other structured content are skipped + _ => None, + } + } + + /// Create a new signal analyzer with default settings + pub fn new() -> Self { + Self { + baseline_turns: 5, + char_ngram_threshold: 0.50, // Lowered to handle typos and small edits realistically + token_cosine_threshold: 0.60, // Lowered for better semantic match in varied contexts + max_message_length: 2000, // Prevent unbounded ngram generation + max_messages: 100, // Prevent unbounded message processing + max_repetition_window: 20, // Prevent O(n²) explosion in repetition detection + } + } + + /// Create a new signal analyzer with custom baseline + pub fn with_baseline(baseline_turns: usize) -> Self { + Self { + baseline_turns, + char_ngram_threshold: 0.50, + token_cosine_threshold: 0.60, + max_message_length: 2000, + max_messages: 100, + max_repetition_window: 20, + } + } + + /// Create a new signal analyzer with custom settings + /// + /// # Arguments + /// * `baseline_turns` - Expected baseline turns for normal interactions + /// * `char_ngram_threshold` - Threshold for character ngram similarity (0.0-1.0) + /// * `token_cosine_threshold` - Threshold for token cosine similarity (0.0-1.0) + pub fn with_settings( + baseline_turns: usize, + char_ngram_threshold: f64, + token_cosine_threshold: f64, + ) -> Self { + Self { + baseline_turns, + char_ngram_threshold, + token_cosine_threshold, + max_message_length: 2000, + max_messages: 100, + max_repetition_window: 20, + } + } + + /// Create a new signal analyzer with full custom settings including computation limits + /// + /// # Arguments + /// * `baseline_turns` - Expected baseline turns for normal interactions + /// * `char_ngram_threshold` - Threshold for character ngram similarity (0.0-1.0) + /// * `token_cosine_threshold` - Threshold for token cosine similarity (0.0-1.0) + /// * `max_message_length` - Maximum characters per message to process + /// * `max_messages` - Maximum number of messages to process + /// * `max_repetition_window` - Maximum messages to compare for repetition detection + pub fn with_full_settings( + baseline_turns: usize, + char_ngram_threshold: f64, + token_cosine_threshold: f64, + max_message_length: usize, + max_messages: usize, + max_repetition_window: usize, + ) -> Self { + Self { + baseline_turns, + char_ngram_threshold, + token_cosine_threshold, + max_message_length, + max_messages, + max_repetition_window, + } + } + + // ======================================================================== + // Individual Signal Analyzers + // ======================================================================== + + /// Analyze turn count and efficiency + fn analyze_turn_count(&self, messages: &[Message]) -> TurnCountSignal { + let mut user_turns = 0; + let mut assistant_turns = 0; + + for message in messages { + match message.role { + Role::User => user_turns += 1, + Role::Assistant => assistant_turns += 1, + _ => {} + } + } + + let total_turns = user_turns + assistant_turns; + let is_concerning = total_turns > 7; + let is_excessive = total_turns > 12; + + // Calculate efficiency score (exponential decay after baseline) + let efficiency_score = if total_turns == 0 || total_turns <= self.baseline_turns { + 1.0 + } else { + let excess = total_turns - self.baseline_turns; + 1.0 / (1.0 + (excess as f64 * 0.3)) + }; + + TurnCountSignal { + total_turns, + user_turns, + assistant_turns, + is_concerning, + is_excessive, + efficiency_score, + } + } + + /// Analyze follow-up and repair frequency + fn analyze_follow_up( + &self, + normalized_messages: &[(usize, Role, NormalizedMessage)], + ) -> FollowUpSignal { + let mut repair_count = 0; + let mut repair_phrases = Vec::new(); + let mut user_turn_count = 0; + + for (i, role, norm_msg) in normalized_messages { + if *role != Role::User { + continue; + } + + user_turn_count += 1; + + // Use per-turn boolean to prevent double-counting + let mut found_in_turn = false; + + // Use pre-computed patterns for fast matching + for pattern in REPAIR_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + repair_count += 1; + repair_phrases.push(format!("Turn {}: '{}'", i + 1, pattern.raw)); + found_in_turn = true; + break; + } + } + + // Only check for semantic similarity if no pattern matched + if !found_in_turn && *i >= 2 { + // Find previous user message + for j in (0..*i).rev() { + let (_, prev_role, prev_norm_msg) = &normalized_messages[j]; + if *prev_role == Role::User { + if self.is_similar_rephrase(norm_msg, prev_norm_msg) { + repair_count += 1; + repair_phrases + .push(format!("Turn {}: Similar rephrase detected", i + 1)); + } + break; + } + } + } + } + + let repair_ratio = if user_turn_count == 0 { + 0.0 + } else { + repair_count as f64 / user_turn_count as f64 + }; + + let is_concerning = repair_ratio > 0.3; + + FollowUpSignal { + repair_count, + repair_ratio, + is_concerning, + repair_phrases, + } + } + + /// Analyze user frustration indicators + fn analyze_frustration( + &self, + normalized_messages: &[(usize, Role, NormalizedMessage)], + ) -> FrustrationSignal { + let mut indicators = Vec::new(); + + // Profanity list - only as standalone tokens, not substrings + let profanity_tokens = [ + "damn", "damnit", "crap", "wtf", "ffs", "bullshit", "shit", "fuck", "fucking", + ]; + + for (i, role, norm_msg) in normalized_messages { + if *role != Role::User { + continue; + } + + let text = &norm_msg.raw; + + // Check for all caps (at least 10 chars and 80% uppercase) + let alpha_chars: String = text.chars().filter(|c| c.is_alphabetic()).collect(); + if alpha_chars.len() >= 10 { + let upper_count = alpha_chars.chars().filter(|c| c.is_uppercase()).count(); + let upper_ratio = upper_count as f64 / alpha_chars.len() as f64; + if upper_ratio >= 0.8 { + indicators.push(FrustrationIndicator { + indicator_type: FrustrationType::AllCaps, + message_index: *i, + snippet: text.chars().take(50).collect(), + }); + } + } + + // Check for excessive punctuation + let question_marks = text.matches('?').count(); + let exclamation_marks = text.matches('!').count(); + if question_marks >= 3 || exclamation_marks >= 3 { + indicators.push(FrustrationIndicator { + indicator_type: FrustrationType::ExcessivePunctuation, + message_index: *i, + snippet: text.chars().take(50).collect(), + }); + } + + // Check for complaint patterns using pre-computed patterns + for pattern in COMPLAINT_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + indicators.push(FrustrationIndicator { + indicator_type: FrustrationType::DirectComplaint, + message_index: *i, + snippet: pattern.raw.clone(), + }); + break; + } + } + + // Check for confusion patterns using pre-computed patterns + for pattern in CONFUSION_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + indicators.push(FrustrationIndicator { + indicator_type: FrustrationType::Confusion, + message_index: *i, + snippet: pattern.raw.clone(), + }); + break; + } + } + + // Check for profanity (token-based, not substring) + for token in &profanity_tokens { + if norm_msg.contains_token(token) { + indicators.push(FrustrationIndicator { + indicator_type: FrustrationType::Profanity, + message_index: *i, + snippet: token.to_string(), + }); + break; + } + } + } + + let frustration_count = indicators.len(); + let has_frustration = frustration_count > 0; + + // Calculate severity + let severity = if frustration_count == 0 { + 0 + } else if frustration_count <= 2 { + 1 + } else if frustration_count <= 4 { + 2 + } else { + 3 + }; + + FrustrationSignal { + frustration_count, + has_frustration, + severity, + indicators, + } + } + + /// Analyze repetition and looping behavior + fn analyze_repetition( + &self, + normalized_messages: &[(usize, Role, NormalizedMessage)], + ) -> RepetitionSignal { + let mut repetitions = Vec::new(); + + // Collect assistant messages with normalized content + let assistant_messages: Vec<(usize, &NormalizedMessage)> = normalized_messages + .iter() + .filter(|(_, role, _)| *role == Role::Assistant) + .map(|(i, _, norm_msg)| (*i, norm_msg)) + .collect(); + + // Limit the window size to prevent O(n²) explosion + // Only compare messages within the max_repetition_window + let window_size = self.max_repetition_window.min(assistant_messages.len()); + + // Check for exact or near-duplicate responses using bigram similarity + // Only compare within the sliding window + for i in 0..assistant_messages.len() { + let window_start = i + 1; + let window_end = (i + 1 + window_size).min(assistant_messages.len()); + + for j in window_start..window_end { + let (idx_i, norm_msg_i) = &assistant_messages[i]; + let (idx_j, norm_msg_j) = &assistant_messages[j]; + + // Skip if messages are too short + if norm_msg_i.tokens.len() < 5 || norm_msg_j.tokens.len() < 5 { + continue; + } + + // Calculate bigram-based similarity (more accurate for near-duplicates) + let similarity = self.calculate_bigram_similarity(norm_msg_i, norm_msg_j); + + // Exact match - lowered from 0.95 to 0.85 for bigram similarity + if similarity >= 0.85 { + repetitions.push(RepetitionInstance { + message_indices: vec![*idx_i, *idx_j], + similarity, + repetition_type: RepetitionType::Exact, + }); + } + // Near duplicate - lowered from 0.75 to 0.50 to catch subtle repetitions + else if similarity >= 0.50 { + repetitions.push(RepetitionInstance { + message_indices: vec![*idx_i, *idx_j], + similarity, + repetition_type: RepetitionType::NearDuplicate, + }); + } + } + } + + let repetition_count = repetitions.len(); + let has_looping = repetition_count > 2; + + let severity = if repetition_count == 0 { + 0 + } else if repetition_count <= 2 { + 1 + } else if repetition_count <= 4 { + 2 + } else { + 3 + }; + + RepetitionSignal { + repetition_count, + has_looping, + severity, + repetitions, + } + } + + /// Calculate bigram similarity using cached bigram sets + fn calculate_bigram_similarity( + &self, + norm_msg1: &NormalizedMessage, + norm_msg2: &NormalizedMessage, + ) -> f64 { + // Use pre-cached bigram sets for O(1) lookups + let set1 = &norm_msg1.bigram_set; + let set2 = &norm_msg2.bigram_set; + + if set1.is_empty() && set2.is_empty() { + return 1.0; // Both empty = identical + } + + if set1.is_empty() || set2.is_empty() { + return 0.0; + } + + let intersection = set1.intersection(set2).count(); + let union = set1.union(set2).count(); + + if union == 0 { + return 0.0; + } + + intersection as f64 / union as f64 + } + + /// Analyze positive feedback indicators + fn analyze_positive_feedback( + &self, + normalized_messages: &[(usize, Role, NormalizedMessage)], + ) -> PositiveFeedbackSignal { + let mut indicators = Vec::new(); + + for (i, role, norm_msg) in normalized_messages { + if *role != Role::User { + continue; + } + + // Use per-turn boolean to prevent double-counting + let mut found_in_turn = false; + + // Check gratitude using pre-computed patterns + for pattern in GRATITUDE_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + indicators.push(PositiveIndicator { + indicator_type: PositiveType::Gratitude, + message_index: *i, + snippet: pattern.raw.clone(), + }); + found_in_turn = true; + break; + } + } + + if found_in_turn { + continue; + } + + // Check satisfaction using pre-computed patterns + for pattern in SATISFACTION_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + indicators.push(PositiveIndicator { + indicator_type: PositiveType::Satisfaction, + message_index: *i, + snippet: pattern.raw.clone(), + }); + found_in_turn = true; + break; + } + } + + if found_in_turn { + continue; + } + + // Check success confirmation using pre-computed patterns + for pattern in SUCCESS_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + indicators.push(PositiveIndicator { + indicator_type: PositiveType::Success, + message_index: *i, + snippet: pattern.raw.clone(), + }); + break; + } + } + } + + let positive_count = indicators.len(); + let has_positive_feedback = positive_count > 0; + + // Calculate confidence based on number and diversity of indicators + let confidence = if positive_count == 0 { + 0.0 + } else if positive_count == 1 { + 0.6 + } else if positive_count == 2 { + 0.8 + } else { + 0.95 + }; + + PositiveFeedbackSignal { + positive_count, + has_positive_feedback, + confidence, + indicators, + } + } + + /// Analyze user escalation requests + fn analyze_escalation( + &self, + normalized_messages: &[(usize, Role, NormalizedMessage)], + ) -> EscalationSignal { + let mut requests = Vec::new(); + + for (i, role, norm_msg) in normalized_messages { + if *role != Role::User { + continue; + } + + let mut found_human_agent = false; + + // Check for human agent request using pre-computed patterns + for pattern in HUMAN_AGENT_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + requests.push(EscalationRequest { + message_index: *i, + snippet: pattern.raw.clone(), + escalation_type: EscalationType::HumanAgent, + }); + found_human_agent = true; + break; + } + } + + // Check for support request (only if no human agent request found) + // HumanAgent and Support are too similar and often match the same phrase + if !found_human_agent { + for pattern in SUPPORT_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + requests.push(EscalationRequest { + message_index: *i, + snippet: pattern.raw.clone(), + escalation_type: EscalationType::Support, + }); + break; + } + } + } + + // Check for quit threats (independent of HumanAgent/Support) + // A message can contain both "give up" (quit) and "speak to human" (escalation) + for pattern in QUIT_PATTERNS.iter() { + if norm_msg.matches_normalized_pattern( + pattern, + self.char_ngram_threshold, + self.token_cosine_threshold, + ) { + requests.push(EscalationRequest { + message_index: *i, + snippet: pattern.raw.clone(), + escalation_type: EscalationType::ThreatToQuit, + }); + break; + } + } + } + + let escalation_count = requests.len(); + let escalation_requested = escalation_count > 0; + + EscalationSignal { + escalation_requested, + escalation_count, + requests, + } + } + + // ======================================================================== + // Helper Methods + // ======================================================================== + + /// Check if two messages are similar rephrases + fn is_similar_rephrase( + &self, + norm_msg1: &NormalizedMessage, + norm_msg2: &NormalizedMessage, + ) -> bool { + // Skip if too short + if norm_msg1.tokens.len() < 3 || norm_msg2.tokens.len() < 3 { + return false; + } + + // Common stopwords to downweight + let stopwords: HashSet<&str> = [ + "i", "me", "my", "you", "the", "a", "an", "is", "are", "was", "were", "to", "with", + "for", "of", "at", "by", "in", "on", "it", "this", "that", "can", "could", "do", + "does", "did", "will", "would", "should", "be", + ] + .iter() + .cloned() + .collect(); + + // Filter out stopwords for meaningful overlap + let tokens1: HashSet<_> = norm_msg1 + .tokens + .iter() + .filter(|t| !stopwords.contains(t.as_str())) + .collect(); + let tokens2: HashSet<_> = norm_msg2 + .tokens + .iter() + .filter(|t| !stopwords.contains(t.as_str())) + .collect(); + + // Need at least 2 non-stopword tokens + if tokens1.len() < 2 || tokens2.len() < 2 { + return false; + } + + let intersection = tokens1.intersection(&tokens2).count(); + let min_size = tokens1.len().min(tokens2.len()); + + // High overlap suggests rephrase + let overlap_ratio = intersection as f64 / min_size as f64; + overlap_ratio >= 0.6 + } + + /// Assess overall interaction quality based on all signals + fn assess_overall_quality( + &self, + turn_count: &TurnCountSignal, + follow_up: &FollowUpSignal, + frustration: &FrustrationSignal, + repetition: &RepetitionSignal, + positive: &PositiveFeedbackSignal, + escalation: &EscalationSignal, + ) -> InteractionQuality { + // Critical conditions - immediate fail + if escalation.escalation_requested + || frustration.severity >= 3 + || repetition.severity >= 3 + || turn_count.is_excessive + { + return InteractionQuality::Severe; + } + + // Calculate quality score + let mut score = 50.0; // Start at neutral + + // Positive factors + if positive.has_positive_feedback { + score += 20.0 * positive.confidence; + } + score += turn_count.efficiency_score * 10.0; + + // Negative factors + if frustration.has_frustration { + score -= frustration.severity as f64 * 10.00; + } + if follow_up.is_concerning { + score -= 15.0; + } + if repetition.has_looping { + score -= repetition.severity as f64 * 8.0; + } + if turn_count.is_concerning { + score -= 10.0; + } + + // Map score to quality level + if score >= 75.0 { + InteractionQuality::Excellent + } else if score >= 60.0 { + InteractionQuality::Good + } else if score >= 40.0 { + InteractionQuality::Neutral + } else if score >= 25.0 { + InteractionQuality::Poor + } else { + InteractionQuality::Severe + } + } + + /// Generate human-readable summary + #[allow(clippy::too_many_arguments)] + fn generate_summary( + &self, + turn_count: &TurnCountSignal, + follow_up: &FollowUpSignal, + frustration: &FrustrationSignal, + repetition: &RepetitionSignal, + positive: &PositiveFeedbackSignal, + escalation: &EscalationSignal, + quality: &InteractionQuality, + ) -> String { + let mut summary_parts = Vec::new(); + + summary_parts.push(format!("Overall Quality: {:?}", quality)); + + summary_parts.push(format!( + "Turn Count: {} turns (efficiency: {:.1}%)", + turn_count.total_turns, + turn_count.efficiency_score * 100.0 + )); + + if follow_up.is_concerning { + summary_parts.push(format!( + "⚠️ High repair rate: {:.1}% of user turns", + follow_up.repair_ratio * 100.0 + )); + } + + if frustration.has_frustration { + summary_parts.push(format!( + "⚠️ Frustration detected: {} indicators (severity: {})", + frustration.frustration_count, frustration.severity + )); + } + + if repetition.has_looping { + summary_parts.push(format!( + "⚠️ Looping detected: {} repetitions", + repetition.repetition_count + )); + } + + if positive.has_positive_feedback { + summary_parts.push(format!( + "✓ Positive feedback: {} indicators", + positive.positive_count + )); + } + + if escalation.escalation_requested { + summary_parts.push(format!( + "⚠️ Escalation requested: {} requests", + escalation.escalation_count + )); + } + + summary_parts.join(" | ") + } +} + +impl SignalAnalyzer for TextBasedSignalAnalyzer { + fn analyze(&self, messages: &[Message]) -> SignalReport { + // Limit the number of messages to process (take most recent messages) + let messages_to_process = if messages.len() > self.max_messages { + &messages[messages.len() - self.max_messages..] + } else { + messages + }; + + // Preprocess all messages once, filtering out non-text content (tool calls, etc.) + // and truncating long messages + let normalized_messages: Vec<(usize, Role, NormalizedMessage)> = messages_to_process + .iter() + .enumerate() + .filter_map(|(i, msg)| { + Self::extract_text(&msg.content).map(|text| { + ( + i, + msg.role.clone(), + NormalizedMessage::from_text_with_limit(&text, self.max_message_length), + ) + }) + }) + .collect(); + + let turn_count = self.analyze_turn_count(messages_to_process); + let follow_up = self.analyze_follow_up(&normalized_messages); + let frustration = self.analyze_frustration(&normalized_messages); + let repetition = self.analyze_repetition(&normalized_messages); + let positive_feedback = self.analyze_positive_feedback(&normalized_messages); + let escalation = self.analyze_escalation(&normalized_messages); + + let overall_quality = self.assess_overall_quality( + &turn_count, + &follow_up, + &frustration, + &repetition, + &positive_feedback, + &escalation, + ); + + let summary = self.generate_summary( + &turn_count, + &follow_up, + &frustration, + &repetition, + &positive_feedback, + &escalation, + &overall_quality, + ); + + SignalReport { + turn_count, + follow_up, + frustration, + repetition, + positive_feedback, + escalation, + overall_quality, + summary, + } + } +} + +impl Default for TextBasedSignalAnalyzer { + fn default() -> Self { + Self::new() + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use hermesllm::apis::openai::MessageContent; + use std::time::Instant; + + fn create_message(role: Role, content: &str) -> Message { + Message { + role, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: None, + tool_call_id: None, + } + } + + // ======================================================================== + // Tests for New Similarity Methods + // ======================================================================== + + #[test] + fn test_char_ngram_similarity_exact_match() { + let msg = NormalizedMessage::from_text("thank you very much"); + let similarity = msg.char_ngram_similarity("thank you very much"); + assert!( + similarity > 0.95, + "Exact match should have very high similarity" + ); + } + + #[test] + fn test_char_ngram_similarity_typo() { + let msg = NormalizedMessage::from_text("thank you very much"); + // Common typo: "thnks" instead of "thanks" + let similarity = msg.char_ngram_similarity("thnks you very much"); + assert!( + similarity > 0.50, + "Should handle single-character typo with decent similarity: {}", + similarity + ); + } + + #[test] + fn test_char_ngram_similarity_small_edit() { + let msg = NormalizedMessage::from_text("this doesn't work"); + let similarity = msg.char_ngram_similarity("this doesnt work"); + assert!( + similarity > 0.70, + "Should handle punctuation removal gracefully: {}", + similarity + ); + } + + #[test] + fn test_char_ngram_similarity_word_insertion() { + let msg = NormalizedMessage::from_text("i don't understand"); + let similarity = msg.char_ngram_similarity("i really don't understand"); + assert!( + similarity > 0.40, + "Should be robust to word insertions: {}", + similarity + ); + } + + #[test] + fn test_token_cosine_similarity_exact_match() { + let msg = NormalizedMessage::from_text("this is not helpful"); + let similarity = msg.token_cosine_similarity("this is not helpful"); + assert!( + (similarity - 1.0).abs() < 0.01, + "Exact match should have cosine similarity of 1.0" + ); + } + + #[test] + fn test_token_cosine_similarity_word_order() { + let msg = NormalizedMessage::from_text("not helpful at all"); + let similarity = msg.token_cosine_similarity("helpful not at all"); + assert!( + similarity > 0.95, + "Should be robust to word order changes: {}", + similarity + ); + } + + #[test] + fn test_token_cosine_similarity_frequency() { + let msg = NormalizedMessage::from_text("help help help please"); + let similarity = msg.token_cosine_similarity("help please"); + assert!( + similarity > 0.7 && similarity < 1.0, + "Should account for frequency differences: {}", + similarity + ); + } + + #[test] + fn test_token_cosine_similarity_long_message_with_context() { + let msg = NormalizedMessage::from_text( + "I've been trying to set up my account for the past hour \ + and the verification email never arrived. I checked my spam folder \ + and still nothing. This is really frustrating and not helpful at all.", + ); + let similarity = msg.token_cosine_similarity("not helpful"); + assert!( + similarity > 0.15 && similarity < 0.7, + "Should detect pattern in long message with lower but non-zero similarity: {}", + similarity + ); + } + + #[test] + fn test_layered_matching_exact_hit() { + let msg = NormalizedMessage::from_text("thank you so much"); + assert!( + msg.layered_contains_phrase("thank you", 0.50, 0.60), + "Should match exact phrase in Layer 0" + ); + } + + #[test] + fn test_layered_matching_typo_hit() { + // Test that shows layered matching is more robust than exact matching alone + let msg = NormalizedMessage::from_text("it doesnt work for me"); + + // "doesnt work" should match "doesn't work" via character ngrams (high overlap) + assert!( + msg.layered_contains_phrase("doesn't work", 0.50, 0.60), + "Should match 'doesnt work' to 'doesn't work' via character ngrams" + ); + } + + #[test] + fn test_layered_matching_word_order_hit() { + let msg = NormalizedMessage::from_text("helpful not very"); + assert!( + msg.layered_contains_phrase("not helpful", 0.50, 0.60), + "Should match reordered words via token cosine in Layer 2" + ); + } + + #[test] + fn test_layered_matching_long_message_with_pattern() { + let msg = NormalizedMessage::from_text( + "I've tried everything and followed all the instructions \ + but this is not helpful at all and I'm getting frustrated", + ); + assert!( + msg.layered_contains_phrase("not helpful", 0.50, 0.60), + "Should detect pattern buried in long message" + ); + } + + #[test] + fn test_layered_matching_no_match() { + let msg = NormalizedMessage::from_text("everything is working perfectly"); + assert!( + !msg.layered_contains_phrase("not helpful", 0.50, 0.60), + "Should not match completely different content" + ); + } + + #[test] + fn test_char_ngram_vs_token_cosine_tradeoffs() { + // Character ngrams handle character-level changes well + let msg1 = NormalizedMessage::from_text("this doesnt work"); + let char_sim1 = msg1.char_ngram_similarity("this doesn't work"); + assert!( + char_sim1 > 0.70, + "Character ngrams should handle punctuation: {}", + char_sim1 + ); + + // Token cosine is better for word order and long messages with semantic overlap + let msg2 = + NormalizedMessage::from_text("I really appreciate all your help with this issue today"); + let token_sim2 = msg2.token_cosine_similarity("thank you for help"); + assert!( + token_sim2 > 0.15, + "Token cosine should detect semantic overlap: {}", + token_sim2 + ); + } + + // ======================================================================== + // Existing Tests + // ======================================================================== + + fn preprocess_messages(messages: &[Message]) -> Vec<(usize, Role, NormalizedMessage)> { + messages + .iter() + .enumerate() + .map(|(i, msg)| { + let text = msg.content.to_string(); + (i, msg.role.clone(), NormalizedMessage::from_text(&text)) + }) + .collect() + } + + #[test] + fn test_turn_count_efficient() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Hello"), + create_message(Role::Assistant, "Hi! How can I help?"), + create_message(Role::User, "Thanks!"), + ]; + + let signal = analyzer.analyze_turn_count(&messages); + assert_eq!(signal.total_turns, 3); + assert_eq!(signal.user_turns, 2); + assert_eq!(signal.assistant_turns, 1); + assert!(!signal.is_concerning); + assert!(!signal.is_excessive); + assert!(signal.efficiency_score > 0.9); + println!("test_turn_count_efficient took: {:?}", start.elapsed()); + } + + #[test] + fn test_turn_count_excessive() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let mut messages = Vec::new(); + for i in 0..15 { + messages.push(create_message( + if i % 2 == 0 { + Role::User + } else { + Role::Assistant + }, + &format!("Message {}", i), + )); + } + + let signal = analyzer.analyze_turn_count(&messages); + assert_eq!(signal.total_turns, 15); + assert!(signal.is_concerning); + assert!(signal.is_excessive); + assert!(signal.efficiency_score < 0.5); + println!("test_turn_count_excessive took: {:?}", start.elapsed()); + } + + #[test] + fn test_follow_up_detection() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Show me restaurants"), + create_message(Role::Assistant, "Here are some options"), + create_message(Role::User, "No, I meant Italian restaurants"), + create_message(Role::Assistant, "Here are Italian restaurants"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_follow_up(&normalized_messages); + assert_eq!(signal.repair_count, 1); + assert!(signal.repair_ratio > 0.0); + println!("test_follow_up_detection took: {:?}", start.elapsed()); + } + + #[test] + fn test_frustration_detection() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "THIS IS RIDICULOUS!!!"), + create_message(Role::Assistant, "I apologize for the frustration"), + create_message(Role::User, "This doesn't work at all"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized_messages); + assert!(signal.has_frustration); + assert!(signal.frustration_count >= 2); + assert!(signal.severity > 0); + println!("test_frustration_detection took: {:?}", start.elapsed()); + } + + #[test] + fn test_positive_feedback_detection() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Can you help me?"), + create_message(Role::Assistant, "Sure!"), + create_message(Role::User, "Thank you! That's exactly what I needed."), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_positive_feedback(&normalized_messages); + assert!(signal.has_positive_feedback); + assert!(signal.positive_count >= 1); + assert!(signal.confidence > 0.5); + println!( + "test_positive_feedback_detection took: {:?}", + start.elapsed() + ); + } + + #[test] + fn test_escalation_detection() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "This isn't working"), + create_message(Role::Assistant, "Let me help"), + create_message(Role::User, "I need to speak to a human agent"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_escalation(&normalized_messages); + assert!(signal.escalation_requested); + assert_eq!(signal.escalation_count, 1); + println!("test_escalation_detection took: {:?}", start.elapsed()); + } + + #[test] + fn test_repetition_detection() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "What's the weather?"), + create_message( + Role::Assistant, + "I can help you with the weather information", + ), + create_message(Role::User, "Show me the forecast"), + create_message(Role::Assistant, "Sure, I can help you with the forecast"), + create_message(Role::User, "Stop repeating yourself"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_repetition(&normalized_messages); + + for rep in &signal.repetitions { + println!( + " - Messages {:?}, similarity: {:.3}, type: {:?}", + rep.message_indices, rep.similarity, rep.repetition_type + ); + } + + assert!(signal.repetition_count > 0, + "Should detect the subtle repetition between 'I can help you with the weather information' \ + and 'Sure, I can help you with the forecast'"); + println!("test_repetition_detection took: {:?}", start.elapsed()); + } + + #[test] + fn test_full_analysis_excellent() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "I need to book a flight"), + create_message(Role::Assistant, "Sure! Where would you like to go?"), + create_message(Role::User, "New York"), + create_message(Role::Assistant, "Great! I found several options."), + create_message(Role::User, "Perfect!"), + ]; + + let report = analyzer.analyze(&messages); + assert!(matches!( + report.overall_quality, + InteractionQuality::Excellent | InteractionQuality::Good + )); + assert!(report.positive_feedback.has_positive_feedback); + assert!(!report.frustration.has_frustration); + println!("test_full_analysis_excellent took: {:?}", start.elapsed()); + } + + #[test] + fn test_full_analysis_poor() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Help me"), + create_message(Role::Assistant, "How can I assist?"), + create_message(Role::User, "No, I meant something else"), + create_message(Role::Assistant, "What do you need?"), + create_message(Role::User, "THIS DOESN'T WORK!!!"), + create_message(Role::Assistant, "I apologize"), + create_message(Role::User, "Let me speak to a human"), + ]; + + let report = analyzer.analyze(&messages); + assert!(matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + )); + assert!(report.frustration.has_frustration); + assert!(report.escalation.escalation_requested); + println!("test_full_analysis_poor took: {:?}", start.elapsed()); + } + + #[test] + fn test_fuzzy_matching_gratitude() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Can you help me?"), + create_message(Role::Assistant, "Sure!"), + create_message(Role::User, "thnaks! that's exactly what i needed."), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_positive_feedback(&normalized_messages); + assert!(signal.has_positive_feedback); + assert!(signal.positive_count >= 1); + println!("test_fuzzy_matching_gratitude took: {:?}", start.elapsed()); + } + + #[test] + fn test_fuzzy_matching_escalation() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "This isn't working"), + create_message(Role::Assistant, "Let me help"), + create_message(Role::User, "i need to speek to a human agnet"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_escalation(&normalized_messages); + assert!(signal.escalation_requested); + assert_eq!(signal.escalation_count, 1); + println!("test_fuzzy_matching_escalation took: {:?}", start.elapsed()); + } + + #[test] + fn test_fuzzy_matching_repair() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Show me restaurants"), + create_message(Role::Assistant, "Here are some options"), + create_message(Role::User, "no i ment Italian restaurants"), + create_message(Role::Assistant, "Here are Italian restaurants"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_follow_up(&normalized_messages); + assert!(signal.repair_count >= 1); + println!("test_fuzzy_matching_repair took: {:?}", start.elapsed()); + } + + #[test] + fn test_fuzzy_matching_complaint() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + // Use a complaint that should match - "doesnt work" is close enough to "doesn't work" + let messages = vec![ + create_message(Role::User, "this doesnt work at all"), // Common typo: missing apostrophe + create_message(Role::Assistant, "I apologize"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized_messages); + + // The layered matching should catch this via character ngrams or token cosine + // "doesnt work" has high character-level similarity to "doesn't work" + assert!( + signal.has_frustration, + "Should detect frustration from complaint pattern" + ); + assert!(signal.frustration_count >= 1); + println!("test_fuzzy_matching_complaint took: {:?}", start.elapsed()); + } + + #[test] + fn test_exact_match_priority() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message(Role::User, "thank you so much")]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_positive_feedback(&normalized_messages); + assert!(signal.has_positive_feedback); + // Should detect exact match, not fuzzy + assert!(signal.indicators[0].snippet.contains("thank you")); + assert!(!signal.indicators[0].snippet.contains("fuzzy")); + println!("test_exact_match_priority took: {:?}", start.elapsed()); + } + + // ======================================================================== + // Anti-Tests: Verify fixes stay fixed + // ======================================================================== + + #[test] + fn test_hello_not_profanity() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message(Role::User, "hello there")]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized_messages); + assert!( + !signal.has_frustration, + "\"hello\" should not trigger profanity detection" + ); + } + + #[test] + fn test_prepare_not_escalation() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message( + Role::User, + "Can you help me prepare for the meeting?", + )]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_escalation(&normalized_messages); + assert!( + !signal.escalation_requested, + "\"prepare\" should not trigger escalation (rep pattern removed)" + ); + } + + #[test] + fn test_unicode_apostrophe_confusion() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "I'm confused"), // Unicode apostrophe + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized_messages); + assert!( + signal.has_frustration, + "Unicode apostrophe 'I'm confused' should trigger confusion" + ); + } + + #[test] + fn test_unicode_quotes_work() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message( + Role::User, + "\u{201C}doesn\u{2019}t work\u{201D} with unicode quotes", + )]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized_messages); + assert!( + signal.has_frustration, + "Unicode quotes should be normalized and match patterns" + ); + } + + #[test] + fn test_absolute_not_profanity() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message(Role::User, "That's absolute nonsense")]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized_messages); + // Should match on "nonsense" logic, not on "bs" substring + let has_bs_match = signal + .indicators + .iter() + .any(|ind| ind.snippet.contains("bs")); + assert!( + !has_bs_match, + "\"absolute\" should not trigger 'bs' profanity match" + ); + } + + #[test] + fn test_stopwords_not_rephrase() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Help me with X"), + create_message(Role::Assistant, "Sure"), + create_message(Role::User, "Help me with Y"), + ]; + + let normalized_messages = preprocess_messages(&messages); + let signal = analyzer.analyze_follow_up(&normalized_messages); + // Should not detect as rephrase since only stopwords overlap + assert_eq!( + signal.repair_count, 0, + "Messages with only stopword overlap should not be rephrases" + ); + } + + #[test] + fn test_frustrated_user_with_legitimate_repair() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + + use hermesllm::apis::openai::{FunctionCall, ToolCall}; + + // Helper to create a message with tool calls + let create_assistant_with_tools = + |content: &str, tool_id: &str, tool_name: &str, args: &str| -> Message { + Message { + role: Role::Assistant, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: Some(vec![ToolCall { + id: tool_id.to_string(), + call_type: "function".to_string(), + function: FunctionCall { + name: tool_name.to_string(), + arguments: args.to_string(), + }, + }]), + tool_call_id: None, + } + }; + + // Helper to create a tool response message + let create_tool_message = |tool_call_id: &str, content: &str| -> Message { + Message { + role: Role::Tool, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: None, + tool_call_id: Some(tool_call_id.to_string()), + } + }; + + // Scenario: User DOES mention New York in first message, making "I already told you" legitimate + let messages = vec![ + create_message( + Role::User, + "I need to book a flight from New York to Paris for December 20th", + ), + create_assistant_with_tools( + "I'll help you search for flights to Paris.", + "call_123", + "search_flights", + r#"{"origin": "NYC", "destination": "Paris", "date": "2025-12-20"}"#, + ), + create_tool_message("call_123", r#"{"flights": []}"#), + create_message( + Role::Assistant, + "I couldn't find any flights. Could you provide your departure city?", + ), + create_message(Role::User, "I already told you, from New York!"), + create_assistant_with_tools( + "Let me try again.", + "call_456", + "search_flights", + r#"{"origin": "New York", "destination": "Paris", "date": "2025-12-20"}"#, + ), + create_tool_message("call_456", r#"{"flights": []}"#), + create_message( + Role::Assistant, + "I'm still not finding results. Let me check the system.", + ), + create_message( + Role::User, + "THIS IS RIDICULOUS!!! The tool doesn't work at all. Why do you keep calling it?", + ), + create_message( + Role::Assistant, + "I sincerely apologize for the frustration with the search tool.", + ), + create_message( + Role::User, + "Forget it. I need to speak to a human agent. This is a waste of time.", + ), + ]; + + let report = analyzer.analyze(&messages); + + // Tool messages should be filtered out, so we should only analyze text messages + // That's 4 user messages + 5 assistant text messages = 9 turns + assert_eq!( + report.turn_count.total_turns, 9, + "Should count 9 text messages (tool messages filtered out)" + ); + assert!( + report.turn_count.is_concerning, + "Should flag concerning turn count" + ); + + // Should detect frustration (all caps, complaints) + assert!( + report.frustration.has_frustration, + "Should detect frustration" + ); + assert!( + report.frustration.frustration_count >= 2, + "Should detect multiple frustration indicators" + ); + assert!( + report.frustration.severity >= 2, + "Should have moderate or higher frustration severity" + ); + + // Should detect escalation request + assert!( + report.escalation.escalation_requested, + "Should detect escalation to human agent" + ); + assert!( + report.escalation.escalation_count >= 1, + "Should detect at least one escalation" + ); + + // Overall quality should be Poor or Severe + assert!( + matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ), + "Quality should be Poor or Severe, got {:?}", + report.overall_quality + ); + + println!( + "test_frustrated_user_with_legitimate_repair took: {:?}", + start.elapsed() + ); + } + + #[test] + fn test_frustrated_user_false_claim() { + let start = Instant::now(); + let analyzer = TextBasedSignalAnalyzer::new(); + + use hermesllm::apis::openai::{FunctionCall, ToolCall}; + + // Helper to create a message with tool calls + let create_assistant_with_tools = + |content: &str, tool_id: &str, tool_name: &str, args: &str| -> Message { + Message { + role: Role::Assistant, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: Some(vec![ToolCall { + id: tool_id.to_string(), + call_type: "function".to_string(), + function: FunctionCall { + name: tool_name.to_string(), + arguments: args.to_string(), + }, + }]), + tool_call_id: None, + } + }; + + // Helper to create a tool response message + let create_tool_message = |tool_call_id: &str, content: &str| -> Message { + Message { + role: Role::Tool, + content: MessageContent::Text(content.to_string()), + name: None, + tool_calls: None, + tool_call_id: Some(tool_call_id.to_string()), + } + }; + + // Scenario: User NEVER mentions New York in first message but claims "I already told you" + // This represents realistic frustrated user behavior - exaggeration/misremembering + let messages = vec![ + create_message( + Role::User, + "I need to book a flight to Paris for December 20th", + ), + create_assistant_with_tools( + "I'll help you search for flights to Paris.", + "call_123", + "search_flights", + r#"{"destination": "Paris", "date": "2025-12-20"}"#, + ), + create_tool_message("call_123", r#"{"error": "origin required"}"#), + create_message( + Role::Assistant, + "I couldn't find any flights. Could you provide your departure city?", + ), + create_message(Role::User, "I already told you, from New York!"), // False claim - never mentioned it + create_assistant_with_tools( + "Let me try again.", + "call_456", + "search_flights", + r#"{"origin": "New York", "destination": "Paris", "date": "2025-12-20"}"#, + ), + create_tool_message("call_456", r#"{"flights": []}"#), + create_message( + Role::Assistant, + "I'm still not finding results. Let me check the system.", + ), + create_message( + Role::User, + "THIS IS RIDICULOUS!!! The tool doesn't work at all. Why do you keep calling it?", + ), + create_message( + Role::Assistant, + "I sincerely apologize for the frustration with the search tool.", + ), + create_message( + Role::User, + "Forget it. I need to speak to a human agent. This is a waste of time.", + ), + ]; + + let report = analyzer.analyze(&messages); + + // Tool messages should be filtered out, so we should only analyze text messages + // That's 4 user messages + 5 assistant text messages = 9 turns + assert_eq!( + report.turn_count.total_turns, 9, + "Should count 9 text messages (tool messages filtered out)" + ); + assert!( + report.turn_count.is_concerning, + "Should flag concerning turn count" + ); + + // Should detect frustration (all caps, complaints, false claims) + assert!( + report.frustration.has_frustration, + "Should detect frustration" + ); + assert!( + report.frustration.frustration_count >= 2, + "Should detect multiple frustration indicators" + ); + assert!( + report.frustration.severity >= 2, + "Should have moderate or higher frustration severity" + ); + + // Should detect escalation request + assert!( + report.escalation.escalation_requested, + "Should detect escalation to human agent" + ); + assert!( + report.escalation.escalation_count >= 1, + "Should detect at least one escalation" + ); + + // Note: May detect false positive "positive feedback" due to fuzzy matching + // e.g., "I already told YOU" matches "you rock", "THIS is RIDICULOUS" matches "this helps" + // However, the overall quality should still be Poor/Severe due to frustration+escalation + + // Overall quality should be Poor or Severe (frustration + escalation indicates poor interaction) + assert!( + matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ), + "Quality should be Poor or Severe for frustrated user with false claims, got {:?}", + report.overall_quality + ); + + println!( + "test_frustrated_user_false_claim took: {:?}", + start.elapsed() + ); + } + + // false negative tests + #[test] + fn test_dissatisfaction_polite_not_working_for_me() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Thanks, but this still isn't working for me."), // Polite dissatisfaction, e.g., I appreciate it, but this isn't what I was looking for. + create_message(Role::Assistant, "Sorry—what error do you see?"), + ]; + let normalized = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized); + assert!( + signal.has_frustration, + "Polite dissatisfaction should be detected" + ); + } + + #[test] + fn test_dissatisfaction_giving_up_without_escalation() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message( + Role::User, + "Never mind, I'll figure it out myself.", + )]; + let normalized = preprocess_messages(&messages); + let signal = analyzer.analyze_escalation(&normalized); + assert!( + signal.escalation_requested, + "Giving up should count as escalation/quit intent" + ); + } + + #[test] + fn test_dissatisfaction_same_problem_again() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message( + Role::User, + "I'm running into the same issue again.", + )]; + let normalized = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized); + assert!( + signal.has_frustration, + "'same issue again' should be detected" + ); + } + + #[test] + fn test_unsatisfied_incomplete() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message(Role::User, "This feels incomplete.")]; + let normalized = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized); + assert!( + signal.has_frustration, + "Should detect 'incomplete' dissatisfaction" + ); + } + + #[test] + fn test_low_mood_overwhelming() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message( + Role::User, + "This is overwhelming and I'm not sure what to do.", + )]; + let normalized = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized); + assert!(signal.has_frustration, "Should detect overwhelmed language"); + } + + #[test] + fn test_low_mood_exhausted_trying() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![create_message( + Role::User, + "I'm exhausted trying to get this working.", + )]; + let normalized = preprocess_messages(&messages); + let signal = analyzer.analyze_frustration(&normalized); + assert!( + signal.has_frustration, + "Should detect exhaustion/struggle language" + ); + } + + #[test] + fn test_common_polite_unresolved_dissatisfaction() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "I'm trying to set up SSH keys for GitHub."), + create_message( + Role::Assistant, + "Sure. First generate a key using ssh-keygen.", + ), + create_message(Role::User, "I did that already."), + create_message( + Role::Assistant, + "Then add the key to your GitHub account settings.", + ), + create_message(Role::User, "I've done that too."), + create_message( + Role::Assistant, + "After that, make sure your SSH agent is running.", + ), + create_message( + Role::User, + "Okay, but this still doesn't seem to fix the issue.", + ), + create_message(Role::Assistant, "What error message are you seeing?"), + create_message(Role::User, "It's just not connecting the way I expected."), + ]; + + let report = analyzer.analyze(&messages); + + // This is a common false negative if you only look for caps/profanity. + // Desired: detect dissatisfaction/frustration (or at least not rate as Excellent). + assert!( + report.frustration.has_frustration || report.follow_up.repair_count >= 1, + "Should detect polite unresolved dissatisfaction via frustration or follow-up indicators" + ); + + assert!( + !matches!(report.overall_quality, InteractionQuality::Excellent), + "Should not classify unresolved dissatisfaction as Excellent" + ); + } + + #[test] + fn test_common_resigned_giving_up_quietly() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message( + Role::User, + "Can you explain how to deploy this with Docker?", + ), + create_message( + Role::Assistant, + "You need to write a Dockerfile and build an image.", + ), + create_message(Role::User, "I tried that."), + create_message(Role::Assistant, "Then you can run docker-compose up."), + create_message(Role::User, "I did, but it didn’t really help."), + create_message(Role::Assistant, "What error are you getting?"), + create_message( + Role::User, + "Honestly, never mind. I’ll just try something else.", + ), + ]; + + let report = analyzer.analyze(&messages); + + // Many systems miss "never mind / I'll try something else" if they only look for "human agent". + assert!( + report.escalation.escalation_requested || report.frustration.has_frustration, + "Resigned quitting language should trigger escalation or frustration" + ); + + assert!( + matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ) || report.escalation.escalation_requested + || report.frustration.has_frustration, + "Giving up should not be classified as a high-quality interaction" + ); + } + + #[test] + fn test_common_discouraged_overwhelmed_low_mood() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "I'm trying to understand backpropagation."), + create_message( + Role::Assistant, + "It's a way to compute gradients efficiently.", + ), + create_message(Role::User, "I’ve read that explanation already."), + create_message(Role::Assistant, "Would you like a mathematical derivation?"), + create_message(Role::User, "Maybe, but I’m still having trouble following."), + create_message(Role::Assistant, "I can walk through a simple example."), + create_message( + Role::User, + "That might help, but honestly this is pretty overwhelming.", + ), + create_message(Role::Assistant, "Let’s slow it down step by step."), + create_message( + Role::User, + "Yeah… I’m just feeling kind of discouraged right now.", + ), + ]; + + let report = analyzer.analyze(&messages); + + // This is negative affect without caps/profanity. Should still count as frustration/negative signal. + assert!( + report.frustration.has_frustration, + "Overwhelmed/discouraged language should be detected as negative sentiment/frustration" + ); + + assert!( + !matches!(report.overall_quality, InteractionQuality::Excellent), + "Low-mood discouragement should not be classified as Excellent" + ); + } + + #[test] + fn test_common_misalignment_not_what_i_asked() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "How do I optimize this SQL query?"), + create_message( + Role::Assistant, + "You can add indexes to improve performance.", + ), + create_message(Role::User, "I already have indexes."), + create_message(Role::Assistant, "Then you could consider query caching."), + create_message(Role::User, "That’s not really what I was asking about."), + create_message( + Role::Assistant, + "What specifically are you trying to optimize?", + ), + create_message( + Role::User, + "The execution plan — this answer doesn’t address that.", + ), + ]; + + let report = analyzer.analyze(&messages); + + // Misalignment often shows as follow-up repair or frustration. + assert!( + report.follow_up.repair_count >= 1 || report.frustration.has_frustration, + "Misalignment ('not what I asked') should trigger repair or frustration signals" + ); + + assert!( + !matches!(report.overall_quality, InteractionQuality::Excellent), + "Misalignment should not be rated as Excellent" + ); + } + + #[test] + fn test_common_false_negative_polite_disappointment_complexity() { + let analyzer = TextBasedSignalAnalyzer::new(); + let messages = vec![ + create_message(Role::User, "Can you help me write a regex for this?"), + create_message(Role::Assistant, "Sure, try this pattern: ^[a-z]+$"), + create_message(Role::User, "I tested it."), + create_message(Role::Assistant, "Did it work?"), + create_message(Role::User, "Not quite — it matches more than it should."), + create_message(Role::Assistant, "You can refine it with a lookahead."), + create_message( + Role::User, + "I see… this is more complicated than I expected.", + ), + ]; + + let report = analyzer.analyze(&messages); + + // Polite disappointment often becomes a false negative. + assert!( + report.frustration.has_frustration || report.follow_up.repair_count >= 1, + "Polite dissatisfaction ('not quite', 'more complicated than expected') should trigger a negative signal" + ); + + assert!( + !matches!(report.overall_quality, InteractionQuality::Excellent), + "Polite disappointment should not be classified as Excellent" + ); + } + + #[test] + fn test_catastrophic_failure_looping_assistant() { + let analyzer = TextBasedSignalAnalyzer::new(); + + // Catastrophic failure: assistant stuck in loop, user increasingly frustrated + let messages = vec![ + // Turn 1: Initial request + create_message(Role::User, "I need to cancel my subscription"), + create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"), + + // Turn 3: First repair - repetition clarification + create_message(Role::User, "I just said I want to cancel my subscription"), + // Turn 4: Exact repetition from assistant + create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"), + + // Turn 5: Repair (negation) + frustration (ALL CAPS) + create_message(Role::User, "No, I meant CANCEL. My. Subscription."), + // Turn 6: Near-duplicate repetition + create_message(Role::Assistant, "I understand you need help with your account. Let me assist you with that."), + + // Turn 7: Severe frustration (complaint + excessive punctuation + ALL CAPS) + create_message(Role::User, "This is not helpful!! I want to CANCEL my subscription!!!"), + // Turn 8: Exact repetition again (looping) + create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"), + + // Turn 9: Repair (explicit rephrase) + frustration (complaint) + create_message(Role::User, "This doesnt work. Let me rephrase: I need to terminate my account and stop being charged"), + // Turn 10: Near-duplicate repetition (looping continues) + create_message(Role::Assistant, "I understand you need help with your account. Let me assist you with that."), + + // Turn 11: Escalation (multiple types) + frustration + create_message(Role::User, "I give up. Can I speak to a real person? This is a waste of time"), + ]; + + let report = analyzer.analyze(&messages); + + // Validate turn count + assert_eq!( + report.turn_count.total_turns, 11, + "Should have 11 total turns" + ); + assert_eq!(report.turn_count.user_turns, 6, "Should have 6 user turns"); + assert_eq!( + report.turn_count.assistant_turns, 5, + "Should have 5 assistant turns" + ); + assert!( + report.turn_count.is_concerning, + "11 turns should be concerning (>7)" + ); + assert!( + !report.turn_count.is_excessive, + "11 turns should not be excessive (<=12)" + ); + assert!( + report.turn_count.efficiency_score < 0.5, + "Efficiency should be low" + ); + + // Validate repair detection (USER signals - query reformulation) + // Detected repairs: + // 1. "I just said I want to cancel..." - pattern: "I just said" + // 2. "No, I meant CANCEL..." - pattern: "No, I meant" + // 3. "Let me rephrase: I need to terminate..." - pattern: "let me rephrase" + // Note: "This is not helpful!!" is frustration (not repair) + // Note: "I give up..." is escalation (not repair) + assert_eq!( + report.follow_up.repair_count, 3, + "Should detect exactly 3 repair attempts from user messages" + ); + assert_eq!( + report.follow_up.repair_ratio, 0.5, + "Repair ratio should be 0.5 (3 repairs / 6 user messages)" + ); + assert!( + report.follow_up.is_concerning, + "50% repair ratio should be highly concerning (threshold is 30%)" + ); + + // Validate frustration detection + assert!( + report.frustration.has_frustration, + "Should detect frustration" + ); + assert!( + report.frustration.frustration_count >= 4, + "Should detect multiple frustration indicators: found {}", + report.frustration.frustration_count + ); + assert!( + report.frustration.severity >= 2, + "Should be at least moderate frustration" + ); + + // Validate repetition/looping detection (ASSISTANT signals - not following instructions) + // The assistant repeats the same unhelpful responses multiple times: + // 1. "I can help you with account management..." appears 3 times (exact repetition) + // 2. "I understand you need help with your account..." appears 2 times (near-duplicate) + assert!( + report.repetition.repetition_count >= 4, + "Should detect at least 4 assistant repetitions (exact + near-duplicates)" + ); + assert!( + report.repetition.has_looping, + "Should detect looping (>2 repetitions indicates stuck agent)" + ); + assert!( + report.repetition.severity >= 2, + "Should be moderate to severe looping (assistant not adapting)" + ); + + // Validate escalation detection + assert!( + report.escalation.escalation_requested, + "Should detect escalation request" + ); + assert!( + report.escalation.escalation_count >= 2, + "Should detect multiple escalation indicators: 'give up' + 'speak to a real person'" + ); + + // Validate overall quality + assert_eq!(report.overall_quality, InteractionQuality::Severe, "Should be classified as Severe due to escalation + excessive frustration + looping + high repair ratio"); + } +} diff --git a/crates/brightstaff/src/signals/mod.rs b/crates/brightstaff/src/signals/mod.rs new file mode 100644 index 00000000..83db943e --- /dev/null +++ b/crates/brightstaff/src/signals/mod.rs @@ -0,0 +1,3 @@ +mod analyzer; + +pub use analyzer::*; diff --git a/crates/brightstaff/src/tracing/constants.rs b/crates/brightstaff/src/tracing/constants.rs index aac48802..d1102531 100644 --- a/crates/brightstaff/src/tracing/constants.rs +++ b/crates/brightstaff/src/tracing/constants.rs @@ -139,6 +139,45 @@ pub mod error { pub const STACK_TRACE: &str = "error.stack_trace"; } +// ============================================================================= +// Span Attributes - Agentic Signals +// ============================================================================= + +/// Behavioral quality indicators for agent interactions +/// These signals are computed automatically from conversation patterns +pub mod signals { + /// Overall quality assessment + /// Values: "Excellent", "Good", "Neutral", "Poor", "Severe" + pub const QUALITY: &str = "signals.quality"; + + /// Total number of turns in the conversation + pub const TURN_COUNT: &str = "signals.turn_count"; + + /// Efficiency score (0.0-1.0) + pub const EFFICIENCY_SCORE: &str = "signals.efficiency_score"; + + /// Number of repair attempts detected + pub const REPAIR_COUNT: &str = "signals.follow_up.repair.count"; + + /// Ratio of repairs to user turns + pub const REPAIR_RATIO: &str = "signals.follow_up.repair.ratio"; + + /// Number of frustration indicators detected + pub const FRUSTRATION_COUNT: &str = "signals.frustration.count"; + + /// Frustration severity level (0-3) + pub const FRUSTRATION_SEVERITY: &str = "signals.frustration.severity"; + + /// Number of repetition instances detected + pub const REPETITION_COUNT: &str = "signals.repetition.count"; + + /// Whether escalation was requested (user asked for human help) + pub const ESCALATION_REQUESTED: &str = "signals.escalation.requested"; + + /// Number of positive feedback indicators detected + pub const POSITIVE_FEEDBACK_COUNT: &str = "signals.positive_feedback.count"; +} + // ============================================================================= // Operation Names // ============================================================================= diff --git a/crates/brightstaff/src/tracing/mod.rs b/crates/brightstaff/src/tracing/mod.rs index 4c7f099f..09ec6f2a 100644 --- a/crates/brightstaff/src/tracing/mod.rs +++ b/crates/brightstaff/src/tracing/mod.rs @@ -1,3 +1,5 @@ mod constants; -pub use constants::{error, http, llm, operation_component, routing, OperationNameBuilder}; +pub use constants::{ + error, http, llm, operation_component, routing, signals, OperationNameBuilder, +}; diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs index e49173dc..720e24d3 100644 --- a/crates/hermesllm/src/apis/openai_responses.rs +++ b/crates/hermesllm/src/apis/openai_responses.rs @@ -1127,82 +1127,16 @@ impl ProviderRequest for ResponsesAPIRequest { } fn get_messages(&self) -> Vec { - use crate::apis::openai::{Message, MessageContent, Role}; + use crate::transforms::request::from_openai::ResponsesInputConverter; - let mut openai_messages = Vec::new(); + // Use the shared converter to get the full conversion with image support + let converter = ResponsesInputConverter { + input: self.input.clone(), + instructions: self.instructions.clone(), + }; - // Add instructions as system message if present - if let Some(instructions) = &self.instructions { - openai_messages.push(Message { - role: Role::System, - content: MessageContent::Text(instructions.clone()), - name: None, - tool_calls: None, - tool_call_id: None, - }); - } - - // Convert input to messages - match &self.input { - InputParam::Text(text) => { - openai_messages.push(Message { - role: Role::User, - content: MessageContent::Text(text.clone()), - name: None, - tool_calls: None, - tool_call_id: None, - }); - } - InputParam::Items(items) => { - for item in items { - match item { - InputItem::Message(msg) => { - // Convert message role - let role = match msg.role { - MessageRole::User => Role::User, - MessageRole::Assistant => Role::Assistant, - MessageRole::System => Role::System, - MessageRole::Developer => Role::System, // Map developer to system - }; - - // Extract text from message content - let content = match &msg.content { - crate::apis::openai_responses::MessageContent::Text(text) => { - text.clone() - } - crate::apis::openai_responses::MessageContent::Items(items) => { - items - .iter() - .filter_map(|c| { - if let InputContent::InputText { text } = c { - Some(text.clone()) - } else { - None - } - }) - .collect::>() - .join("\n") - } - }; - - openai_messages.push(Message { - role, - content: MessageContent::Text(content), - name: None, - tool_calls: None, - tool_call_id: None, - }); - } - // Skip other input item types for now - InputItem::ItemReference { .. } | InputItem::FunctionCallOutput { .. } => { - // These are not yet supported in agent framework - } - } - } - } - } - - openai_messages + // Convert and return, falling back to empty vec on error + converter.try_into().unwrap_or_else(|_| Vec::new()) } fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) { diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs index 2a133041..e39cfed3 100644 --- a/crates/hermesllm/src/transforms/request/from_openai.rs +++ b/crates/hermesllm/src/transforms/request/from_openai.rs @@ -24,6 +24,150 @@ use crate::transforms::*; type AnthropicMessagesRequest = MessagesRequest; +// ============================================================================ +// RESPONSES API INPUT CONVERSION +// ============================================================================ + +/// Helper struct for converting ResponsesAPI input to OpenAI messages +pub struct ResponsesInputConverter { + pub input: InputParam, + pub instructions: Option, +} + +impl TryFrom for Vec { + type Error = TransformError; + + fn try_from(converter: ResponsesInputConverter) -> Result { + // Convert input to messages + match converter.input { + InputParam::Text(text) => { + // Simple text input becomes a user message + let mut messages = Vec::new(); + + // Add instructions as system message if present + if let Some(instructions) = converter.instructions { + messages.push(Message { + role: Role::System, + content: MessageContent::Text(instructions), + name: None, + tool_call_id: None, + tool_calls: None, + }); + } + + // Add the user message + messages.push(Message { + role: Role::User, + content: MessageContent::Text(text), + name: None, + tool_call_id: None, + tool_calls: None, + }); + + Ok(messages) + } + InputParam::Items(items) => { + // Convert input items to messages + let mut converted_messages = Vec::new(); + + // Add instructions as system message if present + if let Some(instructions) = converter.instructions { + converted_messages.push(Message { + role: Role::System, + content: MessageContent::Text(instructions), + name: None, + tool_call_id: None, + tool_calls: None, + }); + } + + // Convert each input item + for item in items { + if let InputItem::Message(input_msg) = item { + let role = match input_msg.role { + MessageRole::User => Role::User, + MessageRole::Assistant => Role::Assistant, + MessageRole::System => Role::System, + MessageRole::Developer => Role::System, // Map developer to system + }; + + // Convert content based on MessageContent type + let content = match &input_msg.content { + crate::apis::openai_responses::MessageContent::Text(text) => { + // Simple text content + MessageContent::Text(text.clone()) + } + crate::apis::openai_responses::MessageContent::Items(content_items) => { + // Check if it's a single text item (can use simple text format) + if content_items.len() == 1 { + if let InputContent::InputText { text } = &content_items[0] { + MessageContent::Text(text.clone()) + } else { + // Single non-text item - use parts format + MessageContent::Parts( + content_items.iter() + .filter_map(|c| match c { + InputContent::InputText { text } => { + Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) + } + InputContent::InputImage { image_url, .. } => { + Some(crate::apis::openai::ContentPart::ImageUrl { + image_url: crate::apis::openai::ImageUrl { + url: image_url.clone(), + detail: None, + } + }) + } + InputContent::InputFile { .. } => None, // Skip files for now + InputContent::InputAudio { .. } => None, // Skip audio for now + }) + .collect() + ) + } + } else { + // Multiple content items - convert to parts + MessageContent::Parts( + content_items + .iter() + .filter_map(|c| match c { + InputContent::InputText { text } => { + Some(crate::apis::openai::ContentPart::Text { + text: text.clone(), + }) + } + InputContent::InputImage { image_url, .. } => Some( + crate::apis::openai::ContentPart::ImageUrl { + image_url: crate::apis::openai::ImageUrl { + url: image_url.clone(), + detail: None, + }, + }, + ), + InputContent::InputFile { .. } => None, // Skip files for now + InputContent::InputAudio { .. } => None, // Skip audio for now + }) + .collect(), + ) + } + } + }; + + converted_messages.push(Message { + role, + content, + name: None, + tool_call_id: None, + tool_calls: None, + }); + } + } + + Ok(converted_messages) + } + } + } +} + // ============================================================================ // MAIN REQUEST TRANSFORMATIONS // ============================================================================ @@ -253,117 +397,12 @@ impl TryFrom for ChatCompletionsRequest { type Error = TransformError; fn try_from(req: ResponsesAPIRequest) -> Result { - // Convert input to messages - let messages = match req.input { - InputParam::Text(text) => { - // Simple text input becomes a user message - vec![Message { - role: Role::User, - content: MessageContent::Text(text), - name: None, - tool_call_id: None, - tool_calls: None, - }] - } - InputParam::Items(items) => { - // Convert input items to messages - let mut converted_messages = Vec::new(); - - // Add instructions as system message if present - if let Some(instructions) = &req.instructions { - converted_messages.push(Message { - role: Role::System, - content: MessageContent::Text(instructions.clone()), - name: None, - tool_call_id: None, - tool_calls: None, - }); - } - - // Convert each input item - for item in items { - if let InputItem::Message(input_msg) = item { - let role = match input_msg.role { - MessageRole::User => Role::User, - MessageRole::Assistant => Role::Assistant, - MessageRole::System => Role::System, - MessageRole::Developer => Role::System, // Map developer to system - }; - - // Convert content based on MessageContent type - let content = match &input_msg.content { - crate::apis::openai_responses::MessageContent::Text(text) => { - // Simple text content - MessageContent::Text(text.clone()) - } - crate::apis::openai_responses::MessageContent::Items(content_items) => { - // Check if it's a single text item (can use simple text format) - if content_items.len() == 1 { - if let InputContent::InputText { text } = &content_items[0] { - MessageContent::Text(text.clone()) - } else { - // Single non-text item - use parts format - MessageContent::Parts( - content_items.iter() - .filter_map(|c| match c { - InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { text: text.clone() }) - } - InputContent::InputImage { image_url, .. } => { - Some(crate::apis::openai::ContentPart::ImageUrl { - image_url: crate::apis::openai::ImageUrl { - url: image_url.clone(), - detail: None, - } - }) - } - InputContent::InputFile { .. } => None, // Skip files for now - InputContent::InputAudio { .. } => None, // Skip audio for now - }) - .collect() - ) - } - } else { - // Multiple content items - convert to parts - MessageContent::Parts( - content_items - .iter() - .filter_map(|c| match c { - InputContent::InputText { text } => { - Some(crate::apis::openai::ContentPart::Text { - text: text.clone(), - }) - } - InputContent::InputImage { image_url, .. } => Some( - crate::apis::openai::ContentPart::ImageUrl { - image_url: crate::apis::openai::ImageUrl { - url: image_url.clone(), - detail: None, - }, - }, - ), - InputContent::InputFile { .. } => None, // Skip files for now - InputContent::InputAudio { .. } => None, // Skip audio for now - }) - .collect(), - ) - } - } - }; - - converted_messages.push(Message { - role, - content, - name: None, - tool_call_id: None, - tool_calls: None, - }); - } - } - - converted_messages - } + // Convert input to messages using the shared converter + let converter = ResponsesInputConverter { + input: req.input, + instructions: req.instructions.clone(), }; + let messages: Vec = converter.try_into()?; // Build the ChatCompletionsRequest Ok(ChatCompletionsRequest { diff --git a/docs/source/_static/img/signals_trace.png b/docs/source/_static/img/signals_trace.png new file mode 100644 index 00000000..2d04e26f Binary files /dev/null and b/docs/source/_static/img/signals_trace.png differ diff --git a/docs/source/concepts/signals.rst b/docs/source/concepts/signals.rst new file mode 100644 index 00000000..ec1750e1 --- /dev/null +++ b/docs/source/concepts/signals.rst @@ -0,0 +1,359 @@ +.. -*- coding: utf-8 -*- + +======== +Signals™ +======== + +Agentic Signals are behavioral and executions quality indicators that act as early warning signs of agent performance—highlighting both brilliant successes and **severe failures**. These signals are computed directly from conversation traces without requiring manual labeling or domain expertise, making them practical for production observability at scale. + +The Problem: Knowing What's "Good" +================================== + +One of the hardest parts of building agents is measuring how well they perform in the real world. + +**Offline testing** relies on hand-picked examples and happy-path scenarios, missing the messy diversity of real usage. Developers manually prompt models, evaluate responses, and tune prompts by guesswork—a slow, incomplete feedback loop. + +**Production debugging** floods developers with traces and logs but provides little guidance on which interactions actually matter. Finding failures means painstakingly reconstructing sessions and manually labeling quality issues. + +You can't score every response with an LLM-as-judge (too expensive, too slow) or manually review every trace (doesn't scale). What you need are **behavioral signals**—fast, economical proxies that don’t label quality outright but dramatically shrink the search space, pointing to sessions most likely to be broken or brilliant. + +What Are Behavioral Signals? +============================ + +Behavioral signals are canaries in the coal mine—early, objective indicators that something may have gone wrong (or gone exceptionally well). They don’t explain *why* an agent failed, but they reliably signal *where* attention is needed. + +These signals emerge naturally from the rhythm of interaction: + +- A user rephrasing the same request +- Sharp increases in conversation length +- Frustrated follow-up messages (ALL CAPS, "this doesn’t work", excessive !!!/???) +- Agent repetition / looping +- Expressions of gratitude or satisfaction +- Requests to speak to a human / contact support + +Individually, these clues are shallow; together, they form a fingerprint of agent performance. Embedded directly into traces, they make it easy to spot friction as it happens: where users struggle, where agents loop, and where escalations occur. + +Signals vs Response Quality +=========================== + +Behavioral signals and response quality are complementary. + +**Response Quality** + Domain-specific correctness: did the agent do the right thing given business rules, user intent, and operational context? This often requires subject-matter experts or outcome instrumentation and is time-intensive but irreplaceable. + +**Behavioral Signals** + Observable patterns that correlate with quality: high repair frequency, excessive turns, frustration markers, repetition, escalation, and positive feedback. Fast to compute and valuable for prioritizing which traces deserve inspection. + +Used together, signals tell you *where to look*, and quality evaluation tells you *what went wrong (or right)*. + +How It Works +============ + +Signals are computed automatically by the gateway and emitted as **OpenTelemetry trace attributes** to your existing observability stack (Jaeger, Honeycomb, Grafana Tempo, etc.). No additional libraries or instrumentation required—just configure your OTEL collector endpoint. + +Each conversation trace is enriched with signal attributes that you can query, filter, and visualize in your observability platform. The gateway analyzes message content (performing text normalization, Unicode handling, and pattern matching) to compute behavioral signals in real-time. + +**OTEL Trace Attributes** + +Signal data is exported as structured span attributes: + +- ``signals.quality`` - Overall assessment (Excellent/Good/Neutral/Poor/Severe) +- ``signals.turn_count`` - Total number of turns in the conversation +- ``signals.efficiency_score`` - Efficiency metric (0.0-1.0) +- ``signals.repair.count`` - Number of repair attempts detected (when present) +- ``signals.repair.ratio`` - Ratio of repairs to user turns (when present) +- ``signals.frustration.count`` - Number of frustration indicators detected +- ``signals.frustration.severity`` - Frustration level (0-3) +- ``signals.repetition.count`` - Number of repetition instances detected +- ``signals.escalation.requested`` - Boolean escalation flag ("true" when present) +- ``signals.positive_feedback.count`` - Number of positive feedback indicators + +**Visual Flag Marker** + +When concerning signals are detected (frustration, looping, escalation, or poor/severe quality), the flag marker **🚩** is automatically appended to the span's operation name, making problematic traces easy to spot in your trace visualizations. + +**Querying in Your Observability Platform** + +Example queries: + +- Find all severe interactions: ``signals.quality = "Severe"`` +- Find flagged traces: search for **🚩** in span names +- Find long conversations: ``signals.turn_count > 10`` +- Find inefficient interactions: ``signals.efficiency_score < 0.5`` +- Find high repair rates: ``signals.repair.ratio > 0.3`` +- Find frustrated users: ``signals.frustration.severity >= 2`` +- Find looping agents: ``signals.repetition.count >= 3`` +- Find positive interactions: ``signals.positive_feedback.count >= 2`` +- Find escalations: ``signals.escalation.requested = "true"`` + +.. image:: /_static/img/signals_trace.png + :width: 100% + :align: center + + +Core Signal Types +================= + +The signals system tracks six categories of behavioral indicators. + +Turn Count & Efficiency +----------------------- + +**What it measures** + Number of user–assistant exchanges. + +**Why it matters** + Long conversations often indicate unclear intent resolution, confusion, or inefficiency. Very short conversations can correlate with crisp resolution. + +**Key metrics** + +- Total turn count +- Warning thresholds (concerning: >7 turns, excessive: >12 turns) +- Efficiency score (0.0–1.0) + +**Efficiency scoring** + Baseline expectation is ~5 turns (tunable). Efficiency stays at 1.0 up to the baseline, then declines with an inverse penalty as turns exceed baseline:: + + efficiency = 1 / (1 + 0.3 * (turns - baseline)) + +Follow-Up & Repair Frequency +---------------------------- + +**What it measures** + How often users clarify, correct, or rephrase requests. This is a **user signal** tracking query reformulation behavior—when users must repair or rephrase their requests because the agent didn't understand or respond appropriately. + +**Why it matters** + High repair frequency is a proxy for misunderstanding or intent drift. When users repeatedly rephrase the same request, it indicates the agent is failing to grasp or act on the user's intent. + +**Key metrics** + +- Repair count and ratio (repairs / user turns) +- Concerning threshold: >30% repair ratio +- Detected repair phrases (exact or fuzzy) + +**Common patterns detected** + +- Explicit corrections: "I meant", "correction" +- Negations: "No, I...", "that's not" +- Rephrasing: "let me rephrase", "to clarify" +- Mistake acknowledgment: "my mistake", "I was wrong" +- "Similar rephrase" heuristic based on token overlap (with stopwords downweighted) + +User Frustration +---------------- + +**What it measures** + Observable frustration indicators and emotional escalation. + +**Why it matters** + Catching frustration early enables intervention before users abandon or escalate. + +**Detection patterns** + +- **Complaints**: "this doesn't work", "not helpful", "waste of time" +- **Confusion**: "I don't understand", "makes no sense", "I'm confused" +- **Tone markers**: + + - ALL CAPS (>=10 alphabetic chars and >=80% uppercase) + - Excessive punctuation (>=3 exclamation marks or >=3 question marks) + +- **Profanity**: token-based (avoids substring false positives like "absolute" -> "bs") + +**Severity levels** + +- **None (0)**: no indicators +- **Mild (1)**: 1–2 indicators +- **Moderate (2)**: 3–4 indicators +- **Severe (3)**: 5+ indicators + +Repetition & Looping +-------------------- + +**What it measures** + Assistant repetition / degenerative loops. This is an **assistant signal** tracking when the agent repeats itself, fails to follow instructions, or gets stuck in loops—indicating the agent is not making progress or adapting its responses. + +**Why it matters** + Often indicates missing state tracking, broken tool integration, prompt issues, or the agent ignoring user corrections. High repetition means the agent is not learning from the conversation context. + +**Detection method** + +- Compare assistant messages using **bigram Jaccard similarity** +- Classify: + + - **Exact**: similarity >= 0.85 + - **Near-duplicate**: similarity >= 0.50 + +- Looping is flagged when repetition instances exceed 2 in a session. + +**Severity levels** + +- **None (0)**: 0 instances +- **Mild (1)**: 1–2 instances +- **Moderate (2)**: 3–4 instances +- **Severe (3)**: 5+ instances + +Positive Feedback +----------------- + +**What it measures** + User expressions of satisfaction, gratitude, and success. + +**Why it matters** + Strong positive signals identify exemplar traces for prompt engineering and evaluation. + +**Detection patterns** + +- Gratitude: "thank you", "appreciate it" +- Satisfaction: "that's great", "awesome", "love it" +- Success confirmation: "got it", "that worked", "perfect" + +**Confidence scoring** + +- 1 indicator: 0.6 +- 2 indicators: 0.8 +- 3+ indicators: 0.95 + +Escalation Requests +------------------- + +**What it measures** + Requests for human help/support or threats to quit. + +**Why it matters** + Escalation is a strong signal that the agent failed to resolve the interaction. + +**Detection patterns** + +- Human requests: "speak to a human", "real person", "live agent" +- Support: "contact support", "customer service", "help desk" +- Quit threats: "I'm done", "forget it", "I give up" + +Overall Quality Assessment +========================== + +Signals are aggregated into an overall interaction quality on a 5-point scale. + +**Excellent** + Strong positive signals, efficient resolution, low friction. + +**Good** + Mostly positive with minor clarifications; some back-and-forth but successful. + +**Neutral** + Mixed signals; neither clearly good nor bad. + +**Poor** + Concerning negative patterns (high friction, multiple repairs, moderate frustration). High abandonment risk. + +**Severe** + Critical issues—escalation requested, severe frustration, severe looping, or excessive turns (>12). Requires immediate attention. + +This assessment uses a scoring model that weighs positive factors (efficiency, positive feedback) against negative ones (frustration, repairs, repetition, escalation). + +Sampling and Prioritization +=========================== + +In production, trace data is overwhelming. Signals provide a lightweight first layer of analysis to prioritize which sessions deserve review. + +Workflow: + +1. Gateway captures conversation messages and computes signals +2. Signal attributes are emitted to OTEL spans automatically +3. Your observability platform ingests and indexes the attributes +4. Query/filter by signal attributes to surface outliers (poor/severe and exemplars) +5. Review high-information traces to identify improvement opportunities +6. Update prompts, routing, or policies based on findings +7. Redeploy and monitor signal metrics to validate improvements + +This creates a reinforcement loop where traces become both diagnostic data and training signal. + +Trace Filtering and Telemetry +============================= + +Signal attributes are automatically added to OpenTelemetry spans, making them immediately queryable in your observability platform. + +**Visual Filtering** + +When concerning signals are detected, the flag marker **🚩** (U+1F6A9) is automatically appended to the span's operation name. This makes flagged sessions immediately visible in trace visualizations without requiring attribute filtering. + +**Example Span Attributes**:: + + # Span name: "POST /v1/chat/completions gpt-4 🚩" + signals.quality = "Severe" + signals.turn_count = 15 + signals.efficiency_score = 0.234 + signals.repair.count = 4 + signals.repair.ratio = 0.571 + signals.frustration.severity = 3 + signals.frustration.count = 5 + signals.escalation.requested = "true" + signals.repetition.count = 4 + +**Building Dashboards** + +Use signal attributes to build monitoring dashboards in Grafana, Honeycomb, Datadog, etc.: + +- **Quality distribution**: Count of traces by ``signals.quality`` +- **P95 turn count**: 95th percentile of ``signals.turn_count`` +- **Average efficiency**: Mean of ``signals.efficiency_score`` +- **High repair rate**: Percentage where ``signals.repair.ratio > 0.3`` +- **Frustration rate**: Percentage where ``signals.frustration.severity >= 2`` +- **Escalation rate**: Percentage where ``signals.escalation.requested = "true"`` +- **Looping rate**: Percentage where ``signals.repetition.count >= 3`` +- **Positive feedback rate**: Percentage where ``signals.positive_feedback.count >= 1`` + +**Creating Alerts** + +Set up alerts based on signal thresholds: + +- Alert when severe interaction count exceeds threshold in 1-hour window +- Alert on sudden spike in frustration rate (>2x baseline) +- Alert when escalation rate exceeds 5% of total conversations +- Alert on degraded efficiency (P95 turn count increases >50%) + +Best Practices +============== + +Start simple: + +- Alert or page on **Severe** sessions (or on spikes in Severe rate) +- Review **Poor** sessions within 24 hours +- Sample **Excellent** sessions as exemplars + +Combine multiple signals to infer failure modes: + +- Looping: repetition severity >= 2 + excessive turns +- User giving up: frustration severity >= 2 + escalation requested +- Misunderstood intent: repair ratio > 30% + excessive turns +- Working well: positive feedback + high efficiency + no frustration + +Limitations and Considerations +============================== + +Signals don’t capture: + +- Task completion / real outcomes +- Factual or domain correctness +- Silent abandonment (user leaves without expressing frustration) +- Non-English nuance (pattern libraries are English-oriented) + +Mitigation strategies: + +- Periodically sample flagged sessions and measure false positives/negatives +- Tune baselines per use case and user population +- Add domain-specific phrase libraries where needed +- Combine signals with non-text metrics (tool failures, disconnects, latency) + +.. note:: + Behavioral signals complement—but do not replace—domain-specific response quality evaluation. Use signals to prioritize which traces to inspect, then apply domain expertise and outcome checks to diagnose root causes. + +.. tip:: + The flag marker in the span name provides instant visual feedback in trace UIs, while the structured attributes (``signals.quality``, ``signals.frustration.severity``, etc.) enable powerful querying and aggregation in your observability platform. + +See Also +======== + +- :doc:`../guides/observability/tracing` - Distributed tracing for agent systems +- :doc:`../guides/observability/monitoring` - Metrics and dashboards +- :doc:`../guides/observability/access_logging` - Request/response logging +- :doc:`../guides/observability/observability` - Complete observability guide diff --git a/docs/source/get_started/overview.rst b/docs/source/get_started/overview.rst index 43a0cb45..05955ede 100644 --- a/docs/source/get_started/overview.rst +++ b/docs/source/get_started/overview.rst @@ -2,7 +2,7 @@ Overview ======== -`Plano `_ is delivery infrastructure for agentic apps. A models-native proxy server and data plane designed to help you build agents faster, and deliver them reliably to production. +`Plano `_ is delivery infrastructure for agentic apps. A smart proxy server and data plane designed to help you build agents faster, and deliver them reliably to production. Plano pulls out the rote plumbing work (the “hidden AI middleware”) and decouples you from brittle, ever‑changing framework abstractions. It centralizes what shouldn’t be bespoke in every codebase like agent routing and orchestration, rich agentic signals and traces for continuous improvement, guardrail filters for safety and moderation, and smart LLM routing APIs for UX and DX agility. Use any language or AI framework, and ship agents to production faster with Plano. diff --git a/docs/source/guides/observability/tracing.rst b/docs/source/guides/observability/tracing.rst index 46c10f37..aab3b069 100644 --- a/docs/source/guides/observability/tracing.rst +++ b/docs/source/guides/observability/tracing.rst @@ -28,6 +28,120 @@ tools. :align: center +Understanding Plano Traces +-------------------------- + +Plano creates structured traces that capture the complete flow of requests through your AI system. Each trace consists of multiple spans representing different stages of processing. + +Inbound Request Handling +~~~~~~~~~~~~~~~~~~~~~~~~~ + +When a request enters Plano, it creates an **inbound span** (``plano(inbound)``) that represents the initial request reception and processing. This span captures: + +- HTTP request details (method, path, headers) +- Request payload size +- Initial validation and authentication + +Orchestration & Routing +~~~~~~~~~~~~~~~~~~~~~~~~ + +For agent systems, Plano performs intelligent routing through orchestration spans: + +- **Agent Orchestration** (``plano(orchestrator)``): When multiple agents are available, Plano uses an LLM to analyze the user's intent and select the most appropriate agent. This span captures the orchestration decision-making process. + +- **LLM Routing** (``plano(routing)``): For direct LLM requests, Plano determines the optimal endpoint based on your routing strategy (round-robin, least-latency, cost-optimized). This span includes: + + - Routing strategy used + - Selected upstream endpoint + - Route determination time + - Fallback indicators (if applicable) + +Agent Processing +~~~~~~~~~~~~~~~~ + +When requests are routed to agents, Plano creates spans for agent execution: + +- **Agent Filter Chains** (``plano(filter)``): If filters are configured (guardrails, context enrichment, query rewriting), each filter execution is captured in its own span, showing the transformation pipeline. + +- **Agent Execution** (``plano(agent)``): The main agent processing span that captures the agent's work, including any tools invoked and intermediate reasoning steps. + +Outbound LLM Calls +~~~~~~~~~~~~~~~~~~ + +All LLM calls—whether from Plano's routing layer or from agents—are traced with **LLM spans** (``plano(llm)``) that capture: + +- Model name and provider (e.g., ``gpt-4``, ``claude-3-sonnet``) +- Request parameters (temperature, max_tokens, top_p) +- Token usage (prompt_tokens, completion_tokens) +- Streaming indicators and time-to-first-token +- Response metadata + +**Example Span Attributes**:: + + # LLM call span + llm.model = "gpt-4" + llm.provider = "openai" + llm.usage.prompt_tokens = 150 + llm.usage.completion_tokens = 75 + llm.duration_ms = 1250 + llm.time_to_first_token = 320 + +Handoff to Upstream Services +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When Plano forwards requests to upstream services (agents, APIs, or LLM providers), it creates **handoff spans** (``plano(handoff)``) that capture: + +- Upstream endpoint URL +- Request/response sizes +- HTTP status codes +- Upstream response times + +This creates a complete end-to-end trace showing the full request lifecycle through all system components. + +Behavioral Signals in Traces +---------------------------- + +Plano automatically enriches OpenTelemetry traces with :doc:`../../concepts/signals` — behavioral quality indicators computed from conversation patterns. These signals are attached as span attributes, providing immediate visibility into interaction quality. + +**What Signals Provide** + +Signals act as early warning indicators embedded in your traces: + +- **Quality Assessment**: Overall interaction quality (Excellent/Good/Neutral/Poor/Severe) +- **Efficiency Metrics**: Turn count, efficiency scores, repair frequency +- **User Sentiment**: Frustration indicators, positive feedback, escalation requests +- **Agent Behavior**: Repetition detection, looping patterns + +**Visual Flag Markers** + +When concerning signals are detected (frustration, looping, escalation, or poor/severe quality), Plano automatically appends a flag marker **🚩** to the span's operation name. This makes problematic traces immediately visible in your tracing UI without requiring additional queries. + +**Example Span with Signals**:: + + # Span name: "POST /v1/chat/completions gpt-4 🚩" + # Standard LLM attributes: + llm.model = "gpt-4" + llm.usage.total_tokens = 225 + + # Behavioral signal attributes: + signals.quality = "Severe" + signals.turn_count = 15 + signals.efficiency_score = 0.234 + signals.frustration.severity = 3 + signals.escalation.requested = "true" + +**Querying Signal Data** + +In your observability platform (Jaeger, Grafana Tempo, Datadog, etc.), filter traces by signal attributes: + +- Find severe interactions: ``signals.quality = "Severe"`` +- Find frustrated users: ``signals.frustration.severity >= 2`` +- Find inefficient flows: ``signals.efficiency_score < 0.5`` +- Find escalations: ``signals.escalation.requested = "true"`` + +For complete details on all available signals, detection methods, and best practices, see the :doc:`../../concepts/signals` guide. + + Benefits of Using ``Traceparent`` Headers ----------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 158891f4..a79ea84f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -5,7 +5,7 @@ Welcome to Plano! :width: 100% :align: center -`Plano `_ is delivery infrastructure for agentic apps. A models-native proxy server and data plane designed to help you build agents faster, and deliver them reliably to production. +`Plano `_ is delivery infrastructure for agentic apps. A smart proxy server and data plane designed to help you build agents faster, and deliver them reliably to production. Plano pulls out the rote plumbing work (aka “hidden AI middleware”) and decouples you from brittle, ever‑changing framework abstractions. It centralizes what shouldn’t be bespoke in every codebase like agent routing and orchestration, rich agentic signals and traces for continuous improvement, guardrail filters for safety and moderation, and smart LLM routing APIs for UX and DX agility. Use any language or AI framework, and ship agents to production faster with Plano. @@ -36,6 +36,7 @@ Built by contributors to the widely adopted `Envoy Proxy