mirror of
https://github.com/katanemo/plano.git
synced 2026-06-17 15:25:17 +02:00
added more tests and fixed docuemtnation. PR 100% ready
This commit is contained in:
parent
c2cfd358f5
commit
7d517ace77
3 changed files with 134 additions and 5 deletions
|
|
@ -2465,4 +2465,128 @@ mod tests {
|
|||
"Polite disappointment should not be classified as Excellent"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_catastrophic_failure_looping_assistant() {
|
||||
let analyzer = SignalAnalyzer::new();
|
||||
|
||||
// Catastrophic failure: assistant stuck in loop, user increasingly frustrated
|
||||
let messages = vec![
|
||||
// Turn 1: Initial request
|
||||
create_message(Role::User, "I need to cancel my subscription"),
|
||||
create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"),
|
||||
|
||||
// Turn 3: First repair - repetition clarification
|
||||
create_message(Role::User, "I just said I want to cancel my subscription"),
|
||||
// Turn 4: Exact repetition from assistant
|
||||
create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"),
|
||||
|
||||
// Turn 5: Repair (negation) + frustration (ALL CAPS)
|
||||
create_message(Role::User, "No, I meant CANCEL. My. Subscription."),
|
||||
// Turn 6: Near-duplicate repetition
|
||||
create_message(Role::Assistant, "I understand you need help with your account. Let me assist you with that."),
|
||||
|
||||
// Turn 7: Severe frustration (complaint + excessive punctuation + ALL CAPS)
|
||||
create_message(Role::User, "This is not helpful!! I want to CANCEL my subscription!!!"),
|
||||
// Turn 8: Exact repetition again (looping)
|
||||
create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"),
|
||||
|
||||
// Turn 9: Repair (explicit rephrase) + frustration (complaint)
|
||||
create_message(Role::User, "This doesnt work. Let me rephrase: I need to terminate my account and stop being charged"),
|
||||
// Turn 10: Near-duplicate repetition (looping continues)
|
||||
create_message(Role::Assistant, "I understand you need help with your account. Let me assist you with that."),
|
||||
|
||||
// Turn 11: Escalation (multiple types) + frustration
|
||||
create_message(Role::User, "I give up. Can I speak to a real person? This is a waste of time"),
|
||||
];
|
||||
|
||||
let report = analyzer.analyze(&messages);
|
||||
|
||||
// Validate turn count
|
||||
assert_eq!(
|
||||
report.turn_count.total_turns, 11,
|
||||
"Should have 11 total turns"
|
||||
);
|
||||
assert_eq!(report.turn_count.user_turns, 6, "Should have 6 user turns");
|
||||
assert_eq!(
|
||||
report.turn_count.assistant_turns, 5,
|
||||
"Should have 5 assistant turns"
|
||||
);
|
||||
assert!(
|
||||
report.turn_count.is_concerning,
|
||||
"11 turns should be concerning (>7)"
|
||||
);
|
||||
assert!(
|
||||
!report.turn_count.is_excessive,
|
||||
"11 turns should not be excessive (<=12)"
|
||||
);
|
||||
assert!(
|
||||
report.turn_count.efficiency_score < 0.5,
|
||||
"Efficiency should be low"
|
||||
);
|
||||
|
||||
// Validate repair detection (USER signals - query reformulation)
|
||||
// Detected repairs:
|
||||
// 1. "I just said I want to cancel..." - pattern: "I just said"
|
||||
// 2. "No, I meant CANCEL..." - pattern: "No, I meant"
|
||||
// 3. "Let me rephrase: I need to terminate..." - pattern: "let me rephrase"
|
||||
// Note: "This is not helpful!!" is frustration (not repair)
|
||||
// Note: "I give up..." is escalation (not repair)
|
||||
assert_eq!(
|
||||
report.follow_up.repair_count, 3,
|
||||
"Should detect exactly 3 repair attempts from user messages"
|
||||
);
|
||||
assert_eq!(
|
||||
report.follow_up.repair_ratio, 0.5,
|
||||
"Repair ratio should be 0.5 (3 repairs / 6 user messages)"
|
||||
);
|
||||
assert!(
|
||||
report.follow_up.is_concerning,
|
||||
"50% repair ratio should be highly concerning (threshold is 30%)"
|
||||
);
|
||||
|
||||
// Validate frustration detection
|
||||
assert!(
|
||||
report.frustration.has_frustration,
|
||||
"Should detect frustration"
|
||||
);
|
||||
assert!(
|
||||
report.frustration.frustration_count >= 4,
|
||||
"Should detect multiple frustration indicators"
|
||||
);
|
||||
assert!(
|
||||
report.frustration.severity >= 2,
|
||||
"Should be at least moderate frustration"
|
||||
);
|
||||
|
||||
// Validate repetition/looping detection (ASSISTANT signals - not following instructions)
|
||||
// The assistant repeats the same unhelpful responses multiple times:
|
||||
// 1. "I can help you with account management..." appears 3 times (exact repetition)
|
||||
// 2. "I understand you need help with your account..." appears 2 times (near-duplicate)
|
||||
assert!(
|
||||
report.repetition.repetition_count >= 4,
|
||||
"Should detect at least 4 assistant repetitions (exact + near-duplicates)"
|
||||
);
|
||||
assert!(
|
||||
report.repetition.has_looping,
|
||||
"Should detect looping (>2 repetitions indicates stuck agent)"
|
||||
);
|
||||
assert!(
|
||||
report.repetition.severity >= 2,
|
||||
"Should be moderate to severe looping (assistant not adapting)"
|
||||
);
|
||||
|
||||
// Validate escalation detection
|
||||
assert!(
|
||||
report.escalation.escalation_requested,
|
||||
"Should detect escalation request"
|
||||
);
|
||||
assert!(
|
||||
report.escalation.escalation_count >= 2,
|
||||
"Should detect multiple escalation indicators: 'give up' + 'speak to a real person'"
|
||||
);
|
||||
|
||||
// Validate overall quality
|
||||
assert_eq!(report.overall_quality, InteractionQuality::Severe, "Should be classified as Severe due to escalation + excessive frustration + looping + high repair ratio");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
BIN
docs/source/_static/img/signals_trace.png
Normal file
BIN
docs/source/_static/img/signals_trace.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.8 MiB |
|
|
@ -86,6 +86,11 @@ Example queries:
|
|||
- Find positive interactions: ``signals.positive_feedback.count >= 2``
|
||||
- Find escalations: ``signals.escalation.requested = "true"``
|
||||
|
||||
.. image:: /_static/img/signals_trace.png
|
||||
:width: 100%
|
||||
:align: center
|
||||
|
||||
|
||||
Core Signal Types
|
||||
=================
|
||||
|
||||
|
|
@ -115,10 +120,10 @@ Follow-Up & Repair Frequency
|
|||
----------------------------
|
||||
|
||||
**What it measures**
|
||||
How often users clarify, correct, or rephrase requests.
|
||||
How often users clarify, correct, or rephrase requests. This is a **user signal** tracking query reformulation behavior—when users must repair or rephrase their requests because the agent didn't understand or respond appropriately.
|
||||
|
||||
**Why it matters**
|
||||
High repair frequency is a proxy for misunderstanding or intent drift.
|
||||
High repair frequency is a proxy for misunderstanding or intent drift. When users repeatedly rephrase the same request, it indicates the agent is failing to grasp or act on the user's intent.
|
||||
|
||||
**Key metrics**
|
||||
|
||||
|
|
@ -165,10 +170,10 @@ Repetition & Looping
|
|||
--------------------
|
||||
|
||||
**What it measures**
|
||||
Assistant repetition / degenerative loops.
|
||||
Assistant repetition / degenerative loops. This is an **assistant signal** tracking when the agent repeats itself, fails to follow instructions, or gets stuck in loops—indicating the agent is not making progress or adapting its responses.
|
||||
|
||||
**Why it matters**
|
||||
Often indicates missing state tracking, broken tool integration, or prompt issues.
|
||||
Often indicates missing state tracking, broken tool integration, prompt issues, or the agent ignoring user corrections. High repetition means the agent is not learning from the conversation context.
|
||||
|
||||
**Detection method**
|
||||
|
||||
|
|
@ -176,7 +181,7 @@ Repetition & Looping
|
|||
- Classify:
|
||||
|
||||
- **Exact**: similarity >= 0.85
|
||||
- **Near-duplicate**: similarity >= 0.60
|
||||
- **Near-duplicate**: similarity >= 0.50
|
||||
|
||||
- Looping is flagged when repetition instances exceed 2 in a session.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue