"""Tests for iai_mcp.bedtime -- Task 1. Covers 14 behaviours from the plan: 1. English positive -- "good night" / "heading to bed" / "tired" 2. English negative (phrase alone, no dual-gate) 3. Russian positive 4. Japanese positive 5. Arabic positive 6. de/fr/es/zh positive (one phrase per language at minimum) 7. Cross-lingual fallback -- EN always tried; RU NOT tried under language="en" 8. Dual-gate: phrase alone NOT enough (no quiet window -> None) 9. Dual-gate: inside quiet window -> dict 10. Dual-gate: within 30min of start -> dict 11. Dual-gate: 1h before start -> None 12. Fixture-driven corpus: 5 positive + 5 negative per language 13. False positive rate < 10% on phrase-only check across all 8 fixtures 14. ReDoS protection: 10KB input under 100ms total across all patterns """ from __future__ import annotations import time from datetime import datetime, timezone from pathlib import Path from zoneinfo import ZoneInfo import pytest from iai_mcp import bedtime from iai_mcp.bedtime import ( WIND_DOWN_BY_LANG, WIND_DOWN_GATE_MINUTES_BEFORE, WIND_DOWN_LANGUAGES_SUPPORTED, detect_wind_down, detect_wind_down_phrase, is_late_in_quiet_window, ) UTC = timezone.utc FIXTURES = Path(__file__).parent / "fixtures" / "bedtime" # ---------------------------------------------------------------- phrase gate def test_english_positive() -> None: for cue in [ "good night", "I'm heading to bed", "I'm tired, going to sleep", "catch you tomorrow", "it's bedtime", "Goodnight!", ]: matched, pattern = detect_wind_down_phrase(cue, "en") assert matched, f"expected EN positive for {cue!r}" assert pattern def test_english_phrase_matches_even_rhetorical() -> None: """Phrase alone IS enough for the phrase gate -- the dual gate adds the quiet-window filter. This test locks the phrase behaviour in isolation so dual-gate tests can differentiate.""" cue = "the villain said good night and laughed" matched, pattern = detect_wind_down_phrase(cue, "en") assert matched, "phrase gate alone is intentionally permissive" assert "night" in pattern.lower() def test_russian_positive() -> None: for cue in [ "пойду спать", "спокойной ночи", "устал, иду в постель", "до завтра", "пора ложиться", ]: matched, _ = detect_wind_down_phrase(cue, "ru") assert matched, f"expected RU positive for {cue!r}" def test_japanese_positive() -> None: for cue in [ "おやすみ", "おやすみなさい", "寝ます", "また明日", "疲れた", ]: matched, _ = detect_wind_down_phrase(cue, "ja") assert matched, f"expected JA positive for {cue!r}" def test_arabic_positive() -> None: for cue in [ "تصبح على خير", "ليلة سعيدة", "أنا متعب سأنام", ]: matched, _ = detect_wind_down_phrase(cue, "ar") assert matched, f"expected AR positive for {cue!r}" def test_de_fr_es_zh_positive() -> None: cases: dict[str, list[str]] = { "de": ["gute Nacht", "ich bin müde", "bis morgen"], "fr": ["bonne nuit", "je suis fatigué", "à demain"], "es": ["buenas noches", "estoy cansado", "hasta mañana"], "zh": ["晚安", "我要睡觉", "累了"], } for lang, cues in cases.items(): for cue in cues: matched, _ = detect_wind_down_phrase(cue, lang) assert matched, f"expected {lang.upper()} positive for {cue!r}" def test_cross_lingual_en_is_fallback_but_ru_is_not() -> None: # EN fallback always tried: "good night" under language="ru" still matches. matched_en_under_ru, _ = detect_wind_down_phrase("good night", "ru") assert matched_en_under_ru, "EN fallback must trigger regardless of language" # RU is NOT tried under language="en": a purely Russian cue must NOT match. matched_ru_under_en, _ = detect_wind_down_phrase("я пойду спать", "en") assert not matched_ru_under_en, ( "RU phrases must not fall back under language=en" ) def test_phrase_empty_cue_no_match() -> None: assert detect_wind_down_phrase("", "en") == (False, "") assert detect_wind_down_phrase("", "ru") == (False, "") def test_phrase_unknown_language_still_tries_english() -> None: """Language we don't support (e.g. 'ko') must still try EN fallback.""" matched, _ = detect_wind_down_phrase("good night", "ko") assert matched, "EN fallback required for unsupported languages too" # ---------------------------------------------------------------- quiet-window gate def _utc(y: int, m: int, d: int, hh: int, mm: int = 0) -> datetime: return datetime(y, m, d, hh, mm, tzinfo=UTC) def test_is_late_no_window() -> None: assert is_late_in_quiet_window(None, _utc(2026, 4, 18, 22, 0), UTC) is False def test_is_late_inside_window() -> None: # window = (44, 16) means start at bucket 44 = 22:00, duration 8h. # 23:30 local should be inside. assert is_late_in_quiet_window( (44, 16), _utc(2026, 4, 18, 23, 30), UTC, ) is True def test_is_late_within_30min_of_start() -> None: # start 22:00, now 21:45 -> within 30min -> True. assert is_late_in_quiet_window( (44, 16), _utc(2026, 4, 18, 21, 45), UTC, ) is True def test_is_late_exactly_30min_before_start() -> None: # Boundary: 21:30 should still count (within 30min threshold, inclusive). assert is_late_in_quiet_window( (44, 16), _utc(2026, 4, 18, 21, 30), UTC, ) is True def test_is_late_one_hour_before_start() -> None: # start 22:00, now 21:00 -> 60min before -> False. assert is_late_in_quiet_window( (44, 16), _utc(2026, 4, 18, 21, 0), UTC, ) is False def test_is_late_window_wraps_midnight() -> None: # window = (44, 16): 22:00 start + 8h = 06:00 next morning. # 02:30 local should be inside (post-midnight part of the window). assert is_late_in_quiet_window( (44, 16), _utc(2026, 4, 19, 2, 30), UTC, ) is True def test_is_late_outside_window_afternoon() -> None: # window = (44, 16): 22:00-06:00. 15:00 afternoon -> outside + not within 30min. assert is_late_in_quiet_window( (44, 16), _utc(2026, 4, 18, 15, 0), UTC, ) is False # ---------------------------------------------------------------- dual-gate def test_dual_gate_phrase_alone_not_enough() -> None: # Phrase matches but no quiet window set -> None. result = detect_wind_down( "good night", "en", state={}, now=_utc(2026, 4, 18, 12, 0), tz=UTC, ) assert result is None def test_dual_gate_no_phrase_inside_window() -> None: # Inside window but no phrase match -> None. result = detect_wind_down( "let me check the code", "en", state={"quiet_window": (44, 16)}, now=_utc(2026, 4, 18, 23, 30), tz=UTC, ) assert result is None def test_dual_gate_both_pass_inside_window() -> None: result = detect_wind_down( "good night", "en", state={"quiet_window": (44, 16)}, now=_utc(2026, 4, 18, 23, 30), tz=UTC, ) assert result is not None assert result["message_hint"] == "user_wind_down_detected" assert "night" in result["matched_pattern"].lower() assert result["quiet_window_start_bucket"] == 44 assert result["quiet_window_duration"] == 16 def test_dual_gate_both_pass_30min_before_window() -> None: # 21:45 local, window starts 22:00 -> within 30min threshold. result = detect_wind_down( "good night", "en", state={"quiet_window": (44, 16)}, now=_utc(2026, 4, 18, 21, 45), tz=UTC, ) assert result is not None assert result["quiet_window_start_bucket"] == 44 def test_dual_gate_phrase_but_too_early() -> None: # 21:00 local, window starts 22:00 -> 60min too early -> None. result = detect_wind_down( "good night", "en", state={"quiet_window": (44, 16)}, now=_utc(2026, 4, 18, 21, 0), tz=UTC, ) assert result is None # ---------------------------------------------------------------- fixture corpus _LANGS = sorted(WIND_DOWN_BY_LANG.keys()) @pytest.mark.parametrize("lang", _LANGS) def test_fixture_corpus(lang: str) -> None: fp = FIXTURES / f"{lang}.txt" assert fp.exists(), f"fixture file missing: {fp}" lines = [ ln.strip() for ln in fp.read_text(encoding="utf-8").splitlines() if ln.strip() and not ln.lstrip().startswith("#") ] assert len(lines) >= 10, f"{lang}: expected >=10 fixture lines, got {len(lines)}" for line in lines: assert "\t" in line, f"{lang}: fixture line missing tab separator: {line!r}" sentence, expected = line.rsplit("\t", 1) matched, _ = detect_wind_down_phrase(sentence, lang) assert matched == (expected == "yes"), ( f"{lang}: {sentence!r} expected {expected} got {matched}" ) def test_fixture_corpus_false_positive_rate_under_10_percent() -> None: """Across all 8 languages (80 lines = 40 pos + 40 neg), the phrase-only false positive rate MUST be < 10%. The dual gate ratchets this down to the target of <5% in practice.""" fp_count = 0 neg_total = 0 for lang in _LANGS: fp = FIXTURES / f"{lang}.txt" for line in fp.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue if "\t" not in line: continue sentence, expected = line.rsplit("\t", 1) if expected == "no": neg_total += 1 matched, _ = detect_wind_down_phrase(sentence, lang) if matched: fp_count += 1 assert neg_total >= 40, f"expected >=40 negative fixtures, got {neg_total}" fpr = fp_count / neg_total assert fpr < 0.10, ( f"phrase-only FPR {fpr:.2%} exceeds 10% ceiling " f"({fp_count}/{neg_total}). Tighten fixtures or patterns." ) # ---------------------------------------------------------------- ReDoS guard def test_redos_protection_bounded_quantifiers_under_100ms() -> None: """All patterns are pre-compiled and use bounded quantifiers. 10KB of 'a' characters must execute in < 100ms across every pattern.""" big = "a" * 10240 deadline = 0.100 # seconds total_start = time.monotonic() for lang, patterns in bedtime._COMPILED.items(): for p in patterns: t0 = time.monotonic() p.search(big) if time.monotonic() - t0 > deadline: pytest.fail( f"ReDoS suspected: {lang} pattern {p.pattern!r} took " f">{deadline}s on 10KB input" ) total_elapsed = time.monotonic() - total_start assert total_elapsed < 1.0, ( f"combined ReDoS sweep took {total_elapsed:.3f}s (budget 1.0s)" ) # ---------------------------------------------------------------- coverage sanity def test_language_coverage_is_exactly_eight_d11() -> None: """wind-down regex must cover exactly the 8 shield.py languages.""" assert WIND_DOWN_LANGUAGES_SUPPORTED == frozenset( {"en", "ru", "ja", "ar", "de", "fr", "es", "zh"}, ) assert len(WIND_DOWN_BY_LANG) == 8 def test_gate_minutes_before_is_thirty_d09() -> None: """D-09 dual-gate: 30 minutes before quiet-window start counts as late.""" assert WIND_DOWN_GATE_MINUTES_BEFORE == 30