iai-mcp-opencode/tests/test_batch_guard.py

"""Tests for 02-REVIEW.md H-02 (batch scaffold silently debits budget +
flips effective_tier=tier1 on a stub that produces no output).

Bug: submit_batch_consolidation called budget.record_spend BEFORE the real
SDK call and returned (True, "ok", []). run_heavy_consolidation then saw
ok_batch=True and set effective_tier="tier1", logging it in the
consolidation event. Users inspecting `iai-mcp audit` saw Tier-1 events
that were factually false.

Fix:
    - Scaffold path returns (False, "stub: batch API not yet wired", []).
    - NO budget.record_spend call during the stub period.
    - Emit one info-severity llm_health event documenting the gap so the
      audit CLI reflects honest state.
    - run_heavy_consolidation sees ok_batch=False and keeps tier0; the
      cls_consolidation_run event payload carries batch_submitted=False.

Constitutional contract (D-GUARD budget honesty + audit repudiability):
    Budget ledger rows MUST correspond to real API spend. Tier flags in
    the event log MUST correspond to real Tier-1 output. Both invariants
    were silently violated by the scaffold.
"""
from __future__ import annotations

import pytest

from iai_mcp.events import query_events
from iai_mcp.guard import BudgetLedger, RateLimitLedger
from iai_mcp.store import MemoryStore


def _tasks(n: int = 1) -> list[dict]:
    return [
        {
            "task_id": f"t{i}",
            "prompt": f"summarise cluster {i}",
            "prompt_tok": 500,
            "output_tok": 200,
        }
        for i in range(n)
    ]


# ==================================================== H-02: batch scaffold guard


def test_batch_stub_returns_false_with_scaffold_reason(tmp_path, monkeypatch):
    """Stub path must return (False, "stub: batch API not yet wired", [])
    even when all D-GUARD steps pass (API key + llm_enabled + budget + rate
    all clean). This is the load-bearing assertion that neutralises the
    tier1 flip."""
    from iai_mcp.batch import submit_batch_consolidation

    monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-test-key")
    store = MemoryStore(path=tmp_path)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    ok, reason, results = submit_batch_consolidation(
        store, _tasks(3), budget, rate, llm_enabled=True,
    )

    assert ok is False, "scaffold must return ok=False until real SDK wire-up lands"
    assert reason.startswith("stub:"), (
        f"reason must advertise scaffold status, got {reason!r}"
    )
    assert "batch API not yet wired" in reason
    assert results == [], "scaffold produces empty result list"


def test_batch_stub_does_not_debit_budget(tmp_path, monkeypatch):
    """Budget MUST NOT increase during the scaffold period. Only a real
    successful anthropic.batches.create response may record spend."""
    from iai_mcp.batch import submit_batch_consolidation

    monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-test-key")
    store = MemoryStore(path=tmp_path)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    before_daily = budget.daily_used()
    before_monthly = budget.monthly_used()

    submit_batch_consolidation(
        store, _tasks(5), budget, rate, llm_enabled=True,
    )

    after_daily = budget.daily_used()
    after_monthly = budget.monthly_used()

    assert after_daily == before_daily, (
        f"daily spend changed during stub: {before_daily} -> {after_daily}"
    )
    assert after_monthly == before_monthly


def test_batch_stub_emits_info_llm_health_event(tmp_path, monkeypatch):
    """Observability contract: scaffold state must be visible in the events
    table so `iai-mcp audit` observers can see the gap explicitly.
    Severity=info (not warning/critical) because this is intentional
    scaffold behaviour, not an error."""
    from iai_mcp.batch import submit_batch_consolidation

    monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-test-key")
    store = MemoryStore(path=tmp_path)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    submit_batch_consolidation(
        store, _tasks(), budget, rate, llm_enabled=True,
    )

    events = query_events(store, kind="llm_health")
    batch_events = [
        e for e in events
        if e["data"].get("component") == "batch_consolidation"
    ]
    assert len(batch_events) >= 1, "must emit llm_health for batch stub"
    ev = batch_events[0]
    assert ev["severity"] == "info", (
        f"scaffold event must be info-severity, got {ev['severity']!r}"
    )
    note = ev["data"].get("note") or ""
    assert "scaffold" in note.lower() or "not yet wired" in note.lower(), (
        f"event note must advertise scaffold/not-yet-wired status, got {note!r}"
    )


def test_run_heavy_does_not_flip_tier1_on_stub(tmp_path, monkeypatch):
    """run_heavy_consolidation must not set effective_tier='tier1' while
    submit_batch_consolidation is a stub. Even when the D-GUARD ladder
    greenlights Tier-1 (key + enabled + budget + rate), ok_batch=False so
    the caller stays on Tier-0."""
    from iai_mcp.guard import BudgetLedger, RateLimitLedger
    from iai_mcp.sleep import SleepConfig, run_heavy_consolidation

    monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-test-key")
    store = MemoryStore(path=tmp_path)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    cfg = SleepConfig(llm_enabled=True)
    result = run_heavy_consolidation(
        store,
        session_id="h-stub",
        config=cfg,
        budget=budget,
        rate=rate,
        has_api_key=True,
    )

    assert result["tier"] == "tier0", (
        f"effective_tier must stay tier0 during scaffold, got {result['tier']!r}"
    )

    # cls_consolidation_run event has batch_submitted=False
    events = query_events(store, kind="cls_consolidation_run")
    heavy = [e for e in events if e["data"].get("mode") == "heavy"]
    assert len(heavy) >= 1
    assert heavy[0]["data"]["batch_submitted"] is False, (
        "batch_submitted flag must honestly reflect stub state"
    )
    # tier_eligible still records that the D-GUARD ladder was CONSULTED (tier1)
    # even though effective_tier is tier0 -- lets auditors see the gap.
    assert heavy[0]["data"].get("tier") == "tier0"


def test_run_heavy_does_not_debit_budget_during_stub(tmp_path, monkeypatch):
    """End-to-end: running heavy consolidation with full Tier-1 eligibility
    must leave the budget untouched because submit_batch_consolidation is a
    stub."""
    from iai_mcp.sleep import SleepConfig, run_heavy_consolidation

    monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-test-key")
    store = MemoryStore(path=tmp_path)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    before = budget.daily_used()

    cfg = SleepConfig(llm_enabled=True)
    run_heavy_consolidation(
        store,
        session_id="h-no-debit",
        config=cfg,
        budget=budget,
        rate=rate,
        has_api_key=True,
    )

    # Note: schema_induction_tier1 also records a small spend when eligible.
    # We assert the batch_consolidation row specifically is NOT present.
    tbl = store.db.open_table("budget_ledger")
    df = tbl.to_pandas()
    if not df.empty:
        batch_rows = df[df["kind"] == "batch_consolidation"]
        assert len(batch_rows) == 0, (
            "stub must not record a batch_consolidation spend row"
        )