iai-mcp-opencode/tests/test_bench_total_session_cost.py

"""OPS-12 regression guard: 3-turn sanity for total_session_cost.

Plan 05-05 (D5-08) — CI-runnable guard for bench/total_session_cost.py.
The full 10-turn script runs ad-hoc on this dev Mac and populates
the published bench report rows; this test exercises the shape
contracts and the minimal-vs-standard invariant at CI speed.

Acceptance contracts:
  - minimal total <= standard total (TOK-11 sanity; if not, Plan 05-03
    regressed somewhere)
  - per_turn list has exactly 10 entries (fixed D5-08 script)
  - counter mode honest-disclosed in JSON (anthropic-count-tokens |
    tiktoken-cl100k-proxy | heuristic-char4)
  - reference-gate failure flips passed=False

See:
- bench/total_session_cost.py — the harness under guard
- bench/tokens.py — 3-tier counter fallback pattern reused here
- internal architecture spec
  Task 3 for the behavior contract
"""
from __future__ import annotations

import pytest


def test_total_session_cost_reports_per_turn():
    """M-07 script is the fixed D5-08 10-turn sequence."""
    from bench.total_session_cost import run_total_session_cost

    out = run_total_session_cost(wake_depth="minimal")

    assert "per_turn" in out
    assert isinstance(out["per_turn"], list)
    assert len(out["per_turn"]) == 10, (
        f"D5-08 script has 10 turns; got {len(out['per_turn'])}"
    )
    assert out["total_tokens"] == sum(out["per_turn"])
    assert out["adapter"] == "iai-mcp"
    assert out["wake_depth"] == "minimal"


def test_total_session_cost_minimal_le_standard():
    """TOK-11 invariant: wake_depth=minimal must not cost more than
    wake_depth=standard over the same 10-turn script. If this fails,
    Plan 05-03's lazy session-start work regressed.
    """
    from bench.total_session_cost import run_total_session_cost

    minimal = run_total_session_cost(wake_depth="minimal")
    standard = run_total_session_cost(wake_depth="standard")

    assert minimal["total_tokens"] <= standard["total_tokens"], (
        f"minimal {minimal['total_tokens']} > standard {standard['total_tokens']}"
        " — TOK-11 regression"
    )


def test_total_session_cost_counter_mode_disclosed():
    """BENCH_REPORT honesty: every JSON output must name the counter mode
    used so downstream reports can flag non-official numbers."""
    from bench.total_session_cost import run_total_session_cost

    out = run_total_session_cost(wake_depth="minimal")
    assert out["mode"] in (
        "anthropic-count-tokens",
        "tiktoken-cl100k-proxy",
        "heuristic-char4",
        "injected",
    )


def test_total_session_cost_fails_when_above_ref():
    """When the reference-adapter number is explicitly lower than IAI's,
    the comparative gate flips passed=False. Tests supply an
    impossibly-low ref so the assertion is host-independent.
    """
    from bench.total_session_cost import run_total_session_cost

    out = run_total_session_cost(wake_depth="standard", mempalace_ref=1)
    assert out["passed"] is False
    assert out["refs"]["mempalace"] == 1


def test_total_session_cost_passes_without_refs():
    """When no reference numbers supplied, passed=True is the degenerate
    answer (the bench still records IAI totals for BENCH_REPORT to pick
    up). Honest-disclosure about ref absence lives in the report prose."""
    from bench.total_session_cost import run_total_session_cost

    out = run_total_session_cost(wake_depth="minimal")
    assert out["passed"] is True
    assert out["refs"] == {}


def test_total_session_cost_main_exits_int():
    """CLI entry-point returns 0 or 1 (bench CI contract)."""
    from bench import total_session_cost

    code = total_session_cost.main(argv=["--wake-depth", "minimal"])
    assert code in (0, 1)


def test_total_session_cost_injected_counter():
    """Test-only counter injection: caller can pass a deterministic
    token-count function so the test is not hostage to the proxy
    tokeniser's drift."""
    from bench.total_session_cost import run_total_session_cost

    def _fixed(text: str) -> int:
        return max(1, len(text))  # 1-char-per-token for deterministic checks

    out = run_total_session_cost(
        wake_depth="minimal", count_tokens_fn=_fixed,
    )
    assert out["mode"] == "injected"
    assert out["total_tokens"] >= 10  # at least 1/turn * 10 turns