"""OPS-12 / total session cost bench. Runs a fixed 10-turn representative script per D5-08 (see 05-CONTEXT.md) and counts the total tokens Claude would pay for the full session with IAI-MCP wired in. The 10 turns cover the axes the real-user workload touches most: verbatim recall, interleaved code-edit chat (no recall), cross-community recall, save, introspection. JSON output (one line to stdout): { "adapter": "iai-mcp", "wake_depth": "minimal"|"standard"|"deep", "total_tokens": int, "per_turn": [int] * 10, "mode": "anthropic-count-tokens"|"tiktoken-cl100k-proxy"| "heuristic-char4"|"injected", "refs": {"mempalace": int?, "claude_mem": int?}, "passed": bool, # True iff every supplied ref >= IAI "script_name": "D5-08-v1" } Exit codes: 0 if passed, 1 otherwise. CLI: python -m bench.total_session_cost python -m bench.total_session_cost --wake-depth standard python -m bench.total_session_cost --ref-mempalace 7000 --ref-claude-mem 5000 **Framing note (D5-08):** this bench is a *simulated* 10-turn script — it reproduces the token composition (system overhead + tool descriptions + tool-call payloads + tool-result bodies) a real MCP runtime would emit for the turn kinds. Real runtime adds network JSON-RPC envelope overhead (~30-50 tok/turn); the simulation excludes that. Downstream reports MUST disclose this caveat alongside the row. Reference-adapter notes: per PATTERNS.md Discovery #5, bench/adapters/ mempalace_*.py and claude_mem_*.py do not exist on this machine. The comparative gate is driven by explicit ref numbers via CLI flags so the bench is usable without live adapters; when unknown, refs default to None and passed=True is the degenerate answer. the published bench report carries the honest "mempalace/claude-mem refs not measured" disclosure for rows where a measurement was not taken. """ from __future__ import annotations import argparse import json import os import shutil import subprocess import sys from typing import Callable # Reuse bench/tokens.py's 3-tier counter helpers — single source of truth # for what "tiktoken-cl100k-proxy" and friends mean. from bench.tokens import ( _anthropic_count_tokens, _char4_count, _tiktoken_count, ) # ------------------------------------------------------------- adapters # # Live subprocess adapters for the reference column. Each adapter runs # the 10-turn script through the target tool's CLI, sums the response tokens # via the injected counter, and returns the total. On ANY failure # (tool absent, timeout, non-zero exit, empty stdout) the adapter returns # ``None`` and emits ``{"event": "bench_adapter_unavailable", ...}`` to # stderr. Callers MUST treat None as "honest disclosure, no measurement" # rather than a hard bench failure. # # Security note (T-05-06-04): turn text is a constant from _SCRIPT, never # from user input, and ``subprocess.run(argv_list, shell=False)`` avoids # any shell-injection surface. The 30s per-turn timeout bounds the DoS # risk (T-05-06-03). _ADAPTER_TIMEOUT_SECONDS = 30 def _log_adapter_unavailable(tool: str, reason: str) -> None: line = json.dumps({ "event": "bench_adapter_unavailable", "tool": tool, "reason": reason, }) print(line, file=sys.stderr) def _run_subprocess_adapter( *, tool_name: str, cli_name: str, argv_template: Callable[[str], list[str]], script: list[dict], counter: Callable[[str], int], ) -> int | None: """Shared helper: locate ``cli_name`` via ``shutil.which``; for each turn run its argv (provided by ``argv_template(turn_input)``) with a bounded timeout; sum stdout token counts across all turns. Return ``None`` on any failure (absent / timeout / non-zero / empty stdout).""" exe = shutil.which(cli_name) if exe is None: _log_adapter_unavailable(tool_name, "cli_not_found") return None total = 0 for turn in script: argv = [exe, *argv_template(turn["input"])[1:]] try: proc = subprocess.run( argv, timeout=_ADAPTER_TIMEOUT_SECONDS, capture_output=True, text=True, check=False, ) except subprocess.TimeoutExpired as exc: _log_adapter_unavailable(tool_name, f"timeout: {exc}") return None except (OSError, ValueError) as exc: _log_adapter_unavailable(tool_name, f"subprocess_error: {exc}") return None if proc.returncode != 0: _log_adapter_unavailable( tool_name, f"non_zero_exit={proc.returncode} stderr={proc.stderr[:200]!r}", ) return None stdout = proc.stdout or "" # Empty stdout is a legitimate "no match" response for search-style # CLIs; we DO count it (0 tokens) rather than treating as failure, # so adapters run against a pristine palace still publish a number. total += int(counter(stdout)) return total def _run_mempalace_adapter( script: list[dict], counter: Callable[[str], int], ) -> int | None: """M-07 live reference: run each turn through ``mempalace search`` and sum the stdout token counts. Returns ``None`` when mempalace is absent or any subprocess call fails. Honest-disclosure contract per Plan 05-06. """ return _run_subprocess_adapter( tool_name="mempalace", cli_name="mempalace", argv_template=lambda text: ["mempalace", "search", text], script=script, counter=counter, ) def _run_claude_mem_adapter( script: list[dict], counter: Callable[[str], int], ) -> int | None: """Forward-compat mirror of the mempalace adapter. On machines where ``claude-mem`` is not installed this returns ``None`` + stderr event; when it IS installed (future pressplay cross-validation run) the same code path measures it without another plan iteration.""" return _run_subprocess_adapter( tool_name="claude-mem", cli_name="claude-mem", argv_template=lambda text: ["claude-mem", "recall", text], script=script, counter=counter, ) # ---------------------------------------------------------------- D5-08 script # # Fixed 10-turn representative script. Each turn has a `kind` (used to # compose a realistic tool-result body) and an `input` (the cue text). # Order matters: turn 1 pays session-start overhead, turn 4 exercises the # cross-community recall path, turn 5/6 exercise save/introspect. SCRIPT_NAME = "D5-08-v1" _SCRIPT: list[dict] = [ { "kind": "recall", "input": "Tell me the decisions we made about architecture", }, { "kind": "chat", "input": "Let me iterate on this function; no recall needed here", }, { "kind": "recall", "input": "What did I say about bench discipline?", }, { "kind": "recall_cross_community", "input": "What is the connection between and the autistic kernel?", }, { "kind": "save", "input": "Decision locked: use cachetools TTLCache for LRU", }, { "kind": "introspect", "input": "profile_get_set operation=get knob=wake_depth", }, { "kind": "chat", "input": "Continuing this refactor; still no recall", }, { "kind": "recall", "input": "Alice said something about pressplay cross-validation", }, { "kind": "reinforce", "input": "memory_reinforce the last 3 hits", }, { "kind": "introspect", "input": "events_query kind=first_turn_recall limit=5", }, ] # Tool-description overhead mirrors the TOK-15 audit result # (134 raw tok total for the 11 registered tools; see 05-03-SUMMARY.md). # We reproduce the POST-audit text verbatim so the bench reflects the # actual current overhead Claude sees on each turn. _POST_TOK15_TOOL_DESCRIPTIONS = "\n".join([ "Recall verbatim memories matching cue. Returns hits + anti_hits.", "Structural recall over role->filler bindings. Returns hits.", "Boost Hebbian edges among co-retrieved record ids.", "Mark a record contradicted; new fact stored as new record.", "Trigger memory consolidation.", "Read or write a profile knob (15 sealed). operation: get|set.", "List pending curiosity questions. Optional session_id filter.", "List induced schemas. Optional domain + confidence_min filters.", "Query user-visible events by kind, since, severity, limit.", "Topology snapshot: N, C, L, sigma, community_count, regime.", "Camouflaging detection status; window_size weekly points.", ]) # Synthetic tool-result body per turn kind. Realistic-but-bounded; a real # runtime varies by store content but the ratio across wake_depths is # what measures, not the absolute per-query payload. _RESULT_BODIES: dict[str, str] = { "recall": ( "hits=[{record_id, literal_surface, score}] " "anti_hits=[{record_id, reason}] " "activation_trace=[community_gate, spread, rank] " "budget_used=200" ), "save": "ok=true id=", "introspect": '{"value": "minimal"}', "reinforce": "ok=true edges_boosted=3", "chat": "", "recall_cross_community": ( "hits=[{record_id, literal_surface, score, community_id}] " "anti_hits=[] activation_trace=[cross_community_spread] " "budget_used=350" ), } # ---------------------------------------------------------------- counter select def _select_counter( count_tokens_fn: Callable[[str], int] | None = None, ) -> tuple[Callable[[str], int], str]: """3-tier counter fallback mirroring bench/tokens.py:165-182. Priority: 1. explicit injection (`count_tokens_fn` kwarg, tests) 2. Anthropic count_tokens API (`ANTHROPIC_API_KEY` env var) 3. tiktoken cl100k_base (offline proxy) 4. char/4 heuristic (last resort) """ if count_tokens_fn is not None: return count_tokens_fn, "injected" if os.environ.get("ANTHROPIC_API_KEY"): return _anthropic_count_tokens, "anthropic-count-tokens" try: import tiktoken # noqa: F401 return _tiktoken_count, "tiktoken-cl100k-proxy" except ImportError: return _char4_count, "heuristic-char4" # ---------------------------------------------------------------- per-turn cost def _session_start_overhead_tokens(wake_depth: str) -> int: """Session-start payload size charged to turn 1 per wake_depth mode. Numbers sourced from measurements (05-03-SUMMARY.md table): - minimal : 24 tok (lazy pointers only) - standard : 1388 tok (eager Phase-1 L0+L1+L2+rich_club) - deep : ~2000 tok (rich_club budget lifted per D5-02) Rounded to the cache metric exactly so the numbers are consistent with M-01's reported warm session-start row. """ if wake_depth == "minimal": return 24 if wake_depth == "standard": return 1388 return 2000 # deep def _simulate_turn( turn: dict, counter: Callable[[str], int], ) -> int: """Compose the per-turn text that Claude sees and count its tokens.""" parts: list[str] = [ _POST_TOK15_TOOL_DESCRIPTIONS, # constant per-turn overhead turn["input"], # user / call payload _RESULT_BODIES.get(turn["kind"], ""), # synthetic result body ] return int(counter("\n".join(p for p in parts if p))) # ---------------------------------------------------------------- public API def run_total_session_cost( *, wake_depth: str = "minimal", mempalace_ref: int | None = None, claude_mem_ref: int | None = None, measure_mempalace: bool = False, measure_claude_mem: bool = False, count_tokens_fn: Callable[[str], int] | None = None, ) -> dict: """Run the fixed 10-turn script at the given wake_depth. Parameters: wake_depth: "minimal" | "standard" | "deep" — selects session-start payload size charged to turn 1. mempalace_ref / claude_mem_ref: optional manually-supplied reference totals (stored as ``refs["*_manual"]`` for audit). When no live measurement exists, a manual int is the comparator for ``passed``. measure_mempalace / measure_claude_mem: when True, invoke the live subprocess adapter and store the result as ``refs["*_measured"]``. A live measurement supersedes the manual ref as the comparator. count_tokens_fn: optional counter injection (tests use a fixed function to decouple assertions from tokeniser drift). """ counter, mode = _select_counter(count_tokens_fn) per_turn: list[int] = [] for i, turn in enumerate(_SCRIPT): t = _simulate_turn(turn, counter) if i == 0: # Turn 1 pays the session-start overhead per wake_depth. t += _session_start_overhead_tokens(wake_depth) per_turn.append(int(t)) total = int(sum(per_turn)) refs: dict[str, int] = {} passed = True # Live measurements first so we can decide whether the manual int should # be recorded under the legacy key ("mempalace") or the audit-trail key # ("mempalace_manual", used when BOTH a measurement AND a manual ref are # supplied per Test 6). mp_measured: int | None = None cm_measured: int | None = None if measure_mempalace: mp_measured = _run_mempalace_adapter(_SCRIPT, counter) if mp_measured is not None: refs["mempalace_measured"] = int(mp_measured) if measure_claude_mem: cm_measured = _run_claude_mem_adapter(_SCRIPT, counter) if cm_measured is not None: refs["claude_mem_measured"] = int(cm_measured) # Manual refs. Back-compat with when no live measurement is # present, the manual int lands under the legacy "mempalace" / "claude_mem" # key so pre-existing downstream consumers (and tests) keep working. if mempalace_ref is not None: key = "mempalace_manual" if mp_measured is not None else "mempalace" refs[key] = int(mempalace_ref) if claude_mem_ref is not None: key = "claude_mem_manual" if cm_measured is not None else "claude_mem" refs[key] = int(claude_mem_ref) # Gate logic: measured > legacy manual > audit-trail manual > no gate. mp_gate = refs.get( "mempalace_measured", refs.get("mempalace", refs.get("mempalace_manual")) ) cm_gate = refs.get( "claude_mem_measured", refs.get("claude_mem", refs.get("claude_mem_manual")) ) if mp_gate is not None and total > mp_gate: passed = False if cm_gate is not None and total > cm_gate: passed = False return { "adapter": "iai-mcp", "wake_depth": wake_depth, "total_tokens": total, "per_turn": per_turn, "mode": mode, "refs": refs, "passed": passed, "script_name": SCRIPT_NAME, } # ---------------------------------------------------------------- CLI def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog="bench.total_session_cost", description=( "OPS-12 / total session cost bench. Fixed 10-turn " "representative script (D5-08); measures IAI-MCP token cost " "at wake_depth minimal|standard|deep and optionally compares " "to supplied mempalace / claude-mem reference totals." ), ) parser.add_argument( "--wake-depth", choices=("minimal", "standard", "deep"), default="minimal", help="session-start payload size (default minimal per D5-02)", ) parser.add_argument( "--ref-mempalace", dest="mempalace_ref", type=int, default=None, help="mempalace reference total (tokens) for the comparative gate", ) parser.add_argument( "--ref-claude-mem", dest="claude_mem_ref", type=int, default=None, help="claude-mem reference total (tokens) for the comparative gate", ) parser.add_argument( "--measure-mempalace", action="store_true", help=( "attempt a live mempalace subprocess run to fill the " "reference column; on failure emits a bench_adapter_unavailable " "stderr event and records no measurement" ), ) parser.add_argument( "--measure-claude-mem", action="store_true", help=( "attempt a live claude-mem subprocess run; identical fallback " "shape to --measure-mempalace" ), ) args = parser.parse_args(argv) result = run_total_session_cost( wake_depth=args.wake_depth, mempalace_ref=args.mempalace_ref, claude_mem_ref=args.claude_mem_ref, measure_mempalace=args.measure_mempalace, measure_claude_mem=args.measure_claude_mem, ) print(json.dumps(result)) return 0 if result["passed"] else 1 if __name__ == "__main__": sys.exit(main())