Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
477 lines
17 KiB
Python
477 lines
17 KiB
Python
"""OPS-12 / total session cost bench.
|
|
|
|
Runs a fixed 10-turn representative script per D5-08 (see 05-CONTEXT.md)
|
|
and counts the total tokens Claude would pay for the full session with
|
|
IAI-MCP wired in. The 10 turns cover the axes the real-user workload
|
|
touches most: verbatim recall, interleaved code-edit chat (no recall),
|
|
cross-community recall, save, introspection.
|
|
|
|
JSON output (one line to stdout):
|
|
|
|
{
|
|
"adapter": "iai-mcp",
|
|
"wake_depth": "minimal"|"standard"|"deep",
|
|
"total_tokens": int,
|
|
"per_turn": [int] * 10,
|
|
"mode": "anthropic-count-tokens"|"tiktoken-cl100k-proxy"|
|
|
"heuristic-char4"|"injected",
|
|
"refs": {"mempalace": int?, "claude_mem": int?},
|
|
"passed": bool, # True iff every supplied ref >= IAI
|
|
"script_name": "D5-08-v1"
|
|
}
|
|
|
|
Exit codes:
|
|
0 if passed, 1 otherwise.
|
|
|
|
CLI:
|
|
python -m bench.total_session_cost
|
|
python -m bench.total_session_cost --wake-depth standard
|
|
python -m bench.total_session_cost --ref-mempalace 7000 --ref-claude-mem 5000
|
|
|
|
**Framing note (D5-08):** this bench is a *simulated* 10-turn script —
|
|
it reproduces the token composition (system overhead + tool descriptions
|
|
+ tool-call payloads + tool-result bodies) a real MCP runtime would emit
|
|
for the turn kinds. Real runtime adds network JSON-RPC envelope
|
|
overhead (~30-50 tok/turn); the simulation excludes that. Downstream
|
|
reports MUST disclose this caveat alongside the row.
|
|
|
|
Reference-adapter notes: per PATTERNS.md Discovery #5, bench/adapters/
|
|
mempalace_*.py and claude_mem_*.py do not exist on this machine. The
|
|
comparative gate is driven by explicit ref numbers via CLI flags so the
|
|
bench is usable without live adapters; when unknown, refs default to
|
|
None and passed=True is the degenerate answer. the published bench report
|
|
carries the honest "mempalace/claude-mem refs not measured" disclosure
|
|
for rows where a measurement was not taken.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from typing import Callable
|
|
|
|
# Reuse bench/tokens.py's 3-tier counter helpers — single source of truth
|
|
# for what "tiktoken-cl100k-proxy" and friends mean.
|
|
from bench.tokens import (
|
|
_anthropic_count_tokens,
|
|
_char4_count,
|
|
_tiktoken_count,
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------- adapters
|
|
#
|
|
# Live subprocess adapters for the reference column. Each adapter runs
|
|
# the 10-turn script through the target tool's CLI, sums the response tokens
|
|
# via the injected counter, and returns the total. On ANY failure
|
|
# (tool absent, timeout, non-zero exit, empty stdout) the adapter returns
|
|
# ``None`` and emits ``{"event": "bench_adapter_unavailable", ...}`` to
|
|
# stderr. Callers MUST treat None as "honest disclosure, no measurement"
|
|
# rather than a hard bench failure.
|
|
#
|
|
# Security note (T-05-06-04): turn text is a constant from _SCRIPT, never
|
|
# from user input, and ``subprocess.run(argv_list, shell=False)`` avoids
|
|
# any shell-injection surface. The 30s per-turn timeout bounds the DoS
|
|
# risk (T-05-06-03).
|
|
|
|
_ADAPTER_TIMEOUT_SECONDS = 30
|
|
|
|
|
|
def _log_adapter_unavailable(tool: str, reason: str) -> None:
|
|
line = json.dumps({
|
|
"event": "bench_adapter_unavailable",
|
|
"tool": tool,
|
|
"reason": reason,
|
|
})
|
|
print(line, file=sys.stderr)
|
|
|
|
|
|
def _run_subprocess_adapter(
|
|
*,
|
|
tool_name: str,
|
|
cli_name: str,
|
|
argv_template: Callable[[str], list[str]],
|
|
script: list[dict],
|
|
counter: Callable[[str], int],
|
|
) -> int | None:
|
|
"""Shared helper: locate ``cli_name`` via ``shutil.which``; for each turn
|
|
run its argv (provided by ``argv_template(turn_input)``) with a bounded
|
|
timeout; sum stdout token counts across all turns. Return ``None`` on
|
|
any failure (absent / timeout / non-zero / empty stdout)."""
|
|
exe = shutil.which(cli_name)
|
|
if exe is None:
|
|
_log_adapter_unavailable(tool_name, "cli_not_found")
|
|
return None
|
|
|
|
total = 0
|
|
for turn in script:
|
|
argv = [exe, *argv_template(turn["input"])[1:]]
|
|
try:
|
|
proc = subprocess.run(
|
|
argv,
|
|
timeout=_ADAPTER_TIMEOUT_SECONDS,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
except subprocess.TimeoutExpired as exc:
|
|
_log_adapter_unavailable(tool_name, f"timeout: {exc}")
|
|
return None
|
|
except (OSError, ValueError) as exc:
|
|
_log_adapter_unavailable(tool_name, f"subprocess_error: {exc}")
|
|
return None
|
|
|
|
if proc.returncode != 0:
|
|
_log_adapter_unavailable(
|
|
tool_name,
|
|
f"non_zero_exit={proc.returncode} stderr={proc.stderr[:200]!r}",
|
|
)
|
|
return None
|
|
|
|
stdout = proc.stdout or ""
|
|
# Empty stdout is a legitimate "no match" response for search-style
|
|
# CLIs; we DO count it (0 tokens) rather than treating as failure,
|
|
# so adapters run against a pristine palace still publish a number.
|
|
total += int(counter(stdout))
|
|
|
|
return total
|
|
|
|
|
|
def _run_mempalace_adapter(
|
|
script: list[dict],
|
|
counter: Callable[[str], int],
|
|
) -> int | None:
|
|
"""M-07 live reference: run each turn through ``mempalace search`` and
|
|
sum the stdout token counts. Returns ``None`` when mempalace is absent
|
|
or any subprocess call fails. Honest-disclosure contract per Plan 05-06.
|
|
"""
|
|
return _run_subprocess_adapter(
|
|
tool_name="mempalace",
|
|
cli_name="mempalace",
|
|
argv_template=lambda text: ["mempalace", "search", text],
|
|
script=script,
|
|
counter=counter,
|
|
)
|
|
|
|
|
|
def _run_claude_mem_adapter(
|
|
script: list[dict],
|
|
counter: Callable[[str], int],
|
|
) -> int | None:
|
|
"""Forward-compat mirror of the mempalace adapter. On machines where
|
|
``claude-mem`` is not installed this returns ``None`` + stderr event;
|
|
when it IS installed (future pressplay cross-validation run) the same
|
|
code path measures it without another plan iteration."""
|
|
return _run_subprocess_adapter(
|
|
tool_name="claude-mem",
|
|
cli_name="claude-mem",
|
|
argv_template=lambda text: ["claude-mem", "recall", text],
|
|
script=script,
|
|
counter=counter,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------- D5-08 script
|
|
#
|
|
# Fixed 10-turn representative script. Each turn has a `kind` (used to
|
|
# compose a realistic tool-result body) and an `input` (the cue text).
|
|
# Order matters: turn 1 pays session-start overhead, turn 4 exercises the
|
|
# cross-community recall path, turn 5/6 exercise save/introspect.
|
|
|
|
SCRIPT_NAME = "D5-08-v1"
|
|
|
|
_SCRIPT: list[dict] = [
|
|
{
|
|
"kind": "recall",
|
|
"input": "Tell me the decisions we made about architecture",
|
|
},
|
|
{
|
|
"kind": "chat",
|
|
"input": "Let me iterate on this function; no recall needed here",
|
|
},
|
|
{
|
|
"kind": "recall",
|
|
"input": "What did I say about bench discipline?",
|
|
},
|
|
{
|
|
"kind": "recall_cross_community",
|
|
"input": "What is the connection between and the autistic kernel?",
|
|
},
|
|
{
|
|
"kind": "save",
|
|
"input": "Decision locked: use cachetools TTLCache for LRU",
|
|
},
|
|
{
|
|
"kind": "introspect",
|
|
"input": "profile_get_set operation=get knob=wake_depth",
|
|
},
|
|
{
|
|
"kind": "chat",
|
|
"input": "Continuing this refactor; still no recall",
|
|
},
|
|
{
|
|
"kind": "recall",
|
|
"input": "Alice said something about pressplay cross-validation",
|
|
},
|
|
{
|
|
"kind": "reinforce",
|
|
"input": "memory_reinforce the last 3 hits",
|
|
},
|
|
{
|
|
"kind": "introspect",
|
|
"input": "events_query kind=first_turn_recall limit=5",
|
|
},
|
|
]
|
|
|
|
|
|
# Tool-description overhead mirrors the TOK-15 audit result
|
|
# (134 raw tok total for the 11 registered tools; see 05-03-SUMMARY.md).
|
|
# We reproduce the POST-audit text verbatim so the bench reflects the
|
|
# actual current overhead Claude sees on each turn.
|
|
_POST_TOK15_TOOL_DESCRIPTIONS = "\n".join([
|
|
"Recall verbatim memories matching cue. Returns hits + anti_hits.",
|
|
"Structural recall over role->filler bindings. Returns hits.",
|
|
"Boost Hebbian edges among co-retrieved record ids.",
|
|
"Mark a record contradicted; new fact stored as new record.",
|
|
"Trigger memory consolidation.",
|
|
"Read or write a profile knob (15 sealed). operation: get|set.",
|
|
"List pending curiosity questions. Optional session_id filter.",
|
|
"List induced schemas. Optional domain + confidence_min filters.",
|
|
"Query user-visible events by kind, since, severity, limit.",
|
|
"Topology snapshot: N, C, L, sigma, community_count, regime.",
|
|
"Camouflaging detection status; window_size weekly points.",
|
|
])
|
|
|
|
# Synthetic tool-result body per turn kind. Realistic-but-bounded; a real
|
|
# runtime varies by store content but the ratio across wake_depths is
|
|
# what measures, not the absolute per-query payload.
|
|
_RESULT_BODIES: dict[str, str] = {
|
|
"recall": (
|
|
"hits=[{record_id, literal_surface, score}] "
|
|
"anti_hits=[{record_id, reason}] "
|
|
"activation_trace=[community_gate, spread, rank] "
|
|
"budget_used=200"
|
|
),
|
|
"save": "ok=true id=<uuid>",
|
|
"introspect": '{"value": "minimal"}',
|
|
"reinforce": "ok=true edges_boosted=3",
|
|
"chat": "",
|
|
"recall_cross_community": (
|
|
"hits=[{record_id, literal_surface, score, community_id}] "
|
|
"anti_hits=[] activation_trace=[cross_community_spread] "
|
|
"budget_used=350"
|
|
),
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------- counter select
|
|
|
|
def _select_counter(
|
|
count_tokens_fn: Callable[[str], int] | None = None,
|
|
) -> tuple[Callable[[str], int], str]:
|
|
"""3-tier counter fallback mirroring bench/tokens.py:165-182.
|
|
|
|
Priority:
|
|
1. explicit injection (`count_tokens_fn` kwarg, tests)
|
|
2. Anthropic count_tokens API (`ANTHROPIC_API_KEY` env var)
|
|
3. tiktoken cl100k_base (offline proxy)
|
|
4. char/4 heuristic (last resort)
|
|
"""
|
|
if count_tokens_fn is not None:
|
|
return count_tokens_fn, "injected"
|
|
if os.environ.get("ANTHROPIC_API_KEY"):
|
|
return _anthropic_count_tokens, "anthropic-count-tokens"
|
|
try:
|
|
import tiktoken # noqa: F401
|
|
return _tiktoken_count, "tiktoken-cl100k-proxy"
|
|
except ImportError:
|
|
return _char4_count, "heuristic-char4"
|
|
|
|
|
|
# ---------------------------------------------------------------- per-turn cost
|
|
|
|
def _session_start_overhead_tokens(wake_depth: str) -> int:
|
|
"""Session-start payload size charged to turn 1 per wake_depth mode.
|
|
|
|
Numbers sourced from measurements (05-03-SUMMARY.md table):
|
|
- minimal : 24 tok (lazy pointers only)
|
|
- standard : 1388 tok (eager Phase-1 L0+L1+L2+rich_club)
|
|
- deep : ~2000 tok (rich_club budget lifted per D5-02)
|
|
|
|
Rounded to the cache metric exactly so the numbers are
|
|
consistent with M-01's reported warm session-start row.
|
|
"""
|
|
if wake_depth == "minimal":
|
|
return 24
|
|
if wake_depth == "standard":
|
|
return 1388
|
|
return 2000 # deep
|
|
|
|
|
|
def _simulate_turn(
|
|
turn: dict,
|
|
counter: Callable[[str], int],
|
|
) -> int:
|
|
"""Compose the per-turn text that Claude sees and count its tokens."""
|
|
parts: list[str] = [
|
|
_POST_TOK15_TOOL_DESCRIPTIONS, # constant per-turn overhead
|
|
turn["input"], # user / call payload
|
|
_RESULT_BODIES.get(turn["kind"], ""), # synthetic result body
|
|
]
|
|
return int(counter("\n".join(p for p in parts if p)))
|
|
|
|
|
|
# ---------------------------------------------------------------- public API
|
|
|
|
def run_total_session_cost(
|
|
*,
|
|
wake_depth: str = "minimal",
|
|
mempalace_ref: int | None = None,
|
|
claude_mem_ref: int | None = None,
|
|
measure_mempalace: bool = False,
|
|
measure_claude_mem: bool = False,
|
|
count_tokens_fn: Callable[[str], int] | None = None,
|
|
) -> dict:
|
|
"""Run the fixed 10-turn script at the given wake_depth.
|
|
|
|
Parameters:
|
|
wake_depth: "minimal" | "standard" | "deep" — selects session-start
|
|
payload size charged to turn 1.
|
|
mempalace_ref / claude_mem_ref: optional manually-supplied reference
|
|
totals (stored as ``refs["*_manual"]`` for audit). When no live
|
|
measurement exists, a manual int is the comparator for ``passed``.
|
|
measure_mempalace / measure_claude_mem: when True, invoke the live
|
|
subprocess adapter and store the result as ``refs["*_measured"]``.
|
|
A live measurement supersedes the manual ref as the comparator.
|
|
count_tokens_fn: optional counter injection (tests use a fixed
|
|
function to decouple assertions from tokeniser drift).
|
|
"""
|
|
counter, mode = _select_counter(count_tokens_fn)
|
|
|
|
per_turn: list[int] = []
|
|
for i, turn in enumerate(_SCRIPT):
|
|
t = _simulate_turn(turn, counter)
|
|
if i == 0:
|
|
# Turn 1 pays the session-start overhead per wake_depth.
|
|
t += _session_start_overhead_tokens(wake_depth)
|
|
per_turn.append(int(t))
|
|
|
|
total = int(sum(per_turn))
|
|
|
|
refs: dict[str, int] = {}
|
|
passed = True
|
|
|
|
# Live measurements first so we can decide whether the manual int should
|
|
# be recorded under the legacy key ("mempalace") or the audit-trail key
|
|
# ("mempalace_manual", used when BOTH a measurement AND a manual ref are
|
|
# supplied per Test 6).
|
|
mp_measured: int | None = None
|
|
cm_measured: int | None = None
|
|
if measure_mempalace:
|
|
mp_measured = _run_mempalace_adapter(_SCRIPT, counter)
|
|
if mp_measured is not None:
|
|
refs["mempalace_measured"] = int(mp_measured)
|
|
if measure_claude_mem:
|
|
cm_measured = _run_claude_mem_adapter(_SCRIPT, counter)
|
|
if cm_measured is not None:
|
|
refs["claude_mem_measured"] = int(cm_measured)
|
|
|
|
# Manual refs. Back-compat with when no live measurement is
|
|
# present, the manual int lands under the legacy "mempalace" / "claude_mem"
|
|
# key so pre-existing downstream consumers (and tests) keep working.
|
|
if mempalace_ref is not None:
|
|
key = "mempalace_manual" if mp_measured is not None else "mempalace"
|
|
refs[key] = int(mempalace_ref)
|
|
if claude_mem_ref is not None:
|
|
key = "claude_mem_manual" if cm_measured is not None else "claude_mem"
|
|
refs[key] = int(claude_mem_ref)
|
|
|
|
# Gate logic: measured > legacy manual > audit-trail manual > no gate.
|
|
mp_gate = refs.get(
|
|
"mempalace_measured", refs.get("mempalace", refs.get("mempalace_manual"))
|
|
)
|
|
cm_gate = refs.get(
|
|
"claude_mem_measured", refs.get("claude_mem", refs.get("claude_mem_manual"))
|
|
)
|
|
if mp_gate is not None and total > mp_gate:
|
|
passed = False
|
|
if cm_gate is not None and total > cm_gate:
|
|
passed = False
|
|
|
|
return {
|
|
"adapter": "iai-mcp",
|
|
"wake_depth": wake_depth,
|
|
"total_tokens": total,
|
|
"per_turn": per_turn,
|
|
"mode": mode,
|
|
"refs": refs,
|
|
"passed": passed,
|
|
"script_name": SCRIPT_NAME,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------- CLI
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
parser = argparse.ArgumentParser(
|
|
prog="bench.total_session_cost",
|
|
description=(
|
|
"OPS-12 / total session cost bench. Fixed 10-turn "
|
|
"representative script (D5-08); measures IAI-MCP token cost "
|
|
"at wake_depth minimal|standard|deep and optionally compares "
|
|
"to supplied mempalace / claude-mem reference totals."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--wake-depth",
|
|
choices=("minimal", "standard", "deep"),
|
|
default="minimal",
|
|
help="session-start payload size (default minimal per D5-02)",
|
|
)
|
|
parser.add_argument(
|
|
"--ref-mempalace",
|
|
dest="mempalace_ref",
|
|
type=int, default=None,
|
|
help="mempalace reference total (tokens) for the comparative gate",
|
|
)
|
|
parser.add_argument(
|
|
"--ref-claude-mem",
|
|
dest="claude_mem_ref",
|
|
type=int, default=None,
|
|
help="claude-mem reference total (tokens) for the comparative gate",
|
|
)
|
|
parser.add_argument(
|
|
"--measure-mempalace",
|
|
action="store_true",
|
|
help=(
|
|
"attempt a live mempalace subprocess run to fill the "
|
|
"reference column; on failure emits a bench_adapter_unavailable "
|
|
"stderr event and records no measurement"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--measure-claude-mem",
|
|
action="store_true",
|
|
help=(
|
|
"attempt a live claude-mem subprocess run; identical fallback "
|
|
"shape to --measure-mempalace"
|
|
),
|
|
)
|
|
args = parser.parse_args(argv)
|
|
|
|
result = run_total_session_cost(
|
|
wake_depth=args.wake_depth,
|
|
mempalace_ref=args.mempalace_ref,
|
|
claude_mem_ref=args.claude_mem_ref,
|
|
measure_mempalace=args.measure_mempalace,
|
|
measure_claude_mem=args.measure_claude_mem,
|
|
)
|
|
print(json.dumps(result))
|
|
return 0 if result["passed"] else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|