Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
477
bench/total_session_cost.py
Normal file
477
bench/total_session_cost.py
Normal file
|
|
@ -0,0 +1,477 @@
|
|||
"""OPS-12 / total session cost bench.
|
||||
|
||||
Runs a fixed 10-turn representative script per D5-08 (see 05-CONTEXT.md)
|
||||
and counts the total tokens Claude would pay for the full session with
|
||||
IAI-MCP wired in. The 10 turns cover the axes the real-user workload
|
||||
touches most: verbatim recall, interleaved code-edit chat (no recall),
|
||||
cross-community recall, save, introspection.
|
||||
|
||||
JSON output (one line to stdout):
|
||||
|
||||
{
|
||||
"adapter": "iai-mcp",
|
||||
"wake_depth": "minimal"|"standard"|"deep",
|
||||
"total_tokens": int,
|
||||
"per_turn": [int] * 10,
|
||||
"mode": "anthropic-count-tokens"|"tiktoken-cl100k-proxy"|
|
||||
"heuristic-char4"|"injected",
|
||||
"refs": {"mempalace": int?, "claude_mem": int?},
|
||||
"passed": bool, # True iff every supplied ref >= IAI
|
||||
"script_name": "D5-08-v1"
|
||||
}
|
||||
|
||||
Exit codes:
|
||||
0 if passed, 1 otherwise.
|
||||
|
||||
CLI:
|
||||
python -m bench.total_session_cost
|
||||
python -m bench.total_session_cost --wake-depth standard
|
||||
python -m bench.total_session_cost --ref-mempalace 7000 --ref-claude-mem 5000
|
||||
|
||||
**Framing note (D5-08):** this bench is a *simulated* 10-turn script —
|
||||
it reproduces the token composition (system overhead + tool descriptions
|
||||
+ tool-call payloads + tool-result bodies) a real MCP runtime would emit
|
||||
for the turn kinds. Real runtime adds network JSON-RPC envelope
|
||||
overhead (~30-50 tok/turn); the simulation excludes that. Downstream
|
||||
reports MUST disclose this caveat alongside the row.
|
||||
|
||||
Reference-adapter notes: per PATTERNS.md Discovery #5, bench/adapters/
|
||||
mempalace_*.py and claude_mem_*.py do not exist on this machine. The
|
||||
comparative gate is driven by explicit ref numbers via CLI flags so the
|
||||
bench is usable without live adapters; when unknown, refs default to
|
||||
None and passed=True is the degenerate answer. the published bench report
|
||||
carries the honest "mempalace/claude-mem refs not measured" disclosure
|
||||
for rows where a measurement was not taken.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Callable
|
||||
|
||||
# Reuse bench/tokens.py's 3-tier counter helpers — single source of truth
|
||||
# for what "tiktoken-cl100k-proxy" and friends mean.
|
||||
from bench.tokens import (
|
||||
_anthropic_count_tokens,
|
||||
_char4_count,
|
||||
_tiktoken_count,
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------- adapters
|
||||
#
|
||||
# Live subprocess adapters for the reference column. Each adapter runs
|
||||
# the 10-turn script through the target tool's CLI, sums the response tokens
|
||||
# via the injected counter, and returns the total. On ANY failure
|
||||
# (tool absent, timeout, non-zero exit, empty stdout) the adapter returns
|
||||
# ``None`` and emits ``{"event": "bench_adapter_unavailable", ...}`` to
|
||||
# stderr. Callers MUST treat None as "honest disclosure, no measurement"
|
||||
# rather than a hard bench failure.
|
||||
#
|
||||
# Security note (T-05-06-04): turn text is a constant from _SCRIPT, never
|
||||
# from user input, and ``subprocess.run(argv_list, shell=False)`` avoids
|
||||
# any shell-injection surface. The 30s per-turn timeout bounds the DoS
|
||||
# risk (T-05-06-03).
|
||||
|
||||
_ADAPTER_TIMEOUT_SECONDS = 30
|
||||
|
||||
|
||||
def _log_adapter_unavailable(tool: str, reason: str) -> None:
|
||||
line = json.dumps({
|
||||
"event": "bench_adapter_unavailable",
|
||||
"tool": tool,
|
||||
"reason": reason,
|
||||
})
|
||||
print(line, file=sys.stderr)
|
||||
|
||||
|
||||
def _run_subprocess_adapter(
|
||||
*,
|
||||
tool_name: str,
|
||||
cli_name: str,
|
||||
argv_template: Callable[[str], list[str]],
|
||||
script: list[dict],
|
||||
counter: Callable[[str], int],
|
||||
) -> int | None:
|
||||
"""Shared helper: locate ``cli_name`` via ``shutil.which``; for each turn
|
||||
run its argv (provided by ``argv_template(turn_input)``) with a bounded
|
||||
timeout; sum stdout token counts across all turns. Return ``None`` on
|
||||
any failure (absent / timeout / non-zero / empty stdout)."""
|
||||
exe = shutil.which(cli_name)
|
||||
if exe is None:
|
||||
_log_adapter_unavailable(tool_name, "cli_not_found")
|
||||
return None
|
||||
|
||||
total = 0
|
||||
for turn in script:
|
||||
argv = [exe, *argv_template(turn["input"])[1:]]
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
argv,
|
||||
timeout=_ADAPTER_TIMEOUT_SECONDS,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
_log_adapter_unavailable(tool_name, f"timeout: {exc}")
|
||||
return None
|
||||
except (OSError, ValueError) as exc:
|
||||
_log_adapter_unavailable(tool_name, f"subprocess_error: {exc}")
|
||||
return None
|
||||
|
||||
if proc.returncode != 0:
|
||||
_log_adapter_unavailable(
|
||||
tool_name,
|
||||
f"non_zero_exit={proc.returncode} stderr={proc.stderr[:200]!r}",
|
||||
)
|
||||
return None
|
||||
|
||||
stdout = proc.stdout or ""
|
||||
# Empty stdout is a legitimate "no match" response for search-style
|
||||
# CLIs; we DO count it (0 tokens) rather than treating as failure,
|
||||
# so adapters run against a pristine palace still publish a number.
|
||||
total += int(counter(stdout))
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def _run_mempalace_adapter(
|
||||
script: list[dict],
|
||||
counter: Callable[[str], int],
|
||||
) -> int | None:
|
||||
"""M-07 live reference: run each turn through ``mempalace search`` and
|
||||
sum the stdout token counts. Returns ``None`` when mempalace is absent
|
||||
or any subprocess call fails. Honest-disclosure contract per Plan 05-06.
|
||||
"""
|
||||
return _run_subprocess_adapter(
|
||||
tool_name="mempalace",
|
||||
cli_name="mempalace",
|
||||
argv_template=lambda text: ["mempalace", "search", text],
|
||||
script=script,
|
||||
counter=counter,
|
||||
)
|
||||
|
||||
|
||||
def _run_claude_mem_adapter(
|
||||
script: list[dict],
|
||||
counter: Callable[[str], int],
|
||||
) -> int | None:
|
||||
"""Forward-compat mirror of the mempalace adapter. On machines where
|
||||
``claude-mem`` is not installed this returns ``None`` + stderr event;
|
||||
when it IS installed (future pressplay cross-validation run) the same
|
||||
code path measures it without another plan iteration."""
|
||||
return _run_subprocess_adapter(
|
||||
tool_name="claude-mem",
|
||||
cli_name="claude-mem",
|
||||
argv_template=lambda text: ["claude-mem", "recall", text],
|
||||
script=script,
|
||||
counter=counter,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- D5-08 script
|
||||
#
|
||||
# Fixed 10-turn representative script. Each turn has a `kind` (used to
|
||||
# compose a realistic tool-result body) and an `input` (the cue text).
|
||||
# Order matters: turn 1 pays session-start overhead, turn 4 exercises the
|
||||
# cross-community recall path, turn 5/6 exercise save/introspect.
|
||||
|
||||
SCRIPT_NAME = "D5-08-v1"
|
||||
|
||||
_SCRIPT: list[dict] = [
|
||||
{
|
||||
"kind": "recall",
|
||||
"input": "Tell me the decisions we made about architecture",
|
||||
},
|
||||
{
|
||||
"kind": "chat",
|
||||
"input": "Let me iterate on this function; no recall needed here",
|
||||
},
|
||||
{
|
||||
"kind": "recall",
|
||||
"input": "What did I say about bench discipline?",
|
||||
},
|
||||
{
|
||||
"kind": "recall_cross_community",
|
||||
"input": "What is the connection between and the autistic kernel?",
|
||||
},
|
||||
{
|
||||
"kind": "save",
|
||||
"input": "Decision locked: use cachetools TTLCache for LRU",
|
||||
},
|
||||
{
|
||||
"kind": "introspect",
|
||||
"input": "profile_get_set operation=get knob=wake_depth",
|
||||
},
|
||||
{
|
||||
"kind": "chat",
|
||||
"input": "Continuing this refactor; still no recall",
|
||||
},
|
||||
{
|
||||
"kind": "recall",
|
||||
"input": "Alice said something about pressplay cross-validation",
|
||||
},
|
||||
{
|
||||
"kind": "reinforce",
|
||||
"input": "memory_reinforce the last 3 hits",
|
||||
},
|
||||
{
|
||||
"kind": "introspect",
|
||||
"input": "events_query kind=first_turn_recall limit=5",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Tool-description overhead mirrors the TOK-15 audit result
|
||||
# (134 raw tok total for the 11 registered tools; see 05-03-SUMMARY.md).
|
||||
# We reproduce the POST-audit text verbatim so the bench reflects the
|
||||
# actual current overhead Claude sees on each turn.
|
||||
_POST_TOK15_TOOL_DESCRIPTIONS = "\n".join([
|
||||
"Recall verbatim memories matching cue. Returns hits + anti_hits.",
|
||||
"Structural recall over role->filler bindings. Returns hits.",
|
||||
"Boost Hebbian edges among co-retrieved record ids.",
|
||||
"Mark a record contradicted; new fact stored as new record.",
|
||||
"Trigger memory consolidation.",
|
||||
"Read or write a profile knob (15 sealed). operation: get|set.",
|
||||
"List pending curiosity questions. Optional session_id filter.",
|
||||
"List induced schemas. Optional domain + confidence_min filters.",
|
||||
"Query user-visible events by kind, since, severity, limit.",
|
||||
"Topology snapshot: N, C, L, sigma, community_count, regime.",
|
||||
"Camouflaging detection status; window_size weekly points.",
|
||||
])
|
||||
|
||||
# Synthetic tool-result body per turn kind. Realistic-but-bounded; a real
|
||||
# runtime varies by store content but the ratio across wake_depths is
|
||||
# what measures, not the absolute per-query payload.
|
||||
_RESULT_BODIES: dict[str, str] = {
|
||||
"recall": (
|
||||
"hits=[{record_id, literal_surface, score}] "
|
||||
"anti_hits=[{record_id, reason}] "
|
||||
"activation_trace=[community_gate, spread, rank] "
|
||||
"budget_used=200"
|
||||
),
|
||||
"save": "ok=true id=<uuid>",
|
||||
"introspect": '{"value": "minimal"}',
|
||||
"reinforce": "ok=true edges_boosted=3",
|
||||
"chat": "",
|
||||
"recall_cross_community": (
|
||||
"hits=[{record_id, literal_surface, score, community_id}] "
|
||||
"anti_hits=[] activation_trace=[cross_community_spread] "
|
||||
"budget_used=350"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- counter select
|
||||
|
||||
def _select_counter(
|
||||
count_tokens_fn: Callable[[str], int] | None = None,
|
||||
) -> tuple[Callable[[str], int], str]:
|
||||
"""3-tier counter fallback mirroring bench/tokens.py:165-182.
|
||||
|
||||
Priority:
|
||||
1. explicit injection (`count_tokens_fn` kwarg, tests)
|
||||
2. Anthropic count_tokens API (`ANTHROPIC_API_KEY` env var)
|
||||
3. tiktoken cl100k_base (offline proxy)
|
||||
4. char/4 heuristic (last resort)
|
||||
"""
|
||||
if count_tokens_fn is not None:
|
||||
return count_tokens_fn, "injected"
|
||||
if os.environ.get("ANTHROPIC_API_KEY"):
|
||||
return _anthropic_count_tokens, "anthropic-count-tokens"
|
||||
try:
|
||||
import tiktoken # noqa: F401
|
||||
return _tiktoken_count, "tiktoken-cl100k-proxy"
|
||||
except ImportError:
|
||||
return _char4_count, "heuristic-char4"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- per-turn cost
|
||||
|
||||
def _session_start_overhead_tokens(wake_depth: str) -> int:
|
||||
"""Session-start payload size charged to turn 1 per wake_depth mode.
|
||||
|
||||
Numbers sourced from measurements (05-03-SUMMARY.md table):
|
||||
- minimal : 24 tok (lazy pointers only)
|
||||
- standard : 1388 tok (eager Phase-1 L0+L1+L2+rich_club)
|
||||
- deep : ~2000 tok (rich_club budget lifted per D5-02)
|
||||
|
||||
Rounded to the cache metric exactly so the numbers are
|
||||
consistent with M-01's reported warm session-start row.
|
||||
"""
|
||||
if wake_depth == "minimal":
|
||||
return 24
|
||||
if wake_depth == "standard":
|
||||
return 1388
|
||||
return 2000 # deep
|
||||
|
||||
|
||||
def _simulate_turn(
|
||||
turn: dict,
|
||||
counter: Callable[[str], int],
|
||||
) -> int:
|
||||
"""Compose the per-turn text that Claude sees and count its tokens."""
|
||||
parts: list[str] = [
|
||||
_POST_TOK15_TOOL_DESCRIPTIONS, # constant per-turn overhead
|
||||
turn["input"], # user / call payload
|
||||
_RESULT_BODIES.get(turn["kind"], ""), # synthetic result body
|
||||
]
|
||||
return int(counter("\n".join(p for p in parts if p)))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- public API
|
||||
|
||||
def run_total_session_cost(
|
||||
*,
|
||||
wake_depth: str = "minimal",
|
||||
mempalace_ref: int | None = None,
|
||||
claude_mem_ref: int | None = None,
|
||||
measure_mempalace: bool = False,
|
||||
measure_claude_mem: bool = False,
|
||||
count_tokens_fn: Callable[[str], int] | None = None,
|
||||
) -> dict:
|
||||
"""Run the fixed 10-turn script at the given wake_depth.
|
||||
|
||||
Parameters:
|
||||
wake_depth: "minimal" | "standard" | "deep" — selects session-start
|
||||
payload size charged to turn 1.
|
||||
mempalace_ref / claude_mem_ref: optional manually-supplied reference
|
||||
totals (stored as ``refs["*_manual"]`` for audit). When no live
|
||||
measurement exists, a manual int is the comparator for ``passed``.
|
||||
measure_mempalace / measure_claude_mem: when True, invoke the live
|
||||
subprocess adapter and store the result as ``refs["*_measured"]``.
|
||||
A live measurement supersedes the manual ref as the comparator.
|
||||
count_tokens_fn: optional counter injection (tests use a fixed
|
||||
function to decouple assertions from tokeniser drift).
|
||||
"""
|
||||
counter, mode = _select_counter(count_tokens_fn)
|
||||
|
||||
per_turn: list[int] = []
|
||||
for i, turn in enumerate(_SCRIPT):
|
||||
t = _simulate_turn(turn, counter)
|
||||
if i == 0:
|
||||
# Turn 1 pays the session-start overhead per wake_depth.
|
||||
t += _session_start_overhead_tokens(wake_depth)
|
||||
per_turn.append(int(t))
|
||||
|
||||
total = int(sum(per_turn))
|
||||
|
||||
refs: dict[str, int] = {}
|
||||
passed = True
|
||||
|
||||
# Live measurements first so we can decide whether the manual int should
|
||||
# be recorded under the legacy key ("mempalace") or the audit-trail key
|
||||
# ("mempalace_manual", used when BOTH a measurement AND a manual ref are
|
||||
# supplied per Test 6).
|
||||
mp_measured: int | None = None
|
||||
cm_measured: int | None = None
|
||||
if measure_mempalace:
|
||||
mp_measured = _run_mempalace_adapter(_SCRIPT, counter)
|
||||
if mp_measured is not None:
|
||||
refs["mempalace_measured"] = int(mp_measured)
|
||||
if measure_claude_mem:
|
||||
cm_measured = _run_claude_mem_adapter(_SCRIPT, counter)
|
||||
if cm_measured is not None:
|
||||
refs["claude_mem_measured"] = int(cm_measured)
|
||||
|
||||
# Manual refs. Back-compat with when no live measurement is
|
||||
# present, the manual int lands under the legacy "mempalace" / "claude_mem"
|
||||
# key so pre-existing downstream consumers (and tests) keep working.
|
||||
if mempalace_ref is not None:
|
||||
key = "mempalace_manual" if mp_measured is not None else "mempalace"
|
||||
refs[key] = int(mempalace_ref)
|
||||
if claude_mem_ref is not None:
|
||||
key = "claude_mem_manual" if cm_measured is not None else "claude_mem"
|
||||
refs[key] = int(claude_mem_ref)
|
||||
|
||||
# Gate logic: measured > legacy manual > audit-trail manual > no gate.
|
||||
mp_gate = refs.get(
|
||||
"mempalace_measured", refs.get("mempalace", refs.get("mempalace_manual"))
|
||||
)
|
||||
cm_gate = refs.get(
|
||||
"claude_mem_measured", refs.get("claude_mem", refs.get("claude_mem_manual"))
|
||||
)
|
||||
if mp_gate is not None and total > mp_gate:
|
||||
passed = False
|
||||
if cm_gate is not None and total > cm_gate:
|
||||
passed = False
|
||||
|
||||
return {
|
||||
"adapter": "iai-mcp",
|
||||
"wake_depth": wake_depth,
|
||||
"total_tokens": total,
|
||||
"per_turn": per_turn,
|
||||
"mode": mode,
|
||||
"refs": refs,
|
||||
"passed": passed,
|
||||
"script_name": SCRIPT_NAME,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- CLI
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.total_session_cost",
|
||||
description=(
|
||||
"OPS-12 / total session cost bench. Fixed 10-turn "
|
||||
"representative script (D5-08); measures IAI-MCP token cost "
|
||||
"at wake_depth minimal|standard|deep and optionally compares "
|
||||
"to supplied mempalace / claude-mem reference totals."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wake-depth",
|
||||
choices=("minimal", "standard", "deep"),
|
||||
default="minimal",
|
||||
help="session-start payload size (default minimal per D5-02)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ref-mempalace",
|
||||
dest="mempalace_ref",
|
||||
type=int, default=None,
|
||||
help="mempalace reference total (tokens) for the comparative gate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ref-claude-mem",
|
||||
dest="claude_mem_ref",
|
||||
type=int, default=None,
|
||||
help="claude-mem reference total (tokens) for the comparative gate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--measure-mempalace",
|
||||
action="store_true",
|
||||
help=(
|
||||
"attempt a live mempalace subprocess run to fill the "
|
||||
"reference column; on failure emits a bench_adapter_unavailable "
|
||||
"stderr event and records no measurement"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--measure-claude-mem",
|
||||
action="store_true",
|
||||
help=(
|
||||
"attempt a live claude-mem subprocess run; identical fallback "
|
||||
"shape to --measure-mempalace"
|
||||
),
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
result = run_total_session_cost(
|
||||
wake_depth=args.wake_depth,
|
||||
mempalace_ref=args.mempalace_ref,
|
||||
claude_mem_ref=args.claude_mem_ref,
|
||||
measure_mempalace=args.measure_mempalace,
|
||||
measure_claude_mem=args.measure_claude_mem,
|
||||
)
|
||||
print(json.dumps(result))
|
||||
return 0 if result["passed"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue