Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/bench/tokens.py
+++ b/bench/tokens.py
@ -0,0 +1,249 @@
+"""bench/tokens.py -- / benchmark harness.
+
+Measures session-start token budget three ways, preferring the most accurate
+source available at runtime:
+
+1. Anthropic `count_tokens` API (best). Used when ANTHROPIC_API_KEY is set.
+   Gives an honest billable-token count that includes Anthropic-side overhead
+   and exact tokeniser output. Model: claude-sonnet-4-5. This is the only mode
+   whose numbers are safe to publish (PROJECT.md: "honest mode-by-mode
+   benchmarks, not headline numbers").
+
+2. tiktoken cl100k_base fallback. OpenAI's tokeniser shipped with the tiktoken
+   package -- runs fully offline, no network, no key. It under-counts Claude by
+   ~5-10% on English and over-counts by ~10-15% on Cyrillic (GPT-4 tokeniser
+   packs multibyte differently). Acceptable for local dev and CI; the JSON
+   output always records mode so downstream dashboards can reject non-API
+   numbers from public charts.
+
+3. char/4 heuristic. Used only when both 1 and 2 are unavailable (e.g. minimal
+   CI image without tiktoken installed). Very rough; adequate only for sanity
+   checks on the order of magnitude.
+
+Thresholds:
+- (steady warm-cache): <= STEADY_LIMIT (3000 tokens) on every warm run
+- (first fresh session): <= FRESH_LIMIT (8000 tokens)
+
+Exit codes:
+- 0: both steady_ok and fresh_ok
+- 1: at least one failed
+
+JSON output format (one line to stdout):
+    {"fresh": int, "warm": [int, ...], "steady_ok": bool, "fresh_ok": bool,
+     "mode": "anthropic-count-tokens" | "tiktoken-cl100k-proxy" |
+             "heuristic-char4" | "injected",
+     "limits": {"steady": 3000, "fresh": 8000}}
+"""
+from __future__ import annotations
+
+import json
+import os
+import sys
+from typing import Callable
+
+from iai_mcp.retrieve import build_runtime_graph
+from iai_mcp.session import SessionStartPayload, assemble_session_start
+from iai_mcp.store import MemoryStore
+
+# budget targets
+STEADY_LIMIT = 3000   # warm-cache steady-state
+FRESH_LIMIT = 8000    # first-fresh-session (cache populate premium)
+
+
+def _anthropic_count_tokens(text: str) -> int:
+    """Use Anthropic count_tokens API. Raises if key absent or call fails."""
+    import anthropic
+    client = anthropic.Anthropic()
+    resp = client.messages.count_tokens(
+        model="claude-sonnet-4-5",
+        messages=[{"role": "user", "content": text}],
+    )
+    return int(resp.input_tokens)
+
+
+def _tiktoken_count(text: str) -> int:
+    """Offline tiktoken cl100k_base as a proxy for Claude's tokeniser.
+
+    Raises ImportError if tiktoken not installed -- caller falls through to
+    the char/4 heuristic in that case.
+    """
+    import tiktoken
+    enc = tiktoken.get_encoding("cl100k_base")
+    return len(enc.encode(text))
+
+
+def _char4_count(text: str) -> int:
+    """Last-resort char/4 heuristic. Reasonable for English prose, bad for CJK."""
+    return max(1, len(text) // 4)
+
+
+def _payload_to_prompt(payload: SessionStartPayload) -> str:
+    """Flatten the session-start payload to a single prompt string.
+
+    Mirrors the TypeScript wrapper's buildCachedSystemPrompt shape so the
+    counted prompt is faithful to what Anthropic actually receives.
+
+    D5-02: at wake_depth=minimal, the legacy l0/l1/l2/rich_club
+    fields are empty and the payload is three pointer handles. Include them
+    alongside legacy segments so both modes flatten to a representative
+    prompt string for counting.
+    """
+    parts: list[str] = []
+    if payload.l0:
+        parts.append(f"# L0 identity\n{payload.l0}")
+    if payload.l1:
+        parts.append(f"# L1 critical facts\n{payload.l1}")
+    for segment in payload.l2:
+        parts.append(f"# L2 community\n{segment}")
+    if payload.rich_club:
+        parts.append(f"# Global rich-club\n{payload.rich_club}")
+    # / 05-06: lazy session-start wire payload.
+    # Under wake_depth=minimal the wire is the compact handle alone
+    # (the 3 legacy pointer fields stay on the dataclass for back-compat
+    # callers but are NOT serialised to the wire).
+    # Under standard/deep the wire is the Phase-1 eager L0/L1/L2/rich_club
+    # plus the 3 legacy pointer fields, matching the pre-05-06 baseline.
+    # The compact handle is carried on the dataclass under standard/deep
+    # too so opt-in callers may read it, but it does NOT add to the wire
+    # (that would inflate the standard baseline).
+    compact = getattr(payload, "compact_handle", "")
+    wake_depth = getattr(payload, "wake_depth", "minimal")
+    if wake_depth == "minimal":
+        if compact:
+            parts.append(compact)
+    else:
+        lazy = [
+            s for s in (
+                getattr(payload, "identity_pointer", ""),
+                getattr(payload, "brain_handle", ""),
+                getattr(payload, "topic_cluster_hint", ""),
+            ) if s
+        ]
+        if lazy:
+            parts.append(" ".join(lazy))
+    return "\n\n".join(parts)
+
+
+def _fresh_prompt(payload: SessionStartPayload) -> str:
+    """the first fresh-session request pays the cache-populate premium.
+
+    Simulated here by padding the cached prefix with ~1000 tokens of dynamic
+    tail content (D-10 dynamic reserve). Anthropic's count_tokens will return
+    the sum of both parts in one call.
+    """
+    prompt = _payload_to_prompt(payload)
+    tail = "dynamic tail content " * 125  # ~2500 chars ~ 625 tokens heuristic
+    return f"{prompt}\n\n{tail}" if prompt else tail
+
+
+def run_token_bench(
+    store: MemoryStore | None = None,
+    n_runs: int = 3,
+    count_tokens_fn: Callable[[str], int] | None = None,
+    wake_depth: str = "minimal",
+) -> dict:
+    """Run the token benchmark.
+
+    Parameters:
+        store: optional MemoryStore override (tests pass an isolated tmp_path store).
+        n_runs: how many warm-cache repeats to measure (OPS-01 steady-state needs
+                at least 3 consecutive samples).
+        count_tokens_fn: optional token-counter injection (test-only); overrides both
+                the Anthropic API and the heuristic fallback.
+        wake_depth: TOK-11 — selects session-start payload mode.
+                Default ``minimal`` measures the lazy <=30-tok handle; pass
+                ``standard`` for the Phase-1 eager dump baseline; ``deep`` for
+                the ≤2000-tok expanded rich_club.
+
+    Returns a dict with keys described in the module docstring.
+    """
+    s = store if store is not None else MemoryStore()
+    records_count = s.db.open_table("records").count_rows()
+    if records_count > 0:
+        _graph, assignment, rc = build_runtime_graph(s)
+        payload = assemble_session_start(
+            s, assignment, rc, profile_state={"wake_depth": wake_depth},
+        )
+    else:
+        # Empty-store fallback: mint a representative compact handle so the
+        # warm-prompt count reflects the wire payload shape even before any
+        # record is written. Mirrors session.assemble_session_start at
+        # wake_depth=minimal.
+        from iai_mcp.handle import encode_compact_handle
+        from uuid import uuid4
+
+        _compact = encode_compact_handle("", str(uuid4())[:8], "none", 0)
+        payload = SessionStartPayload(
+            l0="",
+            l1="",
+            l2=[],
+            rich_club="",
+            total_cached_tokens=max(1, len(_compact) // 4),
+            total_dynamic_tokens=1000,
+            compact_handle=_compact,
+            wake_depth=wake_depth,
+        )
+
+    counter: Callable[[str], int]
+    mode: str
+    if count_tokens_fn is not None:
+        counter = count_tokens_fn
+        mode = "injected"
+    elif os.environ.get("ANTHROPIC_API_KEY"):
+        counter = _anthropic_count_tokens
+        mode = "anthropic-count-tokens"
+    else:
+        # Prefer tiktoken over char/4 -- it actually tokenises the text and
+        # tracks Claude within ~10% across English + Cyrillic.
+        try:
+            import tiktoken  # noqa: F401
+            counter = _tiktoken_count
+            mode = "tiktoken-cl100k-proxy"
+        except ImportError:
+            counter = _char4_count
+            mode = "heuristic-char4"
+
+    warm_prompt = _payload_to_prompt(payload) or "."
+    fresh_prompt = _fresh_prompt(payload)
+    fresh = int(counter(fresh_prompt))
+    warm = [int(counter(warm_prompt)) for _ in range(n_runs)]
+
+    fresh_ok = fresh <= FRESH_LIMIT
+    steady_ok = all(w <= STEADY_LIMIT for w in warm)
+
+    return {
+        "fresh": fresh,
+        "warm": warm,
+        "steady_ok": steady_ok,
+        "fresh_ok": fresh_ok,
+        "mode": mode,
+        "limits": {"steady": STEADY_LIMIT, "fresh": FRESH_LIMIT},
+        "payload_cached_tokens": payload.total_cached_tokens,
+        "payload_dynamic_tokens": payload.total_dynamic_tokens,
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    import argparse
+    parser = argparse.ArgumentParser(
+        prog="bench.tokens",
+        description=(
+            "OPS-01/OPS-02 session-start token bench. TOK-11 added "
+            "--wake-depth for measuring the lazy <=30-tok payload vs Phase-1 "
+            "eager dump vs the deep variant."
+        ),
+    )
+    parser.add_argument(
+        "--wake-depth",
+        choices=("minimal", "standard", "deep"),
+        default="minimal",
+        help="Session-start payload mode (default: minimal per D5-02).",
+    )
+    args = parser.parse_args(argv)
+    result = run_token_bench(wake_depth=args.wake_depth)
+    print(json.dumps(result))
+    return 0 if (result["steady_ok"] and result["fresh_ok"]) else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())