iai-mcp-opencode/bench/tokens.py

"""bench/tokens.py -- / benchmark harness.

Measures session-start token budget three ways, preferring the most accurate
source available at runtime:

1. Anthropic `count_tokens` API (best). Used when ANTHROPIC_API_KEY is set.
   Gives an honest billable-token count that includes Anthropic-side overhead
   and exact tokeniser output. Model: claude-sonnet-4-5. This is the only mode
   whose numbers are safe to publish (PROJECT.md: "honest mode-by-mode
   benchmarks, not headline numbers").

2. tiktoken cl100k_base fallback. OpenAI's tokeniser shipped with the tiktoken
   package -- runs fully offline, no network, no key. It under-counts Claude by
   ~5-10% on English and over-counts by ~10-15% on Cyrillic (GPT-4 tokeniser
   packs multibyte differently). Acceptable for local dev and CI; the JSON
   output always records mode so downstream dashboards can reject non-API
   numbers from public charts.

3. char/4 heuristic. Used only when both 1 and 2 are unavailable (e.g. minimal
   CI image without tiktoken installed). Very rough; adequate only for sanity
   checks on the order of magnitude.

Thresholds:
- (steady warm-cache): <= STEADY_LIMIT (3000 tokens) on every warm run
- (first fresh session): <= FRESH_LIMIT (8000 tokens)

Exit codes:
- 0: both steady_ok and fresh_ok
- 1: at least one failed

JSON output format (one line to stdout):
    {"fresh": int, "warm": [int, ...], "steady_ok": bool, "fresh_ok": bool,
     "mode": "anthropic-count-tokens" | "tiktoken-cl100k-proxy" |
             "heuristic-char4" | "injected",
     "limits": {"steady": 3000, "fresh": 8000}}
"""
from __future__ import annotations

import json
import os
import sys
from typing import Callable

from iai_mcp.retrieve import build_runtime_graph
from iai_mcp.session import SessionStartPayload, assemble_session_start
from iai_mcp.store import MemoryStore

# budget targets
STEADY_LIMIT = 3000   # warm-cache steady-state
FRESH_LIMIT = 8000    # first-fresh-session (cache populate premium)


def _anthropic_count_tokens(text: str) -> int:
    """Use Anthropic count_tokens API. Raises if key absent or call fails."""
    import anthropic
    client = anthropic.Anthropic()
    resp = client.messages.count_tokens(
        model="claude-sonnet-4-5",
        messages=[{"role": "user", "content": text}],
    )
    return int(resp.input_tokens)


def _tiktoken_count(text: str) -> int:
    """Offline tiktoken cl100k_base as a proxy for Claude's tokeniser.

    Raises ImportError if tiktoken not installed -- caller falls through to
    the char/4 heuristic in that case.
    """
    import tiktoken
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))


def _char4_count(text: str) -> int:
    """Last-resort char/4 heuristic. Reasonable for English prose, bad for CJK."""
    return max(1, len(text) // 4)


def _payload_to_prompt(payload: SessionStartPayload) -> str:
    """Flatten the session-start payload to a single prompt string.

    Mirrors the TypeScript wrapper's buildCachedSystemPrompt shape so the
    counted prompt is faithful to what Anthropic actually receives.

    D5-02: at wake_depth=minimal, the legacy l0/l1/l2/rich_club
    fields are empty and the payload is three pointer handles. Include them
    alongside legacy segments so both modes flatten to a representative
    prompt string for counting.
    """
    parts: list[str] = []
    if payload.l0:
        parts.append(f"# L0 identity\n{payload.l0}")
    if payload.l1:
        parts.append(f"# L1 critical facts\n{payload.l1}")
    for segment in payload.l2:
        parts.append(f"# L2 community\n{segment}")
    if payload.rich_club:
        parts.append(f"# Global rich-club\n{payload.rich_club}")
    # / 05-06: lazy session-start wire payload.
    # Under wake_depth=minimal the wire is the compact handle alone
    # (the 3 legacy pointer fields stay on the dataclass for back-compat
    # callers but are NOT serialised to the wire).
    # Under standard/deep the wire is the Phase-1 eager L0/L1/L2/rich_club
    # plus the 3 legacy pointer fields, matching the pre-05-06 baseline.
    # The compact handle is carried on the dataclass under standard/deep
    # too so opt-in callers may read it, but it does NOT add to the wire
    # (that would inflate the standard baseline).
    compact = getattr(payload, "compact_handle", "")
    wake_depth = getattr(payload, "wake_depth", "minimal")
    if wake_depth == "minimal":
        if compact:
            parts.append(compact)
    else:
        lazy = [
            s for s in (
                getattr(payload, "identity_pointer", ""),
                getattr(payload, "brain_handle", ""),
                getattr(payload, "topic_cluster_hint", ""),
            ) if s
        ]
        if lazy:
            parts.append(" ".join(lazy))
    return "\n\n".join(parts)


def _fresh_prompt(payload: SessionStartPayload) -> str:
    """the first fresh-session request pays the cache-populate premium.

    Simulated here by padding the cached prefix with ~1000 tokens of dynamic
    tail content (D-10 dynamic reserve). Anthropic's count_tokens will return
    the sum of both parts in one call.
    """
    prompt = _payload_to_prompt(payload)
    tail = "dynamic tail content " * 125  # ~2500 chars ~ 625 tokens heuristic
    return f"{prompt}\n\n{tail}" if prompt else tail


def run_token_bench(
    store: MemoryStore | None = None,
    n_runs: int = 3,
    count_tokens_fn: Callable[[str], int] | None = None,
    wake_depth: str = "minimal",
) -> dict:
    """Run the token benchmark.

    Parameters:
        store: optional MemoryStore override (tests pass an isolated tmp_path store).
        n_runs: how many warm-cache repeats to measure (OPS-01 steady-state needs
                at least 3 consecutive samples).
        count_tokens_fn: optional token-counter injection (test-only); overrides both
                the Anthropic API and the heuristic fallback.
        wake_depth: TOK-11 — selects session-start payload mode.
                Default ``minimal`` measures the lazy <=30-tok handle; pass
                ``standard`` for the Phase-1 eager dump baseline; ``deep`` for
                the ≤2000-tok expanded rich_club.

    Returns a dict with keys described in the module docstring.
    """
    s = store if store is not None else MemoryStore()
    records_count = s.db.open_table("records").count_rows()
    if records_count > 0:
        _graph, assignment, rc = build_runtime_graph(s)
        payload = assemble_session_start(
            s, assignment, rc, profile_state={"wake_depth": wake_depth},
        )
    else:
        # Empty-store fallback: mint a representative compact handle so the
        # warm-prompt count reflects the wire payload shape even before any
        # record is written. Mirrors session.assemble_session_start at
        # wake_depth=minimal.
        from iai_mcp.handle import encode_compact_handle
        from uuid import uuid4

        _compact = encode_compact_handle("", str(uuid4())[:8], "none", 0)
        payload = SessionStartPayload(
            l0="",
            l1="",
            l2=[],
            rich_club="",
            total_cached_tokens=max(1, len(_compact) // 4),
            total_dynamic_tokens=1000,
            compact_handle=_compact,
            wake_depth=wake_depth,
        )

    counter: Callable[[str], int]
    mode: str
    if count_tokens_fn is not None:
        counter = count_tokens_fn
        mode = "injected"
    elif os.environ.get("ANTHROPIC_API_KEY"):
        counter = _anthropic_count_tokens
        mode = "anthropic-count-tokens"
    else:
        # Prefer tiktoken over char/4 -- it actually tokenises the text and
        # tracks Claude within ~10% across English + Cyrillic.
        try:
            import tiktoken  # noqa: F401
            counter = _tiktoken_count
            mode = "tiktoken-cl100k-proxy"
        except ImportError:
            counter = _char4_count
            mode = "heuristic-char4"

    warm_prompt = _payload_to_prompt(payload) or "."
    fresh_prompt = _fresh_prompt(payload)
    fresh = int(counter(fresh_prompt))
    warm = [int(counter(warm_prompt)) for _ in range(n_runs)]

    fresh_ok = fresh <= FRESH_LIMIT
    steady_ok = all(w <= STEADY_LIMIT for w in warm)

    return {
        "fresh": fresh,
        "warm": warm,
        "steady_ok": steady_ok,
        "fresh_ok": fresh_ok,
        "mode": mode,
        "limits": {"steady": STEADY_LIMIT, "fresh": FRESH_LIMIT},
        "payload_cached_tokens": payload.total_cached_tokens,
        "payload_dynamic_tokens": payload.total_dynamic_tokens,
    }


def main(argv: list[str] | None = None) -> int:
    import argparse
    parser = argparse.ArgumentParser(
        prog="bench.tokens",
        description=(
            "OPS-01/OPS-02 session-start token bench. TOK-11 added "
            "--wake-depth for measuring the lazy <=30-tok payload vs Phase-1 "
            "eager dump vs the deep variant."
        ),
    )
    parser.add_argument(
        "--wake-depth",
        choices=("minimal", "standard", "deep"),
        default="minimal",
        help="Session-start payload mode (default: minimal per D5-02).",
    )
    args = parser.parse_args(argv)
    result = run_token_bench(wake_depth=args.wake_depth)
    print(json.dumps(result))
    return 0 if (result["steady_ok"] and result["fresh_ok"]) else 1


if __name__ == "__main__":
    sys.exit(main())