Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
249 lines
9.2 KiB
Python
249 lines
9.2 KiB
Python
"""bench/tokens.py -- / benchmark harness.
|
|
|
|
Measures session-start token budget three ways, preferring the most accurate
|
|
source available at runtime:
|
|
|
|
1. Anthropic `count_tokens` API (best). Used when ANTHROPIC_API_KEY is set.
|
|
Gives an honest billable-token count that includes Anthropic-side overhead
|
|
and exact tokeniser output. Model: claude-sonnet-4-5. This is the only mode
|
|
whose numbers are safe to publish (PROJECT.md: "honest mode-by-mode
|
|
benchmarks, not headline numbers").
|
|
|
|
2. tiktoken cl100k_base fallback. OpenAI's tokeniser shipped with the tiktoken
|
|
package -- runs fully offline, no network, no key. It under-counts Claude by
|
|
~5-10% on English and over-counts by ~10-15% on Cyrillic (GPT-4 tokeniser
|
|
packs multibyte differently). Acceptable for local dev and CI; the JSON
|
|
output always records mode so downstream dashboards can reject non-API
|
|
numbers from public charts.
|
|
|
|
3. char/4 heuristic. Used only when both 1 and 2 are unavailable (e.g. minimal
|
|
CI image without tiktoken installed). Very rough; adequate only for sanity
|
|
checks on the order of magnitude.
|
|
|
|
Thresholds:
|
|
- (steady warm-cache): <= STEADY_LIMIT (3000 tokens) on every warm run
|
|
- (first fresh session): <= FRESH_LIMIT (8000 tokens)
|
|
|
|
Exit codes:
|
|
- 0: both steady_ok and fresh_ok
|
|
- 1: at least one failed
|
|
|
|
JSON output format (one line to stdout):
|
|
{"fresh": int, "warm": [int, ...], "steady_ok": bool, "fresh_ok": bool,
|
|
"mode": "anthropic-count-tokens" | "tiktoken-cl100k-proxy" |
|
|
"heuristic-char4" | "injected",
|
|
"limits": {"steady": 3000, "fresh": 8000}}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from typing import Callable
|
|
|
|
from iai_mcp.retrieve import build_runtime_graph
|
|
from iai_mcp.session import SessionStartPayload, assemble_session_start
|
|
from iai_mcp.store import MemoryStore
|
|
|
|
# budget targets
|
|
STEADY_LIMIT = 3000 # warm-cache steady-state
|
|
FRESH_LIMIT = 8000 # first-fresh-session (cache populate premium)
|
|
|
|
|
|
def _anthropic_count_tokens(text: str) -> int:
|
|
"""Use Anthropic count_tokens API. Raises if key absent or call fails."""
|
|
import anthropic
|
|
client = anthropic.Anthropic()
|
|
resp = client.messages.count_tokens(
|
|
model="claude-sonnet-4-5",
|
|
messages=[{"role": "user", "content": text}],
|
|
)
|
|
return int(resp.input_tokens)
|
|
|
|
|
|
def _tiktoken_count(text: str) -> int:
|
|
"""Offline tiktoken cl100k_base as a proxy for Claude's tokeniser.
|
|
|
|
Raises ImportError if tiktoken not installed -- caller falls through to
|
|
the char/4 heuristic in that case.
|
|
"""
|
|
import tiktoken
|
|
enc = tiktoken.get_encoding("cl100k_base")
|
|
return len(enc.encode(text))
|
|
|
|
|
|
def _char4_count(text: str) -> int:
|
|
"""Last-resort char/4 heuristic. Reasonable for English prose, bad for CJK."""
|
|
return max(1, len(text) // 4)
|
|
|
|
|
|
def _payload_to_prompt(payload: SessionStartPayload) -> str:
|
|
"""Flatten the session-start payload to a single prompt string.
|
|
|
|
Mirrors the TypeScript wrapper's buildCachedSystemPrompt shape so the
|
|
counted prompt is faithful to what Anthropic actually receives.
|
|
|
|
D5-02: at wake_depth=minimal, the legacy l0/l1/l2/rich_club
|
|
fields are empty and the payload is three pointer handles. Include them
|
|
alongside legacy segments so both modes flatten to a representative
|
|
prompt string for counting.
|
|
"""
|
|
parts: list[str] = []
|
|
if payload.l0:
|
|
parts.append(f"# L0 identity\n{payload.l0}")
|
|
if payload.l1:
|
|
parts.append(f"# L1 critical facts\n{payload.l1}")
|
|
for segment in payload.l2:
|
|
parts.append(f"# L2 community\n{segment}")
|
|
if payload.rich_club:
|
|
parts.append(f"# Global rich-club\n{payload.rich_club}")
|
|
# / 05-06: lazy session-start wire payload.
|
|
# Under wake_depth=minimal the wire is the compact handle alone
|
|
# (the 3 legacy pointer fields stay on the dataclass for back-compat
|
|
# callers but are NOT serialised to the wire).
|
|
# Under standard/deep the wire is the Phase-1 eager L0/L1/L2/rich_club
|
|
# plus the 3 legacy pointer fields, matching the pre-05-06 baseline.
|
|
# The compact handle is carried on the dataclass under standard/deep
|
|
# too so opt-in callers may read it, but it does NOT add to the wire
|
|
# (that would inflate the standard baseline).
|
|
compact = getattr(payload, "compact_handle", "")
|
|
wake_depth = getattr(payload, "wake_depth", "minimal")
|
|
if wake_depth == "minimal":
|
|
if compact:
|
|
parts.append(compact)
|
|
else:
|
|
lazy = [
|
|
s for s in (
|
|
getattr(payload, "identity_pointer", ""),
|
|
getattr(payload, "brain_handle", ""),
|
|
getattr(payload, "topic_cluster_hint", ""),
|
|
) if s
|
|
]
|
|
if lazy:
|
|
parts.append(" ".join(lazy))
|
|
return "\n\n".join(parts)
|
|
|
|
|
|
def _fresh_prompt(payload: SessionStartPayload) -> str:
|
|
"""the first fresh-session request pays the cache-populate premium.
|
|
|
|
Simulated here by padding the cached prefix with ~1000 tokens of dynamic
|
|
tail content (D-10 dynamic reserve). Anthropic's count_tokens will return
|
|
the sum of both parts in one call.
|
|
"""
|
|
prompt = _payload_to_prompt(payload)
|
|
tail = "dynamic tail content " * 125 # ~2500 chars ~ 625 tokens heuristic
|
|
return f"{prompt}\n\n{tail}" if prompt else tail
|
|
|
|
|
|
def run_token_bench(
|
|
store: MemoryStore | None = None,
|
|
n_runs: int = 3,
|
|
count_tokens_fn: Callable[[str], int] | None = None,
|
|
wake_depth: str = "minimal",
|
|
) -> dict:
|
|
"""Run the token benchmark.
|
|
|
|
Parameters:
|
|
store: optional MemoryStore override (tests pass an isolated tmp_path store).
|
|
n_runs: how many warm-cache repeats to measure (OPS-01 steady-state needs
|
|
at least 3 consecutive samples).
|
|
count_tokens_fn: optional token-counter injection (test-only); overrides both
|
|
the Anthropic API and the heuristic fallback.
|
|
wake_depth: TOK-11 — selects session-start payload mode.
|
|
Default ``minimal`` measures the lazy <=30-tok handle; pass
|
|
``standard`` for the Phase-1 eager dump baseline; ``deep`` for
|
|
the ≤2000-tok expanded rich_club.
|
|
|
|
Returns a dict with keys described in the module docstring.
|
|
"""
|
|
s = store if store is not None else MemoryStore()
|
|
records_count = s.db.open_table("records").count_rows()
|
|
if records_count > 0:
|
|
_graph, assignment, rc = build_runtime_graph(s)
|
|
payload = assemble_session_start(
|
|
s, assignment, rc, profile_state={"wake_depth": wake_depth},
|
|
)
|
|
else:
|
|
# Empty-store fallback: mint a representative compact handle so the
|
|
# warm-prompt count reflects the wire payload shape even before any
|
|
# record is written. Mirrors session.assemble_session_start at
|
|
# wake_depth=minimal.
|
|
from iai_mcp.handle import encode_compact_handle
|
|
from uuid import uuid4
|
|
|
|
_compact = encode_compact_handle("", str(uuid4())[:8], "none", 0)
|
|
payload = SessionStartPayload(
|
|
l0="",
|
|
l1="",
|
|
l2=[],
|
|
rich_club="",
|
|
total_cached_tokens=max(1, len(_compact) // 4),
|
|
total_dynamic_tokens=1000,
|
|
compact_handle=_compact,
|
|
wake_depth=wake_depth,
|
|
)
|
|
|
|
counter: Callable[[str], int]
|
|
mode: str
|
|
if count_tokens_fn is not None:
|
|
counter = count_tokens_fn
|
|
mode = "injected"
|
|
elif os.environ.get("ANTHROPIC_API_KEY"):
|
|
counter = _anthropic_count_tokens
|
|
mode = "anthropic-count-tokens"
|
|
else:
|
|
# Prefer tiktoken over char/4 -- it actually tokenises the text and
|
|
# tracks Claude within ~10% across English + Cyrillic.
|
|
try:
|
|
import tiktoken # noqa: F401
|
|
counter = _tiktoken_count
|
|
mode = "tiktoken-cl100k-proxy"
|
|
except ImportError:
|
|
counter = _char4_count
|
|
mode = "heuristic-char4"
|
|
|
|
warm_prompt = _payload_to_prompt(payload) or "."
|
|
fresh_prompt = _fresh_prompt(payload)
|
|
fresh = int(counter(fresh_prompt))
|
|
warm = [int(counter(warm_prompt)) for _ in range(n_runs)]
|
|
|
|
fresh_ok = fresh <= FRESH_LIMIT
|
|
steady_ok = all(w <= STEADY_LIMIT for w in warm)
|
|
|
|
return {
|
|
"fresh": fresh,
|
|
"warm": warm,
|
|
"steady_ok": steady_ok,
|
|
"fresh_ok": fresh_ok,
|
|
"mode": mode,
|
|
"limits": {"steady": STEADY_LIMIT, "fresh": FRESH_LIMIT},
|
|
"payload_cached_tokens": payload.total_cached_tokens,
|
|
"payload_dynamic_tokens": payload.total_dynamic_tokens,
|
|
}
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
import argparse
|
|
parser = argparse.ArgumentParser(
|
|
prog="bench.tokens",
|
|
description=(
|
|
"OPS-01/OPS-02 session-start token bench. TOK-11 added "
|
|
"--wake-depth for measuring the lazy <=30-tok payload vs Phase-1 "
|
|
"eager dump vs the deep variant."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--wake-depth",
|
|
choices=("minimal", "standard", "deep"),
|
|
default="minimal",
|
|
help="Session-start payload mode (default: minimal per D5-02).",
|
|
)
|
|
args = parser.parse_args(argv)
|
|
result = run_token_bench(wake_depth=args.wake_depth)
|
|
print(json.dumps(result))
|
|
return 0 if (result["steady_ok"] and result["fresh_ok"]) else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|