Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
249
bench/tokens.py
Normal file
249
bench/tokens.py
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
"""bench/tokens.py -- / benchmark harness.
|
||||
|
||||
Measures session-start token budget three ways, preferring the most accurate
|
||||
source available at runtime:
|
||||
|
||||
1. Anthropic `count_tokens` API (best). Used when ANTHROPIC_API_KEY is set.
|
||||
Gives an honest billable-token count that includes Anthropic-side overhead
|
||||
and exact tokeniser output. Model: claude-sonnet-4-5. This is the only mode
|
||||
whose numbers are safe to publish (PROJECT.md: "honest mode-by-mode
|
||||
benchmarks, not headline numbers").
|
||||
|
||||
2. tiktoken cl100k_base fallback. OpenAI's tokeniser shipped with the tiktoken
|
||||
package -- runs fully offline, no network, no key. It under-counts Claude by
|
||||
~5-10% on English and over-counts by ~10-15% on Cyrillic (GPT-4 tokeniser
|
||||
packs multibyte differently). Acceptable for local dev and CI; the JSON
|
||||
output always records mode so downstream dashboards can reject non-API
|
||||
numbers from public charts.
|
||||
|
||||
3. char/4 heuristic. Used only when both 1 and 2 are unavailable (e.g. minimal
|
||||
CI image without tiktoken installed). Very rough; adequate only for sanity
|
||||
checks on the order of magnitude.
|
||||
|
||||
Thresholds:
|
||||
- (steady warm-cache): <= STEADY_LIMIT (3000 tokens) on every warm run
|
||||
- (first fresh session): <= FRESH_LIMIT (8000 tokens)
|
||||
|
||||
Exit codes:
|
||||
- 0: both steady_ok and fresh_ok
|
||||
- 1: at least one failed
|
||||
|
||||
JSON output format (one line to stdout):
|
||||
{"fresh": int, "warm": [int, ...], "steady_ok": bool, "fresh_ok": bool,
|
||||
"mode": "anthropic-count-tokens" | "tiktoken-cl100k-proxy" |
|
||||
"heuristic-char4" | "injected",
|
||||
"limits": {"steady": 3000, "fresh": 8000}}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import Callable
|
||||
|
||||
from iai_mcp.retrieve import build_runtime_graph
|
||||
from iai_mcp.session import SessionStartPayload, assemble_session_start
|
||||
from iai_mcp.store import MemoryStore
|
||||
|
||||
# budget targets
|
||||
STEADY_LIMIT = 3000 # warm-cache steady-state
|
||||
FRESH_LIMIT = 8000 # first-fresh-session (cache populate premium)
|
||||
|
||||
|
||||
def _anthropic_count_tokens(text: str) -> int:
|
||||
"""Use Anthropic count_tokens API. Raises if key absent or call fails."""
|
||||
import anthropic
|
||||
client = anthropic.Anthropic()
|
||||
resp = client.messages.count_tokens(
|
||||
model="claude-sonnet-4-5",
|
||||
messages=[{"role": "user", "content": text}],
|
||||
)
|
||||
return int(resp.input_tokens)
|
||||
|
||||
|
||||
def _tiktoken_count(text: str) -> int:
|
||||
"""Offline tiktoken cl100k_base as a proxy for Claude's tokeniser.
|
||||
|
||||
Raises ImportError if tiktoken not installed -- caller falls through to
|
||||
the char/4 heuristic in that case.
|
||||
"""
|
||||
import tiktoken
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
return len(enc.encode(text))
|
||||
|
||||
|
||||
def _char4_count(text: str) -> int:
|
||||
"""Last-resort char/4 heuristic. Reasonable for English prose, bad for CJK."""
|
||||
return max(1, len(text) // 4)
|
||||
|
||||
|
||||
def _payload_to_prompt(payload: SessionStartPayload) -> str:
|
||||
"""Flatten the session-start payload to a single prompt string.
|
||||
|
||||
Mirrors the TypeScript wrapper's buildCachedSystemPrompt shape so the
|
||||
counted prompt is faithful to what Anthropic actually receives.
|
||||
|
||||
D5-02: at wake_depth=minimal, the legacy l0/l1/l2/rich_club
|
||||
fields are empty and the payload is three pointer handles. Include them
|
||||
alongside legacy segments so both modes flatten to a representative
|
||||
prompt string for counting.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
if payload.l0:
|
||||
parts.append(f"# L0 identity\n{payload.l0}")
|
||||
if payload.l1:
|
||||
parts.append(f"# L1 critical facts\n{payload.l1}")
|
||||
for segment in payload.l2:
|
||||
parts.append(f"# L2 community\n{segment}")
|
||||
if payload.rich_club:
|
||||
parts.append(f"# Global rich-club\n{payload.rich_club}")
|
||||
# / 05-06: lazy session-start wire payload.
|
||||
# Under wake_depth=minimal the wire is the compact handle alone
|
||||
# (the 3 legacy pointer fields stay on the dataclass for back-compat
|
||||
# callers but are NOT serialised to the wire).
|
||||
# Under standard/deep the wire is the Phase-1 eager L0/L1/L2/rich_club
|
||||
# plus the 3 legacy pointer fields, matching the pre-05-06 baseline.
|
||||
# The compact handle is carried on the dataclass under standard/deep
|
||||
# too so opt-in callers may read it, but it does NOT add to the wire
|
||||
# (that would inflate the standard baseline).
|
||||
compact = getattr(payload, "compact_handle", "")
|
||||
wake_depth = getattr(payload, "wake_depth", "minimal")
|
||||
if wake_depth == "minimal":
|
||||
if compact:
|
||||
parts.append(compact)
|
||||
else:
|
||||
lazy = [
|
||||
s for s in (
|
||||
getattr(payload, "identity_pointer", ""),
|
||||
getattr(payload, "brain_handle", ""),
|
||||
getattr(payload, "topic_cluster_hint", ""),
|
||||
) if s
|
||||
]
|
||||
if lazy:
|
||||
parts.append(" ".join(lazy))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _fresh_prompt(payload: SessionStartPayload) -> str:
|
||||
"""the first fresh-session request pays the cache-populate premium.
|
||||
|
||||
Simulated here by padding the cached prefix with ~1000 tokens of dynamic
|
||||
tail content (D-10 dynamic reserve). Anthropic's count_tokens will return
|
||||
the sum of both parts in one call.
|
||||
"""
|
||||
prompt = _payload_to_prompt(payload)
|
||||
tail = "dynamic tail content " * 125 # ~2500 chars ~ 625 tokens heuristic
|
||||
return f"{prompt}\n\n{tail}" if prompt else tail
|
||||
|
||||
|
||||
def run_token_bench(
|
||||
store: MemoryStore | None = None,
|
||||
n_runs: int = 3,
|
||||
count_tokens_fn: Callable[[str], int] | None = None,
|
||||
wake_depth: str = "minimal",
|
||||
) -> dict:
|
||||
"""Run the token benchmark.
|
||||
|
||||
Parameters:
|
||||
store: optional MemoryStore override (tests pass an isolated tmp_path store).
|
||||
n_runs: how many warm-cache repeats to measure (OPS-01 steady-state needs
|
||||
at least 3 consecutive samples).
|
||||
count_tokens_fn: optional token-counter injection (test-only); overrides both
|
||||
the Anthropic API and the heuristic fallback.
|
||||
wake_depth: TOK-11 — selects session-start payload mode.
|
||||
Default ``minimal`` measures the lazy <=30-tok handle; pass
|
||||
``standard`` for the Phase-1 eager dump baseline; ``deep`` for
|
||||
the ≤2000-tok expanded rich_club.
|
||||
|
||||
Returns a dict with keys described in the module docstring.
|
||||
"""
|
||||
s = store if store is not None else MemoryStore()
|
||||
records_count = s.db.open_table("records").count_rows()
|
||||
if records_count > 0:
|
||||
_graph, assignment, rc = build_runtime_graph(s)
|
||||
payload = assemble_session_start(
|
||||
s, assignment, rc, profile_state={"wake_depth": wake_depth},
|
||||
)
|
||||
else:
|
||||
# Empty-store fallback: mint a representative compact handle so the
|
||||
# warm-prompt count reflects the wire payload shape even before any
|
||||
# record is written. Mirrors session.assemble_session_start at
|
||||
# wake_depth=minimal.
|
||||
from iai_mcp.handle import encode_compact_handle
|
||||
from uuid import uuid4
|
||||
|
||||
_compact = encode_compact_handle("", str(uuid4())[:8], "none", 0)
|
||||
payload = SessionStartPayload(
|
||||
l0="",
|
||||
l1="",
|
||||
l2=[],
|
||||
rich_club="",
|
||||
total_cached_tokens=max(1, len(_compact) // 4),
|
||||
total_dynamic_tokens=1000,
|
||||
compact_handle=_compact,
|
||||
wake_depth=wake_depth,
|
||||
)
|
||||
|
||||
counter: Callable[[str], int]
|
||||
mode: str
|
||||
if count_tokens_fn is not None:
|
||||
counter = count_tokens_fn
|
||||
mode = "injected"
|
||||
elif os.environ.get("ANTHROPIC_API_KEY"):
|
||||
counter = _anthropic_count_tokens
|
||||
mode = "anthropic-count-tokens"
|
||||
else:
|
||||
# Prefer tiktoken over char/4 -- it actually tokenises the text and
|
||||
# tracks Claude within ~10% across English + Cyrillic.
|
||||
try:
|
||||
import tiktoken # noqa: F401
|
||||
counter = _tiktoken_count
|
||||
mode = "tiktoken-cl100k-proxy"
|
||||
except ImportError:
|
||||
counter = _char4_count
|
||||
mode = "heuristic-char4"
|
||||
|
||||
warm_prompt = _payload_to_prompt(payload) or "."
|
||||
fresh_prompt = _fresh_prompt(payload)
|
||||
fresh = int(counter(fresh_prompt))
|
||||
warm = [int(counter(warm_prompt)) for _ in range(n_runs)]
|
||||
|
||||
fresh_ok = fresh <= FRESH_LIMIT
|
||||
steady_ok = all(w <= STEADY_LIMIT for w in warm)
|
||||
|
||||
return {
|
||||
"fresh": fresh,
|
||||
"warm": warm,
|
||||
"steady_ok": steady_ok,
|
||||
"fresh_ok": fresh_ok,
|
||||
"mode": mode,
|
||||
"limits": {"steady": STEADY_LIMIT, "fresh": FRESH_LIMIT},
|
||||
"payload_cached_tokens": payload.total_cached_tokens,
|
||||
"payload_dynamic_tokens": payload.total_dynamic_tokens,
|
||||
}
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.tokens",
|
||||
description=(
|
||||
"OPS-01/OPS-02 session-start token bench. TOK-11 added "
|
||||
"--wake-depth for measuring the lazy <=30-tok payload vs Phase-1 "
|
||||
"eager dump vs the deep variant."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wake-depth",
|
||||
choices=("minimal", "standard", "deep"),
|
||||
default="minimal",
|
||||
help="Session-start payload mode (default: minimal per D5-02).",
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
result = run_token_bench(wake_depth=args.wake_depth)
|
||||
print(json.dumps(result))
|
||||
return 0 if (result["steady_ok"] and result["fresh_ok"]) else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue