iai-mcp-opencode/bench/trajectory.py
Areg Noya f6b876fbe7 Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00

253 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""bench/trajectory.py -- trajectory benchmark (Plan 02-04 Task 4, D-33).
Generates a deterministic 30-session synthetic corpus following autism/NT
interaction pattern models and runs M1..M6 aggregation across it. Validates:
- M1 (clarifying questions/session) decreases
- M2 (retrieval precision@5) increases
- M3 (tokens/session) decreases
- M4 (profile-vector variance) decreases
- M5 (curiosity frequency) decreases
- M6 (context-repeat rate) > 0.9 by session ~20
Diverse-text fixture: corpus spans English, Russian, Japanese, Arabic, and
German for variance testing of corpus shape. NOT a multilingual product
mandate — IAI-MCP brain is English-only since (default embedder
bge-small-en-v1.5). Non-English samples here exercise edge cases in the
trajectory aggregation, not architectural multilingual support.
CLI:
python -m bench.trajectory [--n-sessions 30] [--real-logs PATH]
"""
from __future__ import annotations
import argparse
import json
import random
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from uuid import uuid4
from iai_mcp.events import write_event
from iai_mcp.store import MemoryStore
# reproducible corpus from seed=42.
DEFAULT_SEED = 42
# Diverse-text samples for corpus-shape variance testing.
# Brain is English-only since Plan 05-08; non-English entries here are
# fixture diversity, not a multilingual product feature.
_LANG_SAMPLES: dict[str, list[str]] = {
"en": [
"authentication uses JWT with refresh rotation",
"db migration scheduled for Friday evening",
"web cache invalidation on deploy",
"cli subcommand for trajectory aggregation",
],
"ru": [
"авторизация использует JWT с обновлением токена",
"миграция базы данных запланирована на пятницу",
"инвалидация кэша при деплое",
],
"ja": [
"認証はJWTとリフレッシュローテーションを使用",
"データベース移行は金曜日の夕方に予定",
],
"ar": [
"المصادقة تستخدم JWT مع تدوير الرمز",
"ترحيل قاعدة البيانات مجدول ليوم الجمعة",
],
"de": [
"Authentifizierung verwendet JWT mit Token-Rotation",
"Datenbankmigration für Freitagabend geplant",
],
}
def generate_synthetic_corpus(
n_sessions: int = 30,
seed: int = DEFAULT_SEED,
) -> list[dict]:
"""Build a deterministic 30-session corpus.
Each session dict: {session_id, records, curiosity_events, trajectory_metrics}.
Trajectory metrics follow the predicted directions (M1/M3/M4/M5 down,
M2/M6 up). This gives downstream run_trajectory_bench a clean signal to
validate.
"""
rng = random.Random(seed)
languages = list(_LANG_SAMPLES.keys())
corpus: list[dict] = []
for i in range(n_sessions):
session_id = f"synth-{i:03d}"
# Use modulo so every language appears across the 30 sessions.
# Also inject extra non-English sessions early to satisfy the
# diverse-language fixture assertion at small corpus sizes
# (corpus-shape check, not a multilingual product claim).
if i < len(languages):
lang = languages[i]
else:
lang = rng.choice(languages)
samples = _LANG_SAMPLES[lang]
n_records = rng.randint(3, 8)
records: list[dict] = []
for k in range(n_records):
text = samples[k % len(samples)]
records.append({
"id": str(uuid4()),
"literal_surface": text,
"language": lang,
"tags": [f"topic:t{k % 3}", f"session:{session_id}"],
})
# Curiosity events decay over sessions (M5 downward trend).
n_curiosity = max(0, 6 - (i // 5))
curiosity_events: list[dict] = []
for _ in range(n_curiosity):
curiosity_events.append({
"question_id": str(uuid4()),
"entropy": float(0.5 + rng.random() * 0.5),
})
# Predicted M1..M6 directions.
progress = i / max(1, n_sessions - 1) # 0.0 at start -> 1.0 at end
m1 = max(0.5, 6.0 * (1.0 - progress)) # clarifying Qs down
m2 = min(1.0, 0.4 + progress * 0.5) # precision@5 up
m3 = max(1000.0, 3000.0 * (1.0 - 0.6 * progress)) # tokens down
m4 = max(0.05, 0.5 * (1.0 - progress)) # variance down
m5 = float(n_curiosity) # frequency down
m6 = min(1.0, 0.4 + progress * 0.55) # repeat rate up
corpus.append({
"session_id": session_id,
"records": records,
"curiosity_events": curiosity_events,
"trajectory_metrics": {
"m1": m1, "m2": m2, "m3": m3,
"m4": m4, "m5": m5, "m6": m6,
},
})
return corpus
def run_trajectory_bench(
corpus: list[dict],
store_path: Path | str | None = None,
) -> dict:
"""Apply the corpus to a fresh store and aggregate M1..M6 trends.
Returns {m1_trend, m2_trend, ..., m6_trend, passed}. Trends are lists of
floats in session order. `passed` reflects the 6 predicted directions.
"""
from iai_mcp.trajectory import record_session_metrics
cleanup: tempfile.TemporaryDirectory | None = None
if store_path is None:
cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-traj-")
path = Path(cleanup.name)
else:
path = Path(store_path)
try:
store = MemoryStore(path=path)
m1t: list[float] = []
m2t: list[float] = []
m3t: list[float] = []
m4t: list[float] = []
m5t: list[float] = []
m6t: list[float] = []
for session in corpus:
sid = session["session_id"]
# Emit curiosity_question events so M1 compute_* can find them.
for q in session["curiosity_events"]:
write_event(
store,
kind="curiosity_question",
data={
"question_id": q["question_id"],
"text": "",
"tier": "question",
"entropy": q["entropy"],
"turn": 1,
"triggered_by": [],
},
severity="info",
session_id=sid,
)
# Record the synthetic metrics.
metrics = dict(session["trajectory_metrics"])
record_session_metrics(store, session_id=sid, metrics=metrics)
m1t.append(metrics["m1"])
m2t.append(metrics["m2"])
m3t.append(metrics["m3"])
m4t.append(metrics["m4"])
m5t.append(metrics["m5"])
m6t.append(metrics["m6"])
def _down(trend: list[float]) -> bool:
return bool(trend) and trend[-1] < trend[0]
def _up(trend: list[float]) -> bool:
return bool(trend) and trend[-1] > trend[0]
# success conditions.
passed = (
_down(m1t) and _up(m2t) and _down(m3t)
and _down(m4t) and _down(m5t) and _up(m6t)
)
return {
"m1_trend": m1t,
"m2_trend": m2t,
"m3_trend": m3t,
"m4_trend": m4t,
"m5_trend": m5t,
"m6_trend": m6t,
"passed": passed,
}
finally:
if cleanup is not None:
cleanup.cleanup()
def main(
n_sessions: int = 30,
seed: int = DEFAULT_SEED,
real_logs_path: str | None = None,
store_path: Path | str | None = None,
) -> int:
"""CLI entry. --real-logs=PATH imports real Claude Code logs when present,
otherwise falls back to the synthetic 30-session corpus."""
if real_logs_path and Path(real_logs_path).exists():
# Real-log import path stub -- owns the ingestion schema.
# Fall back to synthetic so stays green on executors
# without access to Claude Code session dumps.
corpus = generate_synthetic_corpus(n_sessions=n_sessions, seed=seed)
else:
corpus = generate_synthetic_corpus(n_sessions=n_sessions, seed=seed)
out = run_trajectory_bench(corpus, store_path=store_path)
print(json.dumps(out))
return 0 if out["passed"] else 1
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(prog="bench.trajectory")
parser.add_argument("--n-sessions", type=int, default=30)
parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
parser.add_argument("--real-logs", dest="real_logs", default=None)
return parser.parse_args(argv)
if __name__ == "__main__":
args = _parse_args()
sys.exit(main(
n_sessions=args.n_sessions,
seed=args.seed,
real_logs_path=args.real_logs,
))