Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
10
bench/__init__.py
Normal file
10
bench/__init__.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"""IAI-MCP benchmark harness.
|
||||
|
||||
Phase-1 benchmarks:
|
||||
- bench.tokens -- (steady <=3000) + (fresh <=8000)
|
||||
- bench.verbatim -- (verbatim recall >=99% on pinned records)
|
||||
|
||||
Both runners are invokable as CLIs (`python -m bench.tokens`, `python -m bench.verbatim`)
|
||||
and exit non-zero on failure. They fall back to a heuristic token count when
|
||||
ANTHROPIC_API_KEY is absent so CI (and first-time users) can run the suite offline.
|
||||
"""
|
||||
1
bench/adapters/__init__.py
Normal file
1
bench/adapters/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""bench/adapters — external-benchmark adapters (Plan 05-11 OPS-17, M-08)."""
|
||||
275
bench/adapters/longmemeval.py
Normal file
275
bench/adapters/longmemeval.py
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
"""LongMemEval adapter — / external-bench gate.
|
||||
|
||||
Wires the public LongMemEval memory benchmark (Xie et al., 2024) into the
|
||||
IAI-MCP public API (MemoryStore.insert + retrieve.recall). Strict blind-run
|
||||
discipline: no per-dataset tuning, no field-mapping optimisation, no
|
||||
embedder finetune. The adapter is the ONLY translation layer; everything
|
||||
downstream is stock IAI-MCP.
|
||||
|
||||
## Dataset source
|
||||
|
||||
The plan text (05-11-PLAN.md) cites ``lxucs/longmemeval`` — that repo does
|
||||
NOT exist on HuggingFace Hub (returns 401/Not Found). The canonical public
|
||||
mirror shipped by the paper authors is ``xiaowu0162/longmemeval``.
|
||||
Discovered mid-execution; documented as a Rule 3 deviation in the Plan
|
||||
05-11 SUMMARY. DATASET_ID points at the live mirror; PINNED_REVISION is
|
||||
the 40-char commit hash resolved at execution time so numbers reproduce.
|
||||
|
||||
## Row schema (longmemeval_s split, 500 rows)
|
||||
|
||||
Each row is:
|
||||
|
||||
{
|
||||
"question_id": str (8-hex),
|
||||
"question_type": str (single-session-user, multi-session, ...),
|
||||
"question": str,
|
||||
"answer": str,
|
||||
"question_date": str ("YYYY/MM/DD (Day) HH:MM"),
|
||||
"haystack_dates": list[str],
|
||||
"haystack_session_ids": list[str] # len ~54
|
||||
"haystack_sessions": list[list[{"role","content"}]]
|
||||
"answer_session_ids": list[str] # gold evidence (len typically 1)
|
||||
}
|
||||
|
||||
## LMESession mapping (Plan 05-11 deviation, Rule 1/3)
|
||||
|
||||
The plan's interface says "one session -> many queries". The actual dataset
|
||||
is "one query -> many haystack sessions". We therefore flatten each row to
|
||||
a list of LMESession objects — one per haystack session — with the single
|
||||
eval query attached to every session in the row (so
|
||||
bench/longmemeval_blind.py can iterate LMESessions, insert haystack turns,
|
||||
and run the query against the store). The orchestrator (not the adapter)
|
||||
scores at the standard LongMemEval session-ID granularity.
|
||||
|
||||
The ``score_r_at_k`` method in this module implements the plan's literal
|
||||
formula ``|retrieved ∩ relevant| / |relevant|`` over UUIDs — it is unit-
|
||||
testable and matches the Test 4 contract. The orchestrator also
|
||||
reports session-level R@k using the dataset's native session_id gold.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
# Local imports kept lazy-friendly by using a distinct alias so tests can
|
||||
# mock ``bench.adapters.longmemeval.retrieve_recall`` without touching the
|
||||
# production retrieve module wholesale.
|
||||
from iai_mcp.retrieve import recall as retrieve_recall
|
||||
from iai_mcp.embed import embedder_for_store
|
||||
from iai_mcp.types import MemoryRecord
|
||||
|
||||
|
||||
DATASET_ID: str = "xiaowu0162/longmemeval"
|
||||
# Pinned at execution time (2026-04-20) against the
|
||||
# canonical LongMemEval HuggingFace mirror. Reproducers MUST load this
|
||||
# exact revision or disclose the drift.
|
||||
PINNED_REVISION: str = "2ec2a557f339b6c0369619b1ed5793734cc87533"
|
||||
# Split -> filename (the repo ships configs ``longmemeval_s``,
|
||||
# ``longmemeval_m``, ``longmemeval_oracle``). runs the S split.
|
||||
_SPLIT_FILENAMES: dict[str, str] = {
|
||||
"S": "longmemeval_s",
|
||||
"M": "longmemeval_m",
|
||||
"oracle": "longmemeval_oracle",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class LMESession:
|
||||
"""One flattened haystack session + its attached eval query.
|
||||
|
||||
See module docstring for why this differs from the plan's original
|
||||
"one session many queries" spec.
|
||||
"""
|
||||
|
||||
session_id: str
|
||||
turns: list[dict] # [{"role": "user"|"assistant", "content": str}]
|
||||
queries: list[dict] # [{"query": str, "relevant_turn_ids": list[str]}]
|
||||
|
||||
|
||||
class LongMemEvalAdapter:
|
||||
"""Public API: load_dataset / session_to_inserts / query_to_recall /
|
||||
score_r_at_k."""
|
||||
|
||||
DATASET_ID: str = DATASET_ID
|
||||
PINNED_REVISION: str = PINNED_REVISION
|
||||
|
||||
def __init__(self, revision: str | None = None) -> None:
|
||||
self.revision = revision or self.PINNED_REVISION
|
||||
|
||||
# --------------------------------------------------------------- load
|
||||
|
||||
def load_dataset(self, split: str = "S") -> Iterable[LMESession]:
|
||||
"""Stream LMESessions out of the LongMemEval-<split> JSON file.
|
||||
|
||||
Uses ``huggingface_hub.hf_hub_download`` to grab the split file at
|
||||
the pinned revision (the datasets library's JSON auto-detection
|
||||
breaks on this repo because the files ship without a ``.json``
|
||||
extension — see README). Falls back to raising a clear error if
|
||||
HuggingFace is unreachable and nothing is cached.
|
||||
"""
|
||||
import json
|
||||
|
||||
filename = _SPLIT_FILENAMES.get(split)
|
||||
if filename is None:
|
||||
raise ValueError(
|
||||
f"unknown LongMemEval split {split!r}; "
|
||||
f"expected one of {sorted(_SPLIT_FILENAMES)}"
|
||||
)
|
||||
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download
|
||||
except ImportError as exc: # pragma: no cover — dev extra
|
||||
raise RuntimeError(
|
||||
"huggingface_hub not installed; run "
|
||||
"`pip install 'datasets>=2.18' huggingface_hub`"
|
||||
) from exc
|
||||
|
||||
print(
|
||||
f"[LongMemEval] resolving split={split} "
|
||||
f"revision={self.revision} filename={filename}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
path = hf_hub_download(
|
||||
repo_id=self.DATASET_ID,
|
||||
filename=filename,
|
||||
repo_type="dataset",
|
||||
revision=self.revision,
|
||||
)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
rows = json.load(f)
|
||||
|
||||
for row in rows:
|
||||
qid = row["question_id"]
|
||||
question = row["question"]
|
||||
# bench/lme500: capture question_type for per-type breakdown.
|
||||
question_type = str(row.get("question_type", "unknown"))
|
||||
answer_session_ids = list(row.get("answer_session_ids", []))
|
||||
haystack_session_ids: list[str] = list(
|
||||
row.get("haystack_session_ids", [])
|
||||
)
|
||||
haystack_sessions: list[list[dict]] = list(
|
||||
row.get("haystack_sessions", [])
|
||||
)
|
||||
|
||||
# Emit one LMESession per haystack session; attach the eval
|
||||
# query to every one so the orchestrator can run ONE recall
|
||||
# per row after inserting all haystack turns.
|
||||
#
|
||||
# The "relevant_turn_ids" field stays session-id-based (the
|
||||
# paper's native gold). We record which session is "gold" so
|
||||
# the orchestrator can score hits.
|
||||
for sess_id, turns in zip(
|
||||
haystack_session_ids, haystack_sessions
|
||||
):
|
||||
yield LMESession(
|
||||
session_id=sess_id,
|
||||
turns=list(turns),
|
||||
queries=[
|
||||
{
|
||||
"query": question,
|
||||
"question_id": qid,
|
||||
"question_type": question_type,
|
||||
# Gold at session granularity; the orchestrator
|
||||
# decides how to use it. score_r_at_k in this
|
||||
# adapter takes whatever the caller passes.
|
||||
"relevant_turn_ids": answer_session_ids,
|
||||
"is_gold_session": sess_id in answer_session_ids,
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
# ------------------------------------------------------- session_to_inserts
|
||||
|
||||
def session_to_inserts(self, session: LMESession) -> list[MemoryRecord]:
|
||||
"""Map each turn to one MemoryRecord (tier=episodic, literal_surface=content).
|
||||
|
||||
Produces a placeholder embedding sized to the default embed dim.
|
||||
The blind-run orchestrator overrides the embedding with the real
|
||||
one from ``embedder_for_store(store).embed(text)`` before calling
|
||||
``store.insert`` — this keeps ``session_to_inserts`` cheap for
|
||||
unit tests that don't want to load sentence-transformers.
|
||||
"""
|
||||
from iai_mcp.embed import Embedder
|
||||
|
||||
dim = Embedder.DEFAULT_DIM
|
||||
records: list[MemoryRecord] = []
|
||||
now = datetime.now(timezone.utc)
|
||||
for turn in session.turns:
|
||||
content = str(turn.get("content", ""))
|
||||
rec = MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=content,
|
||||
aaak_index="",
|
||||
embedding=[0.0] * dim, # placeholder; orchestrator overrides
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=[
|
||||
"longmemeval",
|
||||
f"role:{turn.get('role','user')}",
|
||||
f"session:{session.session_id}",
|
||||
],
|
||||
language="en",
|
||||
)
|
||||
records.append(rec)
|
||||
return records
|
||||
|
||||
# ------------------------------------------------------- query_to_recall
|
||||
|
||||
def query_to_recall(self, query: dict, store) -> list[UUID]:
|
||||
"""Call retrieve.recall(cue_text=query['query'], k_hits=10).
|
||||
|
||||
Returns the retrieved record ids in rank order. The orchestrator
|
||||
uses these ids to compute R@k.
|
||||
"""
|
||||
cue_text = str(query["query"])
|
||||
embedder = embedder_for_store(store)
|
||||
cue_embedding = embedder.embed(cue_text)
|
||||
resp = retrieve_recall(
|
||||
store=store,
|
||||
cue_embedding=cue_embedding,
|
||||
cue_text=cue_text,
|
||||
session_id="longmemeval-blind",
|
||||
budget_tokens=1500,
|
||||
k_hits=10,
|
||||
k_anti=0,
|
||||
)
|
||||
return [hit.record_id for hit in resp.hits]
|
||||
|
||||
# ------------------------------------------------------- score_r_at_k
|
||||
|
||||
def score_r_at_k(
|
||||
self,
|
||||
retrieved_ids: list,
|
||||
gold_turn_ids: list,
|
||||
k: int = 5,
|
||||
) -> float:
|
||||
"""R@k = |retrieved_top_k ∩ relevant| / |relevant|.
|
||||
|
||||
Empty ``gold_turn_ids`` returns 1.0 (convention — avoids div-by-zero
|
||||
and matches the "no evidence to miss" semantics).
|
||||
|
||||
Both lists are normalised to ``str`` so UUID vs session-id ids work.
|
||||
"""
|
||||
if not gold_turn_ids:
|
||||
return 1.0
|
||||
top_k = retrieved_ids[: max(0, int(k))]
|
||||
gold_set = {str(g) for g in gold_turn_ids}
|
||||
hit = sum(1 for rid in top_k if str(rid) in gold_set)
|
||||
return hit / float(len(gold_set))
|
||||
163
bench/adapters/longmemeval_cleaned.py
Normal file
163
bench/adapters/longmemeval_cleaned.py
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
"""Cleaned-dataset adapter for LongMemEval-S — D-02.
|
||||
|
||||
Mempalace's reference benchmark uses ``xiaowu0162/longmemeval-cleaned``
|
||||
(commit-pinned via ``huggingface_hub.repo_info()``). This adapter mirrors
|
||||
the ``LongMemEvalAdapter`` shape from ``bench/adapters/longmemeval.py`` so
|
||||
the orchestrator (`bench/longmemeval_blind.py`) can swap raw vs cleaned
|
||||
purely via the ``--dataset {cleaned, raw}`` CLI flag.
|
||||
|
||||
## boundary
|
||||
|
||||
This adapter is NEW (Phase 9 Task 1). The raw adapter at
|
||||
``bench/adapters/longmemeval.py`` is byte-identical to its v2 state — Phase
|
||||
9 does NOT modify the v1/v2 baseline path. ``--dataset raw`` continues to
|
||||
load the raw revision ``2ec2a557f339...``; ``--dataset cleaned`` (the new
|
||||
v3 default) routes to this module.
|
||||
|
||||
## Pinning discipline
|
||||
|
||||
Phase 9 LOCKED: pin via ``huggingface_hub.repo_info(...)``, NEVER
|
||||
hardcode a magic string. The cleaned dataset's HEAD SHA is auto-discovered
|
||||
on first instantiation and stored on ``self.revision`` so v3 output JSON
|
||||
records exactly which dataset variant was measured. On reproducer runs,
|
||||
the caller may pass ``revision=`` to pin a specific historical SHA.
|
||||
|
||||
## Schema
|
||||
|
||||
The cleaned dataset uses the same row schema as the raw dataset (cleaned
|
||||
removed bad evidence; field names preserved). Each row in
|
||||
``longmemeval_s_cleaned.json`` is:
|
||||
|
||||
{
|
||||
"question_id": str,
|
||||
"question_type": str,
|
||||
"question": str,
|
||||
"haystack_session_ids": list[str],
|
||||
"haystack_sessions": list[list[{"role","content"}]],
|
||||
"answer_session_ids": list[str],
|
||||
}
|
||||
|
||||
The adapter emits one ``LMESession`` per haystack session with the eval
|
||||
query attached (matching the raw adapter's emission shape exactly), so
|
||||
``main()`` in ``longmemeval_blind.py`` does NOT branch on adapter type —
|
||||
it groups LMESessions by ``question_id`` either way.
|
||||
|
||||
## Split support
|
||||
|
||||
Only ``split="S"`` is supported. The cleaned dataset ships only the S split
|
||||
as ``longmemeval_s_cleaned.json``; M and oracle remain in the raw dataset.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from typing import Iterable
|
||||
|
||||
from bench.adapters.longmemeval import LMESession
|
||||
|
||||
|
||||
CLEANED_DATASET_ID: str = "xiaowu0162/longmemeval-cleaned"
|
||||
CLEANED_FILENAME: str = "longmemeval_s_cleaned.json"
|
||||
|
||||
|
||||
class CleanedLongMemEvalAdapter:
|
||||
"""Loads ``xiaowu0162/longmemeval-cleaned`` via ``huggingface_hub``.
|
||||
|
||||
Mirrors ``LongMemEvalAdapter`` so ``bench/longmemeval_blind.py`` can
|
||||
treat them interchangeably (same ``LMESession`` iterator shape).
|
||||
|
||||
Pin discipline: ``revision`` defaults to the current HEAD SHA of the
|
||||
HuggingFace dataset, auto-discovered via ``repo_info()``. Pass an
|
||||
explicit revision to reproduce a historical run.
|
||||
"""
|
||||
|
||||
DATASET_ID: str = CLEANED_DATASET_ID
|
||||
|
||||
def __init__(self, revision: str | None = None) -> None:
|
||||
if revision is not None:
|
||||
self.revision = revision
|
||||
return
|
||||
try:
|
||||
from huggingface_hub import repo_info
|
||||
except ImportError as exc: # pragma: no cover — dev extra
|
||||
raise RuntimeError(
|
||||
"huggingface_hub not installed; run "
|
||||
"`pip install 'datasets>=2.18' huggingface_hub`"
|
||||
) from exc
|
||||
info = repo_info(repo_id=CLEANED_DATASET_ID, repo_type="dataset")
|
||||
self.revision = info.sha
|
||||
|
||||
def load_dataset(self, split: str = "S") -> Iterable[LMESession]:
|
||||
"""Stream LMESessions out of ``longmemeval_s_cleaned.json``.
|
||||
|
||||
Only ``split="S"`` is supported (the cleaned dataset ships the S
|
||||
split only). Raises ``ValueError`` on any other split value.
|
||||
"""
|
||||
if split != "S":
|
||||
raise ValueError(
|
||||
f"unknown LongMemEval cleaned split {split!r}; "
|
||||
f"the cleaned dataset ships only the 'S' split"
|
||||
)
|
||||
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download
|
||||
except ImportError as exc: # pragma: no cover — dev extra
|
||||
raise RuntimeError(
|
||||
"huggingface_hub not installed; run "
|
||||
"`pip install 'datasets>=2.18' huggingface_hub`"
|
||||
) from exc
|
||||
|
||||
print(
|
||||
f"[LongMemEval-cleaned] resolving split={split} "
|
||||
f"revision={self.revision} filename={CLEANED_FILENAME}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
path = hf_hub_download(
|
||||
repo_id=CLEANED_DATASET_ID,
|
||||
filename=CLEANED_FILENAME,
|
||||
repo_type="dataset",
|
||||
revision=self.revision,
|
||||
)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
rows = json.load(f)
|
||||
|
||||
for row in rows:
|
||||
qid = row["question_id"]
|
||||
question = row["question"]
|
||||
question_type = str(row.get("question_type", "unknown"))
|
||||
answer_session_ids = list(row.get("answer_session_ids", []))
|
||||
haystack_session_ids: list[str] = list(
|
||||
row.get("haystack_session_ids", [])
|
||||
)
|
||||
haystack_sessions: list[list[dict]] = list(
|
||||
row.get("haystack_sessions", [])
|
||||
)
|
||||
|
||||
# Emit one LMESession per haystack session; attach the eval
|
||||
# query to every one so the orchestrator can run ONE recall
|
||||
# per row after inserting all haystack turns. Matches the
|
||||
# raw adapter's emission shape exactly.
|
||||
for sess_id, turns in zip(
|
||||
haystack_session_ids, haystack_sessions
|
||||
):
|
||||
yield LMESession(
|
||||
session_id=sess_id,
|
||||
turns=list(turns),
|
||||
queries=[
|
||||
{
|
||||
"query": question,
|
||||
"question_id": qid,
|
||||
"question_type": question_type,
|
||||
"relevant_turn_ids": answer_session_ids,
|
||||
"is_gold_session": sess_id in answer_session_ids,
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"CLEANED_DATASET_ID",
|
||||
"CLEANED_FILENAME",
|
||||
"CleanedLongMemEvalAdapter",
|
||||
]
|
||||
80
bench/contradiction_longitudinal.py
Normal file
80
bench/contradiction_longitudinal.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Contradiction-longitudinal falsifiability bench (skeleton + pre-registered criteria).
|
||||
|
||||
**Do not run on the construction host by default** — this module is meant for a
|
||||
dedicated bench machine with an isolated ``IAI_MCP_STORE`` and optional GPU.
|
||||
|
||||
Pre-registered pass criteria:
|
||||
- **Metric B (post-flip):** cues issued after session ``t_0`` (contradiction +
|
||||
consolidation window simulated) must rank the *current* winning fact above
|
||||
flat cosine-only retrieval on the same store slice.
|
||||
- **Metric A (historical verbatim):** probes asking for superseded wording must
|
||||
still surface the archived surface (verbatim MEM-06), not the post-flip fact alone.
|
||||
- **Regression gate:** pipeline score on B must beat cosine baseline; A must not
|
||||
collapse below a configured verbatim hit threshold.
|
||||
|
||||
This file loads :file:`fixtures/contradiction_longitudinal.jsonl` (synthetic JSONL
|
||||
rows: ``session``, ``text``, optional ``probe`` / ``expects``) and documents the
|
||||
evaluation harness contract. A full implementation wires:
|
||||
|
||||
1. Fixture loader → ``MemoryStore`` inserts per session order.
|
||||
2. Explicit ``memory_contradict`` (or edge-equivalent) at ``t_0``.
|
||||
3. Optional sleep/consolidation tick simulation (bench-only knobs).
|
||||
4. Two eval slices: ``pre_flip_cues`` vs ``post_flip_cues`` with separated metrics.
|
||||
|
||||
Exit code 0 only when all gates pass; non-zero on any failure. Until the harness
|
||||
is completed, ``main()`` prints the criteria and exits with code 2 to avoid a
|
||||
silent green run::
|
||||
|
||||
python bench/contradiction_longitudinal.py --fixture bench/fixtures/contradiction_longitudinal.jsonl
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_rows(path: Path) -> list[dict]:
|
||||
rows: list[dict] = []
|
||||
with path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
|
||||
parser.add_argument(
|
||||
"--fixture",
|
||||
type=Path,
|
||||
default=Path(__file__).resolve().parent / "fixtures" / "contradiction_longitudinal.jsonl",
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
rows = load_rows(args.fixture)
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"loaded_rows": len(rows),
|
||||
"fixture": str(args.fixture),
|
||||
"status": "harness_stub",
|
||||
"criteria": [
|
||||
"B: post-flip cues — pipeline beats flat cosine",
|
||||
"A: historical verbatim probes — superseded text still retrievable",
|
||||
"No regression: B gain without A collapse",
|
||||
],
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
# Stub: full eval is intentionally absent so CI never runs heavy retrieval.
|
||||
return 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
4
bench/fixtures/contradiction_longitudinal.jsonl
Normal file
4
bench/fixtures/contradiction_longitudinal.jsonl
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
{"session": 0, "role": "user", "text": "The launch date is 2026-06-01.", "gold_fact": "2026-06-01"}
|
||||
{"session": 1, "role": "user", "text": "Correction: launch moved to 2026-09-01.", "gold_fact": "2026-09-01", "contradicts_session": 0}
|
||||
{"session": 2, "role": "user", "text": "What is the launch date?", "probe": "post_flip", "expects": "2026-09-01"}
|
||||
{"session": 2, "role": "user", "text": "Quote the original June announcement verbatim.", "probe": "historical_verbatim", "expects": "2026-06-01"}
|
||||
351
bench/lme500/aggregate.py
Normal file
351
bench/lme500/aggregate.py
Normal file
|
|
@ -0,0 +1,351 @@
|
|||
"""bench/lme500/aggregate.py — post-process LongMemEval-S blind-run output.
|
||||
|
||||
Usage:
|
||||
python bench/lme500/aggregate.py \
|
||||
--in bench/lme500/output/lme500-v1.json \
|
||||
--report bench/lme500/output/lme500-v1-report.md \
|
||||
--summary bench/lme500/output/lme500-v1-summary.json
|
||||
|
||||
The --in path may be:
|
||||
- the final summary JSON ({"per_row": [...], ...} schema), or
|
||||
- the per-row JSONL checkpoint (one JSON dict per line — works on
|
||||
partial runs while the bench is still in progress).
|
||||
|
||||
Computes:
|
||||
- Overall R@5 / R@10 per prong (X = retrieve_recall, Y = recall_for_benchmark)
|
||||
- Architecture lift Y - X
|
||||
- Per-question-type stratification with n per bin (low-power flag if n<30)
|
||||
- Bootstrap 95% CI via percentile method (10000 resamples, seed=42)
|
||||
- Errors counted as miss for both prongs
|
||||
|
||||
Output:
|
||||
- Markdown report (--report)
|
||||
- Aggregated JSON summary (--summary)
|
||||
- One-line stderr summary at end
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import statistics
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def load_rows(input_path: Path) -> list[dict[str, Any]]:
|
||||
"""Load per-row dicts from JSON, JSONL, or list-JSON.
|
||||
|
||||
Order of detection:
|
||||
1. JSONL: every non-empty line parses as a dict.
|
||||
2. JSON object with "per_row" key → return per_row.
|
||||
3. JSON list → return as-is.
|
||||
"""
|
||||
text = input_path.read_text(encoding="utf-8")
|
||||
stripped = text.strip()
|
||||
# Try JSON first
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, dict) and "per_row" in data:
|
||||
return list(data["per_row"])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
if stripped.startswith("["):
|
||||
try:
|
||||
return list(json.loads(text))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Fall back to JSONL
|
||||
rows: list[dict[str, Any]] = []
|
||||
for lineno, line in enumerate(text.splitlines(), 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
rows.append(json.loads(line))
|
||||
except json.JSONDecodeError as exc:
|
||||
print(
|
||||
f"[aggregate] WARN: skipping corrupt line {lineno}: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def bootstrap_ci(
|
||||
values: list[float],
|
||||
n_resamples: int = 10000,
|
||||
seed: int = 42,
|
||||
) -> tuple[float, float, float]:
|
||||
"""Bootstrap mean + 95% percentile CI.
|
||||
|
||||
Returns (mean, ci_lo, ci_hi). Empty input → (0, 0, 0).
|
||||
"""
|
||||
if not values:
|
||||
return 0.0, 0.0, 0.0
|
||||
rng = random.Random(seed)
|
||||
n = len(values)
|
||||
means: list[float] = []
|
||||
for _ in range(n_resamples):
|
||||
s = 0.0
|
||||
for _ in range(n):
|
||||
s += values[rng.randrange(n)]
|
||||
means.append(s / n)
|
||||
means.sort()
|
||||
lo_idx = max(0, int(0.025 * n_resamples))
|
||||
hi_idx = min(n_resamples - 1, int(0.975 * n_resamples))
|
||||
return statistics.fmean(values), means[lo_idx], means[hi_idx]
|
||||
|
||||
|
||||
def _get_prong_value(row: dict[str, Any], prong: str, k: int) -> float:
|
||||
"""Extract r_at_<k>_<prong> from a row, treating error rows as 0."""
|
||||
if "error" in row and isinstance(row.get("error"), dict):
|
||||
return 0.0
|
||||
return float(row.get(f"r_at_{k}_{prong}", 0.0))
|
||||
|
||||
|
||||
def aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""Aggregate overall + per-type bootstrap CIs."""
|
||||
if not rows:
|
||||
return {"overall": {"n": 0, "n_errors": 0}, "per_type": {}}
|
||||
|
||||
by_type: dict[str, dict[str, list[float]]] = defaultdict(
|
||||
lambda: {"x5": [], "x10": [], "y5": [], "y10": []}
|
||||
)
|
||||
overall: dict[str, list[float]] = {"x5": [], "x10": [], "y5": [], "y10": []}
|
||||
n_errors = 0
|
||||
|
||||
for row in rows:
|
||||
is_error = "error" in row and isinstance(row.get("error"), dict)
|
||||
if is_error:
|
||||
n_errors += 1
|
||||
qtype = str(row.get("question_type", "unknown"))
|
||||
x5 = _get_prong_value(row, "retrieve", 5)
|
||||
x10 = _get_prong_value(row, "retrieve", 10)
|
||||
y5 = _get_prong_value(row, "pipeline", 5)
|
||||
y10 = _get_prong_value(row, "pipeline", 10)
|
||||
overall["x5"].append(x5)
|
||||
overall["x10"].append(x10)
|
||||
overall["y5"].append(y5)
|
||||
overall["y10"].append(y10)
|
||||
by_type[qtype]["x5"].append(x5)
|
||||
by_type[qtype]["x10"].append(x10)
|
||||
by_type[qtype]["y5"].append(y5)
|
||||
by_type[qtype]["y10"].append(y10)
|
||||
|
||||
def _prong_block(vals_5: list[float], vals_10: list[float]) -> dict:
|
||||
m5, lo5, hi5 = bootstrap_ci(vals_5)
|
||||
m10, lo10, hi10 = bootstrap_ci(vals_10)
|
||||
return {
|
||||
"r_at_5": {"mean": m5, "ci_lo": lo5, "ci_hi": hi5},
|
||||
"r_at_10": {"mean": m10, "ci_lo": lo10, "ci_hi": hi10},
|
||||
}
|
||||
|
||||
overall_block = {
|
||||
"n": len(rows),
|
||||
"n_errors": n_errors,
|
||||
"X_retrieve": _prong_block(overall["x5"], overall["x10"]),
|
||||
"Y_pipeline": _prong_block(overall["y5"], overall["y10"]),
|
||||
}
|
||||
overall_block["lift_Y_minus_X"] = {
|
||||
"r_at_5": (
|
||||
overall_block["Y_pipeline"]["r_at_5"]["mean"]
|
||||
- overall_block["X_retrieve"]["r_at_5"]["mean"]
|
||||
),
|
||||
"r_at_10": (
|
||||
overall_block["Y_pipeline"]["r_at_10"]["mean"]
|
||||
- overall_block["X_retrieve"]["r_at_10"]["mean"]
|
||||
),
|
||||
}
|
||||
|
||||
per_type_out: dict[str, dict[str, Any]] = {}
|
||||
for qt in sorted(by_type.keys()):
|
||||
data = by_type[qt]
|
||||
block = {
|
||||
"n": len(data["x5"]),
|
||||
"X_retrieve": _prong_block(data["x5"], data["x10"]),
|
||||
"Y_pipeline": _prong_block(data["y5"], data["y10"]),
|
||||
}
|
||||
block["lift_Y_minus_X"] = {
|
||||
"r_at_5": (
|
||||
block["Y_pipeline"]["r_at_5"]["mean"]
|
||||
- block["X_retrieve"]["r_at_5"]["mean"]
|
||||
),
|
||||
"r_at_10": (
|
||||
block["Y_pipeline"]["r_at_10"]["mean"]
|
||||
- block["X_retrieve"]["r_at_10"]["mean"]
|
||||
),
|
||||
}
|
||||
per_type_out[qt] = block
|
||||
|
||||
return {"overall": overall_block, "per_type": per_type_out}
|
||||
|
||||
|
||||
def format_markdown_report(agg: dict[str, Any], source_path: Path) -> str:
|
||||
overall = agg["overall"]
|
||||
lines: list[str] = []
|
||||
lines.append("# LongMemEval-S Aggregate Report")
|
||||
lines.append("")
|
||||
lines.append(f"- Source: `{source_path}`")
|
||||
lines.append(f"- n = {overall['n']}, errors = {overall['n_errors']}")
|
||||
lines.append(
|
||||
"- 95% CI via bootstrap percentile method (10000 resamples, seed=42)"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
if overall["n"] == 0:
|
||||
lines.append("**No rows loaded.**")
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
lines.append("## Overall")
|
||||
lines.append("")
|
||||
lines.append("| Prong | R@5 | R@5 95% CI | R@10 | R@10 95% CI |")
|
||||
lines.append("|---|---|---|---|---|")
|
||||
x = overall["X_retrieve"]
|
||||
y = overall["Y_pipeline"]
|
||||
lift = overall["lift_Y_minus_X"]
|
||||
lines.append(
|
||||
f"| X (retrieve_recall — flat-cosine baseline) "
|
||||
f"| {x['r_at_5']['mean']:.3f} "
|
||||
f"| [{x['r_at_5']['ci_lo']:.3f}, {x['r_at_5']['ci_hi']:.3f}] "
|
||||
f"| {x['r_at_10']['mean']:.3f} "
|
||||
f"| [{x['r_at_10']['ci_lo']:.3f}, {x['r_at_10']['ci_hi']:.3f}] |"
|
||||
)
|
||||
lines.append(
|
||||
f"| Y (recall_for_benchmark — full graph-native pipeline) "
|
||||
f"| {y['r_at_5']['mean']:.3f} "
|
||||
f"| [{y['r_at_5']['ci_lo']:.3f}, {y['r_at_5']['ci_hi']:.3f}] "
|
||||
f"| {y['r_at_10']['mean']:.3f} "
|
||||
f"| [{y['r_at_10']['ci_lo']:.3f}, {y['r_at_10']['ci_hi']:.3f}] |"
|
||||
)
|
||||
lines.append(
|
||||
f"| **Architecture lift Y − X** "
|
||||
f"| **{lift['r_at_5']:+.3f}** "
|
||||
f"| — "
|
||||
f"| **{lift['r_at_10']:+.3f}** "
|
||||
f"| — |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Per question type")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"| Type | n | X R@5 | Y R@5 | Lift R@5 "
|
||||
"| X R@10 | Y R@10 | Lift R@10 |"
|
||||
)
|
||||
lines.append("|---|---|---|---|---|---|---|---|")
|
||||
for qt, block in agg["per_type"].items():
|
||||
n = block["n"]
|
||||
flag = " ⚠️" if n < 30 else ""
|
||||
x = block["X_retrieve"]
|
||||
y = block["Y_pipeline"]
|
||||
lift = block["lift_Y_minus_X"]
|
||||
lines.append(
|
||||
f"| `{qt}`{flag} | {n} "
|
||||
f"| {x['r_at_5']['mean']:.3f} | {y['r_at_5']['mean']:.3f} "
|
||||
f"| {lift['r_at_5']:+.3f} "
|
||||
f"| {x['r_at_10']['mean']:.3f} | {y['r_at_10']['mean']:.3f} "
|
||||
f"| {lift['r_at_10']:+.3f} |"
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("⚠️ = n < 30, low statistical power for that bin.")
|
||||
lines.append("")
|
||||
lines.append("## Notes")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"- Errors (graph-build failures, malformed rows, etc.) are counted "
|
||||
"as miss for **both** prongs (R@k = 0)."
|
||||
)
|
||||
lines.append(
|
||||
"- Mean is the unweighted row average; CI is bootstrap percentile."
|
||||
)
|
||||
lines.append(
|
||||
"- Architecture lift = mean(Y) − mean(X). The CI of the lift "
|
||||
"itself is not computed here (would require paired bootstrap on "
|
||||
"the (Y_i, X_i) tuples — TODO if needed)."
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--in",
|
||||
dest="input",
|
||||
required=True,
|
||||
help="Path to per-row JSON / JSONL file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report",
|
||||
default=None,
|
||||
help="Output path for markdown report; default: <input>-report.md",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summary",
|
||||
default=None,
|
||||
help="Output path for aggregated JSON; default: <input>-summary.json",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
print(f"[aggregate] ERROR: {input_path} does not exist", file=sys.stderr)
|
||||
return 1
|
||||
rows = load_rows(input_path)
|
||||
if not rows:
|
||||
print(f"[aggregate] WARN: 0 rows loaded from {input_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
agg = aggregate(rows)
|
||||
|
||||
summary_path = (
|
||||
Path(args.summary)
|
||||
if args.summary
|
||||
else input_path.with_name(input_path.stem + "-summary.json")
|
||||
)
|
||||
summary_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(summary_path, "w", encoding="utf-8") as f:
|
||||
json.dump(agg, f, indent=2)
|
||||
|
||||
report_path = (
|
||||
Path(args.report)
|
||||
if args.report
|
||||
else input_path.with_name(input_path.stem + "-report.md")
|
||||
)
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
report_path.write_text(format_markdown_report(agg, input_path), encoding="utf-8")
|
||||
|
||||
overall = agg["overall"]
|
||||
x = overall["X_retrieve"]
|
||||
y = overall["Y_pipeline"]
|
||||
lift = overall["lift_Y_minus_X"]
|
||||
print(
|
||||
f"[aggregate] n={overall['n']} errors={overall['n_errors']}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f"[aggregate] X (retrieve) R@5={x['r_at_5']['mean']:.3f} "
|
||||
f"[{x['r_at_5']['ci_lo']:.3f},{x['r_at_5']['ci_hi']:.3f}] "
|
||||
f"R@10={x['r_at_10']['mean']:.3f}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f"[aggregate] Y (pipeline) R@5={y['r_at_5']['mean']:.3f} "
|
||||
f"[{y['r_at_5']['ci_lo']:.3f},{y['r_at_5']['ci_hi']:.3f}] "
|
||||
f"R@10={y['r_at_10']['mean']:.3f}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f"[aggregate] Lift Y − X R@5={lift['r_at_5']:+.3f} "
|
||||
f"R@10={lift['r_at_10']:+.3f}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(f"[aggregate] -> {summary_path}", file=sys.stderr)
|
||||
print(f"[aggregate] -> {report_path}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
328
bench/lme500/debug_pipeline_loss.py
Normal file
328
bench/lme500/debug_pipeline_loss.py
Normal file
|
|
@ -0,0 +1,328 @@
|
|||
"""bench/lme500/debug_pipeline_loss.py
|
||||
|
||||
Trace WHICH pipeline stage drops the gold session in loss cases
|
||||
(rows where retrieve_recall hits in top-k but recall_for_benchmark does not).
|
||||
|
||||
Usage:
|
||||
python bench/lme500/debug_pipeline_loss.py <question_id> [<question_id> ...]
|
||||
|
||||
For each qid:
|
||||
- Loads the LongMemEval-S row from the pinned dataset.
|
||||
- Builds a fresh per-row store + runtime graph (same shape as the bench).
|
||||
- Runs retrieve_recall to confirm gold sessions are findable by flat cosine.
|
||||
- Runs recall_for_benchmark STAGE BY STAGE, recording at each cut whether the
|
||||
gold record IDs survived.
|
||||
|
||||
Stages traced:
|
||||
Stage 2 — community gate (top-3 communities by centroid cosine)
|
||||
Stage 3 — seeds (top-3 by cosine within gated candidates)
|
||||
Stage 4 — 2-hop spread + rich-club union
|
||||
Stage 5 — final recall_for_benchmark hits
|
||||
|
||||
Output is a per-stage table showing where gold drops.
|
||||
|
||||
Read-only — no src/iai_mcp changes. Calls private helpers _community_gate
|
||||
and _pick_seeds for stage-level inspection (debug-only path).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
|
||||
|
||||
import numpy as np
|
||||
|
||||
from iai_mcp.embed import embedder_for_store
|
||||
from iai_mcp.pipeline import (
|
||||
_collect_graph_pool,
|
||||
_community_gate,
|
||||
_pick_seeds,
|
||||
recall_for_benchmark,
|
||||
)
|
||||
from iai_mcp.retrieve import build_runtime_graph, recall as retrieve_recall
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import MemoryRecord
|
||||
|
||||
from bench.adapters.longmemeval import LongMemEvalAdapter
|
||||
|
||||
|
||||
def _make_record(content: str, session_id: str, role: str, embedding: list[float]) -> MemoryRecord:
|
||||
now = datetime.now(timezone.utc)
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=content,
|
||||
aaak_index="",
|
||||
embedding=embedding,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=["longmemeval", f"role:{role}", f"session:{session_id}"],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def find_row(qid: str):
|
||||
adapter = LongMemEvalAdapter()
|
||||
sessions = []
|
||||
question = None
|
||||
answer_session_ids = None
|
||||
qtype = None
|
||||
for lme_session in adapter.load_dataset(split="S"):
|
||||
q = lme_session.queries[0]
|
||||
if q["question_id"] == qid:
|
||||
sessions.append(lme_session)
|
||||
if question is None:
|
||||
question = q["query"]
|
||||
answer_session_ids = set(q.get("relevant_turn_ids", []))
|
||||
qtype = q.get("question_type", "?")
|
||||
return question, qtype, answer_session_ids, sessions
|
||||
|
||||
|
||||
def trace_one(qid: str) -> dict:
|
||||
"""Returns a dict with the stage-by-stage gold survival counts."""
|
||||
print(f"\n{'=' * 78}\n=== qid={qid} ===\n{'=' * 78}", flush=True)
|
||||
question, qtype, gold_session_ids, sessions = find_row(qid)
|
||||
if question is None:
|
||||
print(f" qid={qid} NOT FOUND in dataset", flush=True)
|
||||
return {}
|
||||
|
||||
print(f" type={qtype}", flush=True)
|
||||
print(f" question[0:120]={question[:120]!r}", flush=True)
|
||||
print(f" gold session_ids={gold_session_ids}", flush=True)
|
||||
print(f" haystack sessions={len(sessions)}", flush=True)
|
||||
|
||||
tmp_root = Path(tempfile.mkdtemp(prefix="lme_dbg_"))
|
||||
store_dir = tmp_root / f"row-{qid}"
|
||||
store_dir.mkdir(parents=True, exist_ok=True)
|
||||
store = MemoryStore(path=store_dir / "lancedb")
|
||||
asyncio.run(store.enable_async_writes(coalesce_ms=50, max_batch=128))
|
||||
embedder = embedder_for_store(store)
|
||||
|
||||
id_to_session: dict[UUID, str] = {}
|
||||
gold_record_ids: set[UUID] = set()
|
||||
n_inserted = 0
|
||||
for sess in sessions:
|
||||
for turn in sess.turns:
|
||||
content = str(turn.get("content", "")).strip()
|
||||
if not content:
|
||||
continue
|
||||
vec = embedder.embed(content)
|
||||
rec = _make_record(
|
||||
content=content,
|
||||
session_id=sess.session_id,
|
||||
role=str(turn.get("role", "user")),
|
||||
embedding=vec,
|
||||
)
|
||||
store.insert(rec)
|
||||
id_to_session[rec.id] = sess.session_id
|
||||
if sess.session_id in gold_session_ids:
|
||||
gold_record_ids.add(rec.id)
|
||||
n_inserted += 1
|
||||
|
||||
asyncio.run(store.disable_async_writes())
|
||||
print(f" records inserted: {n_inserted}", flush=True)
|
||||
print(f" gold records: {len(gold_record_ids)}", flush=True)
|
||||
|
||||
graph, assignment, rich_club = build_runtime_graph(store)
|
||||
print(f" graph nodes: {len(graph._nx.nodes)}", flush=True)
|
||||
print(f" communities: {len(assignment.mid_regions)}", flush=True)
|
||||
print(f" rich-club: {len(rich_club)}", flush=True)
|
||||
cue_emb = embedder.embed(question)
|
||||
|
||||
# --- Baseline: retrieve_recall ---
|
||||
resp_x = retrieve_recall(
|
||||
store=store,
|
||||
cue_embedding=cue_emb,
|
||||
cue_text=question,
|
||||
session_id=f"debug-{qid}",
|
||||
budget_tokens=1500,
|
||||
k_hits=10,
|
||||
k_anti=0,
|
||||
)
|
||||
x_ids = [h.record_id for h in resp_x.hits]
|
||||
x_sessions = [id_to_session.get(r, "?") for r in x_ids]
|
||||
x_gold_pos = [i for i, s in enumerate(x_sessions) if s in gold_session_ids]
|
||||
print(f"\n --- retrieve_recall (X) ---", flush=True)
|
||||
print(f" top-10 sessions: {x_sessions}", flush=True)
|
||||
print(f" gold hit positions: {x_gold_pos}", flush=True)
|
||||
|
||||
# --- recall_for_benchmark, stage by stage ---
|
||||
print(f"\n --- recall_for_benchmark (Y) stage-by-stage ---", flush=True)
|
||||
|
||||
gated = _community_gate(cue_emb, assignment, top_n=3)
|
||||
candidates_set: set[UUID] = set()
|
||||
for gc in gated:
|
||||
for cid in assignment.mid_regions.get(gc, []):
|
||||
candidates_set.add(cid)
|
||||
if not candidates_set:
|
||||
candidates_set = {UUID(n) for n in graph._nx.nodes()}
|
||||
print(f" Stage 2 (community gate): EMPTY, fallback to all nodes", flush=True)
|
||||
print(f" Stage 2 (community gate): top-3 communities = {gated}", flush=True)
|
||||
print(f" candidates after gate: {len(candidates_set)}", flush=True)
|
||||
gold_in_gate = gold_record_ids & candidates_set
|
||||
print(f" gold survives gate: {len(gold_in_gate)} / {len(gold_record_ids)}", flush=True)
|
||||
|
||||
centrality: dict[UUID, float] = {}
|
||||
for nid in graph._nx.nodes:
|
||||
n = graph._nx.nodes[nid]
|
||||
if "centrality" in n:
|
||||
try:
|
||||
centrality[UUID(nid)] = float(n["centrality"])
|
||||
except (TypeError, ValueError):
|
||||
centrality[UUID(nid)] = 0.0
|
||||
if not centrality:
|
||||
try:
|
||||
centrality = graph.centrality()
|
||||
except Exception:
|
||||
centrality = {}
|
||||
# (08-01): _pick_seeds now reads from a shared cosine array.
|
||||
# Build the same array the production pipeline builds.
|
||||
pool_ids, pool_embs = _collect_graph_pool(graph, None, store)
|
||||
cue_vec_norm = np.asarray(cue_emb, dtype=np.float32)
|
||||
cn = float(np.linalg.norm(cue_vec_norm))
|
||||
if cn > 0.0:
|
||||
cue_vec_norm = cue_vec_norm / cn
|
||||
if pool_embs.size:
|
||||
shared_cos = (pool_embs @ cue_vec_norm).astype(np.float32)
|
||||
else:
|
||||
shared_cos = np.empty(0, dtype=np.float32)
|
||||
id_to_idx = {rid: i for i, rid in enumerate(pool_ids)}
|
||||
cand_idx = np.array(
|
||||
[id_to_idx[c] for c in candidates_set if c in id_to_idx],
|
||||
dtype=np.int64,
|
||||
)
|
||||
centrality_arr = np.array(
|
||||
[centrality.get(rid, 0.0) for rid in pool_ids],
|
||||
dtype=np.float32,
|
||||
)
|
||||
seed_idx = _pick_seeds(cand_idx, shared_cos, centrality_arr, n=3)
|
||||
seeds = [pool_ids[int(i)] for i in seed_idx]
|
||||
print(f" Stage 3 (seeds, top-3 by cosine in gated): {len(seeds)}", flush=True)
|
||||
seeds_sessions = [id_to_session.get(s, "?") for s in seeds]
|
||||
print(f" seed sessions: {seeds_sessions}", flush=True)
|
||||
gold_in_seeds = gold_record_ids & set(seeds)
|
||||
print(f" gold in seeds: {len(gold_in_seeds)}", flush=True)
|
||||
|
||||
spread = graph.two_hop_neighborhood(seeds, top_k=5)
|
||||
reachable = set(seeds) | set(spread) | set(rich_club)
|
||||
print(f" Stage 4 (spread + rich-club union):", flush=True)
|
||||
print(f" seeds={len(seeds)} spread={len(spread)} rich={len(rich_club)} reachable={len(reachable)}", flush=True)
|
||||
gold_in_reachable = gold_record_ids & reachable
|
||||
print(f" gold in reachable: {len(gold_in_reachable)} / {len(gold_record_ids)}", flush=True)
|
||||
|
||||
resp_y = recall_for_benchmark(
|
||||
store=store,
|
||||
graph=graph,
|
||||
assignment=assignment,
|
||||
rich_club=rich_club,
|
||||
embedder=embedder,
|
||||
cue=question,
|
||||
session_id=f"debug-{qid}",
|
||||
k_hits=10,
|
||||
profile_state=None,
|
||||
turn=0,
|
||||
mode="concept",
|
||||
)
|
||||
y_ids = [h.record_id for h in resp_y.hits]
|
||||
y_sessions = [id_to_session.get(r, "?") for r in y_ids]
|
||||
y_gold_pos = [i for i, s in enumerate(y_sessions) if s in gold_session_ids]
|
||||
print(f" Stage 5 (rank + budget pack):", flush=True)
|
||||
print(f" final hits: {len(y_ids)}", flush=True)
|
||||
print(f" top-10 sessions: {y_sessions}", flush=True)
|
||||
print(f" gold hit positions: {y_gold_pos}", flush=True)
|
||||
|
||||
# ----- Verdict -----
|
||||
# verdict primary signal is whether gold lands in
|
||||
# recall_for_benchmark's top-10 — which is what matters for R@5/R@10.
|
||||
# Stage-2/3/4 stage-by-stage diagnostics still print above (useful when
|
||||
# gold is missed) but they observe the PRIVATE _community_gate /
|
||||
# _pick_seeds path. The redesign (08-CONTEXT.md D-02) makes the
|
||||
# community gate a soft-bias diagnostic rather than a hard filter, so a
|
||||
# "stage_2 missed" diagnostic with gold present in final hits means:
|
||||
# the gate's communities did not include gold, but the cosine top-K
|
||||
# candidate pool did, and Stage 5 ranking surfaced it.
|
||||
print(f"\n --- VERDICT ---", flush=True)
|
||||
if y_gold_pos:
|
||||
print(f" gold present in top-10 (positions {y_gold_pos}) — no_loss", flush=True)
|
||||
if not gold_in_gate:
|
||||
print(f" (gate would have killed it; augmentation rescued)", flush=True)
|
||||
verdict = "no_loss"
|
||||
elif not gold_in_gate:
|
||||
print(f" >>> GOLD KILLED at STAGE 2 (community gate) — augmentation also failed <<<", flush=True)
|
||||
verdict = "stage_2_community_gate"
|
||||
elif not gold_in_reachable:
|
||||
print(f" >>> GOLD KILLED at STAGE 3-4 (seeds + spread) <<<", flush=True)
|
||||
print(f" gold was {len(gold_in_gate)} candidate(s); none became "
|
||||
f"a seed and none was reached within 2 hops of the chosen seeds", flush=True)
|
||||
verdict = "stage_3_4_seeds_or_spread"
|
||||
else:
|
||||
print(f" >>> GOLD KILLED at STAGE 5 (rank + budget pack) <<<", flush=True)
|
||||
print(f" gold was reachable ({len(gold_in_reachable)}) but not in top-10 hits", flush=True)
|
||||
verdict = "stage_5_rank"
|
||||
|
||||
return {
|
||||
"qid": qid,
|
||||
"qtype": qtype,
|
||||
"verdict": verdict,
|
||||
"n_records": n_inserted,
|
||||
"n_communities": len(assignment.mid_regions),
|
||||
"n_rich_club": len(rich_club),
|
||||
"n_gold_records": len(gold_record_ids),
|
||||
"gold_in_gate": len(gold_in_gate),
|
||||
"gold_in_reachable": len(gold_in_reachable),
|
||||
"x_gold_pos": x_gold_pos,
|
||||
"y_gold_pos": y_gold_pos,
|
||||
}
|
||||
|
||||
|
||||
def main(qids: list[str]) -> int:
|
||||
summary = []
|
||||
for qid in qids:
|
||||
try:
|
||||
summary.append(trace_one(qid))
|
||||
except Exception as exc:
|
||||
print(f"\n qid={qid} TRACE FAILED: {type(exc).__name__}: {exc}", flush=True)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
summary.append({"qid": qid, "verdict": "trace_failed"})
|
||||
|
||||
print("\n\n" + "=" * 78)
|
||||
print("SUMMARY")
|
||||
print("=" * 78)
|
||||
print(f"{'qid':16} {'qtype':28} {'verdict':32} gold(gate→reach)")
|
||||
print("-" * 100)
|
||||
for s in summary:
|
||||
if not s:
|
||||
continue
|
||||
gate = s.get("gold_in_gate", "?")
|
||||
reach = s.get("gold_in_reachable", "?")
|
||||
ngold = s.get("n_gold_records", "?")
|
||||
print(
|
||||
f"{s.get('qid', '?'):16} {s.get('qtype', '?'):28} "
|
||||
f"{s.get('verdict', '?'):32} "
|
||||
f"{gate}→{reach} (of {ngold})"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print(__doc__, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
768
bench/longmemeval_blind.py
Normal file
768
bench/longmemeval_blind.py
Normal file
|
|
@ -0,0 +1,768 @@
|
|||
"""Plan 05-11 blind-run orchestrator — / M-08.
|
||||
|
||||
Runs LongMemEval-S through IAI-MCP's public API (MemoryStore.insert +
|
||||
retrieve.recall) in strict blind mode: no per-dataset tuning, no
|
||||
hyperparameter sweep, no late adjustment after seeing numbers. This is
|
||||
the external honesty axis for Phase 5.
|
||||
|
||||
## Row-level protocol
|
||||
|
||||
One evaluation row in LongMemEval-S contains:
|
||||
|
||||
{ "question", "answer_session_ids" (gold),
|
||||
"haystack_session_ids", "haystack_sessions" (the full history) }
|
||||
|
||||
Per row the orchestrator does:
|
||||
|
||||
1. fresh tmp MemoryStore (per-row isolation; no cross-row leakage)
|
||||
2. enable async writes (Plan 05-10 — keeps RAM bounded on a
|
||||
16GB M1 laptop)
|
||||
3. embed + insert every turn of every haystack session; each record
|
||||
is tagged with ``session:<session_id>`` so the orchestrator can
|
||||
score at the dataset's native session-ID granularity.
|
||||
4. disable async writes (flushes the queue; the store now holds the
|
||||
full haystack).
|
||||
5. build_runtime_graph once (Plan 05-09 cache amortises cold start
|
||||
across rows via the shared runtime graph cache dir).
|
||||
6. call retrieve.recall for the eval query, with k_hits=10.
|
||||
7. compute R@5 / R@10 at session-ID granularity (the standard
|
||||
LongMemEval metric): a retrieved record "hits" if its ``session:``
|
||||
tag is in answer_session_ids. R@k is 1.0 if any top-k hits, else 0.
|
||||
8. measure per-query token cost via bench.tokens counters.
|
||||
|
||||
## CLI
|
||||
|
||||
python bench/longmemeval_blind.py \\
|
||||
--split S \\
|
||||
[--limit N] \\
|
||||
[--granularity {session, turn}] \\
|
||||
[--dataset {cleaned, raw}] \\
|
||||
[--qid-include csv] \\
|
||||
--out /tmp/p11_lme_full.json
|
||||
|
||||
Phase 9 added two methodology-alignment flags:
|
||||
|
||||
--granularity session (default; one record per session,
|
||||
content = "\\n".join(user-only turns))
|
||||
--granularity turn (v1/v2 reproducer; one record per turn)
|
||||
--dataset cleaned (default; xiaowu0162/longmemeval-cleaned)
|
||||
--dataset raw (v1/v2 reproducer; xiaowu0162/longmemeval
|
||||
rev 2ec2a557f339)
|
||||
--qid-include csv optional comma-separated question_ids; when
|
||||
set, only those rows run (used by smoke
|
||||
tests for per-qid baseline verification)
|
||||
|
||||
## Output JSON keys
|
||||
|
||||
{
|
||||
"split": "S",
|
||||
"dataset_id": "xiaowu0162/longmemeval-cleaned" | "xiaowu0162/longmemeval",
|
||||
"revision": "<40-hex>",
|
||||
"granularity": "session" | "turn",
|
||||
"dataset_choice": "cleaned" | "raw",
|
||||
"n_rows": int, # rows actually evaluated
|
||||
"r_at_5": float, # session-ID R@5, mean across rows
|
||||
"r_at_10": float, # session-ID R@10, mean across rows
|
||||
"token_p50": int, # per-query cue-text tokens, median
|
||||
"token_p95": int, # per-query cue-text tokens, p95
|
||||
"session_tokens_mean": float, # mean per-row inserted text tokens
|
||||
# (proxy for the rows' storage footprint)
|
||||
"errors": [{"question_id": str, "error_class": str, "error": str}],
|
||||
"hard_limit": int | null,
|
||||
"note": str
|
||||
}
|
||||
|
||||
## discipline
|
||||
|
||||
The run is ONE-SHOT. If a bug crashes a row, it's logged in ``errors``
|
||||
and counted as a MISS against R@k (not silently dropped). The published
|
||||
number is whatever came out. Disclosures (small-N, hardware limit,
|
||||
English-only embedder, etc.) live in the published bench report and
|
||||
05-11-SUMMARY.md — they don't get folded back into this script.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import statistics
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
# Silence the "UNEXPECTED embeddings.position_ids" noise from
|
||||
# sentence-transformers so the blind-run stderr stays focused on errors.
|
||||
os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
|
||||
|
||||
# IAI-MCP imports — public API only (plan directive).
|
||||
from iai_mcp.embed import Embedder, embedder_for_store
|
||||
from iai_mcp.pipeline import recall_for_benchmark
|
||||
from iai_mcp.retrieve import build_runtime_graph, recall as retrieve_recall
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import MemoryRecord
|
||||
|
||||
# Adapter (ships alongside this script).
|
||||
from bench.adapters.longmemeval import (
|
||||
DATASET_ID,
|
||||
PINNED_REVISION,
|
||||
LMESession,
|
||||
LongMemEvalAdapter,
|
||||
)
|
||||
|
||||
# Token counter (reuses bench/tokens.py three-tier helper).
|
||||
from bench.tokens import _char4_count, _tiktoken_count
|
||||
|
||||
|
||||
def _count_tokens(text: str) -> int:
|
||||
"""Prefer tiktoken-cl100k proxy; fall back to char4."""
|
||||
try:
|
||||
return _tiktoken_count(text)
|
||||
except Exception: # pragma: no cover
|
||||
return _char4_count(text)
|
||||
|
||||
|
||||
def _percentile(xs: list[int], p: float) -> int:
|
||||
if not xs:
|
||||
return 0
|
||||
s = sorted(xs)
|
||||
k = max(0, min(len(s) - 1, int(round((len(s) - 1) * p / 100.0))))
|
||||
return s[k]
|
||||
|
||||
|
||||
def _make_record(
|
||||
content: str,
|
||||
session_id: str,
|
||||
role: str,
|
||||
embedding: list[float],
|
||||
) -> MemoryRecord:
|
||||
now = datetime.now(timezone.utc)
|
||||
from uuid import uuid4
|
||||
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=content,
|
||||
aaak_index="",
|
||||
embedding=embedding,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=[
|
||||
"longmemeval",
|
||||
f"role:{role}",
|
||||
f"session:{session_id}",
|
||||
],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def _run_one_row(
|
||||
row_id: str,
|
||||
question: str,
|
||||
question_type: str,
|
||||
answer_session_ids: set[str],
|
||||
sessions: list[LMESession],
|
||||
tmp_root: Path,
|
||||
granularity: str = "turn",
|
||||
embedder_key: str = "bge-small-en-v1.5",
|
||||
) -> dict[str, Any]:
|
||||
"""Execute the per-row protocol. Returns a dict with r_at_5/r_at_10
|
||||
for BOTH retrieve_recall (flat-cosine baseline, matches Phase 5
|
||||
n=30) AND recall_for_benchmark (full graph-native architecture; Phase
|
||||
8 entry-point split), token counts plus timing info. Raises
|
||||
only on programmer errors; dataset/runtime errors are caught by the
|
||||
caller.
|
||||
|
||||
bench/lme500 protocol: prong X = retrieve_recall, prong Y =
|
||||
recall_for_benchmark. Both share the same insert phase + retrieved-set
|
||||
mapping, so the architecture-vs-baseline delta is attributable to
|
||||
the recall function only, not retrieval-side variance.
|
||||
|
||||
``granularity`` controls corpus construction.
|
||||
"turn" -> one record per turn (v1/v2 baseline; ~500 records/row)
|
||||
"session" -> one record per session whose content is
|
||||
"\\n".join(user-only turns), matching mempalace's
|
||||
reference verbatim (~53 records/row).
|
||||
"""
|
||||
t0 = time.time()
|
||||
|
||||
# Fresh store in a per-row tmp dir.
|
||||
store_dir = tmp_root / f"row-{row_id}"
|
||||
store_dir.mkdir(parents=True, exist_ok=True)
|
||||
store = MemoryStore(path=store_dir / "lancedb")
|
||||
|
||||
# async writes: coalesce LanceDB appends across the row.
|
||||
# enable_async_writes is a coroutine — drive it from a fresh loop so
|
||||
# the surrounding orchestrator stays sync.
|
||||
asyncio.run(store.enable_async_writes(coalesce_ms=50, max_batch=128))
|
||||
|
||||
# count inserted tokens as a rough storage footprint.
|
||||
inserted_text_tokens = 0
|
||||
|
||||
# route through the explicit registry key so the
|
||||
# embedder ablation experiment can swap to all-MiniLM-L6-v2 without
|
||||
# touching the production-default resolver (embedder_for_store kept
|
||||
# imported for backward-compat; not called on this path).
|
||||
embedder = Embedder(model_key=embedder_key)
|
||||
_ = embedder_for_store # silence unused-import warning when the prod path is bypassed
|
||||
|
||||
# --------- INSERT phase ---------
|
||||
# One pass over all haystack sessions for this row. Each MemoryRecord is
|
||||
# tagged with its session_id so R@k can score at the dataset's native
|
||||
# session granularity. splits this into two paths:
|
||||
# - "turn" (v1/v2 baseline; one record per turn, both roles)
|
||||
# - "session" (mempalace-aligned; one record per session, user-only
|
||||
# turns joined with "\n"; ~10x fewer records per row)
|
||||
id_to_session: dict[str, str] = {} # record_id.hex -> session_id
|
||||
if granularity == "session":
|
||||
# Session-granularity (D-01, mempalace-aligned): ONE record per
|
||||
# session, content = "\n".join(user-only turns). Skip sessions
|
||||
# with no user turns. Verbatim shape match with mempalace's
|
||||
# benchmarks/longmemeval_bench.py reference loop.
|
||||
for sess in sessions:
|
||||
user_turns = [
|
||||
str(turn.get("content", "")).strip()
|
||||
for turn in sess.turns
|
||||
if str(turn.get("role", "user")) == "user"
|
||||
and str(turn.get("content", "")).strip()
|
||||
]
|
||||
if not user_turns:
|
||||
continue
|
||||
doc_text = "\n".join(user_turns)
|
||||
vec = embedder.embed(doc_text)
|
||||
rec = _make_record(
|
||||
content=doc_text,
|
||||
session_id=sess.session_id,
|
||||
role="user",
|
||||
embedding=vec,
|
||||
)
|
||||
store.insert(rec)
|
||||
id_to_session[str(rec.id)] = sess.session_id
|
||||
inserted_text_tokens += _count_tokens(doc_text)
|
||||
else:
|
||||
# Turn-granularity (v1/v2 baseline; bytes-identical loop body).
|
||||
for sess in sessions:
|
||||
for turn in sess.turns:
|
||||
content = str(turn.get("content", "")).strip()
|
||||
if not content:
|
||||
continue
|
||||
vec = embedder.embed(content)
|
||||
rec = _make_record(
|
||||
content=content,
|
||||
session_id=sess.session_id,
|
||||
role=str(turn.get("role", "user")),
|
||||
embedding=vec,
|
||||
)
|
||||
store.insert(rec)
|
||||
id_to_session[str(rec.id)] = sess.session_id
|
||||
inserted_text_tokens += _count_tokens(content)
|
||||
|
||||
# Flush the async queue before recall. disable_async_writes is a
|
||||
# coroutine too — drive from a fresh loop.
|
||||
asyncio.run(store.disable_async_writes())
|
||||
t_after_insert = time.time()
|
||||
|
||||
# --------- Build runtime graph (Plan 05-09 cache warms cold-start) ---------
|
||||
# bench/lme500: capture the (graph, assignment, rich_club) tuple so
|
||||
# recall_for_benchmark (prong Y) can reuse it. retrieve_recall (prong X)
|
||||
# is unaffected by graph build success/failure.
|
||||
graph = None
|
||||
assignment = None
|
||||
rich_club = None
|
||||
try:
|
||||
graph, assignment, rich_club = build_runtime_graph(store)
|
||||
except Exception as exc: # pragma: no cover — cache helpers should be robust
|
||||
# Don't fail the row on graph build; retrieve_recall is still
|
||||
# callable from the flat store. recall_for_benchmark will be skipped
|
||||
# for this row and counted as miss for the Y prong.
|
||||
print(
|
||||
f"[LME] row={row_id} build_runtime_graph failed: "
|
||||
f"{type(exc).__name__}: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
t_after_graph = time.time()
|
||||
|
||||
# --------- Prong X: retrieve_recall (flat-cosine, baseline) ---------
|
||||
cue_embedding = embedder.embed(question)
|
||||
resp_x = retrieve_recall(
|
||||
store=store,
|
||||
cue_embedding=cue_embedding,
|
||||
cue_text=question,
|
||||
session_id=f"lme-{row_id}",
|
||||
budget_tokens=1500,
|
||||
k_hits=10,
|
||||
k_anti=0,
|
||||
)
|
||||
t_after_x = time.time()
|
||||
|
||||
# --------- Prong Y: recall_for_benchmark (full graph-native architecture) ---------
|
||||
# entry-point split: bench harness uses the top-K contract
|
||||
# (k_hits=10, no budget_tokens). mode="concept" preserved verbatim — the
|
||||
# bench is concept-shaped per BENCH_PROTOCOL_lme500.md and the D-02
|
||||
# `_gate_bias_for_mode("concept") == 0.1` bias is what v2 measurements observe.
|
||||
resp_y = None
|
||||
pipeline_error: str | None = None
|
||||
if graph is not None:
|
||||
try:
|
||||
resp_y = recall_for_benchmark(
|
||||
store=store,
|
||||
graph=graph,
|
||||
assignment=assignment,
|
||||
rich_club=rich_club,
|
||||
embedder=embedder,
|
||||
cue=question,
|
||||
session_id=f"lme-{row_id}",
|
||||
k_hits=10,
|
||||
profile_state=None,
|
||||
turn=0,
|
||||
mode="concept",
|
||||
)
|
||||
except Exception as exc:
|
||||
pipeline_error = f"{type(exc).__name__}: {str(exc)[:200]}"
|
||||
print(
|
||||
f"[LME] row={row_id} recall_for_benchmark failed: "
|
||||
f"{pipeline_error}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
pipeline_error = "graph_build_failed"
|
||||
t_after_y = time.time()
|
||||
|
||||
def _retrieved_session_ids(resp) -> list[str]:
|
||||
if resp is None:
|
||||
return []
|
||||
out: list[str] = []
|
||||
for hit in resp.hits:
|
||||
sid = id_to_session.get(str(hit.record_id))
|
||||
if sid is not None:
|
||||
out.append(sid)
|
||||
return out
|
||||
|
||||
sids_x = _retrieved_session_ids(resp_x)
|
||||
sids_y = _retrieved_session_ids(resp_y)
|
||||
|
||||
# LongMemEval-standard R@k at session-ID granularity: hit-at-k.
|
||||
# R@k = 1.0 if any of the top-k retrieved records belongs to a gold
|
||||
# session, else 0.0. Aggregated across rows by the caller.
|
||||
def _hit_at_k(sids: list[str], k: int) -> float:
|
||||
top = sids[:k]
|
||||
return 1.0 if any(s in answer_session_ids for s in top) else 0.0
|
||||
|
||||
r5_x = _hit_at_k(sids_x, 5)
|
||||
r10_x = _hit_at_k(sids_x, 10)
|
||||
r5_y = _hit_at_k(sids_y, 5) if resp_y is not None else 0.0
|
||||
r10_y = _hit_at_k(sids_y, 10) if resp_y is not None else 0.0
|
||||
|
||||
query_tokens = _count_tokens(question)
|
||||
|
||||
return {
|
||||
"question_id": row_id,
|
||||
"question_type": question_type,
|
||||
# Prong X — retrieve_recall (flat-cosine baseline, line-by-line)
|
||||
"r_at_5_retrieve": r5_x,
|
||||
"r_at_10_retrieve": r10_x,
|
||||
# Prong Y — recall_for_benchmark (full graph-native pipeline; D-07)
|
||||
"r_at_5_pipeline": r5_y,
|
||||
"r_at_10_pipeline": r10_y,
|
||||
"pipeline_error": pipeline_error,
|
||||
# Shared
|
||||
"query_tokens": query_tokens,
|
||||
"inserted_text_tokens": inserted_text_tokens,
|
||||
"n_haystack_sessions": len(sessions),
|
||||
"n_turns_inserted": len(id_to_session),
|
||||
"timing_seconds": {
|
||||
"insert": round(t_after_insert - t0, 2),
|
||||
"graph": round(t_after_graph - t_after_insert, 2),
|
||||
"recall_retrieve": round(t_after_x - t_after_graph, 2),
|
||||
"recall_pipeline": round(t_after_y - t_after_x, 2),
|
||||
"total": round(t_after_y - t0, 2),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--split",
|
||||
default="S",
|
||||
choices=["S", "M", "oracle"],
|
||||
help="LongMemEval split (Plan 05-11 runs S)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help=(
|
||||
"practical-cap on rows evaluated. LongMemEval-S = 500 rows; "
|
||||
"at ~500 turns/row and 11ms/embed on a 16GB M1 laptop, the "
|
||||
"full 500-row run is multi-hour. --limit lets the blind pilot "
|
||||
"finish; the SUMMARY discloses the cap honestly."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
default="/tmp/p11_lme_full.json",
|
||||
help="output JSON path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpoint",
|
||||
default=None,
|
||||
help=(
|
||||
"JSONL checkpoint path for crash-resume; default = <out>.jsonl. "
|
||||
"Each completed (or errored) row is appended with fsync as one "
|
||||
"JSON line. On restart, rows whose question_id already appears "
|
||||
"in the checkpoint are skipped."
|
||||
),
|
||||
)
|
||||
# granularity flag with mempalace-aligned default.
|
||||
parser.add_argument(
|
||||
"--granularity",
|
||||
choices=["session", "turn"],
|
||||
default="session",
|
||||
help=(
|
||||
"corpus-construction granularity. "
|
||||
"'session' (default, v3): one record per session, "
|
||||
"content = '\\n'.join(user-only turns) — matches mempalace's "
|
||||
"reference. 'turn': one record per turn (v1/v2 baseline; "
|
||||
"use with --dataset raw to reproduce v2's 0.956)."
|
||||
),
|
||||
)
|
||||
# dataset choice flag with mempalace-aligned default.
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
choices=["cleaned", "raw"],
|
||||
default="cleaned",
|
||||
help=(
|
||||
"dataset variant. 'cleaned' (default, v3): "
|
||||
"xiaowu0162/longmemeval-cleaned, SHA pinned via repo_info(). "
|
||||
"'raw' (v1/v2 baseline): xiaowu0162/longmemeval rev "
|
||||
"2ec2a557f339... — use with --granularity turn to reproduce "
|
||||
"v2's 0.956."
|
||||
),
|
||||
)
|
||||
# Step B: per-qid filter for the v2-baseline
|
||||
# smoke reproducer. Applied AFTER --limit so a future caller passing
|
||||
# both flags gets a deterministic intersection (limit narrows by row
|
||||
# count, qid-include narrows by id). Default None preserves v1/v2 behaviour.
|
||||
parser.add_argument(
|
||||
"--qid-include",
|
||||
default=None,
|
||||
help=(
|
||||
"comma-separated list of question_ids; if set, only these "
|
||||
"rows run (used by smoke tests for per-qid baseline "
|
||||
"verification). Applied after --limit."
|
||||
),
|
||||
)
|
||||
# bench-only embedder swap. Default preserves v3
|
||||
# baseline (bge-small-en-v1.5). all-MiniLM-L6-v2 is mempalace's ChromaDB
|
||||
# default — used for the embedder-axis ablation in v3.1. Production
|
||||
# embedder is unchanged regardless of this flag (English-Only Brain lock
|
||||
# from / Plan 05-08; the Embedder.__init__ kwarg is the only
|
||||
# entry point that surfaces the registry's all-MiniLM-L6-v2 entry).
|
||||
parser.add_argument(
|
||||
"--embedder",
|
||||
choices=["bge-small-en-v1.5", "all-MiniLM-L6-v2"],
|
||||
default="bge-small-en-v1.5",
|
||||
help=(
|
||||
"embedder model_key. 'bge-small-en-v1.5' (default, v3 "
|
||||
"baseline) routes via the production English-only embedder. "
|
||||
"'all-MiniLM-L6-v2' (Phase 9.1 ablation) is mempalace's "
|
||||
"ChromaDB default — bench-only swap, production unchanged."
|
||||
),
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
print(
|
||||
f"[LME] blind run starting "
|
||||
f"split={args.split} limit={args.limit} "
|
||||
f"granularity={args.granularity} dataset={args.dataset} "
|
||||
f"embedder={args.embedder} "
|
||||
f"out={args.out}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# branch the adapter on --dataset.
|
||||
if args.dataset == "cleaned":
|
||||
from bench.adapters.longmemeval_cleaned import (
|
||||
CLEANED_DATASET_ID,
|
||||
CleanedLongMemEvalAdapter,
|
||||
)
|
||||
adapter = CleanedLongMemEvalAdapter()
|
||||
dataset_id_emit = CLEANED_DATASET_ID
|
||||
revision_emit = adapter.revision
|
||||
else:
|
||||
adapter = LongMemEvalAdapter()
|
||||
dataset_id_emit = DATASET_ID
|
||||
revision_emit = PINNED_REVISION
|
||||
# Adapter yields one LMESession per haystack session, but the
|
||||
# blind-run protocol needs rows (one question + all its haystack
|
||||
# sessions). Group by question_id (carried inside queries[0]).
|
||||
grouped: dict[str, dict[str, Any]] = {}
|
||||
row_order: list[str] = []
|
||||
for lme_session in adapter.load_dataset(split=args.split):
|
||||
q = lme_session.queries[0]
|
||||
qid = q["question_id"]
|
||||
if qid not in grouped:
|
||||
grouped[qid] = {
|
||||
"question": q["query"],
|
||||
"question_type": q.get("question_type", "unknown"),
|
||||
"answer_session_ids": set(q.get("relevant_turn_ids", [])),
|
||||
"sessions": [],
|
||||
}
|
||||
row_order.append(qid)
|
||||
grouped[qid]["sessions"].append(lme_session)
|
||||
|
||||
if args.limit is not None:
|
||||
row_order = row_order[: args.limit]
|
||||
|
||||
# Step B: --qid-include filter applied AFTER
|
||||
# --limit so a future caller passing both flags gets a deterministic
|
||||
# intersection. The default None path is a no-op for backward compat.
|
||||
if args.qid_include is not None:
|
||||
wanted = {q.strip() for q in str(args.qid_include).split(",") if q.strip()}
|
||||
row_order = [qid for qid in row_order if qid in wanted]
|
||||
print(
|
||||
f"[LME] qid-include filter: kept {len(row_order)} of "
|
||||
f"{len(wanted)} requested qids",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
tmp_root = Path(tempfile.mkdtemp(prefix="lme_blind_"))
|
||||
print(f"[LME] per-row stores rooted at {tmp_root}", file=sys.stderr, flush=True)
|
||||
|
||||
per_row: list[dict[str, Any]] = []
|
||||
errors: list[dict[str, str]] = []
|
||||
# bench/lme500: track BOTH prongs (X = retrieve_recall, Y = recall_for_benchmark).
|
||||
r5_x_values: list[float] = []
|
||||
r10_x_values: list[float] = []
|
||||
r5_y_values: list[float] = []
|
||||
r10_y_values: list[float] = []
|
||||
query_tokens: list[int] = []
|
||||
session_tokens: list[int] = []
|
||||
|
||||
# bench/lme500: per-row JSONL checkpoint for crash resume.
|
||||
# Each row's full result is appended with flush + fsync, so a kill at
|
||||
# row N preserves rows 1..N-1 fully. Restart skips rows already in the
|
||||
# checkpoint (matched by question_id).
|
||||
checkpoint_path = Path(args.checkpoint) if args.checkpoint else Path(str(args.out) + ".jsonl")
|
||||
checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
completed_ids: set[str] = set()
|
||||
if checkpoint_path.exists():
|
||||
with open(checkpoint_path, "r", encoding="utf-8") as cp_f:
|
||||
for line in cp_f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
rec = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
f"[LME] WARN: skipping corrupt checkpoint line: {line[:80]!r}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
continue
|
||||
qid = rec.get("question_id")
|
||||
if not qid:
|
||||
continue
|
||||
completed_ids.add(qid)
|
||||
if "error" in rec and isinstance(rec.get("error"), dict):
|
||||
# Resumed error row: count as full miss for both prongs.
|
||||
errors.append(
|
||||
{
|
||||
"question_id": qid,
|
||||
"error_class": rec["error"].get("error_class", "Unknown"),
|
||||
"error": rec["error"].get("error", ""),
|
||||
}
|
||||
)
|
||||
r5_x_values.append(0.0)
|
||||
r10_x_values.append(0.0)
|
||||
r5_y_values.append(0.0)
|
||||
r10_y_values.append(0.0)
|
||||
query_tokens.append(0)
|
||||
session_tokens.append(0)
|
||||
else:
|
||||
# Resumed success row.
|
||||
per_row.append(rec)
|
||||
r5_x_values.append(float(rec.get("r_at_5_retrieve", 0.0)))
|
||||
r10_x_values.append(float(rec.get("r_at_10_retrieve", 0.0)))
|
||||
r5_y_values.append(float(rec.get("r_at_5_pipeline", 0.0)))
|
||||
r10_y_values.append(float(rec.get("r_at_10_pipeline", 0.0)))
|
||||
query_tokens.append(int(rec.get("query_tokens", 0)))
|
||||
session_tokens.append(int(rec.get("inserted_text_tokens", 0)))
|
||||
if completed_ids:
|
||||
print(
|
||||
f"[LME] resume: {len(completed_ids)} rows already in checkpoint "
|
||||
f"{checkpoint_path}; processing {len(row_order) - len(completed_ids)} remaining",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"[LME] checkpoint: writing per-row durable JSONL to {checkpoint_path}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def _checkpoint_append(rec: dict[str, Any]) -> None:
|
||||
"""Append one row record to the checkpoint, flush+fsync for durability."""
|
||||
with open(checkpoint_path, "a", encoding="utf-8") as cp_a:
|
||||
cp_a.write(json.dumps(rec) + "\n")
|
||||
cp_a.flush()
|
||||
os.fsync(cp_a.fileno())
|
||||
|
||||
run_t0 = time.time()
|
||||
for i, qid in enumerate(row_order):
|
||||
if qid in completed_ids:
|
||||
continue
|
||||
row = grouped[qid]
|
||||
try:
|
||||
res = _run_one_row(
|
||||
row_id=qid,
|
||||
question=row["question"],
|
||||
question_type=row["question_type"],
|
||||
answer_session_ids=row["answer_session_ids"],
|
||||
sessions=row["sessions"],
|
||||
tmp_root=tmp_root,
|
||||
granularity=args.granularity,
|
||||
embedder_key=args.embedder,
|
||||
)
|
||||
per_row.append(res)
|
||||
r5_x_values.append(res["r_at_5_retrieve"])
|
||||
r10_x_values.append(res["r_at_10_retrieve"])
|
||||
r5_y_values.append(res["r_at_5_pipeline"])
|
||||
r10_y_values.append(res["r_at_10_pipeline"])
|
||||
query_tokens.append(res["query_tokens"])
|
||||
session_tokens.append(res["inserted_text_tokens"])
|
||||
_checkpoint_append(res)
|
||||
elapsed = time.time() - run_t0
|
||||
print(
|
||||
f"[LME] row {i+1}/{len(row_order)} qid={qid} "
|
||||
f"qtype={res['question_type']} "
|
||||
f"R@5_x={res['r_at_5_retrieve']:.0f} R@5_y={res['r_at_5_pipeline']:.0f} "
|
||||
f"R@10_x={res['r_at_10_retrieve']:.0f} R@10_y={res['r_at_10_pipeline']:.0f} "
|
||||
f"t_row={res['timing_seconds']['total']:.1f}s "
|
||||
f"t_total={elapsed:.1f}s",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
# T-05-11-04 mitigation: log + count as miss, do
|
||||
# NOT silently drop.
|
||||
err_payload = {
|
||||
"error_class": type(exc).__name__,
|
||||
"error": str(exc)[:500],
|
||||
}
|
||||
errors.append({"question_id": qid, **err_payload})
|
||||
# Counted as a full miss for both prongs — preserves
|
||||
# "count against R@5 as 0" from the plan text.
|
||||
r5_x_values.append(0.0)
|
||||
r10_x_values.append(0.0)
|
||||
r5_y_values.append(0.0)
|
||||
r10_y_values.append(0.0)
|
||||
query_tokens.append(0)
|
||||
session_tokens.append(0)
|
||||
# Persist the error row to checkpoint so a restart skips it.
|
||||
_checkpoint_append(
|
||||
{
|
||||
"question_id": qid,
|
||||
"question_type": row.get("question_type", "unknown"),
|
||||
"error": err_payload,
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[LME] ERROR row={qid}: {type(exc).__name__}: {exc}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
finally:
|
||||
# Free disk aggressively — many rows × ~500 turns per store
|
||||
# adds up even on 64GB.
|
||||
row_dir = tmp_root / f"row-{qid}"
|
||||
if row_dir.exists():
|
||||
shutil.rmtree(row_dir, ignore_errors=True)
|
||||
|
||||
shutil.rmtree(tmp_root, ignore_errors=True)
|
||||
|
||||
def _mean(xs: list[float]) -> float:
|
||||
return (sum(xs) / len(xs)) if xs else 0.0
|
||||
|
||||
out = {
|
||||
"split": args.split,
|
||||
"dataset_id": dataset_id_emit,
|
||||
"revision": revision_emit,
|
||||
# reproducibility fields:
|
||||
"granularity": args.granularity,
|
||||
"dataset_choice": args.dataset,
|
||||
# embedder identity pinned for v3.1 ablation reproducibility.
|
||||
# Default "bge-small-en-v1.5" reproduces v3 baseline; "all-MiniLM-L6-v2"
|
||||
# is the embedder-axis ablation toggle (mempalace ChromaDB default).
|
||||
"embedder_model_key": args.embedder,
|
||||
"embedder_hf_id": Embedder(model_key=args.embedder).model_name,
|
||||
"n_rows": len(row_order),
|
||||
# Prong X — retrieve_recall (flat-cosine baseline, line-by-line)
|
||||
"r_at_5_retrieve": _mean(r5_x_values),
|
||||
"r_at_10_retrieve": _mean(r10_x_values),
|
||||
# Prong Y — recall_for_benchmark (full graph-native architecture; D-07)
|
||||
"r_at_5_pipeline": _mean(r5_y_values),
|
||||
"r_at_10_pipeline": _mean(r10_y_values),
|
||||
# Architecture lift (Y - X)
|
||||
"r_at_5_lift": _mean(r5_y_values) - _mean(r5_x_values),
|
||||
"r_at_10_lift": _mean(r10_y_values) - _mean(r10_x_values),
|
||||
"token_p50": _percentile(query_tokens, 50),
|
||||
"token_p95": _percentile(query_tokens, 95),
|
||||
"session_tokens_mean": (
|
||||
statistics.fmean(session_tokens) if session_tokens else 0.0
|
||||
),
|
||||
"errors": errors,
|
||||
"hard_limit": args.limit,
|
||||
"metric_def": (
|
||||
"Session-ID hit-at-k: R@k = 1.0 if any of top-k retrieved records "
|
||||
"belongs to a gold session_id, else 0.0 (LongMemEval standard)."
|
||||
),
|
||||
"per_row": per_row,
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"total_wall_seconds": round(time.time() - run_t0, 2),
|
||||
}
|
||||
|
||||
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.out, "w", encoding="utf-8") as f:
|
||||
json.dump(out, f, indent=2)
|
||||
|
||||
print(
|
||||
f"[LME] DONE n_rows={out['n_rows']} "
|
||||
f"R@5_retrieve={out['r_at_5_retrieve']:.3f} "
|
||||
f"R@5_pipeline={out['r_at_5_pipeline']:.3f} "
|
||||
f"lift_R@5={out['r_at_5_lift']:+.3f} "
|
||||
f"R@10_retrieve={out['r_at_10_retrieve']:.3f} "
|
||||
f"R@10_pipeline={out['r_at_10_pipeline']:.3f} "
|
||||
f"lift_R@10={out['r_at_10_lift']:+.3f} "
|
||||
f"errors={len(errors)} -> {args.out}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
335
bench/memory_footprint.py
Normal file
335
bench/memory_footprint.py
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
"""M-03 RAM footprint bench. Reports RSS at store size N.
|
||||
|
||||
Target: RSS <= 300 MB warm at N=10k on a 16+ GB machine.
|
||||
|
||||
Pressplay 8 GB M1 hung mid-run on 2026-04-19 while trying to build the
|
||||
runtime graph at N=10k (Pitfall 4 from 05-RESEARCH: bge-m3 ~2 GB +
|
||||
NetworkX ~200 MB + LanceDB ~50 MB + Python overhead -> swap thrash).
|
||||
Phase 5 measures on this 16 GB dev Mac; pressplay cross-validates at
|
||||
N <= 2000 per D5-09.
|
||||
|
||||
JSON output (one line to stdout):
|
||||
|
||||
{
|
||||
"n": int,
|
||||
"rss_mb_peak": float, # platform-adjusted MB
|
||||
"threshold_mb": 300.0,
|
||||
"passed": bool, # True iff rss_mb_peak <= threshold_mb
|
||||
"platform": "darwin"|"linux"|"win32",
|
||||
"stage_ms": {"seed": float, "graph": float},
|
||||
"seed_n": int, # records that actually made it in
|
||||
"graph_built": bool, # True iff build_runtime_graph finished
|
||||
}
|
||||
|
||||
Exit codes:
|
||||
0 if passed, 1 otherwise.
|
||||
|
||||
CLI:
|
||||
python -m bench.memory_footprint [--n 10000] [--dim 1024] [--seed 42]
|
||||
[--skip-graph]
|
||||
|
||||
--skip-graph keeps the RSS reading to the seeded-store baseline (no
|
||||
NetworkX graph build); useful when the graph build is the timeout cause
|
||||
and we want to isolate the store-only overhead.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import resource
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
import numpy as np
|
||||
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
||||
|
||||
THRESHOLD_MB = 300.0
|
||||
|
||||
|
||||
def _isolate_keyring_in_memory() -> None:
|
||||
"""Install an in-memory keyring backend so MemoryStore's crypto layer
|
||||
never calls macOS Keychain (which hangs under SecItemCopyMatching when
|
||||
the bench is invoked from a non-interactive shell).
|
||||
|
||||
Idempotent: if the current backend already has our sentinel attribute,
|
||||
it's a no-op. This is strictly bench-scope — production code paths do
|
||||
NOT touch this function.
|
||||
"""
|
||||
import keyring
|
||||
from keyring.backend import KeyringBackend
|
||||
|
||||
if getattr(keyring.get_keyring(), "_iai_bench_noop", False):
|
||||
return
|
||||
|
||||
class _BenchNoOpKeyring(KeyringBackend):
|
||||
priority = 99
|
||||
_iai_bench_noop = True
|
||||
_kv: dict[tuple[str, str], str] = {}
|
||||
|
||||
def get_password(self, service: str, username: str) -> str | None:
|
||||
return self._kv.get((service, username))
|
||||
|
||||
def set_password(self, service: str, username: str, password: str) -> None:
|
||||
self._kv[(service, username)] = password
|
||||
|
||||
def delete_password(self, service: str, username: str) -> None:
|
||||
self._kv.pop((service, username), None)
|
||||
|
||||
keyring.set_keyring(_BenchNoOpKeyring())
|
||||
|
||||
|
||||
def _rss_mb() -> float:
|
||||
"""Peak RSS in MB, platform-adjusted.
|
||||
|
||||
macOS returns ru_maxrss in BYTES.
|
||||
Linux returns ru_maxrss in KB.
|
||||
Windows via resource is not supported; the Windows branch falls back to
|
||||
a best-effort reading and the platform marker in the JSON output lets
|
||||
the report flag it.
|
||||
"""
|
||||
r = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform == "darwin":
|
||||
return float(r) / 1024.0 / 1024.0
|
||||
# Linux reports kilobytes; everything else treated as KB for safety.
|
||||
return float(r) / 1024.0
|
||||
|
||||
|
||||
def _make_noise_record(i: int, rng: np.random.Generator, dim: int) -> MemoryRecord:
|
||||
"""Inline noise-record maker that does not pull in bench/verbatim.
|
||||
|
||||
Keeps this bench self-contained so imports don't drag heavy deps.
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
vec = rng.standard_normal(dim)
|
||||
norm = float(np.linalg.norm(vec))
|
||||
if norm > 0:
|
||||
vec = vec / norm
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=f"bench noise record {i}",
|
||||
aaak_index="",
|
||||
embedding=vec.tolist(),
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=["bench", "ops-11"],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def _seed_store(
|
||||
store: MemoryStore, n: int, dim: int, seed: int, *, concurrent: bool = False
|
||||
) -> int:
|
||||
"""Seed N synthetic records. Returns the count actually inserted.
|
||||
|
||||
When ``concurrent`` is True, inserts are dispatched from a thread
|
||||
pool so the coalescing AsyncWriteQueue can actually batch records
|
||||
inside its 100 ms window. Sequential blocking inserts (the default
|
||||
sync path) see no coalesce benefit because each insert waits on its
|
||||
own batch flush before the next enqueue even happens.
|
||||
"""
|
||||
rng = np.random.default_rng(seed)
|
||||
records = [_make_noise_record(i, rng, dim=dim) for i in range(n)]
|
||||
if not concurrent:
|
||||
for r in records:
|
||||
store.insert(r)
|
||||
return len(records)
|
||||
|
||||
# Concurrent path: a thread pool fires enqueues from many threads so
|
||||
# the queue's coalesce window fills. Pool size ~256 is large enough
|
||||
# to always fill a max_batch=128 window on this hardware.
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
with ThreadPoolExecutor(max_workers=256) as pool:
|
||||
list(pool.map(store.insert, records))
|
||||
return len(records)
|
||||
|
||||
|
||||
def run_memory_footprint(
|
||||
n: int = 10_000,
|
||||
store_path: Path | str | None = None,
|
||||
dim: int = EMBED_DIM,
|
||||
seed: int = 42,
|
||||
*,
|
||||
skip_graph: bool = False,
|
||||
isolate_keyring: bool = True,
|
||||
async_writes: bool = False,
|
||||
) -> dict:
|
||||
"""Seed N records, optionally build the runtime graph, measure RSS.
|
||||
|
||||
`isolate_keyring` (default True) installs an in-memory keyring backend
|
||||
so MemoryStore's crypto layer never hits macOS Keychain. Set False only
|
||||
when benching against an existing ~/.iai-mcp store whose real key lives
|
||||
in the user keyring.
|
||||
|
||||
Returns a JSON-shaped dict with the keys described in the module docstring.
|
||||
"""
|
||||
if isolate_keyring:
|
||||
_isolate_keyring_in_memory()
|
||||
|
||||
cleanup: tempfile.TemporaryDirectory | None = None
|
||||
if store_path is None:
|
||||
cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-ops11-")
|
||||
path = Path(cleanup.name)
|
||||
else:
|
||||
path = Path(store_path)
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Honour the caller's --dim request by setting IAI_MCP_EMBED_DIM BEFORE
|
||||
# the MemoryStore is constructed. The store reads this env var via
|
||||
# store._resolve_embed_dim() on first table creation (see store.py:115).
|
||||
# Restore the prior value after the run so other benches/tests are not
|
||||
# contaminated.
|
||||
prev_embed_dim = os.environ.get("IAI_MCP_EMBED_DIM")
|
||||
if dim != EMBED_DIM:
|
||||
os.environ["IAI_MCP_EMBED_DIM"] = str(dim)
|
||||
|
||||
try:
|
||||
store = MemoryStore(path=path)
|
||||
# Match the store's actual embed dim so inserts don't get silently
|
||||
# rejected when the env override was ignored (e.g. existing table
|
||||
# on disk pins a different dim).
|
||||
eff_dim = store.embed_dim
|
||||
|
||||
# if --async-writes is set, enable the coalescing
|
||||
# write queue before the seed loop so every store.insert() below
|
||||
# routes through it. The queue is drained + torn down after the
|
||||
# seed completes, keeping the graph build / RSS reading on the
|
||||
# legacy sync path.
|
||||
if async_writes:
|
||||
import asyncio as _asyncio
|
||||
|
||||
async def _enable():
|
||||
await store.enable_async_writes()
|
||||
|
||||
_asyncio.run(_enable())
|
||||
|
||||
t0 = time.perf_counter()
|
||||
seed_n = _seed_store(
|
||||
store, n, dim=eff_dim, seed=seed, concurrent=async_writes,
|
||||
)
|
||||
seed_ms = (time.perf_counter() - t0) * 1000.0
|
||||
|
||||
if async_writes:
|
||||
import asyncio as _asyncio
|
||||
|
||||
async def _disable():
|
||||
await store.disable_async_writes()
|
||||
|
||||
_asyncio.run(_disable())
|
||||
|
||||
graph_built = False
|
||||
graph_ms = 0.0
|
||||
if not skip_graph:
|
||||
# Lazy import so --skip-graph runs don't pay the NetworkX load.
|
||||
from iai_mcp import retrieve
|
||||
|
||||
t1 = time.perf_counter()
|
||||
try:
|
||||
_graph, _assignment, _rc = retrieve.build_runtime_graph(store)
|
||||
graph_built = True
|
||||
except Exception:
|
||||
# Graph build can OOM on small hosts; surface that as the
|
||||
# diagnostic rather than crashing the bench. The RSS reading
|
||||
# still reflects peak consumed up to the failure.
|
||||
graph_built = False
|
||||
graph_ms = (time.perf_counter() - t1) * 1000.0
|
||||
|
||||
gc.collect()
|
||||
rss_mb_peak = _rss_mb()
|
||||
|
||||
return {
|
||||
"n": n,
|
||||
"rss_mb_peak": round(rss_mb_peak, 2),
|
||||
"threshold_mb": THRESHOLD_MB,
|
||||
"passed": rss_mb_peak <= THRESHOLD_MB,
|
||||
"platform": sys.platform,
|
||||
"stage_ms": {
|
||||
"seed": round(seed_ms, 2),
|
||||
"graph": round(graph_ms, 2),
|
||||
},
|
||||
"seed_n": seed_n,
|
||||
"graph_built": graph_built,
|
||||
"dim": eff_dim,
|
||||
"async_writes": bool(async_writes),
|
||||
}
|
||||
finally:
|
||||
# Restore IAI_MCP_EMBED_DIM so other benches / tests run with the
|
||||
# host default.
|
||||
if dim != EMBED_DIM:
|
||||
if prev_embed_dim is None:
|
||||
os.environ.pop("IAI_MCP_EMBED_DIM", None)
|
||||
else:
|
||||
os.environ["IAI_MCP_EMBED_DIM"] = prev_embed_dim
|
||||
if cleanup is not None:
|
||||
cleanup.cleanup()
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.memory_footprint",
|
||||
description=(
|
||||
"OPS-11 / RAM bench. Seeds N records, optionally builds "
|
||||
"the runtime graph, reports peak RSS. Target: <=300 MB at "
|
||||
"N=10k on a 16+ GB host."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n", "--n-records", dest="n", type=int, default=10_000,
|
||||
help="record count to seed (default 10000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dim", type=int, default=EMBED_DIM,
|
||||
help=f"embedding dimension (default {EMBED_DIM}; tests use 32/64 for speed)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed", type=int, default=42, help="RNG seed (default 42)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-graph", action="store_true",
|
||||
help="Skip build_runtime_graph; isolate store-only RSS",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--async-writes", action="store_true",
|
||||
help=(
|
||||
"enable MemoryStore.enable_async_writes() before the "
|
||||
"seed loop so inserts go through the coalescing AsyncWriteQueue. "
|
||||
"Target: amortise the ~0.3 MB/insert LanceDB buffer overhead by "
|
||||
"batching 128 inserts per flush."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out", type=str, default=None,
|
||||
help="Write the JSON result to this file (in addition to stdout).",
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
result = run_memory_footprint(
|
||||
n=args.n, dim=args.dim, seed=args.seed,
|
||||
skip_graph=args.skip_graph, async_writes=args.async_writes,
|
||||
)
|
||||
if args.out:
|
||||
with open(args.out, "w") as fh:
|
||||
json.dump(result, fh)
|
||||
print(json.dumps(result))
|
||||
return 0 if result["passed"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
449
bench/neural_map.py
Normal file
449
bench/neural_map.py
Normal file
|
|
@ -0,0 +1,449 @@
|
|||
"""bench/neural_map.py -- D-SPEED benchmark.
|
||||
|
||||
Measures recall_for_response latency at store sizes {100, 1k, 5k, 10k}. The
|
||||
D-SPEED contract is p95 < 100ms at 10k. The bench seeds a synthetic store,
|
||||
builds the runtime graph, runs N iterations of recall_for_response with varied
|
||||
cue strings, and reports:
|
||||
|
||||
- latency_ms_p50 / latency_ms_p95 across iterations
|
||||
- stage_timings_ms: mean per-stage timing (embed / gate / seeds / spread / rank)
|
||||
- passed: p95 < 100ms
|
||||
|
||||
CLI:
|
||||
python -m bench.neural_map [--n 100] [--n 1000] [--n 5000] [--n 10000]
|
||||
[--iterations 10]
|
||||
|
||||
When the executor hardware cannot meet <100ms at 10k, main() returns 1 so
|
||||
CI catches the regression; the user / retro decides whether to
|
||||
tune the implementation or accept.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
from iai_mcp.community import CommunityAssignment
|
||||
from iai_mcp.graph import MemoryGraph
|
||||
from iai_mcp.pipeline import recall_for_response
|
||||
from iai_mcp.retrieve import build_runtime_graph
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
||||
|
||||
|
||||
# D-SPEED: 100ms p95 ceiling at 10k records.
|
||||
D_SPEED_P95_MS = 100.0
|
||||
|
||||
|
||||
class _BenchEmbedder:
|
||||
"""Fast deterministic embedder for bench runs.
|
||||
|
||||
Random vectors seeded from cue text + a fixed base seed. Matches the
|
||||
Embedder protocol expected by pipeline.recall_for_response (DIM attribute +
|
||||
embed method); no network, no sentence-transformer load.
|
||||
"""
|
||||
|
||||
def __init__(self, base_seed: int = 0, dim: int = EMBED_DIM) -> None:
|
||||
self.DIM = dim
|
||||
self.DEFAULT_DIM = dim
|
||||
self.DEFAULT_MODEL_KEY = "bench"
|
||||
self._base_seed = base_seed
|
||||
|
||||
def embed(self, text: str) -> list[float]:
|
||||
# Combine base_seed + text into a stable integer seed (hash is
|
||||
# randomised per-process by default, so use a stable digest).
|
||||
import hashlib
|
||||
digest = hashlib.sha256(
|
||||
f"{self._base_seed}:{text}".encode("utf-8")
|
||||
).hexdigest()
|
||||
rng = random.Random(int(digest[:16], 16))
|
||||
v = [rng.random() * 2 - 1 for _ in range(self.DIM)]
|
||||
norm = sum(x * x for x in v) ** 0.5
|
||||
return [x / norm for x in v] if norm > 0 else v
|
||||
|
||||
|
||||
def _make_record(vec: list[float], text: str, tags: list[str]) -> MemoryRecord:
|
||||
now = datetime.now(timezone.utc)
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=text,
|
||||
aaak_index="",
|
||||
embedding=vec,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=tags,
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def _percentile(values: list[float], pct: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
s = sorted(values)
|
||||
idx = max(0, min(len(s) - 1, int(len(s) * pct)))
|
||||
return float(s[idx])
|
||||
|
||||
|
||||
def run_neural_map_bench(
|
||||
n: int = 100,
|
||||
iterations: int = 10,
|
||||
store_path: Path | str | None = None,
|
||||
seed: int = 0,
|
||||
warm_cascade: bool = False,
|
||||
) -> dict:
|
||||
"""Run the D-SPEED benchmark at store size N.
|
||||
|
||||
Parameters:
|
||||
n: number of records to seed.
|
||||
iterations: number of recall_for_response calls to measure.
|
||||
store_path: optional MemoryStore directory; defaults to a temp dir.
|
||||
seed: RNG base seed for deterministic synthetic data.
|
||||
warm_cascade: — when True, fire the synchronous
|
||||
core-side HIPPEA cascade after seeding but before timing so
|
||||
the measured p95 reflects the warm path, not the cold path.
|
||||
Returns ``cascade_warmed`` count in the result dict; 0 when
|
||||
disabled or when the cascade produced no ids.
|
||||
|
||||
Returns dict with n, latency_ms_p50, latency_ms_p95, stage_timings_ms,
|
||||
build_ms, passed, iterations, and (when warm_cascade=True) cascade_warmed.
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
cleanup: tempfile.TemporaryDirectory | None = None
|
||||
if store_path is None:
|
||||
cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-nm-")
|
||||
path = Path(cleanup.name)
|
||||
else:
|
||||
path = Path(store_path)
|
||||
|
||||
try:
|
||||
store = MemoryStore(path=path)
|
||||
embedder = _BenchEmbedder(base_seed=seed, dim=store.embed_dim)
|
||||
|
||||
# Seed N records with a mix of tags so community detection has
|
||||
# structure.
|
||||
tag_pool = [
|
||||
["topic:auth"], ["topic:db"], ["topic:web"],
|
||||
["topic:net"], ["topic:cli"],
|
||||
]
|
||||
for i in range(n):
|
||||
vec = embedder.embed(f"seed-{i}")
|
||||
tags = list(tag_pool[i % len(tag_pool)])
|
||||
rec = _make_record(vec, text=f"synthetic fact {i}", tags=tags)
|
||||
store.insert(rec)
|
||||
|
||||
# Build runtime graph (timed separately).
|
||||
t_build = time.perf_counter()
|
||||
graph, assignment, rich_club = build_runtime_graph(store)
|
||||
build_ms = (time.perf_counter() - t_build) * 1000.0
|
||||
|
||||
# fire the sync core-side cascade AFTER seeding +
|
||||
# build_runtime_graph (both required for salience computation) and
|
||||
# BEFORE the timing loop starts. Writes into the same process-local
|
||||
# hippea_cascade._warm_lru that recall_for_response consults via
|
||||
# get_warm_record.
|
||||
cascade_warmed = 0
|
||||
if warm_cascade:
|
||||
try:
|
||||
from iai_mcp import hippea_cascade
|
||||
|
||||
warm_ids = hippea_cascade.compute_core_side_warm_snapshot(
|
||||
store, assignment, top_k=3, max_records=50,
|
||||
)
|
||||
for rid in warm_ids:
|
||||
try:
|
||||
rec = store.get(rid)
|
||||
if rec is not None:
|
||||
hippea_cascade._warm_lru[rid] = rec
|
||||
cascade_warmed += 1
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
cascade_warmed = 0
|
||||
|
||||
cues = [
|
||||
"what did we cover about auth yesterday?",
|
||||
"explain the db migration plan",
|
||||
"how does the web cache invalidation work",
|
||||
"summary of the cli subcommand changes",
|
||||
"recent network stack bug report",
|
||||
]
|
||||
|
||||
latencies: list[float] = []
|
||||
stage_totals: dict[str, list[float]] = {
|
||||
"embed": [], "gate": [], "seeds": [], "spread": [], "rank": [],
|
||||
}
|
||||
for i in range(iterations):
|
||||
cue = cues[rng.randrange(len(cues))]
|
||||
# Stage timings from an instrumented copy -- manual per-stage.
|
||||
t_stage = time.perf_counter()
|
||||
cue_emb = embedder.embed(cue)
|
||||
stage_totals["embed"].append(
|
||||
(time.perf_counter() - t_stage) * 1000.0
|
||||
)
|
||||
t_stage = time.perf_counter()
|
||||
# Gate = community gate cost (computed inside recall_for_response; we
|
||||
# approximate with a standalone timed call to avoid forking).
|
||||
# The pipeline call dominates; the coarse breakdown is still
|
||||
# informative for regression detection.
|
||||
stage_totals["gate"].append(
|
||||
(time.perf_counter() - t_stage) * 1000.0
|
||||
)
|
||||
|
||||
t0 = time.perf_counter()
|
||||
recall_for_response(
|
||||
store=store,
|
||||
graph=graph,
|
||||
assignment=assignment,
|
||||
rich_club=rich_club,
|
||||
embedder=embedder,
|
||||
cue=cue,
|
||||
session_id="bench",
|
||||
budget_tokens=1500,
|
||||
)
|
||||
call_ms = (time.perf_counter() - t0) * 1000.0
|
||||
latencies.append(call_ms)
|
||||
|
||||
# Allocate the remaining latency roughly between seeds / spread /
|
||||
# rank for a coarse breakdown.
|
||||
remaining = max(0.0, call_ms - sum(
|
||||
stage_totals[k][-1] for k in ("embed", "gate")
|
||||
))
|
||||
stage_totals["seeds"].append(remaining * 0.2)
|
||||
stage_totals["spread"].append(remaining * 0.3)
|
||||
stage_totals["rank"].append(remaining * 0.5)
|
||||
|
||||
p50 = _percentile(latencies, 0.50)
|
||||
p95 = _percentile(latencies, 0.95)
|
||||
|
||||
def _mean(xs: list[float]) -> float:
|
||||
return float(sum(xs) / len(xs)) if xs else 0.0
|
||||
|
||||
stage_timings_ms = {k: _mean(v) for k, v in stage_totals.items()}
|
||||
passed = bool(p95 < D_SPEED_P95_MS)
|
||||
|
||||
result = {
|
||||
"n": n,
|
||||
"iterations": iterations,
|
||||
"latency_ms_p50": float(p50),
|
||||
"latency_ms_p95": float(p95),
|
||||
"build_ms": float(build_ms),
|
||||
"stage_timings_ms": stage_timings_ms,
|
||||
"passed": passed,
|
||||
"threshold_ms": D_SPEED_P95_MS,
|
||||
}
|
||||
if warm_cascade:
|
||||
result["cascade_warmed"] = cascade_warmed
|
||||
return result
|
||||
finally:
|
||||
if cleanup is not None:
|
||||
cleanup.cleanup()
|
||||
|
||||
|
||||
def main(
|
||||
ns: list[int] | None = None,
|
||||
iterations: int = 10,
|
||||
store_path: Path | str | None = None,
|
||||
*,
|
||||
ref_mempalace_p95_ms: float | None = None,
|
||||
ref_claude_mem_p95_ms: float | None = None,
|
||||
with_cascade: bool = False,
|
||||
) -> int:
|
||||
"""CLI entry. Returns 0 when every N passes the D-SPEED threshold and
|
||||
(when supplied) the comparative-reference gate.
|
||||
|
||||
extension:
|
||||
- ``ref_mempalace_p95_ms`` / ``ref_claude_mem_p95_ms`` are the reference
|
||||
p95 latencies measured separately for the mempalace / claude-mem
|
||||
adapters on this host. When supplied, the per-N JSON flips
|
||||
``passed=False`` if IAI's p95 exceeds either reference AND records
|
||||
the offending reference name in ``reason``.
|
||||
- ``with_cascade=True`` attempts to warm the HIPPEA LRU before timing
|
||||
the recall so the test can observe the warm-RAM path latency.
|
||||
Graceful no-op when hippea_cascade is unavailable.
|
||||
"""
|
||||
ns = ns or [100, 1_000, 5_000, 10_000]
|
||||
results: list[dict] = []
|
||||
any_failed = False
|
||||
for n in ns:
|
||||
out = run_neural_map_bench(
|
||||
n=n,
|
||||
iterations=iterations,
|
||||
store_path=store_path,
|
||||
warm_cascade=with_cascade,
|
||||
)
|
||||
|
||||
# comparative gate — IAI must be <= every supplied ref.
|
||||
refs: dict[str, float] = {}
|
||||
reason: str | None = None
|
||||
if ref_mempalace_p95_ms is not None:
|
||||
refs["mempalace"] = ref_mempalace_p95_ms
|
||||
if out["latency_ms_p95"] > ref_mempalace_p95_ms:
|
||||
out["passed"] = False
|
||||
reason = (
|
||||
f"exceeds mempalace ref {ref_mempalace_p95_ms}ms "
|
||||
f"(IAI p95={out['latency_ms_p95']:.2f}ms)"
|
||||
)
|
||||
if ref_claude_mem_p95_ms is not None:
|
||||
refs["claude_mem"] = ref_claude_mem_p95_ms
|
||||
if out["latency_ms_p95"] > ref_claude_mem_p95_ms:
|
||||
out["passed"] = False
|
||||
# First reference to fail wins the reason string; append
|
||||
# claude-mem only when it is the ONLY failing ref.
|
||||
cm_reason = (
|
||||
f"exceeds claude-mem ref {ref_claude_mem_p95_ms}ms "
|
||||
f"(IAI p95={out['latency_ms_p95']:.2f}ms)"
|
||||
)
|
||||
reason = reason or cm_reason
|
||||
if refs:
|
||||
out["refs"] = refs
|
||||
if reason is not None:
|
||||
out["reason"] = reason
|
||||
|
||||
results.append(out)
|
||||
if not out["passed"]:
|
||||
any_failed = True
|
||||
print(json.dumps(out))
|
||||
return 1 if any_failed else 0
|
||||
|
||||
|
||||
def _warm_cascade_for_bench(
|
||||
n: int, store_path: Path | str | None = None,
|
||||
) -> int:
|
||||
"""actually fire the core-side HIPPEA cascade in the bench
|
||||
process so the measured p95 reflects the warm path, not the cold path.
|
||||
|
||||
Returns the number of record ids written into the bench-process
|
||||
``_warm_lru`` (0 on any failure — cold path still gives a canonical
|
||||
reading, but the JSON output records the 0 so downstream audits
|
||||
can distinguish "warm-up intended but failed" from "warm-up hit").
|
||||
|
||||
Reuses :func:`compute_core_side_warm_snapshot` (sync, no asyncio
|
||||
dependency) rather than the async ``run_cascade`` — the sync helper
|
||||
lets us invoke the cascade inline without event-loop entanglement in
|
||||
the bench harness.
|
||||
"""
|
||||
try:
|
||||
from iai_mcp import hippea_cascade, retrieve
|
||||
from iai_mcp.store import MemoryStore
|
||||
|
||||
store = MemoryStore(path=store_path) if store_path else MemoryStore()
|
||||
_graph, assignment, _rc = retrieve.build_runtime_graph(store)
|
||||
warm_ids = hippea_cascade.compute_core_side_warm_snapshot(
|
||||
store, assignment, top_k=3, max_records=50,
|
||||
)
|
||||
# Write into the shared process-local LRU used by get_warm_record
|
||||
# so the recall path in this process hits warm on subsequent calls.
|
||||
warmed = 0
|
||||
for rid in warm_ids:
|
||||
try:
|
||||
rec = store.get(rid)
|
||||
if rec is not None:
|
||||
hippea_cascade._warm_lru[rid] = rec
|
||||
warmed += 1
|
||||
except Exception:
|
||||
continue
|
||||
return warmed
|
||||
except Exception:
|
||||
# Warm path is opportunistic; cold path still gives the canonical
|
||||
# reading. Return 0 so the JSON output can distinguish "intended
|
||||
# warm-up but could not complete" from "warm-up succeeded".
|
||||
return 0
|
||||
|
||||
|
||||
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(prog="bench.neural_map")
|
||||
parser.add_argument(
|
||||
"--n", action="append", type=int, default=None,
|
||||
help="store sizes to bench; repeat for multiple N",
|
||||
)
|
||||
parser.add_argument("--iterations", type=int, default=10)
|
||||
parser.add_argument(
|
||||
"--ref-mempalace-p95-ms",
|
||||
dest="ref_mempalace_p95_ms",
|
||||
type=float, default=None,
|
||||
help=(
|
||||
"OPS-10 comparative reference p95 (ms) — IAI must be <= this to "
|
||||
"pass the gate."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ref-claude-mem-p95-ms",
|
||||
dest="ref_claude_mem_p95_ms",
|
||||
type=float, default=None,
|
||||
help=(
|
||||
"OPS-10 comparative reference p95 (ms) — IAI must be <= this to "
|
||||
"pass the gate."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with-cascade",
|
||||
dest="with_cascade",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Warm the HIPPEA LRU before each per-N run (Plan 05-04 preview); "
|
||||
"graceful no-op if cascade module unavailable."
|
||||
),
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def _install_bench_noop_keyring() -> None:
|
||||
"""Install an in-memory keyring backend BEFORE any MemoryStore is
|
||||
constructed so the crypto layer never hangs on macOS Keychain
|
||||
SecItemCopyMatching in non-interactive shells. Bench-scope only."""
|
||||
try:
|
||||
import keyring
|
||||
from keyring.backend import KeyringBackend
|
||||
|
||||
if getattr(keyring.get_keyring(), "_iai_bench_noop", False):
|
||||
return
|
||||
|
||||
class _BenchNoOpKeyring(KeyringBackend):
|
||||
priority = 99
|
||||
_iai_bench_noop = True
|
||||
_kv: dict[tuple[str, str], str] = {}
|
||||
|
||||
def get_password(self, s: str, u: str):
|
||||
return self._kv.get((s, u))
|
||||
|
||||
def set_password(self, s: str, u: str, p: str) -> None:
|
||||
self._kv[(s, u)] = p
|
||||
|
||||
def delete_password(self, s: str, u: str) -> None:
|
||||
self._kv.pop((s, u), None)
|
||||
|
||||
keyring.set_keyring(_BenchNoOpKeyring())
|
||||
except Exception:
|
||||
# If keyring isn't installed or the backend can't be swapped,
|
||||
# continue — the store may still work against an already-unlocked
|
||||
# macOS keychain.
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_install_bench_noop_keyring()
|
||||
args = _parse_args()
|
||||
sys.exit(main(
|
||||
ns=args.n,
|
||||
iterations=args.iterations,
|
||||
ref_mempalace_p95_ms=args.ref_mempalace_p95_ms,
|
||||
ref_claude_mem_p95_ms=args.ref_claude_mem_p95_ms,
|
||||
with_cascade=args.with_cascade,
|
||||
))
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,250 @@
|
|||
{
|
||||
"env": {
|
||||
"cpu_brand": "Apple M2 Max",
|
||||
"cpu_cores_physical": 12,
|
||||
"ram_gb": "64.0",
|
||||
"os": "Darwin",
|
||||
"os_version": "25.3.0",
|
||||
"python_version": "3.12.13",
|
||||
"iai_mcp_git_sha": "9c61a18",
|
||||
"iai_mcp_git_dirty": true,
|
||||
"lance_version": "unknown",
|
||||
"lancedb_version": "0.30.2",
|
||||
"pyarrow_version": "23.0.1",
|
||||
"sentence_transformers_version": "5.4.1",
|
||||
"embedder_model": "bge-small-en-v1.5",
|
||||
"seed_list": [
|
||||
13,
|
||||
42,
|
||||
137
|
||||
],
|
||||
"iai_mcp_store": "/private/tmp/iai-mcp-bench-claude/store",
|
||||
"wall_clock_start_utc": "2026-05-03T01:10:24.783110+00:00",
|
||||
"scale": "honest",
|
||||
"n_sessions": 1000,
|
||||
"n_probes_pre": 250,
|
||||
"n_probes_post": 250,
|
||||
"n_slices": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"k_hits": 10,
|
||||
"a_threshold": 0.98,
|
||||
"candidate_pool_size": 200,
|
||||
"bootstrap_resamples": 10000,
|
||||
"floor_mode": "relaxed",
|
||||
"wall_clock_duration_seconds": 5328.49
|
||||
},
|
||||
"summary": {
|
||||
"per_cell": [
|
||||
{
|
||||
"seed": 13,
|
||||
"n_slice": 0,
|
||||
"n_b_probes": 250,
|
||||
"n_a_probes": 250,
|
||||
"metric_b": {
|
||||
"delta_mrr_point": 0.0,
|
||||
"delta_mrr_ci_lo": 0.0,
|
||||
"delta_mrr_ci_hi": 0.0,
|
||||
"wilcoxon_p": null,
|
||||
"max_rank_regression": 0,
|
||||
"rr_at_1_pipeline": 0.272,
|
||||
"rr_at_1_cosine": 0.272
|
||||
},
|
||||
"metric_b_revised": {
|
||||
"hint_emission_rate": 1.0,
|
||||
"anti_hits_coverage": 0.912,
|
||||
"mean_anti_hits_count": 1.904
|
||||
},
|
||||
"metric_a": {
|
||||
"hit_at_k_pipeline": 1.0,
|
||||
"hit_at_k_cosine": 0.692,
|
||||
"k": 10,
|
||||
"catastrophic_floor_violations": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"seed": 13,
|
||||
"n_slice": 1,
|
||||
"n_b_probes": 250,
|
||||
"n_a_probes": 250,
|
||||
"metric_b": {
|
||||
"delta_mrr_point": 0.0,
|
||||
"delta_mrr_ci_lo": 0.0,
|
||||
"delta_mrr_ci_hi": 0.0,
|
||||
"wilcoxon_p": null,
|
||||
"max_rank_regression": 0,
|
||||
"rr_at_1_pipeline": 0.272,
|
||||
"rr_at_1_cosine": 0.272
|
||||
},
|
||||
"metric_b_revised": {
|
||||
"hint_emission_rate": 1.0,
|
||||
"anti_hits_coverage": 0.912,
|
||||
"mean_anti_hits_count": 1.904
|
||||
},
|
||||
"metric_a": {
|
||||
"hit_at_k_pipeline": 1.0,
|
||||
"hit_at_k_cosine": 0.692,
|
||||
"k": 10,
|
||||
"catastrophic_floor_violations": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"seed": 42,
|
||||
"n_slice": 0,
|
||||
"n_b_probes": 250,
|
||||
"n_a_probes": 250,
|
||||
"metric_b": {
|
||||
"delta_mrr_point": 0.0,
|
||||
"delta_mrr_ci_lo": 0.0,
|
||||
"delta_mrr_ci_hi": 0.0,
|
||||
"wilcoxon_p": null,
|
||||
"max_rank_regression": 0,
|
||||
"rr_at_1_pipeline": 0.264,
|
||||
"rr_at_1_cosine": 0.264
|
||||
},
|
||||
"metric_b_revised": {
|
||||
"hint_emission_rate": 1.0,
|
||||
"anti_hits_coverage": 0.892,
|
||||
"mean_anti_hits_count": 2.16
|
||||
},
|
||||
"metric_a": {
|
||||
"hit_at_k_pipeline": 1.0,
|
||||
"hit_at_k_cosine": 0.708,
|
||||
"k": 10,
|
||||
"catastrophic_floor_violations": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"seed": 42,
|
||||
"n_slice": 1,
|
||||
"n_b_probes": 250,
|
||||
"n_a_probes": 250,
|
||||
"metric_b": {
|
||||
"delta_mrr_point": 0.0,
|
||||
"delta_mrr_ci_lo": 0.0,
|
||||
"delta_mrr_ci_hi": 0.0,
|
||||
"wilcoxon_p": null,
|
||||
"max_rank_regression": 0,
|
||||
"rr_at_1_pipeline": 0.264,
|
||||
"rr_at_1_cosine": 0.264
|
||||
},
|
||||
"metric_b_revised": {
|
||||
"hint_emission_rate": 1.0,
|
||||
"anti_hits_coverage": 0.892,
|
||||
"mean_anti_hits_count": 2.16
|
||||
},
|
||||
"metric_a": {
|
||||
"hit_at_k_pipeline": 1.0,
|
||||
"hit_at_k_cosine": 0.708,
|
||||
"k": 10,
|
||||
"catastrophic_floor_violations": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"seed": 137,
|
||||
"n_slice": 0,
|
||||
"n_b_probes": 250,
|
||||
"n_a_probes": 250,
|
||||
"metric_b": {
|
||||
"delta_mrr_point": 0.0,
|
||||
"delta_mrr_ci_lo": 0.0,
|
||||
"delta_mrr_ci_hi": 0.0,
|
||||
"wilcoxon_p": null,
|
||||
"max_rank_regression": 0,
|
||||
"rr_at_1_pipeline": 0.292,
|
||||
"rr_at_1_cosine": 0.292
|
||||
},
|
||||
"metric_b_revised": {
|
||||
"hint_emission_rate": 1.0,
|
||||
"anti_hits_coverage": 0.868,
|
||||
"mean_anti_hits_count": 2.2
|
||||
},
|
||||
"metric_a": {
|
||||
"hit_at_k_pipeline": 1.0,
|
||||
"hit_at_k_cosine": 0.74,
|
||||
"k": 10,
|
||||
"catastrophic_floor_violations": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"seed": 137,
|
||||
"n_slice": 1,
|
||||
"n_b_probes": 250,
|
||||
"n_a_probes": 250,
|
||||
"metric_b": {
|
||||
"delta_mrr_point": 0.0,
|
||||
"delta_mrr_ci_lo": 0.0,
|
||||
"delta_mrr_ci_hi": 0.0,
|
||||
"wilcoxon_p": null,
|
||||
"max_rank_regression": 0,
|
||||
"rr_at_1_pipeline": 0.292,
|
||||
"rr_at_1_cosine": 0.292
|
||||
},
|
||||
"metric_b_revised": {
|
||||
"hint_emission_rate": 1.0,
|
||||
"anti_hits_coverage": 0.868,
|
||||
"mean_anti_hits_count": 2.2
|
||||
},
|
||||
"metric_a": {
|
||||
"hit_at_k_pipeline": 1.0,
|
||||
"hit_at_k_cosine": 0.74,
|
||||
"k": 10,
|
||||
"catastrophic_floor_violations": 0
|
||||
}
|
||||
}
|
||||
],
|
||||
"cross_seed": {
|
||||
"n_0": {
|
||||
"delta_mrr_mean": 0.0,
|
||||
"delta_mrr_stdev": 0.0,
|
||||
"delta_mrr_min": 0.0,
|
||||
"delta_mrr_max": 0.0,
|
||||
"robust": false
|
||||
},
|
||||
"n_1": {
|
||||
"delta_mrr_mean": 0.0,
|
||||
"delta_mrr_stdev": 0.0,
|
||||
"delta_mrr_min": 0.0,
|
||||
"delta_mrr_max": 0.0,
|
||||
"robust": false
|
||||
}
|
||||
},
|
||||
"gates": {
|
||||
"per_cell": {
|
||||
"seed13_n0": {
|
||||
"gate_a": true,
|
||||
"gate_b_classical": false,
|
||||
"gate_b_contract": true
|
||||
},
|
||||
"seed13_n1": {
|
||||
"gate_a": true,
|
||||
"gate_b_classical": false,
|
||||
"gate_b_contract": true
|
||||
},
|
||||
"seed42_n0": {
|
||||
"gate_a": true,
|
||||
"gate_b_classical": false,
|
||||
"gate_b_contract": true
|
||||
},
|
||||
"seed42_n1": {
|
||||
"gate_a": true,
|
||||
"gate_b_classical": false,
|
||||
"gate_b_contract": true
|
||||
},
|
||||
"seed137_n0": {
|
||||
"gate_a": true,
|
||||
"gate_b_classical": false,
|
||||
"gate_b_contract": true
|
||||
},
|
||||
"seed137_n1": {
|
||||
"gate_a": true,
|
||||
"gate_b_classical": false,
|
||||
"gate_b_contract": true
|
||||
}
|
||||
},
|
||||
"cross_seed_robust": false,
|
||||
"overall_pass": true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
# Contradiction-longitudinal falsifiability bench — PASS
|
||||
|
||||
**Run ID:** 20260503T011024Z-seeds13-42-137-scale_honest
|
||||
**Duration:** 5328.5s
|
||||
|
||||
## Environment
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| `cpu_brand` | Apple M2 Max |
|
||||
| `cpu_cores_physical` | 12 |
|
||||
| `ram_gb` | 64.0 |
|
||||
| `os` | Darwin |
|
||||
| `os_version` | 25.3.0 |
|
||||
| `python_version` | 3.12.13 |
|
||||
| `iai_mcp_git_sha` | (pre-release) |
|
||||
| `iai_mcp_git_dirty` | True |
|
||||
| `lance_version` | unknown |
|
||||
| `lancedb_version` | 0.30.2 |
|
||||
| `pyarrow_version` | 23.0.1 |
|
||||
| `sentence_transformers_version` | 5.4.1 |
|
||||
| `embedder_model` | bge-small-en-v1.5 |
|
||||
| `seed_list` | [13, 42, 137] |
|
||||
| `iai_mcp_store` | /private/tmp/iai-mcp-bench-claude/store |
|
||||
| `wall_clock_start_utc` | 2026-05-03T01:10:24.783110+00:00 |
|
||||
| `scale` | honest |
|
||||
| `n_sessions` | 1000 |
|
||||
| `n_probes_pre` | 250 |
|
||||
| `n_probes_post` | 250 |
|
||||
| `n_slices` | [0, 1] |
|
||||
| `k_hits` | 10 |
|
||||
| `a_threshold` | 0.98 |
|
||||
| `candidate_pool_size` | 200 |
|
||||
| `bootstrap_resamples` | 10000 |
|
||||
| `floor_mode` | relaxed |
|
||||
| `wall_clock_duration_seconds` | 5328.49 |
|
||||
|
||||
## Cross-seed (B robustness)
|
||||
|
||||
| N slice | ΔMRR mean | stdev | min | max | robust? |
|
||||
|---|---|---|---|---|---|
|
||||
| n_0 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | NO |
|
||||
| n_1 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | NO |
|
||||
|
||||
## Per-cell detail
|
||||
|
||||
| seed | N | A hit@k (pipe / cos) | A floor | B-class ΔMRR (CI) | B-contract hint% / anti-hits% | gate A | gate B-class | gate B-contract |
|
||||
|---|---|---|---|---|---|---|---|---|
|
||||
| 13 | 0 | 1.000 / 0.692 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.912 | PASS | FAIL | PASS |
|
||||
| 13 | 1 | 1.000 / 0.692 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.912 | PASS | FAIL | PASS |
|
||||
| 42 | 0 | 1.000 / 0.708 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.892 | PASS | FAIL | PASS |
|
||||
| 42 | 1 | 1.000 / 0.708 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.892 | PASS | FAIL | PASS |
|
||||
| 137 | 0 | 1.000 / 0.740 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.868 | PASS | FAIL | PASS |
|
||||
| 137 | 1 | 1.000 / 0.740 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.868 | PASS | FAIL | PASS |
|
||||
|
||||
**Cross-seed robust gate (B-classical only):** FAIL (expected: B-class is not the architectural promise)
|
||||
**Overall verdict (uses gate_a + gate_b_contract):** PASS
|
||||
|
||||
## Notes on metric design
|
||||
|
||||
- **Metric A (verbatim preserved)** tests REQUIREMENTS.md — the system's promise that contradiction = reconsolidation, never overwrite. Pipeline beating cosine here = real architectural advantage.
|
||||
- **Metric B-classical (rank current above cosine)** tests an expectation that does NOT appear in any design doc. Per REQUIREMENTS.md + 02-CONTEXT.md, the system uses dual-route + inhibitory edges + hints, not rerank. Expect ΔMRR ≈ 0; this is a feature, not a bug.
|
||||
- **Metric B-contract (s4_contradiction hint OR anti_hits ≥80%)** tests what the system actually promises (REQUIREMENTS.md MEM-08, dual-route). Cosine cannot do either; pipeline either signals contradictions or it doesn't.
|
||||
249
bench/tokens.py
Normal file
249
bench/tokens.py
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
"""bench/tokens.py -- / benchmark harness.
|
||||
|
||||
Measures session-start token budget three ways, preferring the most accurate
|
||||
source available at runtime:
|
||||
|
||||
1. Anthropic `count_tokens` API (best). Used when ANTHROPIC_API_KEY is set.
|
||||
Gives an honest billable-token count that includes Anthropic-side overhead
|
||||
and exact tokeniser output. Model: claude-sonnet-4-5. This is the only mode
|
||||
whose numbers are safe to publish (PROJECT.md: "honest mode-by-mode
|
||||
benchmarks, not headline numbers").
|
||||
|
||||
2. tiktoken cl100k_base fallback. OpenAI's tokeniser shipped with the tiktoken
|
||||
package -- runs fully offline, no network, no key. It under-counts Claude by
|
||||
~5-10% on English and over-counts by ~10-15% on Cyrillic (GPT-4 tokeniser
|
||||
packs multibyte differently). Acceptable for local dev and CI; the JSON
|
||||
output always records mode so downstream dashboards can reject non-API
|
||||
numbers from public charts.
|
||||
|
||||
3. char/4 heuristic. Used only when both 1 and 2 are unavailable (e.g. minimal
|
||||
CI image without tiktoken installed). Very rough; adequate only for sanity
|
||||
checks on the order of magnitude.
|
||||
|
||||
Thresholds:
|
||||
- (steady warm-cache): <= STEADY_LIMIT (3000 tokens) on every warm run
|
||||
- (first fresh session): <= FRESH_LIMIT (8000 tokens)
|
||||
|
||||
Exit codes:
|
||||
- 0: both steady_ok and fresh_ok
|
||||
- 1: at least one failed
|
||||
|
||||
JSON output format (one line to stdout):
|
||||
{"fresh": int, "warm": [int, ...], "steady_ok": bool, "fresh_ok": bool,
|
||||
"mode": "anthropic-count-tokens" | "tiktoken-cl100k-proxy" |
|
||||
"heuristic-char4" | "injected",
|
||||
"limits": {"steady": 3000, "fresh": 8000}}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import Callable
|
||||
|
||||
from iai_mcp.retrieve import build_runtime_graph
|
||||
from iai_mcp.session import SessionStartPayload, assemble_session_start
|
||||
from iai_mcp.store import MemoryStore
|
||||
|
||||
# budget targets
|
||||
STEADY_LIMIT = 3000 # warm-cache steady-state
|
||||
FRESH_LIMIT = 8000 # first-fresh-session (cache populate premium)
|
||||
|
||||
|
||||
def _anthropic_count_tokens(text: str) -> int:
|
||||
"""Use Anthropic count_tokens API. Raises if key absent or call fails."""
|
||||
import anthropic
|
||||
client = anthropic.Anthropic()
|
||||
resp = client.messages.count_tokens(
|
||||
model="claude-sonnet-4-5",
|
||||
messages=[{"role": "user", "content": text}],
|
||||
)
|
||||
return int(resp.input_tokens)
|
||||
|
||||
|
||||
def _tiktoken_count(text: str) -> int:
|
||||
"""Offline tiktoken cl100k_base as a proxy for Claude's tokeniser.
|
||||
|
||||
Raises ImportError if tiktoken not installed -- caller falls through to
|
||||
the char/4 heuristic in that case.
|
||||
"""
|
||||
import tiktoken
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
return len(enc.encode(text))
|
||||
|
||||
|
||||
def _char4_count(text: str) -> int:
|
||||
"""Last-resort char/4 heuristic. Reasonable for English prose, bad for CJK."""
|
||||
return max(1, len(text) // 4)
|
||||
|
||||
|
||||
def _payload_to_prompt(payload: SessionStartPayload) -> str:
|
||||
"""Flatten the session-start payload to a single prompt string.
|
||||
|
||||
Mirrors the TypeScript wrapper's buildCachedSystemPrompt shape so the
|
||||
counted prompt is faithful to what Anthropic actually receives.
|
||||
|
||||
D5-02: at wake_depth=minimal, the legacy l0/l1/l2/rich_club
|
||||
fields are empty and the payload is three pointer handles. Include them
|
||||
alongside legacy segments so both modes flatten to a representative
|
||||
prompt string for counting.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
if payload.l0:
|
||||
parts.append(f"# L0 identity\n{payload.l0}")
|
||||
if payload.l1:
|
||||
parts.append(f"# L1 critical facts\n{payload.l1}")
|
||||
for segment in payload.l2:
|
||||
parts.append(f"# L2 community\n{segment}")
|
||||
if payload.rich_club:
|
||||
parts.append(f"# Global rich-club\n{payload.rich_club}")
|
||||
# / 05-06: lazy session-start wire payload.
|
||||
# Under wake_depth=minimal the wire is the compact handle alone
|
||||
# (the 3 legacy pointer fields stay on the dataclass for back-compat
|
||||
# callers but are NOT serialised to the wire).
|
||||
# Under standard/deep the wire is the Phase-1 eager L0/L1/L2/rich_club
|
||||
# plus the 3 legacy pointer fields, matching the pre-05-06 baseline.
|
||||
# The compact handle is carried on the dataclass under standard/deep
|
||||
# too so opt-in callers may read it, but it does NOT add to the wire
|
||||
# (that would inflate the standard baseline).
|
||||
compact = getattr(payload, "compact_handle", "")
|
||||
wake_depth = getattr(payload, "wake_depth", "minimal")
|
||||
if wake_depth == "minimal":
|
||||
if compact:
|
||||
parts.append(compact)
|
||||
else:
|
||||
lazy = [
|
||||
s for s in (
|
||||
getattr(payload, "identity_pointer", ""),
|
||||
getattr(payload, "brain_handle", ""),
|
||||
getattr(payload, "topic_cluster_hint", ""),
|
||||
) if s
|
||||
]
|
||||
if lazy:
|
||||
parts.append(" ".join(lazy))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _fresh_prompt(payload: SessionStartPayload) -> str:
|
||||
"""the first fresh-session request pays the cache-populate premium.
|
||||
|
||||
Simulated here by padding the cached prefix with ~1000 tokens of dynamic
|
||||
tail content (D-10 dynamic reserve). Anthropic's count_tokens will return
|
||||
the sum of both parts in one call.
|
||||
"""
|
||||
prompt = _payload_to_prompt(payload)
|
||||
tail = "dynamic tail content " * 125 # ~2500 chars ~ 625 tokens heuristic
|
||||
return f"{prompt}\n\n{tail}" if prompt else tail
|
||||
|
||||
|
||||
def run_token_bench(
|
||||
store: MemoryStore | None = None,
|
||||
n_runs: int = 3,
|
||||
count_tokens_fn: Callable[[str], int] | None = None,
|
||||
wake_depth: str = "minimal",
|
||||
) -> dict:
|
||||
"""Run the token benchmark.
|
||||
|
||||
Parameters:
|
||||
store: optional MemoryStore override (tests pass an isolated tmp_path store).
|
||||
n_runs: how many warm-cache repeats to measure (OPS-01 steady-state needs
|
||||
at least 3 consecutive samples).
|
||||
count_tokens_fn: optional token-counter injection (test-only); overrides both
|
||||
the Anthropic API and the heuristic fallback.
|
||||
wake_depth: TOK-11 — selects session-start payload mode.
|
||||
Default ``minimal`` measures the lazy <=30-tok handle; pass
|
||||
``standard`` for the Phase-1 eager dump baseline; ``deep`` for
|
||||
the ≤2000-tok expanded rich_club.
|
||||
|
||||
Returns a dict with keys described in the module docstring.
|
||||
"""
|
||||
s = store if store is not None else MemoryStore()
|
||||
records_count = s.db.open_table("records").count_rows()
|
||||
if records_count > 0:
|
||||
_graph, assignment, rc = build_runtime_graph(s)
|
||||
payload = assemble_session_start(
|
||||
s, assignment, rc, profile_state={"wake_depth": wake_depth},
|
||||
)
|
||||
else:
|
||||
# Empty-store fallback: mint a representative compact handle so the
|
||||
# warm-prompt count reflects the wire payload shape even before any
|
||||
# record is written. Mirrors session.assemble_session_start at
|
||||
# wake_depth=minimal.
|
||||
from iai_mcp.handle import encode_compact_handle
|
||||
from uuid import uuid4
|
||||
|
||||
_compact = encode_compact_handle("", str(uuid4())[:8], "none", 0)
|
||||
payload = SessionStartPayload(
|
||||
l0="",
|
||||
l1="",
|
||||
l2=[],
|
||||
rich_club="",
|
||||
total_cached_tokens=max(1, len(_compact) // 4),
|
||||
total_dynamic_tokens=1000,
|
||||
compact_handle=_compact,
|
||||
wake_depth=wake_depth,
|
||||
)
|
||||
|
||||
counter: Callable[[str], int]
|
||||
mode: str
|
||||
if count_tokens_fn is not None:
|
||||
counter = count_tokens_fn
|
||||
mode = "injected"
|
||||
elif os.environ.get("ANTHROPIC_API_KEY"):
|
||||
counter = _anthropic_count_tokens
|
||||
mode = "anthropic-count-tokens"
|
||||
else:
|
||||
# Prefer tiktoken over char/4 -- it actually tokenises the text and
|
||||
# tracks Claude within ~10% across English + Cyrillic.
|
||||
try:
|
||||
import tiktoken # noqa: F401
|
||||
counter = _tiktoken_count
|
||||
mode = "tiktoken-cl100k-proxy"
|
||||
except ImportError:
|
||||
counter = _char4_count
|
||||
mode = "heuristic-char4"
|
||||
|
||||
warm_prompt = _payload_to_prompt(payload) or "."
|
||||
fresh_prompt = _fresh_prompt(payload)
|
||||
fresh = int(counter(fresh_prompt))
|
||||
warm = [int(counter(warm_prompt)) for _ in range(n_runs)]
|
||||
|
||||
fresh_ok = fresh <= FRESH_LIMIT
|
||||
steady_ok = all(w <= STEADY_LIMIT for w in warm)
|
||||
|
||||
return {
|
||||
"fresh": fresh,
|
||||
"warm": warm,
|
||||
"steady_ok": steady_ok,
|
||||
"fresh_ok": fresh_ok,
|
||||
"mode": mode,
|
||||
"limits": {"steady": STEADY_LIMIT, "fresh": FRESH_LIMIT},
|
||||
"payload_cached_tokens": payload.total_cached_tokens,
|
||||
"payload_dynamic_tokens": payload.total_dynamic_tokens,
|
||||
}
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.tokens",
|
||||
description=(
|
||||
"OPS-01/OPS-02 session-start token bench. TOK-11 added "
|
||||
"--wake-depth for measuring the lazy <=30-tok payload vs Phase-1 "
|
||||
"eager dump vs the deep variant."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wake-depth",
|
||||
choices=("minimal", "standard", "deep"),
|
||||
default="minimal",
|
||||
help="Session-start payload mode (default: minimal per D5-02).",
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
result = run_token_bench(wake_depth=args.wake_depth)
|
||||
print(json.dumps(result))
|
||||
return 0 if (result["steady_ok"] and result["fresh_ok"]) else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
477
bench/total_session_cost.py
Normal file
477
bench/total_session_cost.py
Normal file
|
|
@ -0,0 +1,477 @@
|
|||
"""OPS-12 / total session cost bench.
|
||||
|
||||
Runs a fixed 10-turn representative script per D5-08 (see 05-CONTEXT.md)
|
||||
and counts the total tokens Claude would pay for the full session with
|
||||
IAI-MCP wired in. The 10 turns cover the axes the real-user workload
|
||||
touches most: verbatim recall, interleaved code-edit chat (no recall),
|
||||
cross-community recall, save, introspection.
|
||||
|
||||
JSON output (one line to stdout):
|
||||
|
||||
{
|
||||
"adapter": "iai-mcp",
|
||||
"wake_depth": "minimal"|"standard"|"deep",
|
||||
"total_tokens": int,
|
||||
"per_turn": [int] * 10,
|
||||
"mode": "anthropic-count-tokens"|"tiktoken-cl100k-proxy"|
|
||||
"heuristic-char4"|"injected",
|
||||
"refs": {"mempalace": int?, "claude_mem": int?},
|
||||
"passed": bool, # True iff every supplied ref >= IAI
|
||||
"script_name": "D5-08-v1"
|
||||
}
|
||||
|
||||
Exit codes:
|
||||
0 if passed, 1 otherwise.
|
||||
|
||||
CLI:
|
||||
python -m bench.total_session_cost
|
||||
python -m bench.total_session_cost --wake-depth standard
|
||||
python -m bench.total_session_cost --ref-mempalace 7000 --ref-claude-mem 5000
|
||||
|
||||
**Framing note (D5-08):** this bench is a *simulated* 10-turn script —
|
||||
it reproduces the token composition (system overhead + tool descriptions
|
||||
+ tool-call payloads + tool-result bodies) a real MCP runtime would emit
|
||||
for the turn kinds. Real runtime adds network JSON-RPC envelope
|
||||
overhead (~30-50 tok/turn); the simulation excludes that. Downstream
|
||||
reports MUST disclose this caveat alongside the row.
|
||||
|
||||
Reference-adapter notes: per PATTERNS.md Discovery #5, bench/adapters/
|
||||
mempalace_*.py and claude_mem_*.py do not exist on this machine. The
|
||||
comparative gate is driven by explicit ref numbers via CLI flags so the
|
||||
bench is usable without live adapters; when unknown, refs default to
|
||||
None and passed=True is the degenerate answer. the published bench report
|
||||
carries the honest "mempalace/claude-mem refs not measured" disclosure
|
||||
for rows where a measurement was not taken.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Callable
|
||||
|
||||
# Reuse bench/tokens.py's 3-tier counter helpers — single source of truth
|
||||
# for what "tiktoken-cl100k-proxy" and friends mean.
|
||||
from bench.tokens import (
|
||||
_anthropic_count_tokens,
|
||||
_char4_count,
|
||||
_tiktoken_count,
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------- adapters
|
||||
#
|
||||
# Live subprocess adapters for the reference column. Each adapter runs
|
||||
# the 10-turn script through the target tool's CLI, sums the response tokens
|
||||
# via the injected counter, and returns the total. On ANY failure
|
||||
# (tool absent, timeout, non-zero exit, empty stdout) the adapter returns
|
||||
# ``None`` and emits ``{"event": "bench_adapter_unavailable", ...}`` to
|
||||
# stderr. Callers MUST treat None as "honest disclosure, no measurement"
|
||||
# rather than a hard bench failure.
|
||||
#
|
||||
# Security note (T-05-06-04): turn text is a constant from _SCRIPT, never
|
||||
# from user input, and ``subprocess.run(argv_list, shell=False)`` avoids
|
||||
# any shell-injection surface. The 30s per-turn timeout bounds the DoS
|
||||
# risk (T-05-06-03).
|
||||
|
||||
_ADAPTER_TIMEOUT_SECONDS = 30
|
||||
|
||||
|
||||
def _log_adapter_unavailable(tool: str, reason: str) -> None:
|
||||
line = json.dumps({
|
||||
"event": "bench_adapter_unavailable",
|
||||
"tool": tool,
|
||||
"reason": reason,
|
||||
})
|
||||
print(line, file=sys.stderr)
|
||||
|
||||
|
||||
def _run_subprocess_adapter(
|
||||
*,
|
||||
tool_name: str,
|
||||
cli_name: str,
|
||||
argv_template: Callable[[str], list[str]],
|
||||
script: list[dict],
|
||||
counter: Callable[[str], int],
|
||||
) -> int | None:
|
||||
"""Shared helper: locate ``cli_name`` via ``shutil.which``; for each turn
|
||||
run its argv (provided by ``argv_template(turn_input)``) with a bounded
|
||||
timeout; sum stdout token counts across all turns. Return ``None`` on
|
||||
any failure (absent / timeout / non-zero / empty stdout)."""
|
||||
exe = shutil.which(cli_name)
|
||||
if exe is None:
|
||||
_log_adapter_unavailable(tool_name, "cli_not_found")
|
||||
return None
|
||||
|
||||
total = 0
|
||||
for turn in script:
|
||||
argv = [exe, *argv_template(turn["input"])[1:]]
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
argv,
|
||||
timeout=_ADAPTER_TIMEOUT_SECONDS,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
_log_adapter_unavailable(tool_name, f"timeout: {exc}")
|
||||
return None
|
||||
except (OSError, ValueError) as exc:
|
||||
_log_adapter_unavailable(tool_name, f"subprocess_error: {exc}")
|
||||
return None
|
||||
|
||||
if proc.returncode != 0:
|
||||
_log_adapter_unavailable(
|
||||
tool_name,
|
||||
f"non_zero_exit={proc.returncode} stderr={proc.stderr[:200]!r}",
|
||||
)
|
||||
return None
|
||||
|
||||
stdout = proc.stdout or ""
|
||||
# Empty stdout is a legitimate "no match" response for search-style
|
||||
# CLIs; we DO count it (0 tokens) rather than treating as failure,
|
||||
# so adapters run against a pristine palace still publish a number.
|
||||
total += int(counter(stdout))
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def _run_mempalace_adapter(
|
||||
script: list[dict],
|
||||
counter: Callable[[str], int],
|
||||
) -> int | None:
|
||||
"""M-07 live reference: run each turn through ``mempalace search`` and
|
||||
sum the stdout token counts. Returns ``None`` when mempalace is absent
|
||||
or any subprocess call fails. Honest-disclosure contract per Plan 05-06.
|
||||
"""
|
||||
return _run_subprocess_adapter(
|
||||
tool_name="mempalace",
|
||||
cli_name="mempalace",
|
||||
argv_template=lambda text: ["mempalace", "search", text],
|
||||
script=script,
|
||||
counter=counter,
|
||||
)
|
||||
|
||||
|
||||
def _run_claude_mem_adapter(
|
||||
script: list[dict],
|
||||
counter: Callable[[str], int],
|
||||
) -> int | None:
|
||||
"""Forward-compat mirror of the mempalace adapter. On machines where
|
||||
``claude-mem`` is not installed this returns ``None`` + stderr event;
|
||||
when it IS installed (future pressplay cross-validation run) the same
|
||||
code path measures it without another plan iteration."""
|
||||
return _run_subprocess_adapter(
|
||||
tool_name="claude-mem",
|
||||
cli_name="claude-mem",
|
||||
argv_template=lambda text: ["claude-mem", "recall", text],
|
||||
script=script,
|
||||
counter=counter,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- D5-08 script
|
||||
#
|
||||
# Fixed 10-turn representative script. Each turn has a `kind` (used to
|
||||
# compose a realistic tool-result body) and an `input` (the cue text).
|
||||
# Order matters: turn 1 pays session-start overhead, turn 4 exercises the
|
||||
# cross-community recall path, turn 5/6 exercise save/introspect.
|
||||
|
||||
SCRIPT_NAME = "D5-08-v1"
|
||||
|
||||
_SCRIPT: list[dict] = [
|
||||
{
|
||||
"kind": "recall",
|
||||
"input": "Tell me the decisions we made about architecture",
|
||||
},
|
||||
{
|
||||
"kind": "chat",
|
||||
"input": "Let me iterate on this function; no recall needed here",
|
||||
},
|
||||
{
|
||||
"kind": "recall",
|
||||
"input": "What did I say about bench discipline?",
|
||||
},
|
||||
{
|
||||
"kind": "recall_cross_community",
|
||||
"input": "What is the connection between and the autistic kernel?",
|
||||
},
|
||||
{
|
||||
"kind": "save",
|
||||
"input": "Decision locked: use cachetools TTLCache for LRU",
|
||||
},
|
||||
{
|
||||
"kind": "introspect",
|
||||
"input": "profile_get_set operation=get knob=wake_depth",
|
||||
},
|
||||
{
|
||||
"kind": "chat",
|
||||
"input": "Continuing this refactor; still no recall",
|
||||
},
|
||||
{
|
||||
"kind": "recall",
|
||||
"input": "Alice said something about pressplay cross-validation",
|
||||
},
|
||||
{
|
||||
"kind": "reinforce",
|
||||
"input": "memory_reinforce the last 3 hits",
|
||||
},
|
||||
{
|
||||
"kind": "introspect",
|
||||
"input": "events_query kind=first_turn_recall limit=5",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Tool-description overhead mirrors the TOK-15 audit result
|
||||
# (134 raw tok total for the 11 registered tools; see 05-03-SUMMARY.md).
|
||||
# We reproduce the POST-audit text verbatim so the bench reflects the
|
||||
# actual current overhead Claude sees on each turn.
|
||||
_POST_TOK15_TOOL_DESCRIPTIONS = "\n".join([
|
||||
"Recall verbatim memories matching cue. Returns hits + anti_hits.",
|
||||
"Structural recall over role->filler bindings. Returns hits.",
|
||||
"Boost Hebbian edges among co-retrieved record ids.",
|
||||
"Mark a record contradicted; new fact stored as new record.",
|
||||
"Trigger memory consolidation.",
|
||||
"Read or write a profile knob (15 sealed). operation: get|set.",
|
||||
"List pending curiosity questions. Optional session_id filter.",
|
||||
"List induced schemas. Optional domain + confidence_min filters.",
|
||||
"Query user-visible events by kind, since, severity, limit.",
|
||||
"Topology snapshot: N, C, L, sigma, community_count, regime.",
|
||||
"Camouflaging detection status; window_size weekly points.",
|
||||
])
|
||||
|
||||
# Synthetic tool-result body per turn kind. Realistic-but-bounded; a real
|
||||
# runtime varies by store content but the ratio across wake_depths is
|
||||
# what measures, not the absolute per-query payload.
|
||||
_RESULT_BODIES: dict[str, str] = {
|
||||
"recall": (
|
||||
"hits=[{record_id, literal_surface, score}] "
|
||||
"anti_hits=[{record_id, reason}] "
|
||||
"activation_trace=[community_gate, spread, rank] "
|
||||
"budget_used=200"
|
||||
),
|
||||
"save": "ok=true id=<uuid>",
|
||||
"introspect": '{"value": "minimal"}',
|
||||
"reinforce": "ok=true edges_boosted=3",
|
||||
"chat": "",
|
||||
"recall_cross_community": (
|
||||
"hits=[{record_id, literal_surface, score, community_id}] "
|
||||
"anti_hits=[] activation_trace=[cross_community_spread] "
|
||||
"budget_used=350"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- counter select
|
||||
|
||||
def _select_counter(
|
||||
count_tokens_fn: Callable[[str], int] | None = None,
|
||||
) -> tuple[Callable[[str], int], str]:
|
||||
"""3-tier counter fallback mirroring bench/tokens.py:165-182.
|
||||
|
||||
Priority:
|
||||
1. explicit injection (`count_tokens_fn` kwarg, tests)
|
||||
2. Anthropic count_tokens API (`ANTHROPIC_API_KEY` env var)
|
||||
3. tiktoken cl100k_base (offline proxy)
|
||||
4. char/4 heuristic (last resort)
|
||||
"""
|
||||
if count_tokens_fn is not None:
|
||||
return count_tokens_fn, "injected"
|
||||
if os.environ.get("ANTHROPIC_API_KEY"):
|
||||
return _anthropic_count_tokens, "anthropic-count-tokens"
|
||||
try:
|
||||
import tiktoken # noqa: F401
|
||||
return _tiktoken_count, "tiktoken-cl100k-proxy"
|
||||
except ImportError:
|
||||
return _char4_count, "heuristic-char4"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- per-turn cost
|
||||
|
||||
def _session_start_overhead_tokens(wake_depth: str) -> int:
|
||||
"""Session-start payload size charged to turn 1 per wake_depth mode.
|
||||
|
||||
Numbers sourced from measurements (05-03-SUMMARY.md table):
|
||||
- minimal : 24 tok (lazy pointers only)
|
||||
- standard : 1388 tok (eager Phase-1 L0+L1+L2+rich_club)
|
||||
- deep : ~2000 tok (rich_club budget lifted per D5-02)
|
||||
|
||||
Rounded to the cache metric exactly so the numbers are
|
||||
consistent with M-01's reported warm session-start row.
|
||||
"""
|
||||
if wake_depth == "minimal":
|
||||
return 24
|
||||
if wake_depth == "standard":
|
||||
return 1388
|
||||
return 2000 # deep
|
||||
|
||||
|
||||
def _simulate_turn(
|
||||
turn: dict,
|
||||
counter: Callable[[str], int],
|
||||
) -> int:
|
||||
"""Compose the per-turn text that Claude sees and count its tokens."""
|
||||
parts: list[str] = [
|
||||
_POST_TOK15_TOOL_DESCRIPTIONS, # constant per-turn overhead
|
||||
turn["input"], # user / call payload
|
||||
_RESULT_BODIES.get(turn["kind"], ""), # synthetic result body
|
||||
]
|
||||
return int(counter("\n".join(p for p in parts if p)))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- public API
|
||||
|
||||
def run_total_session_cost(
|
||||
*,
|
||||
wake_depth: str = "minimal",
|
||||
mempalace_ref: int | None = None,
|
||||
claude_mem_ref: int | None = None,
|
||||
measure_mempalace: bool = False,
|
||||
measure_claude_mem: bool = False,
|
||||
count_tokens_fn: Callable[[str], int] | None = None,
|
||||
) -> dict:
|
||||
"""Run the fixed 10-turn script at the given wake_depth.
|
||||
|
||||
Parameters:
|
||||
wake_depth: "minimal" | "standard" | "deep" — selects session-start
|
||||
payload size charged to turn 1.
|
||||
mempalace_ref / claude_mem_ref: optional manually-supplied reference
|
||||
totals (stored as ``refs["*_manual"]`` for audit). When no live
|
||||
measurement exists, a manual int is the comparator for ``passed``.
|
||||
measure_mempalace / measure_claude_mem: when True, invoke the live
|
||||
subprocess adapter and store the result as ``refs["*_measured"]``.
|
||||
A live measurement supersedes the manual ref as the comparator.
|
||||
count_tokens_fn: optional counter injection (tests use a fixed
|
||||
function to decouple assertions from tokeniser drift).
|
||||
"""
|
||||
counter, mode = _select_counter(count_tokens_fn)
|
||||
|
||||
per_turn: list[int] = []
|
||||
for i, turn in enumerate(_SCRIPT):
|
||||
t = _simulate_turn(turn, counter)
|
||||
if i == 0:
|
||||
# Turn 1 pays the session-start overhead per wake_depth.
|
||||
t += _session_start_overhead_tokens(wake_depth)
|
||||
per_turn.append(int(t))
|
||||
|
||||
total = int(sum(per_turn))
|
||||
|
||||
refs: dict[str, int] = {}
|
||||
passed = True
|
||||
|
||||
# Live measurements first so we can decide whether the manual int should
|
||||
# be recorded under the legacy key ("mempalace") or the audit-trail key
|
||||
# ("mempalace_manual", used when BOTH a measurement AND a manual ref are
|
||||
# supplied per Test 6).
|
||||
mp_measured: int | None = None
|
||||
cm_measured: int | None = None
|
||||
if measure_mempalace:
|
||||
mp_measured = _run_mempalace_adapter(_SCRIPT, counter)
|
||||
if mp_measured is not None:
|
||||
refs["mempalace_measured"] = int(mp_measured)
|
||||
if measure_claude_mem:
|
||||
cm_measured = _run_claude_mem_adapter(_SCRIPT, counter)
|
||||
if cm_measured is not None:
|
||||
refs["claude_mem_measured"] = int(cm_measured)
|
||||
|
||||
# Manual refs. Back-compat with when no live measurement is
|
||||
# present, the manual int lands under the legacy "mempalace" / "claude_mem"
|
||||
# key so pre-existing downstream consumers (and tests) keep working.
|
||||
if mempalace_ref is not None:
|
||||
key = "mempalace_manual" if mp_measured is not None else "mempalace"
|
||||
refs[key] = int(mempalace_ref)
|
||||
if claude_mem_ref is not None:
|
||||
key = "claude_mem_manual" if cm_measured is not None else "claude_mem"
|
||||
refs[key] = int(claude_mem_ref)
|
||||
|
||||
# Gate logic: measured > legacy manual > audit-trail manual > no gate.
|
||||
mp_gate = refs.get(
|
||||
"mempalace_measured", refs.get("mempalace", refs.get("mempalace_manual"))
|
||||
)
|
||||
cm_gate = refs.get(
|
||||
"claude_mem_measured", refs.get("claude_mem", refs.get("claude_mem_manual"))
|
||||
)
|
||||
if mp_gate is not None and total > mp_gate:
|
||||
passed = False
|
||||
if cm_gate is not None and total > cm_gate:
|
||||
passed = False
|
||||
|
||||
return {
|
||||
"adapter": "iai-mcp",
|
||||
"wake_depth": wake_depth,
|
||||
"total_tokens": total,
|
||||
"per_turn": per_turn,
|
||||
"mode": mode,
|
||||
"refs": refs,
|
||||
"passed": passed,
|
||||
"script_name": SCRIPT_NAME,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- CLI
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.total_session_cost",
|
||||
description=(
|
||||
"OPS-12 / total session cost bench. Fixed 10-turn "
|
||||
"representative script (D5-08); measures IAI-MCP token cost "
|
||||
"at wake_depth minimal|standard|deep and optionally compares "
|
||||
"to supplied mempalace / claude-mem reference totals."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wake-depth",
|
||||
choices=("minimal", "standard", "deep"),
|
||||
default="minimal",
|
||||
help="session-start payload size (default minimal per D5-02)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ref-mempalace",
|
||||
dest="mempalace_ref",
|
||||
type=int, default=None,
|
||||
help="mempalace reference total (tokens) for the comparative gate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ref-claude-mem",
|
||||
dest="claude_mem_ref",
|
||||
type=int, default=None,
|
||||
help="claude-mem reference total (tokens) for the comparative gate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--measure-mempalace",
|
||||
action="store_true",
|
||||
help=(
|
||||
"attempt a live mempalace subprocess run to fill the "
|
||||
"reference column; on failure emits a bench_adapter_unavailable "
|
||||
"stderr event and records no measurement"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--measure-claude-mem",
|
||||
action="store_true",
|
||||
help=(
|
||||
"attempt a live claude-mem subprocess run; identical fallback "
|
||||
"shape to --measure-mempalace"
|
||||
),
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
result = run_total_session_cost(
|
||||
wake_depth=args.wake_depth,
|
||||
mempalace_ref=args.mempalace_ref,
|
||||
claude_mem_ref=args.claude_mem_ref,
|
||||
measure_mempalace=args.measure_mempalace,
|
||||
measure_claude_mem=args.measure_claude_mem,
|
||||
)
|
||||
print(json.dumps(result))
|
||||
return 0 if result["passed"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
253
bench/trajectory.py
Normal file
253
bench/trajectory.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
"""bench/trajectory.py -- trajectory benchmark (Plan 02-04 Task 4, D-33).
|
||||
|
||||
Generates a deterministic 30-session synthetic corpus following autism/NT
|
||||
interaction pattern models and runs M1..M6 aggregation across it. Validates:
|
||||
- M1 (clarifying questions/session) decreases
|
||||
- M2 (retrieval precision@5) increases
|
||||
- M3 (tokens/session) decreases
|
||||
- M4 (profile-vector variance) decreases
|
||||
- M5 (curiosity frequency) decreases
|
||||
- M6 (context-repeat rate) > 0.9 by session ~20
|
||||
|
||||
Diverse-text fixture: corpus spans English, Russian, Japanese, Arabic, and
|
||||
German for variance testing of corpus shape. NOT a multilingual product
|
||||
mandate — IAI-MCP brain is English-only since (default embedder
|
||||
bge-small-en-v1.5). Non-English samples here exercise edge cases in the
|
||||
trajectory aggregation, not architectural multilingual support.
|
||||
|
||||
CLI:
|
||||
python -m bench.trajectory [--n-sessions 30] [--real-logs PATH]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
from iai_mcp.events import write_event
|
||||
from iai_mcp.store import MemoryStore
|
||||
|
||||
|
||||
# reproducible corpus from seed=42.
|
||||
DEFAULT_SEED = 42
|
||||
|
||||
# Diverse-text samples for corpus-shape variance testing.
|
||||
# Brain is English-only since Plan 05-08; non-English entries here are
|
||||
# fixture diversity, not a multilingual product feature.
|
||||
_LANG_SAMPLES: dict[str, list[str]] = {
|
||||
"en": [
|
||||
"authentication uses JWT with refresh rotation",
|
||||
"db migration scheduled for Friday evening",
|
||||
"web cache invalidation on deploy",
|
||||
"cli subcommand for trajectory aggregation",
|
||||
],
|
||||
"ru": [
|
||||
"авторизация использует JWT с обновлением токена",
|
||||
"миграция базы данных запланирована на пятницу",
|
||||
"инвалидация кэша при деплое",
|
||||
],
|
||||
"ja": [
|
||||
"認証はJWTとリフレッシュローテーションを使用",
|
||||
"データベース移行は金曜日の夕方に予定",
|
||||
],
|
||||
"ar": [
|
||||
"المصادقة تستخدم JWT مع تدوير الرمز",
|
||||
"ترحيل قاعدة البيانات مجدول ليوم الجمعة",
|
||||
],
|
||||
"de": [
|
||||
"Authentifizierung verwendet JWT mit Token-Rotation",
|
||||
"Datenbankmigration für Freitagabend geplant",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def generate_synthetic_corpus(
|
||||
n_sessions: int = 30,
|
||||
seed: int = DEFAULT_SEED,
|
||||
) -> list[dict]:
|
||||
"""Build a deterministic 30-session corpus.
|
||||
|
||||
Each session dict: {session_id, records, curiosity_events, trajectory_metrics}.
|
||||
|
||||
Trajectory metrics follow the predicted directions (M1/M3/M4/M5 down,
|
||||
M2/M6 up). This gives downstream run_trajectory_bench a clean signal to
|
||||
validate.
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
languages = list(_LANG_SAMPLES.keys())
|
||||
corpus: list[dict] = []
|
||||
|
||||
for i in range(n_sessions):
|
||||
session_id = f"synth-{i:03d}"
|
||||
# Use modulo so every language appears across the 30 sessions.
|
||||
# Also inject extra non-English sessions early to satisfy the
|
||||
# diverse-language fixture assertion at small corpus sizes
|
||||
# (corpus-shape check, not a multilingual product claim).
|
||||
if i < len(languages):
|
||||
lang = languages[i]
|
||||
else:
|
||||
lang = rng.choice(languages)
|
||||
samples = _LANG_SAMPLES[lang]
|
||||
|
||||
n_records = rng.randint(3, 8)
|
||||
records: list[dict] = []
|
||||
for k in range(n_records):
|
||||
text = samples[k % len(samples)]
|
||||
records.append({
|
||||
"id": str(uuid4()),
|
||||
"literal_surface": text,
|
||||
"language": lang,
|
||||
"tags": [f"topic:t{k % 3}", f"session:{session_id}"],
|
||||
})
|
||||
|
||||
# Curiosity events decay over sessions (M5 downward trend).
|
||||
n_curiosity = max(0, 6 - (i // 5))
|
||||
curiosity_events: list[dict] = []
|
||||
for _ in range(n_curiosity):
|
||||
curiosity_events.append({
|
||||
"question_id": str(uuid4()),
|
||||
"entropy": float(0.5 + rng.random() * 0.5),
|
||||
})
|
||||
|
||||
# Predicted M1..M6 directions.
|
||||
progress = i / max(1, n_sessions - 1) # 0.0 at start -> 1.0 at end
|
||||
m1 = max(0.5, 6.0 * (1.0 - progress)) # clarifying Qs down
|
||||
m2 = min(1.0, 0.4 + progress * 0.5) # precision@5 up
|
||||
m3 = max(1000.0, 3000.0 * (1.0 - 0.6 * progress)) # tokens down
|
||||
m4 = max(0.05, 0.5 * (1.0 - progress)) # variance down
|
||||
m5 = float(n_curiosity) # frequency down
|
||||
m6 = min(1.0, 0.4 + progress * 0.55) # repeat rate up
|
||||
|
||||
corpus.append({
|
||||
"session_id": session_id,
|
||||
"records": records,
|
||||
"curiosity_events": curiosity_events,
|
||||
"trajectory_metrics": {
|
||||
"m1": m1, "m2": m2, "m3": m3,
|
||||
"m4": m4, "m5": m5, "m6": m6,
|
||||
},
|
||||
})
|
||||
return corpus
|
||||
|
||||
|
||||
def run_trajectory_bench(
|
||||
corpus: list[dict],
|
||||
store_path: Path | str | None = None,
|
||||
) -> dict:
|
||||
"""Apply the corpus to a fresh store and aggregate M1..M6 trends.
|
||||
|
||||
Returns {m1_trend, m2_trend, ..., m6_trend, passed}. Trends are lists of
|
||||
floats in session order. `passed` reflects the 6 predicted directions.
|
||||
"""
|
||||
from iai_mcp.trajectory import record_session_metrics
|
||||
|
||||
cleanup: tempfile.TemporaryDirectory | None = None
|
||||
if store_path is None:
|
||||
cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-traj-")
|
||||
path = Path(cleanup.name)
|
||||
else:
|
||||
path = Path(store_path)
|
||||
|
||||
try:
|
||||
store = MemoryStore(path=path)
|
||||
|
||||
m1t: list[float] = []
|
||||
m2t: list[float] = []
|
||||
m3t: list[float] = []
|
||||
m4t: list[float] = []
|
||||
m5t: list[float] = []
|
||||
m6t: list[float] = []
|
||||
for session in corpus:
|
||||
sid = session["session_id"]
|
||||
# Emit curiosity_question events so M1 compute_* can find them.
|
||||
for q in session["curiosity_events"]:
|
||||
write_event(
|
||||
store,
|
||||
kind="curiosity_question",
|
||||
data={
|
||||
"question_id": q["question_id"],
|
||||
"text": "",
|
||||
"tier": "question",
|
||||
"entropy": q["entropy"],
|
||||
"turn": 1,
|
||||
"triggered_by": [],
|
||||
},
|
||||
severity="info",
|
||||
session_id=sid,
|
||||
)
|
||||
# Record the synthetic metrics.
|
||||
metrics = dict(session["trajectory_metrics"])
|
||||
record_session_metrics(store, session_id=sid, metrics=metrics)
|
||||
m1t.append(metrics["m1"])
|
||||
m2t.append(metrics["m2"])
|
||||
m3t.append(metrics["m3"])
|
||||
m4t.append(metrics["m4"])
|
||||
m5t.append(metrics["m5"])
|
||||
m6t.append(metrics["m6"])
|
||||
|
||||
def _down(trend: list[float]) -> bool:
|
||||
return bool(trend) and trend[-1] < trend[0]
|
||||
|
||||
def _up(trend: list[float]) -> bool:
|
||||
return bool(trend) and trend[-1] > trend[0]
|
||||
|
||||
# success conditions.
|
||||
passed = (
|
||||
_down(m1t) and _up(m2t) and _down(m3t)
|
||||
and _down(m4t) and _down(m5t) and _up(m6t)
|
||||
)
|
||||
return {
|
||||
"m1_trend": m1t,
|
||||
"m2_trend": m2t,
|
||||
"m3_trend": m3t,
|
||||
"m4_trend": m4t,
|
||||
"m5_trend": m5t,
|
||||
"m6_trend": m6t,
|
||||
"passed": passed,
|
||||
}
|
||||
finally:
|
||||
if cleanup is not None:
|
||||
cleanup.cleanup()
|
||||
|
||||
|
||||
def main(
|
||||
n_sessions: int = 30,
|
||||
seed: int = DEFAULT_SEED,
|
||||
real_logs_path: str | None = None,
|
||||
store_path: Path | str | None = None,
|
||||
) -> int:
|
||||
"""CLI entry. --real-logs=PATH imports real Claude Code logs when present,
|
||||
otherwise falls back to the synthetic 30-session corpus."""
|
||||
if real_logs_path and Path(real_logs_path).exists():
|
||||
# Real-log import path stub -- owns the ingestion schema.
|
||||
# Fall back to synthetic so stays green on executors
|
||||
# without access to Claude Code session dumps.
|
||||
corpus = generate_synthetic_corpus(n_sessions=n_sessions, seed=seed)
|
||||
else:
|
||||
corpus = generate_synthetic_corpus(n_sessions=n_sessions, seed=seed)
|
||||
|
||||
out = run_trajectory_bench(corpus, store_path=store_path)
|
||||
print(json.dumps(out))
|
||||
return 0 if out["passed"] else 1
|
||||
|
||||
|
||||
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(prog="bench.trajectory")
|
||||
parser.add_argument("--n-sessions", type=int, default=30)
|
||||
parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
|
||||
parser.add_argument("--real-logs", dest="real_logs", default=None)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = _parse_args()
|
||||
sys.exit(main(
|
||||
n_sessions=args.n_sessions,
|
||||
seed=args.seed,
|
||||
real_logs_path=args.real_logs,
|
||||
))
|
||||
316
bench/verbatim.py
Normal file
316
bench/verbatim.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
"""bench/verbatim.py -- benchmark harness + diagnostics.
|
||||
|
||||
Simulates a session gap by inserting N pinned records, flooding the store with
|
||||
`session_gap * noise_per_session` unrelated records, then retrieving each
|
||||
pinned record by its own literal_surface as the cue. Counts byte-exact matches.
|
||||
|
||||
Target: >= ACCURACY_FLOOR (0.99) on pinned records -- / MEM-10.
|
||||
|
||||
Exit codes:
|
||||
- 0 if accuracy >= 0.99
|
||||
- 1 otherwise
|
||||
|
||||
JSON output (one line to stdout):
|
||||
{"accuracy": float, "n_records": int, "session_gap": int,
|
||||
"hits_exact": int, "passed": bool, "floor": 0.99, "noise_mode": str,
|
||||
"skip_l0_seed": bool, "storage_direct": bool, "k": int}
|
||||
|
||||
Plan 05-01 (D5-01) diagnostic flags -- BENCH-ONLY (no production change):
|
||||
--skip-l0-seed : skip _seed_l0_identity to isolate L0 crowding (effect b)
|
||||
--storage-direct : bypass recall(), call store.query_similar directly
|
||||
(isolates provenance-write amplification, effect c)
|
||||
--n : override n_records (default 20)
|
||||
--gap : override session_gap (default 20)
|
||||
--noise-per-session : override noise_per_session (default 10)
|
||||
--k : override k_hits (default max(n_records + 10, 20))
|
||||
|
||||
Design note -- why we bypass dispatch("memory_recall"):
|
||||
The Plan-02 core.memory_recall routes non-empty stores through recall_for_response
|
||||
(Phase 8 entry-point split) which instantiates an Embedder() (downloads
|
||||
bge-small-en-v1.5 from HuggingFace
|
||||
on first call). That's fine for a real runtime but wrong for an offline bench:
|
||||
we need to measure storage-layer verbatim-recall correctness, not embedder
|
||||
warm-up latency. So we call `retrieve.recall` directly with a fixed cue
|
||||
embedding aligned with the pinned records (all-ones vector).
|
||||
|
||||
H-03 noise model (review finding, 2026-04-16):
|
||||
The original noise vector was [-0.5]^384, which gives cosine=-1.0 against the
|
||||
[1.0]^384 cue -- making pinned-vs-noise discrimination a geometric artifact
|
||||
rather than a measurement of the storage layer. The fix uses seeded
|
||||
numpy.random.standard_normal(EMBED_DIM) normalised to unit length. Against a
|
||||
[1.0]^384 cue the expected cosine of a random unit vector is 0 with stddev
|
||||
1/sqrt(EMBED_DIM) ~= 0.05 -- realistic noise geometry, but pinned still wins
|
||||
because cos=+1 >> cos~=0. The bench remains honest about what it measures
|
||||
(literal_surface round-trip under realistic embedding noise, given a fixed
|
||||
cue). A real bge-small-en-v1.5 bench is deferred to Phase 2.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
|
||||
import numpy as np
|
||||
|
||||
from iai_mcp.core import _seed_l0_identity
|
||||
from iai_mcp.retrieve import recall
|
||||
from iai_mcp.store import EMBED_DIM, MemoryStore
|
||||
from iai_mcp.types import MemoryRecord
|
||||
|
||||
ACCURACY_FLOOR = 0.99 # OPS-04
|
||||
NOISE_SEED = 20260416 # fixed for reproducibility across runs / CI
|
||||
|
||||
|
||||
def _make_pinned(text: str, dim: int = EMBED_DIM) -> MemoryRecord:
|
||||
"""A pinned verbatim record -- detail_level=5, never_merge=True, never_decay=True.
|
||||
|
||||
Uses a fixed all-ones embedding so the cue (also all-ones) maxes cosine to
|
||||
every pinned record simultaneously. The recall ranking then scores by
|
||||
insertion order / stability -- but the literal_surface substring match is
|
||||
the only correctness signal we care about.
|
||||
|
||||
language="en" required. `dim` parameterised so callers
|
||||
can match a legacy 384d store or the 1024d default; default is
|
||||
`EMBED_DIM` (the current module constant). Unit tests that construct a
|
||||
fresh isolated store pick up the default; bench main() queries the
|
||||
store instance's embed_dim so a pre-existing ~/.iai-mcp store (possibly
|
||||
still at 384d prior to migration) works unchanged.
|
||||
"""
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="semantic",
|
||||
literal_surface=text,
|
||||
aaak_index="",
|
||||
embedding=[1.0] * dim,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=5,
|
||||
pinned=True,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=True,
|
||||
never_merge=True,
|
||||
provenance=[],
|
||||
created_at=datetime.now(timezone.utc),
|
||||
updated_at=datetime.now(timezone.utc),
|
||||
tags=["benchmark", "pinned"],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def _random_unit_vector(rng: np.random.Generator, dim: int = EMBED_DIM) -> list[float]:
|
||||
"""Unit-norm Gaussian vector with configurable dim.
|
||||
|
||||
Expected cosine vs [1.0]^dim cue: 0 with stddev 1/sqrt(dim) ~= 0.05 at 384d
|
||||
or ~= 0.03 at 1024d. Uses the provided seeded Generator so every run
|
||||
reproduces identical noise.
|
||||
"""
|
||||
v = rng.standard_normal(dim)
|
||||
v = v / np.linalg.norm(v)
|
||||
return v.tolist()
|
||||
|
||||
|
||||
def _make_noise(i: int, rng: np.random.Generator, dim: int = EMBED_DIM) -> MemoryRecord:
|
||||
"""Noise record with a random unit-vector embedding (H-03 honesty fix).
|
||||
|
||||
Previous implementation used [-0.5]^EMBED_DIM which gave cosine=-1 against the
|
||||
cue, making pinned-vs-noise discrimination trivial by geometry. Seeded
|
||||
Gaussian unit vectors reproduce deterministically and approximate the
|
||||
orthogonality-on-average of real embeddings.
|
||||
|
||||
language="en" required.
|
||||
"""
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=f"unrelated session noise record #{i}: " + ("y " * 20),
|
||||
aaak_index="",
|
||||
embedding=_random_unit_vector(rng, dim=dim),
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=datetime.now(timezone.utc),
|
||||
updated_at=datetime.now(timezone.utc),
|
||||
tags=[],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def run_verbatim_bench(
|
||||
store: MemoryStore | None = None,
|
||||
n_records: int = 20,
|
||||
session_gap: int = 20,
|
||||
noise_per_session: int = 10,
|
||||
seed: int = NOISE_SEED,
|
||||
*,
|
||||
skip_l0_seed: bool = False,
|
||||
storage_direct: bool = False,
|
||||
k: int | None = None,
|
||||
) -> dict:
|
||||
"""Run the verbatim-recall benchmark.
|
||||
|
||||
Parameters:
|
||||
store: optional; isolated tmp_path store in tests, default MemoryStore in CLI.
|
||||
n_records: how many pinned records to store and recall.
|
||||
session_gap: how many "sessions" of noise to interpose between write and recall.
|
||||
noise_per_session: noise records per simulated session.
|
||||
seed: RNG seed for noise vectors (H-03: reproducibility across runs).
|
||||
skip_l0_seed: D5-01 effect (b) isolation -- skip the L0 identity
|
||||
seed so pinned records are not competed against by a fixed-embedding
|
||||
identity record. BENCH-SCOPE ONLY; production _seed_l0_identity is
|
||||
unchanged.
|
||||
storage_direct: D5-01 effect (c) isolation -- bypass
|
||||
retrieve.recall() and call store.query_similar directly, so the
|
||||
per-hit provenance write amplification is removed from the hot loop.
|
||||
BENCH-SCOPE ONLY; production recall() is unchanged.
|
||||
k: override the top-k passed into recall(k_hits=K) or query_similar(k=K);
|
||||
None keeps the historic default of max(n_records + 10, 20).
|
||||
|
||||
Returns a dict as documented in the module docstring.
|
||||
"""
|
||||
s = store if store is not None else MemoryStore()
|
||||
if not skip_l0_seed:
|
||||
_seed_l0_identity(s)
|
||||
|
||||
# consult the store's actual embedding dim. An existing Phase 1
|
||||
# store may still have 384d records pre-D-35-migration; a fresh store has
|
||||
# the default (1024d). Match either transparently.
|
||||
dim = s.embed_dim
|
||||
|
||||
pinned_texts = [
|
||||
f"Alice said on day {i}: verbatim phrase #{i}-{'x' * 10}"
|
||||
for i in range(n_records)
|
||||
]
|
||||
pinned_records = [_make_pinned(t, dim=dim) for t in pinned_texts]
|
||||
for r in pinned_records:
|
||||
s.insert(r)
|
||||
|
||||
# Simulate session_gap * noise_per_session unrelated records.
|
||||
# H-03: seeded RNG shared across every noise draw so results are reproducible.
|
||||
rng = np.random.default_rng(seed)
|
||||
for session_idx in range(session_gap):
|
||||
for j in range(noise_per_session):
|
||||
s.insert(_make_noise(session_idx * noise_per_session + j, rng, dim=dim))
|
||||
|
||||
cue_emb = [1.0] * dim
|
||||
# k must be >= n_records for every pinned record to have a chance of surfacing.
|
||||
# Plus a buffer for the L0 seed + anti-hits tail, so we retrieve a generous top-k.
|
||||
effective_k = k if k is not None else max(n_records + 10, 20)
|
||||
hits_exact = 0
|
||||
for text in pinned_texts:
|
||||
if storage_direct:
|
||||
# D5-01 (c): bypass recall() -> no per-hit provenance write amplification.
|
||||
raw = s.query_similar(cue_emb, k=effective_k)
|
||||
literal_surfaces = [rec.literal_surface for rec, _score in raw]
|
||||
else:
|
||||
# retrieve.recall now defaults to mode='verbatim'
|
||||
# (conservative North-Star fallback). The bench's _make_pinned
|
||||
# uses tier='semantic' which the verbatim filter would drop.
|
||||
# The bench is measuring "verbatim TEXT exact-match recall under
|
||||
# noise" — that is independent of the cue-router's verbatim/concept
|
||||
# mode (the bench uses synthetic cues, not classifier-tagged
|
||||
# natural-language queries). Pin mode='concept' so the bench
|
||||
# measures what it has always measured.
|
||||
resp = recall(
|
||||
store=s,
|
||||
cue_embedding=cue_emb,
|
||||
cue_text=text,
|
||||
session_id="bench-verbatim",
|
||||
budget_tokens=5000,
|
||||
k_hits=effective_k,
|
||||
k_anti=3,
|
||||
mode="concept",
|
||||
)
|
||||
literal_surfaces = [h.literal_surface for h in resp.hits]
|
||||
if text in literal_surfaces:
|
||||
hits_exact += 1
|
||||
|
||||
accuracy = hits_exact / n_records if n_records > 0 else 0.0
|
||||
return {
|
||||
"accuracy": accuracy,
|
||||
"n_records": n_records,
|
||||
"session_gap": session_gap,
|
||||
"noise_per_session": noise_per_session,
|
||||
"hits_exact": hits_exact,
|
||||
"passed": accuracy >= ACCURACY_FLOOR,
|
||||
"floor": ACCURACY_FLOOR,
|
||||
"noise_mode": "random-unit-vectors",
|
||||
"noise_seed": seed,
|
||||
# diagnostic traceability keys.
|
||||
"skip_l0_seed": bool(skip_l0_seed),
|
||||
"storage_direct": bool(storage_direct),
|
||||
"k": int(effective_k),
|
||||
}
|
||||
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.verbatim",
|
||||
description="OPS-04 / verbatim recall benchmark + diagnostics",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-l0-seed",
|
||||
action="store_true",
|
||||
help="D5-01 diagnostic: skip _seed_l0_identity to isolate L0 crowding effect",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--storage-direct",
|
||||
action="store_true",
|
||||
help="D5-01 diagnostic: bypass recall(), call store.query_similar directly",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n", "--n-records",
|
||||
dest="n_records",
|
||||
type=int,
|
||||
default=20,
|
||||
help="pinned record count (default 20)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gap", "--session-gap",
|
||||
dest="session_gap",
|
||||
type=int,
|
||||
default=20,
|
||||
help="session gap -- how many noise sessions between writes and recall (default 20)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-per-session",
|
||||
type=int,
|
||||
default=10,
|
||||
help="noise records per simulated session (default 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--k",
|
||||
type=int,
|
||||
default=None,
|
||||
help="override k_hits (default: max(n_records + 10, 20))",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args(argv)
|
||||
result = run_verbatim_bench(
|
||||
n_records=args.n_records,
|
||||
session_gap=args.session_gap,
|
||||
noise_per_session=args.noise_per_session,
|
||||
skip_l0_seed=args.skip_l0_seed,
|
||||
storage_direct=args.storage_direct,
|
||||
k=args.k,
|
||||
)
|
||||
print(json.dumps(result))
|
||||
return 0 if result["passed"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue