Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
258 lines
8.6 KiB
Python
258 lines
8.6 KiB
Python
"""Phase 8 redesign (08-CONTEXT.md D-07): benchmark top-K entry-point contract.
|
|
|
|
Tests the new public function `recall_for_benchmark(...)` introduced by
|
|
Plan 08-02. Contract:
|
|
|
|
- Signature: store, graph, assignment, rich_club, embedder, cue,
|
|
session_id, k_hits=10, profile_state=None, turn=0, mode='concept'.
|
|
- NO `budget_tokens` parameter — calling with `budget_tokens=1500`
|
|
MUST raise TypeError.
|
|
- Returns RecallResponse with `len(hits) <= k_hits` (cap honoured).
|
|
- Hits are sorted by score descending (R5 deterministic tie-break by
|
|
UUID-asc preserved from `_recall_core`).
|
|
- mode plumbing: bench callers pass `mode="concept"`; the parameter
|
|
threads through to `_recall_core` unchanged.
|
|
|
|
Cross-file: see `tests/test_recall_for_response.py` for the production
|
|
budget-pack contract, and `tests/test_recall_core_unit.py` for the
|
|
underlying `_recall_core` shape.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
|
|
from iai_mcp.community import CommunityAssignment
|
|
from iai_mcp.graph import MemoryGraph
|
|
from iai_mcp.store import MemoryStore
|
|
from iai_mcp.types import EMBED_DIM, MemoryRecord, RecallResponse
|
|
|
|
|
|
# ------------------------------------------------------------ test fixtures
|
|
|
|
|
|
class _FakeEmbedder:
|
|
"""Stand-in embedder. The cue's embedding is configurable per-test."""
|
|
|
|
DIM = EMBED_DIM
|
|
|
|
def __init__(self, vec: list[float] | None = None) -> None:
|
|
self._vec = vec if vec is not None else [1.0] + [0.0] * (EMBED_DIM - 1)
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
return list(self._vec)
|
|
|
|
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
return [list(self._vec) for _ in texts]
|
|
|
|
|
|
def _make(
|
|
vec: list[float], text: str = "rec", aaak: str = "", tier: str = "episodic",
|
|
) -> MemoryRecord:
|
|
now = datetime.now(timezone.utc)
|
|
return MemoryRecord(
|
|
id=uuid4(),
|
|
tier=tier,
|
|
literal_surface=text,
|
|
aaak_index=aaak,
|
|
embedding=vec,
|
|
community_id=None,
|
|
centrality=0.0,
|
|
detail_level=2,
|
|
pinned=False,
|
|
stability=0.0,
|
|
difficulty=0.0,
|
|
last_reviewed=None,
|
|
never_decay=False,
|
|
never_merge=False,
|
|
provenance=[],
|
|
created_at=now,
|
|
updated_at=now,
|
|
tags=[],
|
|
language="en",
|
|
)
|
|
|
|
|
|
def _build_store_and_graph(
|
|
tmp_path, n: int, surface_len: int = 4,
|
|
) -> tuple[MemoryStore, MemoryGraph, list[MemoryRecord]]:
|
|
store = MemoryStore(path=tmp_path / "lancedb")
|
|
recs: list[MemoryRecord] = []
|
|
for i in range(n):
|
|
vec = [0.0] * EMBED_DIM
|
|
vec[i % EMBED_DIM] = 1.0
|
|
text = "x" * surface_len
|
|
rec = _make(vec, text=text)
|
|
store.insert(rec)
|
|
recs.append(rec)
|
|
graph = MemoryGraph()
|
|
for rec in recs:
|
|
graph.add_node(
|
|
rec.id, community_id=None, embedding=list(rec.embedding),
|
|
)
|
|
graph._nx.nodes[str(rec.id)].update({
|
|
"embedding": list(rec.embedding),
|
|
"surface": rec.literal_surface,
|
|
"centrality": 0.0,
|
|
"tier": rec.tier,
|
|
"tags": [],
|
|
"language": "en",
|
|
})
|
|
return store, graph, recs
|
|
|
|
|
|
def _flat_assignment(recs: list[MemoryRecord]) -> CommunityAssignment:
|
|
cid = uuid4()
|
|
centroid = [1.0] + [0.0] * (EMBED_DIM - 1)
|
|
return CommunityAssignment(
|
|
node_to_community={r.id: cid for r in recs},
|
|
community_centroids={cid: centroid},
|
|
modularity=0.0,
|
|
backend="flat",
|
|
top_communities=[cid],
|
|
mid_regions={cid: [r.id for r in recs]},
|
|
)
|
|
|
|
|
|
# -------------------------------------------------- contract / signature tests
|
|
|
|
|
|
def test_recall_for_benchmark_no_budget_tokens_param(tmp_path) -> None:
|
|
"""Test 6: calling with `budget_tokens=1500` raises TypeError.
|
|
|
|
The contract split is the whole point: top-K benchmark cannot accept
|
|
a token-budget parameter, otherwise an optional argument would let
|
|
the two contracts silently swap semantics.
|
|
"""
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
store, graph, recs = _build_store_and_graph(tmp_path, n=5)
|
|
assignment = _flat_assignment(recs)
|
|
|
|
with pytest.raises(TypeError):
|
|
recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=_FakeEmbedder(),
|
|
cue="test", session_id="s6",
|
|
budget_tokens=1500, # this kwarg does not exist
|
|
)
|
|
|
|
|
|
def test_recall_for_benchmark_returns_at_most_k_hits(tmp_path) -> None:
|
|
"""Test 7: `len(hits) <= k_hits` — the cap is honoured.
|
|
|
|
Build 12 records; ask for k_hits=5; assert len(hits) == 5.
|
|
"""
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
store, graph, recs = _build_store_and_graph(tmp_path, n=12)
|
|
assignment = _flat_assignment(recs)
|
|
|
|
resp = recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=_FakeEmbedder(),
|
|
cue="test", session_id="s7", k_hits=5,
|
|
)
|
|
|
|
assert isinstance(resp, RecallResponse)
|
|
assert len(resp.hits) == 5
|
|
|
|
|
|
def test_recall_for_benchmark_hits_sorted_by_score_desc(tmp_path) -> None:
|
|
"""Test 8: hits are sorted by `score` descending (R5 deterministic order)."""
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
# 8 records on distinct axes; cue at axis 0 -> rank ordered by axis index.
|
|
store, graph, recs = _build_store_and_graph(tmp_path, n=8)
|
|
assignment = _flat_assignment(recs)
|
|
|
|
resp = recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=_FakeEmbedder(),
|
|
cue="test", session_id="s8", k_hits=10,
|
|
)
|
|
|
|
scores = [h.score for h in resp.hits]
|
|
assert scores == sorted(scores, reverse=True), (
|
|
f"recall_for_benchmark hits not sorted desc by score: {scores}"
|
|
)
|
|
|
|
|
|
def test_recall_for_benchmark_returns_fewer_when_pool_is_small(tmp_path) -> None:
|
|
"""Test 9: with k_hits=20 and only 8 ranked records, returns 8 hits.
|
|
|
|
The cap is the natural exhaustion of `_recall_core.scored_hits`, not k_hits.
|
|
"""
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
store, graph, recs = _build_store_and_graph(tmp_path, n=8)
|
|
assignment = _flat_assignment(recs)
|
|
|
|
resp = recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=_FakeEmbedder(),
|
|
cue="test", session_id="s9", k_hits=20,
|
|
)
|
|
|
|
# Pool is 8; k_hits=20 caps at 8.
|
|
assert len(resp.hits) == 8
|
|
|
|
|
|
def test_recall_for_benchmark_budget_used_is_informational(tmp_path) -> None:
|
|
"""Test 10: `budget_used` reflects the per-hit token estimate sum (not a cap)."""
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
# surface_len=200 -> 50 tokens per hit. With k_hits=3 and 5 records,
|
|
# budget_used = 3 * 50 = 150 (informational; no cap).
|
|
store, graph, recs = _build_store_and_graph(tmp_path, n=5, surface_len=200)
|
|
assignment = _flat_assignment(recs)
|
|
|
|
resp = recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=_FakeEmbedder(),
|
|
cue="test", session_id="s10", k_hits=3,
|
|
)
|
|
|
|
assert len(resp.hits) == 3
|
|
assert resp.budget_used == 150
|
|
|
|
|
|
def test_recall_for_benchmark_threads_mode_to_core(tmp_path) -> None:
|
|
"""D-02 mode plumbing: `mode='concept'` (bench default) flows through."""
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
store, graph, recs = _build_store_and_graph(tmp_path, n=5)
|
|
assignment = _flat_assignment(recs)
|
|
|
|
resp = recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=_FakeEmbedder(),
|
|
cue="test", session_id="s-mode", k_hits=10, mode="concept",
|
|
)
|
|
assert resp.cue_mode == "concept"
|
|
|
|
|
|
def test_recall_for_benchmark_signature_has_no_budget_tokens_param() -> None:
|
|
"""The function signature exposes `k_hits` and `mode` but NOT `budget_tokens`."""
|
|
import inspect
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
sig = inspect.signature(recall_for_benchmark)
|
|
assert "k_hits" in sig.parameters
|
|
assert "mode" in sig.parameters
|
|
assert "budget_tokens" not in sig.parameters, (
|
|
"recall_for_benchmark signature must NOT carry a budget_tokens "
|
|
"parameter (D-07 contract split — the entry-point split exists so "
|
|
"the two response shapes can never silently swap via an optional kwarg)."
|
|
)
|
|
|
|
|
|
def test_recall_for_benchmark_default_k_hits_10() -> None:
|
|
"""The default k_hits is 10 (matches LongMemEval-S protocol convention)."""
|
|
import inspect
|
|
from iai_mcp.pipeline import recall_for_benchmark
|
|
|
|
sig = inspect.signature(recall_for_benchmark)
|
|
assert sig.parameters["k_hits"].default == 10
|