iai-mcp-opencode/tests/test_recall_for_benchmark.py

"""Phase 8 redesign (08-CONTEXT.md D-07): benchmark top-K entry-point contract.

Tests the new public function `recall_for_benchmark(...)` introduced by
Plan 08-02. Contract:

- Signature: store, graph, assignment, rich_club, embedder, cue,
  session_id, k_hits=10, profile_state=None, turn=0, mode='concept'.
- NO `budget_tokens` parameter — calling with `budget_tokens=1500`
  MUST raise TypeError.
- Returns RecallResponse with `len(hits) <= k_hits` (cap honoured).
- Hits are sorted by score descending (R5 deterministic tie-break by
  UUID-asc preserved from `_recall_core`).
- mode plumbing: bench callers pass `mode="concept"`; the parameter
  threads through to `_recall_core` unchanged.

Cross-file: see `tests/test_recall_for_response.py` for the production
budget-pack contract, and `tests/test_recall_core_unit.py` for the
underlying `_recall_core` shape.
"""
from __future__ import annotations

from datetime import datetime, timezone
from uuid import uuid4

import pytest

from iai_mcp.community import CommunityAssignment
from iai_mcp.graph import MemoryGraph
from iai_mcp.store import MemoryStore
from iai_mcp.types import EMBED_DIM, MemoryRecord, RecallResponse


# ------------------------------------------------------------ test fixtures


class _FakeEmbedder:
    """Stand-in embedder. The cue's embedding is configurable per-test."""

    DIM = EMBED_DIM

    def __init__(self, vec: list[float] | None = None) -> None:
        self._vec = vec if vec is not None else [1.0] + [0.0] * (EMBED_DIM - 1)

    def embed(self, text: str) -> list[float]:
        return list(self._vec)

    def embed_batch(self, texts: list[str]) -> list[list[float]]:
        return [list(self._vec) for _ in texts]


def _make(
    vec: list[float], text: str = "rec", aaak: str = "", tier: str = "episodic",
) -> MemoryRecord:
    now = datetime.now(timezone.utc)
    return MemoryRecord(
        id=uuid4(),
        tier=tier,
        literal_surface=text,
        aaak_index=aaak,
        embedding=vec,
        community_id=None,
        centrality=0.0,
        detail_level=2,
        pinned=False,
        stability=0.0,
        difficulty=0.0,
        last_reviewed=None,
        never_decay=False,
        never_merge=False,
        provenance=[],
        created_at=now,
        updated_at=now,
        tags=[],
        language="en",
    )


def _build_store_and_graph(
    tmp_path, n: int, surface_len: int = 4,
) -> tuple[MemoryStore, MemoryGraph, list[MemoryRecord]]:
    store = MemoryStore(path=tmp_path / "lancedb")
    recs: list[MemoryRecord] = []
    for i in range(n):
        vec = [0.0] * EMBED_DIM
        vec[i % EMBED_DIM] = 1.0
        text = "x" * surface_len
        rec = _make(vec, text=text)
        store.insert(rec)
        recs.append(rec)
    graph = MemoryGraph()
    for rec in recs:
        graph.add_node(
            rec.id, community_id=None, embedding=list(rec.embedding),
        )
        graph._nx.nodes[str(rec.id)].update({
            "embedding": list(rec.embedding),
            "surface": rec.literal_surface,
            "centrality": 0.0,
            "tier": rec.tier,
            "tags": [],
            "language": "en",
        })
    return store, graph, recs


def _flat_assignment(recs: list[MemoryRecord]) -> CommunityAssignment:
    cid = uuid4()
    centroid = [1.0] + [0.0] * (EMBED_DIM - 1)
    return CommunityAssignment(
        node_to_community={r.id: cid for r in recs},
        community_centroids={cid: centroid},
        modularity=0.0,
        backend="flat",
        top_communities=[cid],
        mid_regions={cid: [r.id for r in recs]},
    )


# -------------------------------------------------- contract / signature tests


def test_recall_for_benchmark_no_budget_tokens_param(tmp_path) -> None:
    """Test 6: calling with `budget_tokens=1500` raises TypeError.

    The contract split is the whole point: top-K benchmark cannot accept
    a token-budget parameter, otherwise an optional argument would let
    the two contracts silently swap semantics.
    """
    from iai_mcp.pipeline import recall_for_benchmark

    store, graph, recs = _build_store_and_graph(tmp_path, n=5)
    assignment = _flat_assignment(recs)

    with pytest.raises(TypeError):
        recall_for_benchmark(
            store=store, graph=graph, assignment=assignment,
            rich_club=[], embedder=_FakeEmbedder(),
            cue="test", session_id="s6",
            budget_tokens=1500,    # this kwarg does not exist
        )


def test_recall_for_benchmark_returns_at_most_k_hits(tmp_path) -> None:
    """Test 7: `len(hits) <= k_hits` — the cap is honoured.

    Build 12 records; ask for k_hits=5; assert len(hits) == 5.
    """
    from iai_mcp.pipeline import recall_for_benchmark

    store, graph, recs = _build_store_and_graph(tmp_path, n=12)
    assignment = _flat_assignment(recs)

    resp = recall_for_benchmark(
        store=store, graph=graph, assignment=assignment,
        rich_club=[], embedder=_FakeEmbedder(),
        cue="test", session_id="s7", k_hits=5,
    )

    assert isinstance(resp, RecallResponse)
    assert len(resp.hits) == 5


def test_recall_for_benchmark_hits_sorted_by_score_desc(tmp_path) -> None:
    """Test 8: hits are sorted by `score` descending (R5 deterministic order)."""
    from iai_mcp.pipeline import recall_for_benchmark

    # 8 records on distinct axes; cue at axis 0 -> rank ordered by axis index.
    store, graph, recs = _build_store_and_graph(tmp_path, n=8)
    assignment = _flat_assignment(recs)

    resp = recall_for_benchmark(
        store=store, graph=graph, assignment=assignment,
        rich_club=[], embedder=_FakeEmbedder(),
        cue="test", session_id="s8", k_hits=10,
    )

    scores = [h.score for h in resp.hits]
    assert scores == sorted(scores, reverse=True), (
        f"recall_for_benchmark hits not sorted desc by score: {scores}"
    )


def test_recall_for_benchmark_returns_fewer_when_pool_is_small(tmp_path) -> None:
    """Test 9: with k_hits=20 and only 8 ranked records, returns 8 hits.

    The cap is the natural exhaustion of `_recall_core.scored_hits`, not k_hits.
    """
    from iai_mcp.pipeline import recall_for_benchmark

    store, graph, recs = _build_store_and_graph(tmp_path, n=8)
    assignment = _flat_assignment(recs)

    resp = recall_for_benchmark(
        store=store, graph=graph, assignment=assignment,
        rich_club=[], embedder=_FakeEmbedder(),
        cue="test", session_id="s9", k_hits=20,
    )

    # Pool is 8; k_hits=20 caps at 8.
    assert len(resp.hits) == 8


def test_recall_for_benchmark_budget_used_is_informational(tmp_path) -> None:
    """Test 10: `budget_used` reflects the per-hit token estimate sum (not a cap)."""
    from iai_mcp.pipeline import recall_for_benchmark

    # surface_len=200 -> 50 tokens per hit. With k_hits=3 and 5 records,
    # budget_used = 3 * 50 = 150 (informational; no cap).
    store, graph, recs = _build_store_and_graph(tmp_path, n=5, surface_len=200)
    assignment = _flat_assignment(recs)

    resp = recall_for_benchmark(
        store=store, graph=graph, assignment=assignment,
        rich_club=[], embedder=_FakeEmbedder(),
        cue="test", session_id="s10", k_hits=3,
    )

    assert len(resp.hits) == 3
    assert resp.budget_used == 150


def test_recall_for_benchmark_threads_mode_to_core(tmp_path) -> None:
    """D-02 mode plumbing: `mode='concept'` (bench default) flows through."""
    from iai_mcp.pipeline import recall_for_benchmark

    store, graph, recs = _build_store_and_graph(tmp_path, n=5)
    assignment = _flat_assignment(recs)

    resp = recall_for_benchmark(
        store=store, graph=graph, assignment=assignment,
        rich_club=[], embedder=_FakeEmbedder(),
        cue="test", session_id="s-mode", k_hits=10, mode="concept",
    )
    assert resp.cue_mode == "concept"


def test_recall_for_benchmark_signature_has_no_budget_tokens_param() -> None:
    """The function signature exposes `k_hits` and `mode` but NOT `budget_tokens`."""
    import inspect
    from iai_mcp.pipeline import recall_for_benchmark

    sig = inspect.signature(recall_for_benchmark)
    assert "k_hits" in sig.parameters
    assert "mode" in sig.parameters
    assert "budget_tokens" not in sig.parameters, (
        "recall_for_benchmark signature must NOT carry a budget_tokens "
        "parameter (D-07 contract split — the entry-point split exists so "
        "the two response shapes can never silently swap via an optional kwarg)."
    )


def test_recall_for_benchmark_default_k_hits_10() -> None:
    """The default k_hits is 10 (matches LongMemEval-S protocol convention)."""
    import inspect
    from iai_mcp.pipeline import recall_for_benchmark

    sig = inspect.signature(recall_for_benchmark)
    assert sig.parameters["k_hits"].default == 10