iai-mcp-opencode/tests/test_loss_qids_regression.py

"""Phase 8 redesign regression-fence tests (post-redesign port).

Two layers — synthetic fence (3 parametrized tests) ported to
recall_for_benchmark; real-data smoke ported to debug_pipeline_loss.py
(08-02 migrated this script).

  1. Synthetic fence (test_synthetic_*) — fast, no network, no HF cache;
     constructs degenerate cold-start fixtures (gate_coverage < 0.10) on
     small in-memory stores and asserts recall_for_benchmark R@5 >=
     retrieve_recall R@5 / R@10 >= R@10.

  2. Real-data smoke (test_real_qids_smoke) — env-gated on the HF cache
     being warm; subprocess-runs bench/lme500/debug_pipeline_loss.py
     against the FULL set of 7 R@5 loss-qids identified in
     the published LongMemEval-S bench report (extended from the 3-qid v1-trace set
     to fence the complete unit-level proxy on the construction
     host) and asserts every verdict reads 'no_loss'.

Fences the the published LongMemEval-S bench report regression: Y-X = -0.012 R@5 /
-0.030 R@10 driven by stage_2_community_gate verdicts. The 7 R@5
loss-qids and 16 R@10 loss-qids all share the same root cause
(Leiden 1-record-per-community on cold-start stores). redesign
(D-01 shared-cosine, gate-as-diagnostic, K_CANDIDATES=200,
D-07 entry-point split) closes the regression by reading the candidate
pool from cosine top-K instead of gate-restricted candidates.
"""
from __future__ import annotations

import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from uuid import UUID, uuid4

import pytest

os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")

_HF_CACHE = Path(
    os.environ.get("HF_HOME") or (Path.home() / ".cache" / "huggingface")
)
HAS_LONGMEMEVAL_CACHE = any(_HF_CACHE.rglob("longmemeval_s")) if _HF_CACHE.exists() else False
HAS_BGE_SMALL_CACHE = any(_HF_CACHE.rglob("*bge-small-en*")) if _HF_CACHE.exists() else False


def _make_record(content: str, session_id: str, role: str, embedding: list[float]):
    from iai_mcp.types import MemoryRecord
    now = datetime.now(timezone.utc)
    return MemoryRecord(
        id=uuid4(),
        tier="episodic",
        literal_surface=content,
        aaak_index="",
        embedding=embedding,
        community_id=None,
        centrality=0.0,
        detail_level=2,
        pinned=False,
        stability=0.0,
        difficulty=0.0,
        last_reviewed=None,
        never_decay=False,
        never_merge=False,
        provenance=[],
        created_at=now,
        updated_at=now,
        tags=["lme_synthetic", f"role:{role}", f"session:{session_id}"],
        language="en",
    )


def _r_at_k_session_ids(retrieved_record_ids, id_to_session, gold_session_ids, k):
    retrieved_sessions = [id_to_session.get(rid, "?") for rid in retrieved_record_ids[:k]]
    return 1.0 if any(s in gold_session_ids for s in retrieved_sessions) else 0.0


@pytest.mark.skipif(
    not HAS_BGE_SMALL_CACHE,
    reason="bge-small-en-v1.5 model not cached locally; synthetic fence requires real embeddings",
)
@pytest.mark.parametrize(
    "n_haystack,n_gold_session,gold_session_count,cue_text,gold_text_template",
    [
        # single-session-user shape: small gold session in a haystack of distractors
        (60, 1, 4, "what did I tell you about my dog Rex on Tuesday?",
         "I have a dog named Rex who is a golden retriever and loves the park"),
        # multi-session shape: gold spread across multiple sessions
        (120, 3, 12, "tell me about the Python build error I was debugging",
         "The build error was traced to a missing __init__.py in the bench module"),
        # single-session-preference shape: low gold count, lots of distractor noise
        (80, 1, 3, "what coffee preference did I share?",
         "My favorite coffee is a single-origin Ethiopian pour-over with no milk"),
    ],
    ids=["single-session-user", "multi-session", "single-session-preference"],
)
def test_synthetic_pipeline_no_regression_vs_baseline(
    tmp_path,
    n_haystack,
    n_gold_session,
    gold_session_count,
    cue_text,
    gold_text_template,
):
    """Y (recall_for_benchmark) R@5 must be >= X (retrieve_recall) R@5 on
    the cold-start synthetic fixture that exercises gate_coverage < 0.10.

    redesign port: Wave 1 + Wave 2 split the OLD recall entry
    point into a contract pair — top-K retrieval lives behind
    recall_for_benchmark(k_hits=10), production answer-packing lives
    behind recall_for_response(budget_tokens). The benchmark prong (Y)
    is what the published LongMemEval-S bench measures, so we fence Y vs X under the
    benchmark-shape entry point.
    """
    import asyncio
    from iai_mcp.embed import embedder_for_store
    from iai_mcp.pipeline import recall_for_benchmark
    from iai_mcp.retrieve import build_runtime_graph, recall as retrieve_recall
    from iai_mcp.store import MemoryStore

    store = MemoryStore(path=tmp_path / "lancedb")
    asyncio.run(store.enable_async_writes(coalesce_ms=50, max_batch=128))
    embedder = embedder_for_store(store)

    id_to_session: dict[UUID, str] = {}
    gold_record_ids: set[UUID] = set()
    gold_session_ids: set[str] = set()

    # Insert gold session(s) — `gold_session_count` records per session.
    for gs_idx in range(n_gold_session):
        session_id = f"gold-{gs_idx:03d}"
        gold_session_ids.add(session_id)
        for k in range(gold_session_count):
            # Slightly varied gold text so each record has a distinct embedding
            # but they all cluster around the cue topic.
            content = f"{gold_text_template} (turn {k} session {gs_idx})"
            vec = embedder.embed(content)
            rec = _make_record(content, session_id, role="user", embedding=vec)
            store.insert(rec)
            id_to_session[rec.id] = session_id
            gold_record_ids.add(rec.id)

    # Insert distractor haystack — unrelated topics across many sessions
    # to force Leiden into many small communities (cold-start trigger).
    distractor_topics = [
        "I went to the grocery store today and bought apples",
        "The weather has been rainy all week long here",
        "I am learning to play the piano this year",
        "My favorite TV show is about cooking competitions",
        "The new garden tools arrived in the mail yesterday",
        "I read an interesting book about ancient Rome recently",
        "The car needs an oil change next month sometime",
        "I decided to repaint the bedroom walls light blue",
        "My friend recommended a great Italian restaurant nearby",
        "I found an old photograph from my college years today",
    ]
    for i in range(n_haystack):
        session_id = f"distractor-{i // 3:04d}"  # 3 turns per session
        content = distractor_topics[i % len(distractor_topics)] + f" (#{i})"
        vec = embedder.embed(content)
        rec = _make_record(content, session_id, role="user", embedding=vec)
        store.insert(rec)
        id_to_session[rec.id] = session_id

    asyncio.run(store.disable_async_writes())

    graph, assignment, rich_club = build_runtime_graph(store)

    # Sanity: this fixture MUST exercise the cold-start trigger.
    # If it doesn't (graph is too healthy / Leiden produced large communities),
    # the test would be vacuously green; assert we're actually testing the bug-class.
    cue_emb = embedder.embed(cue_text)

    # Baseline X
    resp_x = retrieve_recall(
        store=store,
        cue_embedding=cue_emb,
        cue_text=cue_text,
        session_id="phase8-fence-x",
        budget_tokens=1500,
        k_hits=10,
        k_anti=0,
    )
    x_record_ids = [h.record_id for h in resp_x.hits]
    r5_x = _r_at_k_session_ids(x_record_ids, id_to_session, gold_session_ids, 5)
    r10_x = _r_at_k_session_ids(x_record_ids, id_to_session, gold_session_ids, 10)

    # Pipeline Y — benchmark entry point (k_hits=10).
    resp_y = recall_for_benchmark(
        store=store,
        graph=graph,
        assignment=assignment,
        rich_club=rich_club,
        embedder=embedder,
        cue=cue_text,
        session_id="phase8-fence-y",
        k_hits=10,
        profile_state=None,
        turn=0,
        mode="concept",
    )
    y_record_ids = [h.record_id for h in resp_y.hits]
    r5_y = _r_at_k_session_ids(y_record_ids, id_to_session, gold_session_ids, 5)
    r10_y = _r_at_k_session_ids(y_record_ids, id_to_session, gold_session_ids, 10)

    # The fence: Y must not regress against X on this synthetic cold-start fixture.
    assert r5_y >= r5_x, (
        f"recall_for_benchmark R@5 ({r5_y}) regressed against retrieve_recall R@5 ({r5_x}); "
        f"this is exactly the the published LongMemEval-S bench report regression closed. "
        f"Y record_ids: {y_record_ids[:5]}; X record_ids: {x_record_ids[:5]}; "
        f"gold_sessions: {gold_session_ids}; n_gold_records: {len(gold_record_ids)}; "
        f"n_communities: {len(assignment.mid_regions)}"
    )
    assert r10_y >= r10_x, (
        f"recall_for_benchmark R@10 ({r10_y}) regressed against retrieve_recall R@10 ({r10_x})"
    )


# ============================================================================
# Layer 2: Real-data smoke
# ============================================================================
#
# Subprocess-runs bench/lme500/debug_pipeline_loss.py against the FULL
# set of 7 R@5 loss-qids identified in the published LongMemEval-S bench report and
# asserts every SUMMARY-table verdict reads 'no_loss' (post-redesign
# invariant; hard floor unit-level proxy).
#
# redesign port: the 3-qid v1-trace set
# ({726462e0, 06f04340, d3ab962e}) was the originally-traced subset.
# This test extends to the full 7 R@5 loss-qids per the phase scope so
# the unit-level proxy fences the complete hard floor on the
# construction host (08-PLAN-CHECK.md F2 option (a) — non-deferrable).
#
# This test is env-gated on the HuggingFace cache containing both the
# bge-small-en-v1.5 embedder weights and the longmemeval_s dataset
# (~150 MB total). On a cold cache it would download those at test
# time, so we skip rather than block CI.
#
# Wall-clock bound: each qid takes ~30-90s for embedder + dataset
# parse + per-row store + graph + recall. 7 qids = ~7-12 minutes total
# on Mac Studio M2 Max. The subprocess timeout is set to 1200s (20 min)
# so CI environments with slower disk / cold model cache have headroom.


@pytest.mark.skipif(
    not (HAS_LONGMEMEVAL_CACHE and HAS_BGE_SMALL_CACHE),
    reason="LongMemEval-S dataset or bge-small-en-v1.5 embedder not cached locally",
)
def test_real_qids_smoke_no_loss_verdict():
    """End-to-end smoke: 7 R@5 loss-qids must all read 'no_loss' post-redesign.

    Re-runs bench/lme500/debug_pipeline_loss.py against the full 7-qid
    R@5 loss set from the published LongMemEval-S bench report. redesign
    (D-01 shared-cosine + gate-as-diagnostic + K_CANDIDATES=200
    + entry-point split) every qid must surface gold inside top-10
    via the cosine pool, regardless of the categorical structure
    (Leiden 1-record-per-community on cold-start stores).
    """
    qids = [
        "726462e0",
        "06f04340",
        "38146c39",
        "d3ab962e",
        "8e91e7d9",
        "gpt4_b0863698",
        "9a707b82",
    ]
    repo_root = Path(__file__).resolve().parents[1]
    script = repo_root / "bench" / "lme500" / "debug_pipeline_loss.py"
    assert script.exists(), f"missing script: {script}"

    env = dict(os.environ)
    env.setdefault("PYTHONPATH", f"{repo_root / 'src'}:{repo_root}")
    env["TRANSFORMERS_VERBOSITY"] = "error"

    proc = subprocess.run(
        [sys.executable, str(script), *qids],
        cwd=repo_root,
        capture_output=True,
        text=True,
        timeout=1200,
        env=env,
    )
    stdout = proc.stdout or ""
    stderr = proc.stderr or ""
    assert proc.returncode == 0, (
        f"debug_pipeline_loss.py exited rc={proc.returncode}\n"
        f"--- stderr ---\n{stderr[-2000:]}"
    )

    # Parse the SUMMARY table at the end of stdout. Each row shape:
    #   <qid> <padded qtype> <padded verdict> <gate->reach> (of N)
    verdicts: dict[str, str] = {}
    for ln in stdout.splitlines():
        for q in qids:
            if ln.startswith(q):
                # tokens: [qid, qtype-words..., verdict, gate->reach, of, N]
                # The verdict column is positionally fixed (32-char field).
                # Split on whitespace and find the 'no_loss' / 'stage_*' token.
                for tok in ln.split():
                    if tok in (
                        "no_loss",
                        "stage_2_community_gate",
                        "stage_3_4_seeds_or_spread",
                        "stage_5_rank",
                        "trace_failed",
                    ):
                        verdicts[q] = tok
                        break
                break

    assert len(verdicts) == 7, (
        f"expected verdicts for all 7 qids; got {verdicts}\n"
        f"--- stdout tail ---\n{stdout[-3000:]}"
    )
    for qid in qids:
        assert verdicts[qid] == "no_loss", (
            f"qid={qid} expected 'no_loss' (post-redesign); "
            f"got {verdicts[qid]!r}. Full verdicts={verdicts}.\n"
            f"--- stdout tail ---\n{stdout[-3000:]}"
        )