iai-mcp-opencode/tests/test_sleep_consolidation_streaming.py

"""Plan 07.7-03 W3 — _tier0_schema_surfacing rewritten on iter_record_columns(["tags_json"]).

RED phase: tests 1+2 fail until ``sleep._tier0_schema_surfacing`` is rewritten
to call ``store.iter_record_columns(["tags_json"], batch_size=1024)`` instead
of ``store.all_records()``. Tests 3-7 lock pre-existing filter semantics that
the rewrite must preserve byte-for-byte (D-11 in CONTEXT.md is the exact
template — record-count floor, raw:/domain: filtering, count >= 3 floor,
defensive JSON parse).

Covered contracts (CONTEXT.md W3 slice):

  Architecture flip:
    1. ``_tier0_schema_surfacing`` calls ``iter_record_columns(["tags_json"], ...)``,
       not ``all_records()`` — verified via monkeypatched spies on both methods.

  Zero AES-GCM cost:
    2. Across the entire ``_tier0_schema_surfacing`` execution on a 16-record
       store, ``store._decrypt_for_record`` fires zero times — projection-only
       iteration skips encrypted columns entirely (literal_surface,
       provenance_json, profile_modulation_gain_json never touch disk).

  Filter semantics — byte-identical to pre-W3 (preserve all rules):
    3. ``raw:*`` and ``domain:*`` tags are filtered before counting (existing
       contract; new code must not regress).
    4. ``count >= 3`` per-tag floor preserved.
    5. ``len(records) < CLUSTER_MIN_SIZE`` global floor preserved (now expressed
       as ``record_count < CLUSTER_MIN_SIZE`` after single-pass iteration).
    6. Output dicts are byte-identical to the pre-W3 implementation on a
       deterministic 20-record fixture (compute expected via the same algorithm
       run inline against ``store.all_records()``).

  Defensive JSON parse:
    7. Malformed ``tags_json`` rows do NOT raise — defensive try/except absorbs
       JSONDecodeError and treats the row as having zero tags. Verified by
       monkeypatch-wrapping ``iter_record_columns`` to inject a malformed row
       AFTER the real rows; OLD code is unaffected (it does not call this
       method) so the test passes RED for the right reason.

Phase 07.6 plan-checker B-1 lesson: every test uses a real ``MemoryRecord``
dataclass via ``_make()`` — never a plain dict against attribute-access code.
"""
from __future__ import annotations

from datetime import datetime, timezone
from pathlib import Path
from unittest.mock import MagicMock
from uuid import uuid4

import pytest

from iai_mcp.sleep import CLUSTER_MIN_SIZE, _tier0_schema_surfacing
from iai_mcp.store import MemoryStore
from iai_mcp.types import EMBED_DIM, MemoryRecord


# --------------------------------------------------------------------------- fixtures


@pytest.fixture(autouse=True)
def _isolated_keyring(monkeypatch: pytest.MonkeyPatch):
    """Mirror tests/test_store_iter_records.py — process-isolated keyring so
    AES-256-GCM key generation does not poke the OS keychain inside CI."""
    import keyring as _keyring

    fake: dict[tuple[str, str], str] = {}
    monkeypatch.setattr(_keyring, "get_password", lambda s, u: fake.get((s, u)))
    monkeypatch.setattr(
        _keyring, "set_password", lambda s, u, p: fake.__setitem__((s, u), p)
    )
    monkeypatch.setattr(
        _keyring, "delete_password", lambda s, u: fake.pop((s, u), None)
    )
    yield fake


def _make(
    text: str = "hello world",
    tier: str = "episodic",
    tags: list[str] | None = None,
    detail: int = 2,
    language: str = "en",
) -> MemoryRecord:
    """Real-dataclass fixture (NEVER a plain dict — plan-checker B-1)."""
    return MemoryRecord(
        id=uuid4(),
        tier=tier,
        literal_surface=text,
        aaak_index="",
        embedding=[0.1] * EMBED_DIM,
        community_id=None,
        centrality=0.0,
        detail_level=detail,
        pinned=False,
        stability=0.0,
        difficulty=0.0,
        last_reviewed=None,
        never_decay=(detail >= 3),
        never_merge=False,
        provenance=[],
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        tags=tags if tags is not None else [],
        language=language,
    )


@pytest.fixture
def store(tmp_path: Path) -> MemoryStore:
    """Fresh MemoryStore in tmp_path/lancedb (one per test, no cross-test bleed)."""
    return MemoryStore(path=tmp_path / "lancedb")


def _populate_mixed_16(store: MemoryStore) -> None:
    """16-record fixture with mixed tier/tags payloads (D-23 W3 contract)."""
    # 4 records with tag-a (single user-facing tag)
    for _ in range(4):
        store.insert(_make(text="alpha", tags=["tag-a"]))
    # 5 records with tag-b
    for _ in range(5):
        store.insert(_make(text="beta", tags=["tag-b"]))
    # 7 records with only filtered tags (raw:*, domain:*) — should contribute 0
    # candidates after the raw:/domain: filter.
    for _ in range(7):
        store.insert(_make(text="gamma", tags=["raw:noise", "domain:misc"]))


# --------------------------------------------------------------------------- architecture flip


def test_tier0_schema_surfacing_uses_iter_record_columns_not_all_records(
    store: MemoryStore, monkeypatch: pytest.MonkeyPatch
) -> None:
    """rewritten function uses ``iter_record_columns(['tags_json'], ...)``
    and never calls ``all_records()``.

    Pre-W3 (current main): ``_tier0_schema_surfacing`` calls
    ``store.all_records()`` at line 337 — spy on ``all_records`` fires once
    and spy on ``iter_record_columns`` fires zero times → assertion fails RED.

    Post-W3: spy on ``iter_record_columns`` fires once and spy on
    ``all_records`` fires zero times → assertion passes GREEN.
    """
    _populate_mixed_16(store)

    spy_all = MagicMock(wraps=store.all_records)
    spy_iter = MagicMock(wraps=store.iter_record_columns)
    monkeypatch.setattr(store, "all_records", spy_all)
    monkeypatch.setattr(store, "iter_record_columns", spy_iter)

    _tier0_schema_surfacing(store)

    assert spy_all.call_count == 0, (
        f"_tier0_schema_surfacing must NOT call store.all_records() post-W3; "
        f"got {spy_all.call_count} call(s)"
    )
    assert spy_iter.call_count == 1, (
        f"_tier0_schema_surfacing must call store.iter_record_columns() exactly "
        f"once post-W3; got {spy_iter.call_count} call(s)"
    )

    # Defense-in-depth: verify the columns parameter is exactly ["tags_json"]
    # — caller is paying for projection, so reading any other column would
    # spend AES-GCM cost we are explicitly avoiding.
    args, kwargs = spy_iter.call_args
    if args:
        cols = args[0]
    else:
        cols = kwargs.get("columns")
    assert cols == ["tags_json"], (
        f"projection must be exactly ['tags_json'] (zero AES-GCM cost); "
        f"got columns={cols!r}"
    )


# --------------------------------------------------------------------------- zero-decrypt contract


def test_tier0_schema_surfacing_zero_decrypt_calls(
    store: MemoryStore, monkeypatch: pytest.MonkeyPatch
) -> None:
    """``_decrypt_for_record`` fires zero times during the W3 path.

    The W3 contract is that projection-only iteration with
    ``columns=["tags_json"]`` skips every encrypted column at the disk-read
    layer; the W5 cipher cache is short-circuited entirely on this path.

    Pre-W3 (current main): ``store.all_records()`` round-trips every row
    through ``_from_row``, which calls ``_decrypt_for_record`` on each of
    literal_surface + provenance_json + profile_modulation_gain_json (encrypted
    columns). For a 16-record store: up to 48 calls. Assertion ``call_count == 0``
    fails RED.

    Post-W3: zero calls — assertion passes GREEN.
    """
    _populate_mixed_16(store)

    decrypt_spy = MagicMock(wraps=store._decrypt_for_record)
    monkeypatch.setattr(store, "_decrypt_for_record", decrypt_spy)

    _tier0_schema_surfacing(store)

    assert decrypt_spy.call_count == 0, (
        f"_tier0_schema_surfacing must NOT trigger ANY _decrypt_for_record "
        f"calls post-W3 (-16210 AES-GCM operations on the 8105-record "
        f"production store); got {decrypt_spy.call_count} call(s)"
    )


# --------------------------------------------------------------------------- raw: / domain: filter


def test_tier0_schema_surfacing_filters_raw_and_domain_tags(
    store: MemoryStore,
) -> None:
    """Existing contract: ``raw:*`` and ``domain:*`` tags are skipped (they are
    classification metadata, not schema-candidate signals).

    5 records with both ``raw:literal`` AND ``tag-real``: only ``tag-real``
    should appear in the candidates output (count=5, confidence=0.5).
    Same for ``domain:foo`` + ``tag-real-2``.
    """
    # Empty fresh store from the fixture; populate with 10 records:
    # 5 with raw: + tag-real, 5 with domain: + tag-real-2.
    # CLUSTER_MIN_SIZE = 3 so 10 records easily clears the floor.
    for _ in range(5):
        store.insert(_make(text="r1", tags=["raw:literal", "tag-real"]))
    for _ in range(5):
        store.insert(_make(text="r2", tags=["domain:foo", "tag-real-2"]))

    candidates = _tier0_schema_surfacing(store)
    patterns = sorted(c["pattern"] for c in candidates)

    # Only the unfiltered tags should surface; both raw: and domain: must NOT.
    assert "tag:tag-real" in patterns
    assert "tag:tag-real-2" in patterns
    assert "tag:raw:literal" not in patterns
    assert "tag:domain:foo" not in patterns

    # Count and confidence preserved (5 occurrences each, confidence = 0.5).
    by_pattern = {c["pattern"]: c for c in candidates}
    assert by_pattern["tag:tag-real"]["evidence_count"] == 5
    assert by_pattern["tag:tag-real"]["confidence"] == pytest.approx(0.5)
    assert by_pattern["tag:tag-real-2"]["evidence_count"] == 5
    assert by_pattern["tag:tag-real-2"]["confidence"] == pytest.approx(0.5)


# --------------------------------------------------------------------------- count >= 3 floor


def test_tier0_schema_surfacing_floor_count_3(store: MemoryStore) -> None:
    """Existing contract: per-tag count must be >= 3 to surface as a candidate.

    Fixture: 6 records, 3 with ``tag-a`` and 3 with ``tag-b``. Both clear the
    >= 3 floor and the global ``CLUSTER_MIN_SIZE`` floor (6 >= 3).

    Note: this isolates the per-tag count >= 3 floor from the global
    ``len(records) < CLUSTER_MIN_SIZE`` floor (test 5 covers the latter).
    """
    for _ in range(3):
        store.insert(_make(text="a", tags=["tag-a"]))
    for _ in range(3):
        store.insert(_make(text="b", tags=["tag-b"]))

    candidates = _tier0_schema_surfacing(store)
    assert len(candidates) == 2

    expected = sorted(
        [
            {"pattern": "tag:tag-a", "confidence": 0.3, "evidence_count": 3},
            {"pattern": "tag:tag-b", "confidence": 0.3, "evidence_count": 3},
        ],
        key=lambda d: d["pattern"],
    )
    actual = sorted(candidates, key=lambda d: d["pattern"])
    # Confidence is a float; use approx equality.
    for e, a in zip(expected, actual, strict=True):
        assert a["pattern"] == e["pattern"]
        assert a["evidence_count"] == e["evidence_count"]
        assert a["confidence"] == pytest.approx(e["confidence"])


# --------------------------------------------------------------------------- CLUSTER_MIN_SIZE global floor


def test_tier0_schema_surfacing_below_cluster_min_size_returns_empty(
    store: MemoryStore,
) -> None:
    """Existing contract: when total records < CLUSTER_MIN_SIZE, return [].

    Pre-W3 expressed as ``len(records) < CLUSTER_MIN_SIZE``.
    Post-W3 expressed as ``record_count < CLUSTER_MIN_SIZE`` after single-pass
    iteration. Both must return ``[]`` on stores with fewer than
    ``CLUSTER_MIN_SIZE`` records.
    """
    # Insert exactly CLUSTER_MIN_SIZE - 1 records. With CLUSTER_MIN_SIZE = 3
    # this is 2 records — below the floor.
    for _ in range(CLUSTER_MIN_SIZE - 1):
        store.insert(_make(text="below-floor", tags=["any-tag"]))

    candidates = _tier0_schema_surfacing(store)
    assert candidates == [], (
        f"expected [] when record count ({CLUSTER_MIN_SIZE - 1}) is below "
        f"CLUSTER_MIN_SIZE ({CLUSTER_MIN_SIZE}); got {candidates!r}"
    )


# --------------------------------------------------------------------------- byte-identical-to-pre-W3


def test_tier0_schema_surfacing_byte_identical_to_pre_w3(
    store: MemoryStore,
) -> None:
    """D-11 contract: rewritten function produces byte-identical output to the
    pre-W3 implementation on a deterministic 20-record fixture.

    Compute the expected output inline using the pre-W3 algorithm against
    ``store.all_records()``; assert order-independent equality (sort by pattern)
    against the W3 implementation's output.

    Fixture (deterministic, 20 records):
      - 5 records with tags=["a"]
      - 5 records with tags=["b"]
      - 4 records with tags=["c"]
      - 3 records with tags=["a", "raw:noise"]    -> 'a' count + 3
      - 3 records with tags=["b", "domain:x"]     -> 'b' count + 3

    Expected counts: a=8, b=8, c=4. All clear the count >= 3 floor.
    """
    for _ in range(5):
        store.insert(_make(text="a", tags=["a"]))
    for _ in range(5):
        store.insert(_make(text="b", tags=["b"]))
    for _ in range(4):
        store.insert(_make(text="c", tags=["c"]))
    for _ in range(3):
        store.insert(_make(text="ar", tags=["a", "raw:noise"]))
    for _ in range(3):
        store.insert(_make(text="bd", tags=["b", "domain:x"]))

    # Compute expected via the pre-W3 algorithm inline.
    records = store.all_records()
    tag_counts: dict[str, int] = {}
    for r in records:
        for t in r.tags or []:
            if t.startswith("raw:") or t.startswith("domain:"):
                continue
            tag_counts[t] = tag_counts.get(t, 0) + 1
    expected = [
        {
            "pattern": f"tag:{tag}",
            "confidence": min(1.0, count / 10.0),
            "evidence_count": count,
        }
        for tag, count in tag_counts.items()
        if count >= 3
    ]

    actual = _tier0_schema_surfacing(store)

    # Sort both sides by pattern for order-independent equality (dict-iter
    # order is insertion-order in py3.7+ but iter_record_columns batch order
    # is not guaranteed identical to all_records pandas-iterrows order).
    expected_sorted = sorted(expected, key=lambda d: d["pattern"])
    actual_sorted = sorted(actual, key=lambda d: d["pattern"])

    assert len(actual_sorted) == len(expected_sorted)
    for e, a in zip(expected_sorted, actual_sorted, strict=True):
        assert a["pattern"] == e["pattern"]
        assert a["evidence_count"] == e["evidence_count"]
        assert a["confidence"] == pytest.approx(e["confidence"])

    # Sanity: 3 patterns surface (a, b, c) — neither raw:noise nor domain:x.
    assert {c["pattern"] for c in actual} == {"tag:a", "tag:b", "tag:c"}
    by_pattern = {c["pattern"]: c["evidence_count"] for c in actual}
    assert by_pattern["tag:a"] == 8
    assert by_pattern["tag:b"] == 8
    assert by_pattern["tag:c"] == 4


# --------------------------------------------------------------------------- defensive JSON parse


def test_tier0_schema_surfacing_handles_malformed_tags_json_gracefully(
    store: MemoryStore, monkeypatch: pytest.MonkeyPatch
) -> None:
    """D-11 defensive try/except contract: malformed ``tags_json`` rows MUST
    NOT raise — they contribute zero tag counts, valid rows still surface.

    Strategy: monkeypatch-wrap ``store.iter_record_columns`` to inject a
    malformed row AFTER the real rows. OLD pre-W3 code does NOT call this
    method (it uses ``store.all_records()``) so the wrap is invisible to
    pre-W3 — test 7 passes RED for the right reason (existing 5-record
    fixture clears the floor and surfaces ``tag-good``).

    Post-W3: the real iter yields 5 valid rows + 1 malformed row; the
    defensive ``try: json.loads ... except json.JSONDecodeError`` in the
    new function body absorbs the malformed row → no exception, candidates
    still surface for ``tag-good``.

    NEVER write the malformed row directly to LanceDB — pre-W3
    ``_from_row`` parses ``tags_json`` without try/except (store.py:1518)
    and would crash ``all_records()`` on read, breaking test isolation
    and the RED contract (the failure should be the projection assertions
    1+2, not a JSON crash on test 7).
    """
    # 5 valid records — well above CLUSTER_MIN_SIZE = 3.
    for _ in range(5):
        store.insert(_make(text="g", tags=["tag-good"]))

    # Capture the real iter and wrap it to append one malformed row at the end.
    real_iter = store.iter_record_columns

    def iter_with_malformed_tail(columns, **kwargs):  # noqa: ANN001 — match arg shape
        yield from real_iter(columns, **kwargs)
        # Malformed JSON — defensive try/except in W3 must absorb this without
        # raising. (Real production data with a corrupted row column might look
        # like this if a write was interrupted mid-flush.)
        yield {"tags_json": "not valid json {{{"}

    monkeypatch.setattr(store, "iter_record_columns", iter_with_malformed_tail)

    # Must not raise. Pre-W3 path doesn't call iter_record_columns so the
    # monkeypatch is a no-op for it; test 7 passes RED. Post-W3 path consumes
    # the malformed row but absorbs the JSONDecodeError.
    candidates = _tier0_schema_surfacing(store)

    # tag-good still surfaces (5 records, count=5, confidence=0.5).
    by_pattern = {c["pattern"]: c for c in candidates}
    assert "tag:tag-good" in by_pattern, (
        f"valid records' tag must still surface despite malformed-row tail; "
        f"got candidates={candidates!r}"
    )
    assert by_pattern["tag:tag-good"]["evidence_count"] == 5
    assert by_pattern["tag:tag-good"]["confidence"] == pytest.approx(0.5)


# ============================================================================
# Plan 07.7-04 W4-extended: run_heavy_consolidation single-materialisation invariant
# ============================================================================
#
# After CONTEXT.md amendment (2026-04-29 mid-execution), the W4 ≤1
# all_records() invariant on run_heavy_consolidation becomes ACHIEVABLE. The
# original Plan 04 scope was a sleep.py comment marker only; the amendment
# extends scope to migrate two `all_records()` callers in schema.py
# (induce_schemas_tier0 + persist_schema) to use iter_record_columns
# projection.
#
# Pre-2 calls when only induce_schemas_tier0 fires; 3 calls when
# persist_schema fires for an auto-status candidate.
# Post-1 call total (the sleep.py:513 records_by_id materialisation
# kept by W4 minimum-change branch per CONTEXT.md D-14/D-20).
#
# These tests ALSO lock the public contract of run_heavy_consolidation's
# return dict (test 3) — protects against drive-by changes during
# W4-extended editing.


@pytest.fixture
def _patch_schema_embedder(monkeypatch: pytest.MonkeyPatch):
    """persist_schema's insert path embeds the schema summary; without this
    fixture each test pays ~5s embedder load. Mirrors test_schema_dedup.py."""
    from iai_mcp import embed as embed_mod

    class _FakeEmbedder:
        DIM = EMBED_DIM
        DEFAULT_DIM = EMBED_DIM
        DEFAULT_MODEL_KEY = "fake"

        def __init__(self, *args, **kwargs):  # noqa: ANN001
            self.DIM = EMBED_DIM

        def embed(self, text: str) -> list[float]:
            return [1.0] + [0.0] * (EMBED_DIM - 1)

        def embed_batch(self, texts):  # noqa: ANN001
            return [self.embed(t) for t in texts]

    monkeypatch.setattr(embed_mod, "Embedder", _FakeEmbedder)
    yield


def _populate_for_heavy(store: MemoryStore) -> list[MemoryRecord]:
    """10 records on a single tag pair — clears (a) CLUSTER_MIN_SIZE record-count
    floor, (b) per-tag count >= 3 floor, (c) AUTO_INDUCT_COOCCURRENCE = 5 +
    AUTO_INDUCT_CONFIDENCE = 0.85 thresholds (count=10, confidence=1.0). This
    forces the FULL schema-induction path including persist_schema's keeper
    scan, exercising the W4-extended invariant against ALL three pre-D-26
    all_records() call sites."""
    from iai_mcp.types import EMBED_DIM as _EMBED_DIM
    from datetime import datetime as _dt, timezone as _tz
    from uuid import uuid4 as _uuid

    inserted: list[MemoryRecord] = []
    for i in range(10):
        r = MemoryRecord(
            id=_uuid(),
            tier="episodic",
            literal_surface=f"meeting-rec-{i}",
            aaak_index="",
            embedding=[1.0] + [0.0] * (_EMBED_DIM - 1),
            community_id=None,
            centrality=0.0,
            detail_level=2,
            pinned=False,
            stability=0.0,
            difficulty=0.0,
            last_reviewed=None,
            never_decay=False,
            never_merge=False,
            provenance=[],
            created_at=_dt.now(_tz.utc),
            updated_at=_dt.now(_tz.utc),
            tags=["meeting", "notes"],
            language="en",
        )
        store.insert(r)
        inserted.append(r)
    return inserted


def test_run_heavy_consolidation_calls_all_records_at_most_once(
    store: MemoryStore,
    monkeypatch: pytest.MonkeyPatch,
    _patch_schema_embedder,
) -> None:
    """W4-extended invariant (CONTEXT.md + D-26): run_heavy_consolidation
    calls store.all_records() AT MOST ONCE per invocation.

    Pre-D-26 (current main + Plan 03 W3): 2 or 3 calls — one from
    sleep.py:513 (records_by_id materialisation kept by W4), one from
    schema.py:89 (induce_schemas_tier0 — D-26-A target), and one from
    schema.py:267 (persist_schema keeper scan — D-26-B target) when an
    auto-status candidate is persisted.

    Post-1 call (only sleep.py:513 records_by_id; the schema.py paths
    use iter_record_columns instead).

    The test seeds 10 records on a single tag pair. count=10, confidence=1.0
    → status="auto" → persist_schema fires → ALL THREE pre-D-26 call sites
    are exercised in one heavy invocation. The assertion ``call_count <= 1``
    fails RED on current main (count=2 or 3), passes GREEN after D-26-A+B.
    """
    from iai_mcp.guard import BudgetLedger, RateLimitLedger
    from iai_mcp.sleep import SleepConfig, run_heavy_consolidation

    _populate_for_heavy(store)

    spy = MagicMock(wraps=store.all_records)
    monkeypatch.setattr(store, "all_records", spy)

    cfg = SleepConfig(llm_enabled=False)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    run_heavy_consolidation(
        store, session_id="s-w4-inv", config=cfg, budget=budget, rate=rate,
        has_api_key=False,
    )

    assert spy.call_count <= 1, (
        f"D-13 invariant: run_heavy_consolidation must call store.all_records() "
        f"AT MOST ONCE per invocation; got {spy.call_count} call(s). "
        f"Pre-D-26 contributors: sleep.py:513 records_by_id (kept by W4), "
        f"schema.py:89 induce_schemas_tier0 (D-26-A target), "
        f"schema.py:267 persist_schema keeper scan (D-26-B target)."
    )


def test_run_heavy_consolidation_iter_record_columns_called_at_least_once(
    store: MemoryStore,
    monkeypatch: pytest.MonkeyPatch,
    _patch_schema_embedder,
) -> None:
    """Companion to the W4 invariant: proves the W3 path (and post-D-26
    schema paths) actually executed via iter_record_columns. Without this
    companion, a buggy W4 implementation that elided BOTH all_records()
    AND iter_record_columns would silently pass the ≤1 invariant."""
    from iai_mcp.guard import BudgetLedger, RateLimitLedger
    from iai_mcp.sleep import SleepConfig, run_heavy_consolidation

    _populate_for_heavy(store)

    spy = MagicMock(wraps=store.iter_record_columns)
    monkeypatch.setattr(store, "iter_record_columns", spy)

    cfg = SleepConfig(llm_enabled=False)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    run_heavy_consolidation(
        store, session_id="s-w4-iter", config=cfg, budget=budget, rate=rate,
        has_api_key=False,
    )

    assert spy.call_count >= 1, (
        f"run_heavy_consolidation must call store.iter_record_columns() at "
        f"least once per invocation (W3 _tier0_schema_surfacing path + "
        f"post-D-26 schema.py paths); got {spy.call_count} call(s)."
    )


def test_run_heavy_consolidation_returns_expected_keys(
    store: MemoryStore,
    _patch_schema_embedder,
) -> None:
    """Lock the public contract of run_heavy_consolidation's return dict.
    Protects against drive-by changes that could happen during W4-extended
    editing of the function body."""
    from iai_mcp.guard import BudgetLedger, RateLimitLedger
    from iai_mcp.sleep import SleepConfig, run_heavy_consolidation

    _populate_for_heavy(store)

    cfg = SleepConfig(llm_enabled=False)
    budget = BudgetLedger(store)
    rate = RateLimitLedger(store)

    result = run_heavy_consolidation(
        store, session_id="s-w4-keys", config=cfg, budget=budget, rate=rate,
        has_api_key=False,
    )

    expected_keys = {
        "mode",
        "tier",
        "summaries_created",
        "decay_result",
        "schema_candidates",
        "schemas_induced",
    }
    assert set(result.keys()) == expected_keys, (
        f"run_heavy_consolidation public contract: expected keys "
        f"{sorted(expected_keys)}; got {sorted(result.keys())}"
    )
    assert result["mode"] == "heavy"
    assert result["tier"] in ("tier0", "tier1")