"""Tests for R1 — schema-pattern dedup in persist_schema.

Locked decisions covered (06-CONTEXT.md):
- persist_schema dedups by tag `pattern:{candidate.pattern}` against
  existing tier="semantic" records; reinforces schema_instance_of edges
  onto the keeper instead of inserting a duplicate row.
- new event kind `schema_reinforced` with payload
  `{schema_id, pattern, evidence_added, total_evidence}`; severity "info";
  source_ids `[keeper_id, *new_evidence_ids[:5]]`.
- single test file, pytest convention (`tmp_path` LanceDB root).

R1 acceptance (06-SPEC.md): N persist_schema calls for the same pattern
collapse to ONE schema record, with the keeper's incoming
`schema_instance_of` edge count equal to the cumulative distinct evidence
count across all calls.
"""
from __future__ import annotations

from datetime import datetime, timezone
from uuid import uuid4

import pytest

from iai_mcp.events import query_events
from iai_mcp.store import EDGES_TABLE, MemoryStore
from iai_mcp.types import EMBED_DIM, MemoryRecord


# ---------------------------------------------------------------- helpers


def _rec(
    *,
    text: str = "t",
    tags: list[str] | None = None,
    language: str = "en",
    tier: str = "episodic",
    detail_level: int = 2,
) -> MemoryRecord:
    now = datetime.now(timezone.utc)
    return MemoryRecord(
        id=uuid4(),
        tier=tier,
        literal_surface=text,
        aaak_index="",
        embedding=[1.0] + [0.0] * (EMBED_DIM - 1),
        community_id=None,
        centrality=0.0,
        detail_level=detail_level,
        pinned=False,
        stability=0.0,
        difficulty=0.0,
        last_reviewed=None,
        never_decay=False,
        never_merge=False,
        provenance=[],
        created_at=now,
        updated_at=now,
        tags=list(tags or []),
        language=language,
    )


@pytest.fixture(autouse=True)
def _patch_embedder(monkeypatch):
    """Avoid loading bge-m3 during dedup tests — perf hygiene."""
    from iai_mcp import embed as embed_mod

    class _FakeEmbedder:
        DIM = EMBED_DIM
        DEFAULT_DIM = EMBED_DIM
        DEFAULT_MODEL_KEY = "fake"

        def __init__(self, *args, **kwargs):
            self.DIM = EMBED_DIM

        def embed(self, text: str) -> list[float]:
            return [1.0] + [0.0] * (EMBED_DIM - 1)

        def embed_batch(self, texts):
            return [self.embed(t) for t in texts]

    monkeypatch.setattr(embed_mod, "Embedder", _FakeEmbedder)
    yield


# ---------------------------------------------------------------- Task 1: events taxonomy + write-event smoke


def test_events_module_docstring_lists_schema_reinforced():
    """events.py module docstring documents the new `schema_reinforced` kind."""
    import iai_mcp.events as events_mod

    doc = events_mod.__doc__ or ""
    assert "schema_reinforced" in doc, (
        "events.py module docstring missing `schema_reinforced` taxonomy entry "
        "(Plan 06-01 D-10). Add a additions block after the "
        "section listing the new event kind, payload schema, and source_ids note."
    )


def test_write_event_accepts_schema_reinforced_kind(tmp_path):
    """schema_reinforced event round-trips through write_event + query_events."""
    from iai_mcp.events import write_event

    store = MemoryStore(path=tmp_path)
    keeper_id = uuid4()
    ev_id = uuid4()
    write_event(
        store,
        kind="schema_reinforced",
        data={
            "schema_id": str(keeper_id),
            "pattern": "tags:capture+role:user",
            "evidence_added": 1,
            "total_evidence": 5,
        },
        severity="info",
        source_ids=[keeper_id, ev_id],
    )
    rows = query_events(store, kind="schema_reinforced")
    assert len(rows) == 1
    row = rows[0]
    assert row["kind"] == "schema_reinforced"
    assert row["severity"] == "info"
    payload = row["data"]
    assert payload["pattern"] == "tags:capture+role:user"
    assert payload["evidence_added"] == 1
    assert payload["total_evidence"] == 5
    assert payload["schema_id"] == str(keeper_id)


# ---------------------------------------------------------------- Task 2: persist_schema dedup branch (R1)


def _seed_evidence(store: MemoryStore, n: int) -> list[MemoryRecord]:
    """Insert n fresh episodic evidence records (one per call iteration).

    Each record carries the canonical capture/role tags so a downstream
    induced schema for `tags:capture+role:user` traces back to genuine
    evidence. Returns the list in insertion order.
    """
    recs = [_rec(text=f"ev{i}", tags=["capture", "role:user"]) for i in range(n)]
    for r in recs:
        store.insert(r)
    return recs


def test_persist_schema_dedups_same_pattern(tmp_path):
    """R1: 10 persist_schema calls for the same pattern produce ONE schema record."""
    from iai_mcp.schema import SchemaCandidate, persist_schema

    store = MemoryStore(path=tmp_path)
    pattern = "tags:capture+role:user"
    pattern_tag = f"pattern:{pattern}"

    for _ in range(10):
        ev = _seed_evidence(store, 1)
        cand = SchemaCandidate(
            pattern=pattern,
            confidence=0.9,
            evidence_count=1,
            evidence_ids=[ev[0].id],
            status="auto",
        )
        persist_schema(store, cand)

    schemas = [
        r for r in store.all_records()
        if r.tier == "semantic" and pattern_tag in (r.tags or [])
    ]
    assert len(schemas) == 1, (
        f"expected exactly one schema for pattern {pattern!r}, got {len(schemas)}"
    )


def test_persist_schema_reinforces_edges_on_dedup(tmp_path):
    """R1: schema_instance_of edge count to keeper == cumulative evidence count."""
    from iai_mcp.schema import SchemaCandidate, persist_schema

    store = MemoryStore(path=tmp_path)
    pattern = "tags:capture+role:user"
    pattern_tag = f"pattern:{pattern}"

    keeper_id = None
    cumulative_evidence = 0
    for _ in range(10):
        ev = _seed_evidence(store, 1)
        cand = SchemaCandidate(
            pattern=pattern,
            confidence=0.9,
            evidence_count=1,
            evidence_ids=[ev[0].id],
            status="auto",
        )
        sid = persist_schema(store, cand)
        keeper_id = keeper_id or sid
        cumulative_evidence += 1

    # store.boost_edges canonicalises (src, dst) to a sorted tuple, so the
    # keeper appears in EITHER column depending on the string ordering of
    # the paired evidence UUID. OR-count both columns to recover the true
    # edge-incidence count (each edge row has the keeper in exactly one
    # column — no double-count).
    edges_df = store.db.open_table(EDGES_TABLE).to_pandas()
    keeper_str = str(keeper_id)
    sio = edges_df[
        (edges_df["edge_type"] == "schema_instance_of")
        & ((edges_df["dst"] == keeper_str) | (edges_df["src"] == keeper_str))
    ]
    assert len(sio) == cumulative_evidence, (
        f"expected {cumulative_evidence} schema_instance_of edges incident on keeper, "
        f"got {len(sio)}"
    )

    # Sanity: exactly one keeper survives.
    keepers = [
        r for r in store.all_records()
        if r.tier == "semantic" and pattern_tag in (r.tags or [])
    ]
    assert len(keepers) == 1


def test_persist_schema_emits_schema_reinforced_event(tmp_path):
    """R1 + 9 reinforced events + 1 induction event after 10 calls."""
    from iai_mcp.schema import SchemaCandidate, persist_schema

    store = MemoryStore(path=tmp_path)
    pattern = "tags:capture+role:user"

    for _ in range(10):
        ev = _seed_evidence(store, 1)
        cand = SchemaCandidate(
            pattern=pattern,
            confidence=0.9,
            evidence_count=1,
            evidence_ids=[ev[0].id],
            status="auto",
        )
        persist_schema(store, cand)

    induction_events = query_events(store, kind="schema_induction_run")
    reinforced_events = query_events(store, kind="schema_reinforced", limit=100)

    matching_inductions = [
        e for e in induction_events if e["data"].get("pattern") == pattern
    ]
    matching_reinforcements = [
        e for e in reinforced_events if e["data"].get("pattern") == pattern
    ]
    assert len(matching_inductions) == 1, (
        f"expected 1 schema_induction_run event, got {len(matching_inductions)}"
    )
    assert len(matching_reinforcements) == 9, (
        f"expected 9 schema_reinforced events, got {len(matching_reinforcements)}"
    )

    # query_events sorts newest first; the FIRST in the list is the most
    # recent reinforcement and must carry the highest total_evidence.
    payloads = [e["data"] for e in matching_reinforcements]
    for p in payloads:
        assert "schema_id" in p
        assert p["pattern"] == pattern
        assert isinstance(p["evidence_added"], int)
        assert isinstance(p["total_evidence"], int)
    totals = [p["total_evidence"] for p in payloads]
    # Newest first → totals should be monotonically non-increasing in list order.
    assert totals == sorted(totals, reverse=True), (
        f"total_evidence should grow over time; saw {totals}"
    )


def test_persist_schema_returns_keeper_id(tmp_path):
    """R1: persist_schema returns the SAME UUID across N calls for same pattern."""
    from iai_mcp.schema import SchemaCandidate, persist_schema

    store = MemoryStore(path=tmp_path)
    pattern = "tags:capture+role:user"

    returned_ids = []
    for _ in range(10):
        ev = _seed_evidence(store, 1)
        cand = SchemaCandidate(
            pattern=pattern,
            confidence=0.9,
            evidence_count=1,
            evidence_ids=[ev[0].id],
            status="auto",
        )
        returned_ids.append(persist_schema(store, cand))

    first = returned_ids[0]
    assert all(rid == first for rid in returned_ids), (
        f"persist_schema should return the keeper id on every call; got {returned_ids}"
    )


def test_persist_schema_does_not_collapse_distinct_patterns(tmp_path):
    """R1 negative: distinct patterns produce distinct schema records."""
    from iai_mcp.schema import SchemaCandidate, persist_schema

    store = MemoryStore(path=tmp_path)

    ev_a = _seed_evidence(store, 1)
    sid_a = persist_schema(
        store,
        SchemaCandidate(
            pattern="A",
            confidence=0.9,
            evidence_count=1,
            evidence_ids=[ev_a[0].id],
            status="auto",
        ),
    )
    ev_b = _seed_evidence(store, 1)
    sid_b = persist_schema(
        store,
        SchemaCandidate(
            pattern="B",
            confidence=0.9,
            evidence_count=1,
            evidence_ids=[ev_b[0].id],
            status="auto",
        ),
    )
    assert sid_a != sid_b

    schemas = [
        r for r in store.all_records()
        if r.tier == "semantic" and any(
            t in ("pattern:A", "pattern:B") for t in (r.tags or [])
        )
    ]
    assert len(schemas) == 2
    patterns = sorted(
        t.split(":", 1)[1]
        for r in schemas
        for t in r.tags
        if t.startswith("pattern:")
    )
    assert patterns == ["A", "B"]