iai-mcp-opencode/src/iai_mcp/schema.py

"""Schema induction (LEARN-03, D-18, D-21) -- Task 3.

D-18 (scheduling): dual-path schema surfacing.
- Primary: batch induction inside the heavy sleep cycle. Tier-1 Haiku
  extraction when `should_call_llm` permits, Tier-0 cooccurrence + TF-IDF
  fallback otherwise.
- Secondary: entropy-gated provisional schemas surfaced during
  `pipeline_recall` when score distribution entropy > 0.8 bits AND the
  cohesive community has >= 2 shared tags.

D-21 (thresholds, autism-aware):
- Auto-induct when co_occurrence >= 5 AND confidence >= 0.85.
- User-approval flag at co_occurrence in [3, 5) AND confidence in [0.65, 0.85).
- Below: discard.
- Exceptions preserved as first-class records (never absorbed).
- Abstraction level: concrete (Dawson-Mottron Raven's preference).

Schema records are first-class hubs:
- tier="semantic", detail_level=3 -> never_decay=True.
- schema_instance_of edges from evidence -> schema never decay.
- pipeline routing can prioritise schema records when pattern
  matches.
"""
from __future__ import annotations

import json
import os
from collections import Counter
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Iterable
from uuid import UUID, uuid4

from iai_mcp.events import write_event
from iai_mcp.guard import BudgetLedger, RateLimitLedger, should_call_llm
from iai_mcp.store import MemoryStore
from iai_mcp.types import MemoryRecord, SCHEMA_VERSION_CURRENT


# ---------------------------------------------------------------- constants

AUTO_INDUCT_COOCCURRENCE: int = 5
AUTO_INDUCT_CONFIDENCE: float = 0.85
USER_APPROVAL_COOCCURRENCE: int = 3
USER_APPROVAL_CONFIDENCE: float = 0.65
MAX_EVIDENCE_PER_SCHEMA: int = 50
PROVISIONAL_ENTROPY_MIN: float = 0.8


# ---------------------------------------------------------------- candidate


@dataclass
class SchemaCandidate:
    """One schema candidate surfaced by induce_schemas_*."""

    pattern: str
    confidence: float
    evidence_count: int
    evidence_ids: list[UUID] = field(default_factory=list)
    domain: str | None = None
    exceptions: list[UUID] = field(default_factory=list)
    status: str = "auto"   # "auto" | "pending_user_approval"


# ---------------------------------------------------------------- Tier-0 induction


def _tag_cooccurrence(records: Iterable) -> dict:
    """Bucket records by tag-pair frequency. Returns {frozenset(pair): [record_ids]}.

    Phase 07.7-04 D-26-A: accepts either ``list[MemoryRecord]`` (back-compat;
    used by external callers passing dataclass instances) or an iterable of
    projected ``dict`` rows from ``store.iter_record_columns(["id", "tags_json"])``.

    Dispatch is duck-typed: items with a ``.tags`` attribute are treated as
    MemoryRecord; items without are treated as dict rows. This keeps both
    surfaces alive while migrating the production path off ``all_records()``.

    For dict rows, ``tags_json`` is parsed defensively (mirrors the W3
    pattern in ``sleep._tier0_schema_surfacing`` — corrupted rows contribute
    zero counts but do not crash). The ``id`` field arrives as a string from
    LanceDB and is converted to ``UUID`` here so callers always see
    ``list[UUID]`` evidence_ids regardless of which input shape was passed.
    """
    pairs: dict = {}
    for r in records:
        # Dispatch on duck-typing: MemoryRecord has .tags + .id attributes;
        # dict rows have ["tags_json"] + ["id"] keys.
        if hasattr(r, "tags"):
            # MemoryRecord path (back-compat for external/test callers).
            raw_tags = r.tags or []
            rid = r.id
        else:
            # Dict-row path (D-26-A migrated production path). Defensive parse:
            # malformed tags_json contributes zero pairs but does not raise.
            tags_raw = r.get("tags_json") or "[]"
            try:
                raw_tags = json.loads(tags_raw) if tags_raw else []
            except (TypeError, json.JSONDecodeError):
                raw_tags = []
            id_raw = r.get("id")
            if id_raw is None:
                continue
            # iter_record_columns yields id as a string; convert to UUID at
            # the boundary so SchemaCandidate.evidence_ids stays list[UUID].
            try:
                rid = UUID(id_raw) if isinstance(id_raw, str) else id_raw
            except (ValueError, AttributeError):
                continue

        tags = [
            t for t in raw_tags
            if not t.startswith("raw:") and not t.startswith("domain:")
        ]
        for i in range(len(tags)):
            for j in range(i + 1, len(tags)):
                key = frozenset([tags[i], tags[j]])
                pairs.setdefault(key, []).append(rid)
    return pairs


def induce_schemas_tier0(store: MemoryStore) -> list[SchemaCandidate]:
    """D-18 Tier-0 path: tag cooccurrence + TF-IDF; no LLM.

    Returns a list of SchemaCandidate. Each candidate passes the gate:
    - status="auto"               -> count >= 5 AND confidence >= 0.85
    - status="pending_user_approval" -> count in [3,5) AND confidence in [0.65, 0.85)

    Phase 07.7-04 D-26-A: streams via ``store.iter_record_columns(
    ["id", "tags_json"], batch_size=1024)`` instead of ``store.all_records()``.
    Encrypted columns (literal_surface, provenance_json,
    profile_modulation_gain_json) are NEVER read on this path; the W5 cipher
    cache is short-circuited entirely. On the 8105-record production store
    this saves ~16210 AES-GCM operations + ~14.5 MB literal_surface
    materialisation per ``run_heavy_consolidation`` invocation, and unblocks
    the W4 ≤1 ``all_records()`` invariant on the heavy cycle.

    Single-pass record-count tally: count_total is incremented inside the
    iterator loop and the ``< CLUSTER_MIN_SIZE`` floor is checked afterwards.
    Mirrors the pattern in ``sleep._tier0_schema_surfacing`` (Plan 07.7-03 W3).
    """
    rows = list(store.iter_record_columns(["id", "tags_json"], batch_size=1024))
    if len(rows) < 3:
        return []

    pair_counts = _tag_cooccurrence(rows)
    candidates: list[SchemaCandidate] = []
    for pair, evidence in pair_counts.items():
        count = len(evidence)
        # Heuristic confidence: saturates toward 1.0 at 10+ evidence records.
        confidence = min(1.0, count / 10.0)
        pattern = f"tags:{'+'.join(sorted(pair))}"
        if count >= AUTO_INDUCT_COOCCURRENCE and confidence >= AUTO_INDUCT_CONFIDENCE:
            status = "auto"
        elif (
            USER_APPROVAL_COOCCURRENCE <= count < AUTO_INDUCT_COOCCURRENCE
            and confidence >= USER_APPROVAL_CONFIDENCE
        ):
            status = "pending_user_approval"
        else:
            continue
        candidates.append(
            SchemaCandidate(
                pattern=pattern,
                confidence=confidence,
                evidence_count=count,
                evidence_ids=list(evidence[:MAX_EVIDENCE_PER_SCHEMA]),
                status=status,
            )
        )
    return candidates


# ---------------------------------------------------------------- Tier-1 w/ D-GUARD


def induce_schemas_tier1(
    store: MemoryStore,
    budget: BudgetLedger,
    rate: RateLimitLedger,
    llm_enabled: bool = True,
) -> list[SchemaCandidate]:
    """D-18 Tier-1 path: Haiku extraction gated by D-GUARD ladder.

    When should_call_llm returns False (any ladder step), emit an
    llm_health event and delegate to `induce_schemas_tier0`.

    scope: the Tier-1 branch is reserved; wires the
    actual anthropic.batches.create call. This function's contract is: on
    allow, call budget.record_spend and emit llm_health; then fall back to
    tier0 (because real Batch output is a deliverable). The
    effective_tier in the event is "tier0" regardless until Plan 02-04.
    """
    has_key = bool(os.environ.get("ANTHROPIC_API_KEY"))
    ok, reason = should_call_llm(
        budget=budget, rate=rate,
        llm_enabled=llm_enabled, has_api_key=has_key,
        estimated_usd=0.005,
    )
    if not ok:
        write_event(
            store,
            kind="llm_health",
            data={
                "component": "schema_induction",
                "tier": "fallback",
                "reason": reason,
            },
            severity="warning",
        )
        return induce_schemas_tier0(store)

    # Tier-1 eligible -- scaffold only (Plan 02-04 wires real Batch API).
    try:
        import anthropic  # noqa: F401 -- lazy import, raise-only if missing
        budget.record_spend(0.002, kind="schema_induction")
        write_event(
            store,
            kind="llm_health",
            data={
                "component": "schema_induction",
                "tier": "haiku",
                "note": "Plan 02-04 wires real Batch API; 02-03 scaffolds only",
            },
            severity="info",
        )
    except Exception as e:
        write_event(
            store,
            kind="llm_health",
            data={"component": "schema_induction", "error": str(e)},
            severity="critical",
        )
    return induce_schemas_tier0(store)


# ---------------------------------------------------------------- persist


def _majority_language(evidence_ids: list[UUID], store: MemoryStore) -> str:
    """Return the plurality ISO-639-1 language tag among evidence records.

    fix (D-08a constitutional): schema hubs must carry the
    language of their source evidence, not a hardcoded 'en'. A user whose
    records are Russian would otherwise get schemas tagged 'en' and fail
    their own language='ru' filter at retrieval.

    Algorithm:
        - Fetch each evidence record via store.get (skip missing/deleted ones).
        - Collect their language fields (skip empty/None).
        - Return max(set(langs), key=langs.count). Tie-break is deterministic
          given a stable input list order: max with key=list.count returns
          the first element from the set iteration whose count is the
          maximum, and Python's set iteration on strings follows insertion
          order in CPython >= 3.7 for the distinct-values pattern used here
          because we build the distinct set from a list iteration.
        - Fallback 'en' when evidence is empty or all records are missing.

    Tie-break policy: when two languages are tied, the one whose first
    occurrence appears EARLIEST in evidence_ids wins. Matches Phase 1
    default 'en' when no signal is available (least-surprise).
    """
    langs: list[str] = []
    for eid in evidence_ids:
        rec = store.get(eid)
        if rec is None:
            continue
        if rec.language:
            langs.append(rec.language)
    if not langs:
        return "en"
    # Deterministic tie-break: iterate langs in order, pick the first whose
    # count is the max. max(set(langs), key=langs.count) is undefined for
    # set ordering, so we use a hand-rolled pass instead.
    best = langs[0]
    best_count = langs.count(best)
    seen: set[str] = {best}
    for lang in langs[1:]:
        if lang in seen:
            continue
        seen.add(lang)
        c = langs.count(lang)
        if c > best_count:
            best = lang
            best_count = c
    return best


def persist_schema(
    store: MemoryStore,
    candidate: SchemaCandidate,
) -> UUID:
    """Insert a schema record + schema_instance_of edges to evidence.

    Schema records carry:
    - tier="semantic", detail_level=3 (never_decay auto-true)
    - tags=["schema", <status>, f"pattern:{pattern}"]
    - s5_trust_score=0.5 (neutral prior; LEARN-06 may raise over time)
    - schema_version=2
    """
    from iai_mcp.aaak import enforce_language_tagged, generate_aaak_index
    from iai_mcp.embed import embedder_for_store

    summary = (
        f"Schema: {candidate.pattern} (confidence={candidate.confidence:.2f})"
    )

    # R1 (D-09 + D-10): pattern dedup. Search for an existing
    # schema record carrying the tag `pattern:{candidate.pattern}` in the
    # semantic tier. If found, reinforce schema_instance_of edges from new
    # evidence onto the existing keeper, emit `schema_reinforced`, and
    # return the existing schema_id. If not found, fall through to the
    # original insert path. Closes the chain-induction bleed: every sleep
    # cycle would otherwise insert a fresh tier="semantic", never_decay
    # row for the same pattern (live store accumulated 7+ duplicates per
    # pattern with degree-bonus shouldering verbatim records out of hits[]).
    pattern_tag = f"pattern:{candidate.pattern}"
    # Phase 07.7-04 D-26-B: keeper scan migrated from store.all_records() to
    # store.iter_record_columns(["id", "tier", "tags_json"], batch_size=1024).
    # Projection skips encrypted columns (literal_surface, provenance_json,
    # profile_modulation_gain_json) entirely — the W5 cipher cache is
    # short-circuited on this path. Early-exit (`break`) semantics preserved.
    # The matching row's id arrives as a string from LanceDB; we convert to
    # UUID at the boundary so downstream code sees the same type contract as
    # the pre-D-26 ``existing_keeper.id`` access pattern.
    existing_keeper_id: UUID | None = None
    try:
        for row in store.iter_record_columns(
            ["id", "tier", "tags_json"], batch_size=1024
        ):
            if row.get("tier") != "semantic":
                continue
            tags_raw = row.get("tags_json") or "[]"
            try:
                tags = json.loads(tags_raw) if tags_raw else []
            except (TypeError, json.JSONDecodeError):
                tags = []
            if pattern_tag in tags:
                id_raw = row.get("id")
                if id_raw is None:
                    continue
                try:
                    existing_keeper_id = (
                        UUID(id_raw) if isinstance(id_raw, str) else id_raw
                    )
                except (ValueError, AttributeError):
                    continue
                break
    except Exception:
        # Defensive: if the scan fails, fall through to the insert path so
        # we never silently lose a schema. Mirrors the diagnostic-write
        # contract used in pipeline.py provenance batching.
        existing_keeper_id = None

    if existing_keeper_id is not None:
        from iai_mcp.store import EDGES_TABLE

        # Reinforce schema_instance_of edges from each new evidence record
        # onto the existing keeper. Reuses the same delta formula as the
        # insert path (max(0.1, candidate.confidence)) for symmetry.
        delta = max(0.1, candidate.confidence)
        new_pairs = [(ev_id, existing_keeper_id) for ev_id in candidate.evidence_ids]
        if new_pairs:
            store.boost_edges(
                new_pairs,
                edge_type="schema_instance_of",
                delta=delta,
            )

        # Compute total_evidence after reinforcement: count
        # `schema_instance_of` edges incident on the keeper. Read via the
        # edges table to avoid trusting any in-memory cache.
        # Note: store.boost_edges canonicalises (src, dst) to a sorted
        # tuple, so the keeper appears in EITHER column depending on the
        # string ordering of the paired evidence UUID. OR-counting both
        # columns gives the true edge-incidence count (no double-count
        # since each edge row has the keeper in exactly one column).
        try:
            edges_df = store.db.open_table(EDGES_TABLE).to_pandas()
            keeper_str = str(existing_keeper_id)
            total_evidence = int(
                ((edges_df["edge_type"] == "schema_instance_of")
                 & ((edges_df["dst"] == keeper_str)
                    | (edges_df["src"] == keeper_str))).sum()
            )
        except Exception:
            total_evidence = len(candidate.evidence_ids)

        write_event(
            store,
            kind="schema_reinforced",
            data={
                "schema_id": str(existing_keeper_id),
                "pattern": candidate.pattern,
                "evidence_added": len(candidate.evidence_ids),
                "total_evidence": total_evidence,
            },
            severity="info",
            source_ids=[existing_keeper_id, *candidate.evidence_ids[:5]],
        )
        return existing_keeper_id

    emb = embedder_for_store(store).embed(summary)
    now = datetime.now(timezone.utc)
    schema_id = uuid4()
    # fix: derive language from the plurality language
    # of the evidence records, not a hardcoded 'en'. Schema hubs for Russian /
    # Japanese / Arabic clusters now carry the correct ISO-639-1 tag so
    # language-filtered retrieval surfaces them as expected.
    derived_language = _majority_language(candidate.evidence_ids, store)
    schema_rec = MemoryRecord(
        id=schema_id,
        tier="semantic",
        literal_surface=summary,
        aaak_index="",
        embedding=emb,
        community_id=None,
        centrality=0.0,
        detail_level=3,
        pinned=False,
        stability=0.7,
        difficulty=0.3,
        last_reviewed=now,
        never_decay=True,
        never_merge=False,
        provenance=[
            {
                "ts": now.isoformat(),
                "cue": "schema_induction",
                "session_id": "system",
            }
        ],
        created_at=now,
        updated_at=now,
        tags=[
            "schema",
            candidate.status,
            f"pattern:{candidate.pattern}",
        ],
        language=derived_language,
        s5_trust_score=0.5,
        profile_modulation_gain={},
        schema_version=SCHEMA_VERSION_CURRENT,
    )
    enforce_language_tagged(schema_rec)
    schema_rec.aaak_index = generate_aaak_index(schema_rec)
    store.insert(schema_rec)

    # R3: batch the schema_instance_of edges into ONE boost_edges
    # call (one merge_insert + one tbl.add at most). Previously this loop
    # issued N Lance versions on edges.lance for an N-evidence schema.
    instance_pairs = [(ev_id, schema_id) for ev_id in candidate.evidence_ids]
    if instance_pairs:
        store.boost_edges(
            instance_pairs,
            edge_type="schema_instance_of",
            delta=max(0.1, candidate.confidence),
        )

    write_event(
        store,
        kind="schema_induction_run",
        data={
            "schema_id": str(schema_id),
            "pattern": candidate.pattern,
            "confidence": candidate.confidence,
            "evidence_count": candidate.evidence_count,
            "status": candidate.status,
        },
        severity="info",
        source_ids=[schema_id, *candidate.evidence_ids[:5]],
    )
    return schema_id


# ---------------------------------------------------------------- provisional


def provisional_schemas_for_recall(
    store: MemoryStore,
    hits: list,
    entropy_bits: float,
    records_cache: "dict | None" = None,
) -> list[dict]:
    """D-18 secondary path: surface provisional schema hints on high-entropy recalls.

    Returns a list of hint dicts compatible with RecallResponse.hints, one per
    cohesive tag appearing in >= 2 of the top hits.

    perf: batched all_records() fetch replaces N+1 store.get()
    calls. A single to_pandas() call is still O(total_records) but constant
    per recall, not per-hit. This was a major D-SPEED bottleneck at N=50.

    perf (Rule 1 auto-fix): accept optional `records_cache` so
    pipeline_recall can pass its already-built cache through -- avoids a
    second `store.all_records()` scan per recall (~40ms at N=100). Falls
    back to all_records() if no cache provided (preserves back-compat for
    ad-hoc callers; tests without pipeline_recall still work).
    """
    if entropy_bits < PROVISIONAL_ENTROPY_MIN or len(hits) < 3:
        return []

    # Batch-fetch all records once; hits are typically <=5 so the cost of
    # filtering in-memory dominates over 5 separate store.get() round-trips.
    hit_ids = {h.record_id for h in hits}
    if records_cache is not None:
        # Reuse the cache built at pipeline_recall stage 1. Zero scans.
        by_id = {
            rid: rec for rid, rec in records_cache.items() if rid in hit_ids
        }
    else:
        try:
            all_recs = store.all_records()
        except Exception:
            return []
        by_id = {r.id: r for r in all_recs if r.id in hit_ids}

    tag_count: Counter = Counter()
    for h in hits:
        rec = by_id.get(h.record_id)
        if rec is None:
            continue
        for t in (rec.tags or []):
            if t.startswith("raw:") or t.startswith("domain:"):
                continue
            tag_count[t] += 1

    provisional: list[dict] = []
    for tag, cnt in tag_count.most_common(3):
        if cnt >= 2:
            source_ids: list[str] = []
            for h in hits:
                rec = by_id.get(h.record_id)
                if rec is None:
                    continue
                if tag in (rec.tags or []):
                    source_ids.append(str(h.record_id))
                if len(source_ids) >= 5:
                    break
            provisional.append(
                {
                    "kind": "provisional_schema",
                    "severity": "info",
                    "source_ids": source_ids,
                    "text": f"Potential schema: tag={tag} cnt={cnt}",
                    "provisional": True,
                    "entropy": entropy_bits,
                }
            )
    return provisional