Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
551
src/iai_mcp/schema.py
Normal file
551
src/iai_mcp/schema.py
Normal file
|
|
@ -0,0 +1,551 @@
|
|||
"""Schema induction (LEARN-03, D-18, D-21) -- Task 3.
|
||||
|
||||
D-18 (scheduling): dual-path schema surfacing.
|
||||
- Primary: batch induction inside the heavy sleep cycle. Tier-1 Haiku
|
||||
extraction when `should_call_llm` permits, Tier-0 cooccurrence + TF-IDF
|
||||
fallback otherwise.
|
||||
- Secondary: entropy-gated provisional schemas surfaced during
|
||||
`pipeline_recall` when score distribution entropy > 0.8 bits AND the
|
||||
cohesive community has >= 2 shared tags.
|
||||
|
||||
D-21 (thresholds, autism-aware):
|
||||
- Auto-induct when co_occurrence >= 5 AND confidence >= 0.85.
|
||||
- User-approval flag at co_occurrence in [3, 5) AND confidence in [0.65, 0.85).
|
||||
- Below: discard.
|
||||
- Exceptions preserved as first-class records (never absorbed).
|
||||
- Abstraction level: concrete (Dawson-Mottron Raven's preference).
|
||||
|
||||
Schema records are first-class hubs:
|
||||
- tier="semantic", detail_level=3 -> never_decay=True.
|
||||
- schema_instance_of edges from evidence -> schema never decay.
|
||||
- pipeline routing can prioritise schema records when pattern
|
||||
matches.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterable
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from iai_mcp.events import write_event
|
||||
from iai_mcp.guard import BudgetLedger, RateLimitLedger, should_call_llm
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import MemoryRecord, SCHEMA_VERSION_CURRENT
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- constants
|
||||
|
||||
AUTO_INDUCT_COOCCURRENCE: int = 5
|
||||
AUTO_INDUCT_CONFIDENCE: float = 0.85
|
||||
USER_APPROVAL_COOCCURRENCE: int = 3
|
||||
USER_APPROVAL_CONFIDENCE: float = 0.65
|
||||
MAX_EVIDENCE_PER_SCHEMA: int = 50
|
||||
PROVISIONAL_ENTROPY_MIN: float = 0.8
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- candidate
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchemaCandidate:
|
||||
"""One schema candidate surfaced by induce_schemas_*."""
|
||||
|
||||
pattern: str
|
||||
confidence: float
|
||||
evidence_count: int
|
||||
evidence_ids: list[UUID] = field(default_factory=list)
|
||||
domain: str | None = None
|
||||
exceptions: list[UUID] = field(default_factory=list)
|
||||
status: str = "auto" # "auto" | "pending_user_approval"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- Tier-0 induction
|
||||
|
||||
|
||||
def _tag_cooccurrence(records: Iterable) -> dict:
|
||||
"""Bucket records by tag-pair frequency. Returns {frozenset(pair): [record_ids]}.
|
||||
|
||||
Phase 07.7-04 D-26-A: accepts either ``list[MemoryRecord]`` (back-compat;
|
||||
used by external callers passing dataclass instances) or an iterable of
|
||||
projected ``dict`` rows from ``store.iter_record_columns(["id", "tags_json"])``.
|
||||
|
||||
Dispatch is duck-typed: items with a ``.tags`` attribute are treated as
|
||||
MemoryRecord; items without are treated as dict rows. This keeps both
|
||||
surfaces alive while migrating the production path off ``all_records()``.
|
||||
|
||||
For dict rows, ``tags_json`` is parsed defensively (mirrors the W3
|
||||
pattern in ``sleep._tier0_schema_surfacing`` — corrupted rows contribute
|
||||
zero counts but do not crash). The ``id`` field arrives as a string from
|
||||
LanceDB and is converted to ``UUID`` here so callers always see
|
||||
``list[UUID]`` evidence_ids regardless of which input shape was passed.
|
||||
"""
|
||||
pairs: dict = {}
|
||||
for r in records:
|
||||
# Dispatch on duck-typing: MemoryRecord has .tags + .id attributes;
|
||||
# dict rows have ["tags_json"] + ["id"] keys.
|
||||
if hasattr(r, "tags"):
|
||||
# MemoryRecord path (back-compat for external/test callers).
|
||||
raw_tags = r.tags or []
|
||||
rid = r.id
|
||||
else:
|
||||
# Dict-row path (D-26-A migrated production path). Defensive parse:
|
||||
# malformed tags_json contributes zero pairs but does not raise.
|
||||
tags_raw = r.get("tags_json") or "[]"
|
||||
try:
|
||||
raw_tags = json.loads(tags_raw) if tags_raw else []
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
raw_tags = []
|
||||
id_raw = r.get("id")
|
||||
if id_raw is None:
|
||||
continue
|
||||
# iter_record_columns yields id as a string; convert to UUID at
|
||||
# the boundary so SchemaCandidate.evidence_ids stays list[UUID].
|
||||
try:
|
||||
rid = UUID(id_raw) if isinstance(id_raw, str) else id_raw
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
|
||||
tags = [
|
||||
t for t in raw_tags
|
||||
if not t.startswith("raw:") and not t.startswith("domain:")
|
||||
]
|
||||
for i in range(len(tags)):
|
||||
for j in range(i + 1, len(tags)):
|
||||
key = frozenset([tags[i], tags[j]])
|
||||
pairs.setdefault(key, []).append(rid)
|
||||
return pairs
|
||||
|
||||
|
||||
def induce_schemas_tier0(store: MemoryStore) -> list[SchemaCandidate]:
|
||||
"""D-18 Tier-0 path: tag cooccurrence + TF-IDF; no LLM.
|
||||
|
||||
Returns a list of SchemaCandidate. Each candidate passes the gate:
|
||||
- status="auto" -> count >= 5 AND confidence >= 0.85
|
||||
- status="pending_user_approval" -> count in [3,5) AND confidence in [0.65, 0.85)
|
||||
|
||||
Phase 07.7-04 D-26-A: streams via ``store.iter_record_columns(
|
||||
["id", "tags_json"], batch_size=1024)`` instead of ``store.all_records()``.
|
||||
Encrypted columns (literal_surface, provenance_json,
|
||||
profile_modulation_gain_json) are NEVER read on this path; the W5 cipher
|
||||
cache is short-circuited entirely. On the 8105-record production store
|
||||
this saves ~16210 AES-GCM operations + ~14.5 MB literal_surface
|
||||
materialisation per ``run_heavy_consolidation`` invocation, and unblocks
|
||||
the W4 ≤1 ``all_records()`` invariant on the heavy cycle.
|
||||
|
||||
Single-pass record-count tally: count_total is incremented inside the
|
||||
iterator loop and the ``< CLUSTER_MIN_SIZE`` floor is checked afterwards.
|
||||
Mirrors the pattern in ``sleep._tier0_schema_surfacing`` (Plan 07.7-03 W3).
|
||||
"""
|
||||
rows = list(store.iter_record_columns(["id", "tags_json"], batch_size=1024))
|
||||
if len(rows) < 3:
|
||||
return []
|
||||
|
||||
pair_counts = _tag_cooccurrence(rows)
|
||||
candidates: list[SchemaCandidate] = []
|
||||
for pair, evidence in pair_counts.items():
|
||||
count = len(evidence)
|
||||
# Heuristic confidence: saturates toward 1.0 at 10+ evidence records.
|
||||
confidence = min(1.0, count / 10.0)
|
||||
pattern = f"tags:{'+'.join(sorted(pair))}"
|
||||
if count >= AUTO_INDUCT_COOCCURRENCE and confidence >= AUTO_INDUCT_CONFIDENCE:
|
||||
status = "auto"
|
||||
elif (
|
||||
USER_APPROVAL_COOCCURRENCE <= count < AUTO_INDUCT_COOCCURRENCE
|
||||
and confidence >= USER_APPROVAL_CONFIDENCE
|
||||
):
|
||||
status = "pending_user_approval"
|
||||
else:
|
||||
continue
|
||||
candidates.append(
|
||||
SchemaCandidate(
|
||||
pattern=pattern,
|
||||
confidence=confidence,
|
||||
evidence_count=count,
|
||||
evidence_ids=list(evidence[:MAX_EVIDENCE_PER_SCHEMA]),
|
||||
status=status,
|
||||
)
|
||||
)
|
||||
return candidates
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- Tier-1 w/ D-GUARD
|
||||
|
||||
|
||||
def induce_schemas_tier1(
|
||||
store: MemoryStore,
|
||||
budget: BudgetLedger,
|
||||
rate: RateLimitLedger,
|
||||
llm_enabled: bool = True,
|
||||
) -> list[SchemaCandidate]:
|
||||
"""D-18 Tier-1 path: Haiku extraction gated by D-GUARD ladder.
|
||||
|
||||
When should_call_llm returns False (any ladder step), emit an
|
||||
llm_health event and delegate to `induce_schemas_tier0`.
|
||||
|
||||
scope: the Tier-1 branch is reserved; wires the
|
||||
actual anthropic.batches.create call. This function's contract is: on
|
||||
allow, call budget.record_spend and emit llm_health; then fall back to
|
||||
tier0 (because real Batch output is a deliverable). The
|
||||
effective_tier in the event is "tier0" regardless until Plan 02-04.
|
||||
"""
|
||||
has_key = bool(os.environ.get("ANTHROPIC_API_KEY"))
|
||||
ok, reason = should_call_llm(
|
||||
budget=budget, rate=rate,
|
||||
llm_enabled=llm_enabled, has_api_key=has_key,
|
||||
estimated_usd=0.005,
|
||||
)
|
||||
if not ok:
|
||||
write_event(
|
||||
store,
|
||||
kind="llm_health",
|
||||
data={
|
||||
"component": "schema_induction",
|
||||
"tier": "fallback",
|
||||
"reason": reason,
|
||||
},
|
||||
severity="warning",
|
||||
)
|
||||
return induce_schemas_tier0(store)
|
||||
|
||||
# Tier-1 eligible -- scaffold only (Plan 02-04 wires real Batch API).
|
||||
try:
|
||||
import anthropic # noqa: F401 -- lazy import, raise-only if missing
|
||||
budget.record_spend(0.002, kind="schema_induction")
|
||||
write_event(
|
||||
store,
|
||||
kind="llm_health",
|
||||
data={
|
||||
"component": "schema_induction",
|
||||
"tier": "haiku",
|
||||
"note": "Plan 02-04 wires real Batch API; 02-03 scaffolds only",
|
||||
},
|
||||
severity="info",
|
||||
)
|
||||
except Exception as e:
|
||||
write_event(
|
||||
store,
|
||||
kind="llm_health",
|
||||
data={"component": "schema_induction", "error": str(e)},
|
||||
severity="critical",
|
||||
)
|
||||
return induce_schemas_tier0(store)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- persist
|
||||
|
||||
|
||||
def _majority_language(evidence_ids: list[UUID], store: MemoryStore) -> str:
|
||||
"""Return the plurality ISO-639-1 language tag among evidence records.
|
||||
|
||||
fix (D-08a constitutional): schema hubs must carry the
|
||||
language of their source evidence, not a hardcoded 'en'. A user whose
|
||||
records are Russian would otherwise get schemas tagged 'en' and fail
|
||||
their own language='ru' filter at retrieval.
|
||||
|
||||
Algorithm:
|
||||
- Fetch each evidence record via store.get (skip missing/deleted ones).
|
||||
- Collect their language fields (skip empty/None).
|
||||
- Return max(set(langs), key=langs.count). Tie-break is deterministic
|
||||
given a stable input list order: max with key=list.count returns
|
||||
the first element from the set iteration whose count is the
|
||||
maximum, and Python's set iteration on strings follows insertion
|
||||
order in CPython >= 3.7 for the distinct-values pattern used here
|
||||
because we build the distinct set from a list iteration.
|
||||
- Fallback 'en' when evidence is empty or all records are missing.
|
||||
|
||||
Tie-break policy: when two languages are tied, the one whose first
|
||||
occurrence appears EARLIEST in evidence_ids wins. Matches Phase 1
|
||||
default 'en' when no signal is available (least-surprise).
|
||||
"""
|
||||
langs: list[str] = []
|
||||
for eid in evidence_ids:
|
||||
rec = store.get(eid)
|
||||
if rec is None:
|
||||
continue
|
||||
if rec.language:
|
||||
langs.append(rec.language)
|
||||
if not langs:
|
||||
return "en"
|
||||
# Deterministic tie-break: iterate langs in order, pick the first whose
|
||||
# count is the max. max(set(langs), key=langs.count) is undefined for
|
||||
# set ordering, so we use a hand-rolled pass instead.
|
||||
best = langs[0]
|
||||
best_count = langs.count(best)
|
||||
seen: set[str] = {best}
|
||||
for lang in langs[1:]:
|
||||
if lang in seen:
|
||||
continue
|
||||
seen.add(lang)
|
||||
c = langs.count(lang)
|
||||
if c > best_count:
|
||||
best = lang
|
||||
best_count = c
|
||||
return best
|
||||
|
||||
|
||||
def persist_schema(
|
||||
store: MemoryStore,
|
||||
candidate: SchemaCandidate,
|
||||
) -> UUID:
|
||||
"""Insert a schema record + schema_instance_of edges to evidence.
|
||||
|
||||
Schema records carry:
|
||||
- tier="semantic", detail_level=3 (never_decay auto-true)
|
||||
- tags=["schema", <status>, f"pattern:{pattern}"]
|
||||
- s5_trust_score=0.5 (neutral prior; LEARN-06 may raise over time)
|
||||
- schema_version=2
|
||||
"""
|
||||
from iai_mcp.aaak import enforce_language_tagged, generate_aaak_index
|
||||
from iai_mcp.embed import embedder_for_store
|
||||
|
||||
summary = (
|
||||
f"Schema: {candidate.pattern} (confidence={candidate.confidence:.2f})"
|
||||
)
|
||||
|
||||
# R1 (D-09 + D-10): pattern dedup. Search for an existing
|
||||
# schema record carrying the tag `pattern:{candidate.pattern}` in the
|
||||
# semantic tier. If found, reinforce schema_instance_of edges from new
|
||||
# evidence onto the existing keeper, emit `schema_reinforced`, and
|
||||
# return the existing schema_id. If not found, fall through to the
|
||||
# original insert path. Closes the chain-induction bleed: every sleep
|
||||
# cycle would otherwise insert a fresh tier="semantic", never_decay
|
||||
# row for the same pattern (live store accumulated 7+ duplicates per
|
||||
# pattern with degree-bonus shouldering verbatim records out of hits[]).
|
||||
pattern_tag = f"pattern:{candidate.pattern}"
|
||||
# Phase 07.7-04 D-26-B: keeper scan migrated from store.all_records() to
|
||||
# store.iter_record_columns(["id", "tier", "tags_json"], batch_size=1024).
|
||||
# Projection skips encrypted columns (literal_surface, provenance_json,
|
||||
# profile_modulation_gain_json) entirely — the W5 cipher cache is
|
||||
# short-circuited on this path. Early-exit (`break`) semantics preserved.
|
||||
# The matching row's id arrives as a string from LanceDB; we convert to
|
||||
# UUID at the boundary so downstream code sees the same type contract as
|
||||
# the pre-D-26 ``existing_keeper.id`` access pattern.
|
||||
existing_keeper_id: UUID | None = None
|
||||
try:
|
||||
for row in store.iter_record_columns(
|
||||
["id", "tier", "tags_json"], batch_size=1024
|
||||
):
|
||||
if row.get("tier") != "semantic":
|
||||
continue
|
||||
tags_raw = row.get("tags_json") or "[]"
|
||||
try:
|
||||
tags = json.loads(tags_raw) if tags_raw else []
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
tags = []
|
||||
if pattern_tag in tags:
|
||||
id_raw = row.get("id")
|
||||
if id_raw is None:
|
||||
continue
|
||||
try:
|
||||
existing_keeper_id = (
|
||||
UUID(id_raw) if isinstance(id_raw, str) else id_raw
|
||||
)
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
break
|
||||
except Exception:
|
||||
# Defensive: if the scan fails, fall through to the insert path so
|
||||
# we never silently lose a schema. Mirrors the diagnostic-write
|
||||
# contract used in pipeline.py provenance batching.
|
||||
existing_keeper_id = None
|
||||
|
||||
if existing_keeper_id is not None:
|
||||
from iai_mcp.store import EDGES_TABLE
|
||||
|
||||
# Reinforce schema_instance_of edges from each new evidence record
|
||||
# onto the existing keeper. Reuses the same delta formula as the
|
||||
# insert path (max(0.1, candidate.confidence)) for symmetry.
|
||||
delta = max(0.1, candidate.confidence)
|
||||
new_pairs = [(ev_id, existing_keeper_id) for ev_id in candidate.evidence_ids]
|
||||
if new_pairs:
|
||||
store.boost_edges(
|
||||
new_pairs,
|
||||
edge_type="schema_instance_of",
|
||||
delta=delta,
|
||||
)
|
||||
|
||||
# Compute total_evidence after reinforcement: count
|
||||
# `schema_instance_of` edges incident on the keeper. Read via the
|
||||
# edges table to avoid trusting any in-memory cache.
|
||||
# Note: store.boost_edges canonicalises (src, dst) to a sorted
|
||||
# tuple, so the keeper appears in EITHER column depending on the
|
||||
# string ordering of the paired evidence UUID. OR-counting both
|
||||
# columns gives the true edge-incidence count (no double-count
|
||||
# since each edge row has the keeper in exactly one column).
|
||||
try:
|
||||
edges_df = store.db.open_table(EDGES_TABLE).to_pandas()
|
||||
keeper_str = str(existing_keeper_id)
|
||||
total_evidence = int(
|
||||
((edges_df["edge_type"] == "schema_instance_of")
|
||||
& ((edges_df["dst"] == keeper_str)
|
||||
| (edges_df["src"] == keeper_str))).sum()
|
||||
)
|
||||
except Exception:
|
||||
total_evidence = len(candidate.evidence_ids)
|
||||
|
||||
write_event(
|
||||
store,
|
||||
kind="schema_reinforced",
|
||||
data={
|
||||
"schema_id": str(existing_keeper_id),
|
||||
"pattern": candidate.pattern,
|
||||
"evidence_added": len(candidate.evidence_ids),
|
||||
"total_evidence": total_evidence,
|
||||
},
|
||||
severity="info",
|
||||
source_ids=[existing_keeper_id, *candidate.evidence_ids[:5]],
|
||||
)
|
||||
return existing_keeper_id
|
||||
|
||||
emb = embedder_for_store(store).embed(summary)
|
||||
now = datetime.now(timezone.utc)
|
||||
schema_id = uuid4()
|
||||
# fix: derive language from the plurality language
|
||||
# of the evidence records, not a hardcoded 'en'. Schema hubs for Russian /
|
||||
# Japanese / Arabic clusters now carry the correct ISO-639-1 tag so
|
||||
# language-filtered retrieval surfaces them as expected.
|
||||
derived_language = _majority_language(candidate.evidence_ids, store)
|
||||
schema_rec = MemoryRecord(
|
||||
id=schema_id,
|
||||
tier="semantic",
|
||||
literal_surface=summary,
|
||||
aaak_index="",
|
||||
embedding=emb,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=3,
|
||||
pinned=False,
|
||||
stability=0.7,
|
||||
difficulty=0.3,
|
||||
last_reviewed=now,
|
||||
never_decay=True,
|
||||
never_merge=False,
|
||||
provenance=[
|
||||
{
|
||||
"ts": now.isoformat(),
|
||||
"cue": "schema_induction",
|
||||
"session_id": "system",
|
||||
}
|
||||
],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=[
|
||||
"schema",
|
||||
candidate.status,
|
||||
f"pattern:{candidate.pattern}",
|
||||
],
|
||||
language=derived_language,
|
||||
s5_trust_score=0.5,
|
||||
profile_modulation_gain={},
|
||||
schema_version=SCHEMA_VERSION_CURRENT,
|
||||
)
|
||||
enforce_language_tagged(schema_rec)
|
||||
schema_rec.aaak_index = generate_aaak_index(schema_rec)
|
||||
store.insert(schema_rec)
|
||||
|
||||
# R3: batch the schema_instance_of edges into ONE boost_edges
|
||||
# call (one merge_insert + one tbl.add at most). Previously this loop
|
||||
# issued N Lance versions on edges.lance for an N-evidence schema.
|
||||
instance_pairs = [(ev_id, schema_id) for ev_id in candidate.evidence_ids]
|
||||
if instance_pairs:
|
||||
store.boost_edges(
|
||||
instance_pairs,
|
||||
edge_type="schema_instance_of",
|
||||
delta=max(0.1, candidate.confidence),
|
||||
)
|
||||
|
||||
write_event(
|
||||
store,
|
||||
kind="schema_induction_run",
|
||||
data={
|
||||
"schema_id": str(schema_id),
|
||||
"pattern": candidate.pattern,
|
||||
"confidence": candidate.confidence,
|
||||
"evidence_count": candidate.evidence_count,
|
||||
"status": candidate.status,
|
||||
},
|
||||
severity="info",
|
||||
source_ids=[schema_id, *candidate.evidence_ids[:5]],
|
||||
)
|
||||
return schema_id
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- provisional
|
||||
|
||||
|
||||
def provisional_schemas_for_recall(
|
||||
store: MemoryStore,
|
||||
hits: list,
|
||||
entropy_bits: float,
|
||||
records_cache: "dict | None" = None,
|
||||
) -> list[dict]:
|
||||
"""D-18 secondary path: surface provisional schema hints on high-entropy recalls.
|
||||
|
||||
Returns a list of hint dicts compatible with RecallResponse.hints, one per
|
||||
cohesive tag appearing in >= 2 of the top hits.
|
||||
|
||||
perf: batched all_records() fetch replaces N+1 store.get()
|
||||
calls. A single to_pandas() call is still O(total_records) but constant
|
||||
per recall, not per-hit. This was a major D-SPEED bottleneck at N=50.
|
||||
|
||||
perf (Rule 1 auto-fix): accept optional `records_cache` so
|
||||
pipeline_recall can pass its already-built cache through -- avoids a
|
||||
second `store.all_records()` scan per recall (~40ms at N=100). Falls
|
||||
back to all_records() if no cache provided (preserves back-compat for
|
||||
ad-hoc callers; tests without pipeline_recall still work).
|
||||
"""
|
||||
if entropy_bits < PROVISIONAL_ENTROPY_MIN or len(hits) < 3:
|
||||
return []
|
||||
|
||||
# Batch-fetch all records once; hits are typically <=5 so the cost of
|
||||
# filtering in-memory dominates over 5 separate store.get() round-trips.
|
||||
hit_ids = {h.record_id for h in hits}
|
||||
if records_cache is not None:
|
||||
# Reuse the cache built at pipeline_recall stage 1. Zero scans.
|
||||
by_id = {
|
||||
rid: rec for rid, rec in records_cache.items() if rid in hit_ids
|
||||
}
|
||||
else:
|
||||
try:
|
||||
all_recs = store.all_records()
|
||||
except Exception:
|
||||
return []
|
||||
by_id = {r.id: r for r in all_recs if r.id in hit_ids}
|
||||
|
||||
tag_count: Counter = Counter()
|
||||
for h in hits:
|
||||
rec = by_id.get(h.record_id)
|
||||
if rec is None:
|
||||
continue
|
||||
for t in (rec.tags or []):
|
||||
if t.startswith("raw:") or t.startswith("domain:"):
|
||||
continue
|
||||
tag_count[t] += 1
|
||||
|
||||
provisional: list[dict] = []
|
||||
for tag, cnt in tag_count.most_common(3):
|
||||
if cnt >= 2:
|
||||
source_ids: list[str] = []
|
||||
for h in hits:
|
||||
rec = by_id.get(h.record_id)
|
||||
if rec is None:
|
||||
continue
|
||||
if tag in (rec.tags or []):
|
||||
source_ids.append(str(h.record_id))
|
||||
if len(source_ids) >= 5:
|
||||
break
|
||||
provisional.append(
|
||||
{
|
||||
"kind": "provisional_schema",
|
||||
"severity": "info",
|
||||
"source_ids": source_ids,
|
||||
"text": f"Potential schema: tag={tag} cnt={cnt}",
|
||||
"provisional": True,
|
||||
"entropy": entropy_bits,
|
||||
}
|
||||
)
|
||||
return provisional
|
||||
Loading…
Add table
Add a link
Reference in a new issue