Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/bench/results/contradiction_longitudinal_20260503T011024Z-seeds13-42-137-scale_honest.csv
+++ b/bench/results/contradiction_longitudinal_20260503T011024Z-seeds13-42-137-scale_honest.csv
--- a/bench/results/contradiction_longitudinal_20260503T011024Z-seeds13-42-137-scale_honest.json
+++ b/bench/results/contradiction_longitudinal_20260503T011024Z-seeds13-42-137-scale_honest.json
@ -0,0 +1,250 @@
+{
+  "env": {
+    "cpu_brand": "Apple M2 Max",
+    "cpu_cores_physical": 12,
+    "ram_gb": "64.0",
+    "os": "Darwin",
+    "os_version": "25.3.0",
+    "python_version": "3.12.13",
+    "iai_mcp_git_sha": "9c61a18",
+    "iai_mcp_git_dirty": true,
+    "lance_version": "unknown",
+    "lancedb_version": "0.30.2",
+    "pyarrow_version": "23.0.1",
+    "sentence_transformers_version": "5.4.1",
+    "embedder_model": "bge-small-en-v1.5",
+    "seed_list": [
+      13,
+      42,
+      137
+    ],
+    "iai_mcp_store": "/private/tmp/iai-mcp-bench-claude/store",
+    "wall_clock_start_utc": "2026-05-03T01:10:24.783110+00:00",
+    "scale": "honest",
+    "n_sessions": 1000,
+    "n_probes_pre": 250,
+    "n_probes_post": 250,
+    "n_slices": [
+      0,
+      1
+    ],
+    "k_hits": 10,
+    "a_threshold": 0.98,
+    "candidate_pool_size": 200,
+    "bootstrap_resamples": 10000,
+    "floor_mode": "relaxed",
+    "wall_clock_duration_seconds": 5328.49
+  },
+  "summary": {
+    "per_cell": [
+      {
+        "seed": 13,
+        "n_slice": 0,
+        "n_b_probes": 250,
+        "n_a_probes": 250,
+        "metric_b": {
+          "delta_mrr_point": 0.0,
+          "delta_mrr_ci_lo": 0.0,
+          "delta_mrr_ci_hi": 0.0,
+          "wilcoxon_p": null,
+          "max_rank_regression": 0,
+          "rr_at_1_pipeline": 0.272,
+          "rr_at_1_cosine": 0.272
+        },
+        "metric_b_revised": {
+          "hint_emission_rate": 1.0,
+          "anti_hits_coverage": 0.912,
+          "mean_anti_hits_count": 1.904
+        },
+        "metric_a": {
+          "hit_at_k_pipeline": 1.0,
+          "hit_at_k_cosine": 0.692,
+          "k": 10,
+          "catastrophic_floor_violations": 0
+        }
+      },
+      {
+        "seed": 13,
+        "n_slice": 1,
+        "n_b_probes": 250,
+        "n_a_probes": 250,
+        "metric_b": {
+          "delta_mrr_point": 0.0,
+          "delta_mrr_ci_lo": 0.0,
+          "delta_mrr_ci_hi": 0.0,
+          "wilcoxon_p": null,
+          "max_rank_regression": 0,
+          "rr_at_1_pipeline": 0.272,
+          "rr_at_1_cosine": 0.272
+        },
+        "metric_b_revised": {
+          "hint_emission_rate": 1.0,
+          "anti_hits_coverage": 0.912,
+          "mean_anti_hits_count": 1.904
+        },
+        "metric_a": {
+          "hit_at_k_pipeline": 1.0,
+          "hit_at_k_cosine": 0.692,
+          "k": 10,
+          "catastrophic_floor_violations": 0
+        }
+      },
+      {
+        "seed": 42,
+        "n_slice": 0,
+        "n_b_probes": 250,
+        "n_a_probes": 250,
+        "metric_b": {
+          "delta_mrr_point": 0.0,
+          "delta_mrr_ci_lo": 0.0,
+          "delta_mrr_ci_hi": 0.0,
+          "wilcoxon_p": null,
+          "max_rank_regression": 0,
+          "rr_at_1_pipeline": 0.264,
+          "rr_at_1_cosine": 0.264
+        },
+        "metric_b_revised": {
+          "hint_emission_rate": 1.0,
+          "anti_hits_coverage": 0.892,
+          "mean_anti_hits_count": 2.16
+        },
+        "metric_a": {
+          "hit_at_k_pipeline": 1.0,
+          "hit_at_k_cosine": 0.708,
+          "k": 10,
+          "catastrophic_floor_violations": 0
+        }
+      },
+      {
+        "seed": 42,
+        "n_slice": 1,
+        "n_b_probes": 250,
+        "n_a_probes": 250,
+        "metric_b": {
+          "delta_mrr_point": 0.0,
+          "delta_mrr_ci_lo": 0.0,
+          "delta_mrr_ci_hi": 0.0,
+          "wilcoxon_p": null,
+          "max_rank_regression": 0,
+          "rr_at_1_pipeline": 0.264,
+          "rr_at_1_cosine": 0.264
+        },
+        "metric_b_revised": {
+          "hint_emission_rate": 1.0,
+          "anti_hits_coverage": 0.892,
+          "mean_anti_hits_count": 2.16
+        },
+        "metric_a": {
+          "hit_at_k_pipeline": 1.0,
+          "hit_at_k_cosine": 0.708,
+          "k": 10,
+          "catastrophic_floor_violations": 0
+        }
+      },
+      {
+        "seed": 137,
+        "n_slice": 0,
+        "n_b_probes": 250,
+        "n_a_probes": 250,
+        "metric_b": {
+          "delta_mrr_point": 0.0,
+          "delta_mrr_ci_lo": 0.0,
+          "delta_mrr_ci_hi": 0.0,
+          "wilcoxon_p": null,
+          "max_rank_regression": 0,
+          "rr_at_1_pipeline": 0.292,
+          "rr_at_1_cosine": 0.292
+        },
+        "metric_b_revised": {
+          "hint_emission_rate": 1.0,
+          "anti_hits_coverage": 0.868,
+          "mean_anti_hits_count": 2.2
+        },
+        "metric_a": {
+          "hit_at_k_pipeline": 1.0,
+          "hit_at_k_cosine": 0.74,
+          "k": 10,
+          "catastrophic_floor_violations": 0
+        }
+      },
+      {
+        "seed": 137,
+        "n_slice": 1,
+        "n_b_probes": 250,
+        "n_a_probes": 250,
+        "metric_b": {
+          "delta_mrr_point": 0.0,
+          "delta_mrr_ci_lo": 0.0,
+          "delta_mrr_ci_hi": 0.0,
+          "wilcoxon_p": null,
+          "max_rank_regression": 0,
+          "rr_at_1_pipeline": 0.292,
+          "rr_at_1_cosine": 0.292
+        },
+        "metric_b_revised": {
+          "hint_emission_rate": 1.0,
+          "anti_hits_coverage": 0.868,
+          "mean_anti_hits_count": 2.2
+        },
+        "metric_a": {
+          "hit_at_k_pipeline": 1.0,
+          "hit_at_k_cosine": 0.74,
+          "k": 10,
+          "catastrophic_floor_violations": 0
+        }
+      }
+    ],
+    "cross_seed": {
+      "n_0": {
+        "delta_mrr_mean": 0.0,
+        "delta_mrr_stdev": 0.0,
+        "delta_mrr_min": 0.0,
+        "delta_mrr_max": 0.0,
+        "robust": false
+      },
+      "n_1": {
+        "delta_mrr_mean": 0.0,
+        "delta_mrr_stdev": 0.0,
+        "delta_mrr_min": 0.0,
+        "delta_mrr_max": 0.0,
+        "robust": false
+      }
+    },
+    "gates": {
+      "per_cell": {
+        "seed13_n0": {
+          "gate_a": true,
+          "gate_b_classical": false,
+          "gate_b_contract": true
+        },
+        "seed13_n1": {
+          "gate_a": true,
+          "gate_b_classical": false,
+          "gate_b_contract": true
+        },
+        "seed42_n0": {
+          "gate_a": true,
+          "gate_b_classical": false,
+          "gate_b_contract": true
+        },
+        "seed42_n1": {
+          "gate_a": true,
+          "gate_b_classical": false,
+          "gate_b_contract": true
+        },
+        "seed137_n0": {
+          "gate_a": true,
+          "gate_b_classical": false,
+          "gate_b_contract": true
+        },
+        "seed137_n1": {
+          "gate_a": true,
+          "gate_b_classical": false,
+          "gate_b_contract": true
+        }
+      },
+      "cross_seed_robust": false,
+      "overall_pass": true
+    }
+  }
+}
--- a/bench/results/contradiction_longitudinal_20260503T011024Z-seeds13-42-137-scale_honest.md
+++ b/bench/results/contradiction_longitudinal_20260503T011024Z-seeds13-42-137-scale_honest.md
@ -0,0 +1,63 @@
+# Contradiction-longitudinal falsifiability bench — PASS
+
+**Run ID:** 20260503T011024Z-seeds13-42-137-scale_honest
+**Duration:** 5328.5s
+
+## Environment
+
+| Field | Value |
+|---|---|
+| `cpu_brand` | Apple M2 Max |
+| `cpu_cores_physical` | 12 |
+| `ram_gb` | 64.0 |
+| `os` | Darwin |
+| `os_version` | 25.3.0 |
+| `python_version` | 3.12.13 |
+| `iai_mcp_git_sha` | (pre-release) |
+| `iai_mcp_git_dirty` | True |
+| `lance_version` | unknown |
+| `lancedb_version` | 0.30.2 |
+| `pyarrow_version` | 23.0.1 |
+| `sentence_transformers_version` | 5.4.1 |
+| `embedder_model` | bge-small-en-v1.5 |
+| `seed_list` | [13, 42, 137] |
+| `iai_mcp_store` | /private/tmp/iai-mcp-bench-claude/store |
+| `wall_clock_start_utc` | 2026-05-03T01:10:24.783110+00:00 |
+| `scale` | honest |
+| `n_sessions` | 1000 |
+| `n_probes_pre` | 250 |
+| `n_probes_post` | 250 |
+| `n_slices` | [0, 1] |
+| `k_hits` | 10 |
+| `a_threshold` | 0.98 |
+| `candidate_pool_size` | 200 |
+| `bootstrap_resamples` | 10000 |
+| `floor_mode` | relaxed |
+| `wall_clock_duration_seconds` | 5328.49 |
+
+## Cross-seed (B robustness)
+
+| N slice | ΔMRR mean | stdev | min | max | robust? |
+|---|---|---|---|---|---|
+| n_0 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | NO |
+| n_1 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | NO |
+
+## Per-cell detail
+
+| seed | N | A hit@k (pipe / cos) | A floor | B-class ΔMRR (CI) | B-contract hint% / anti-hits% | gate A | gate B-class | gate B-contract |
+|---|---|---|---|---|---|---|---|---|
+| 13 | 0 | 1.000 / 0.692 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.912 | PASS | FAIL | PASS |
+| 13 | 1 | 1.000 / 0.692 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.912 | PASS | FAIL | PASS |
+| 42 | 0 | 1.000 / 0.708 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.892 | PASS | FAIL | PASS |
+| 42 | 1 | 1.000 / 0.708 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.892 | PASS | FAIL | PASS |
+| 137 | 0 | 1.000 / 0.740 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.868 | PASS | FAIL | PASS |
+| 137 | 1 | 1.000 / 0.740 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.868 | PASS | FAIL | PASS |
+
+**Cross-seed robust gate (B-classical only):** FAIL (expected: B-class is not the architectural promise)
+**Overall verdict (uses gate_a + gate_b_contract):** PASS
+
+## Notes on metric design
+
+- **Metric A (verbatim preserved)** tests REQUIREMENTS.md — the system's promise that contradiction = reconsolidation, never overwrite. Pipeline beating cosine here = real architectural advantage.
+- **Metric B-classical (rank current above cosine)** tests an expectation that does NOT appear in any design doc. Per REQUIREMENTS.md + 02-CONTEXT.md, the system uses dual-route + inhibitory edges + hints, not rerank. Expect ΔMRR ≈ 0; this is a feature, not a bug.
+- **Metric B-contract (s4_contradiction hint OR anti_hits ≥80%)** tests what the system actually promises (REQUIREMENTS.md MEM-08, dual-route). Cosine cannot do either; pipeline either signals contradictions or it doesn't.