Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
Areg Noya 2026-05-06 01:04:47 -07:00
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions

View file

@ -0,0 +1,250 @@
{
"env": {
"cpu_brand": "Apple M2 Max",
"cpu_cores_physical": 12,
"ram_gb": "64.0",
"os": "Darwin",
"os_version": "25.3.0",
"python_version": "3.12.13",
"iai_mcp_git_sha": "9c61a18",
"iai_mcp_git_dirty": true,
"lance_version": "unknown",
"lancedb_version": "0.30.2",
"pyarrow_version": "23.0.1",
"sentence_transformers_version": "5.4.1",
"embedder_model": "bge-small-en-v1.5",
"seed_list": [
13,
42,
137
],
"iai_mcp_store": "/private/tmp/iai-mcp-bench-claude/store",
"wall_clock_start_utc": "2026-05-03T01:10:24.783110+00:00",
"scale": "honest",
"n_sessions": 1000,
"n_probes_pre": 250,
"n_probes_post": 250,
"n_slices": [
0,
1
],
"k_hits": 10,
"a_threshold": 0.98,
"candidate_pool_size": 200,
"bootstrap_resamples": 10000,
"floor_mode": "relaxed",
"wall_clock_duration_seconds": 5328.49
},
"summary": {
"per_cell": [
{
"seed": 13,
"n_slice": 0,
"n_b_probes": 250,
"n_a_probes": 250,
"metric_b": {
"delta_mrr_point": 0.0,
"delta_mrr_ci_lo": 0.0,
"delta_mrr_ci_hi": 0.0,
"wilcoxon_p": null,
"max_rank_regression": 0,
"rr_at_1_pipeline": 0.272,
"rr_at_1_cosine": 0.272
},
"metric_b_revised": {
"hint_emission_rate": 1.0,
"anti_hits_coverage": 0.912,
"mean_anti_hits_count": 1.904
},
"metric_a": {
"hit_at_k_pipeline": 1.0,
"hit_at_k_cosine": 0.692,
"k": 10,
"catastrophic_floor_violations": 0
}
},
{
"seed": 13,
"n_slice": 1,
"n_b_probes": 250,
"n_a_probes": 250,
"metric_b": {
"delta_mrr_point": 0.0,
"delta_mrr_ci_lo": 0.0,
"delta_mrr_ci_hi": 0.0,
"wilcoxon_p": null,
"max_rank_regression": 0,
"rr_at_1_pipeline": 0.272,
"rr_at_1_cosine": 0.272
},
"metric_b_revised": {
"hint_emission_rate": 1.0,
"anti_hits_coverage": 0.912,
"mean_anti_hits_count": 1.904
},
"metric_a": {
"hit_at_k_pipeline": 1.0,
"hit_at_k_cosine": 0.692,
"k": 10,
"catastrophic_floor_violations": 0
}
},
{
"seed": 42,
"n_slice": 0,
"n_b_probes": 250,
"n_a_probes": 250,
"metric_b": {
"delta_mrr_point": 0.0,
"delta_mrr_ci_lo": 0.0,
"delta_mrr_ci_hi": 0.0,
"wilcoxon_p": null,
"max_rank_regression": 0,
"rr_at_1_pipeline": 0.264,
"rr_at_1_cosine": 0.264
},
"metric_b_revised": {
"hint_emission_rate": 1.0,
"anti_hits_coverage": 0.892,
"mean_anti_hits_count": 2.16
},
"metric_a": {
"hit_at_k_pipeline": 1.0,
"hit_at_k_cosine": 0.708,
"k": 10,
"catastrophic_floor_violations": 0
}
},
{
"seed": 42,
"n_slice": 1,
"n_b_probes": 250,
"n_a_probes": 250,
"metric_b": {
"delta_mrr_point": 0.0,
"delta_mrr_ci_lo": 0.0,
"delta_mrr_ci_hi": 0.0,
"wilcoxon_p": null,
"max_rank_regression": 0,
"rr_at_1_pipeline": 0.264,
"rr_at_1_cosine": 0.264
},
"metric_b_revised": {
"hint_emission_rate": 1.0,
"anti_hits_coverage": 0.892,
"mean_anti_hits_count": 2.16
},
"metric_a": {
"hit_at_k_pipeline": 1.0,
"hit_at_k_cosine": 0.708,
"k": 10,
"catastrophic_floor_violations": 0
}
},
{
"seed": 137,
"n_slice": 0,
"n_b_probes": 250,
"n_a_probes": 250,
"metric_b": {
"delta_mrr_point": 0.0,
"delta_mrr_ci_lo": 0.0,
"delta_mrr_ci_hi": 0.0,
"wilcoxon_p": null,
"max_rank_regression": 0,
"rr_at_1_pipeline": 0.292,
"rr_at_1_cosine": 0.292
},
"metric_b_revised": {
"hint_emission_rate": 1.0,
"anti_hits_coverage": 0.868,
"mean_anti_hits_count": 2.2
},
"metric_a": {
"hit_at_k_pipeline": 1.0,
"hit_at_k_cosine": 0.74,
"k": 10,
"catastrophic_floor_violations": 0
}
},
{
"seed": 137,
"n_slice": 1,
"n_b_probes": 250,
"n_a_probes": 250,
"metric_b": {
"delta_mrr_point": 0.0,
"delta_mrr_ci_lo": 0.0,
"delta_mrr_ci_hi": 0.0,
"wilcoxon_p": null,
"max_rank_regression": 0,
"rr_at_1_pipeline": 0.292,
"rr_at_1_cosine": 0.292
},
"metric_b_revised": {
"hint_emission_rate": 1.0,
"anti_hits_coverage": 0.868,
"mean_anti_hits_count": 2.2
},
"metric_a": {
"hit_at_k_pipeline": 1.0,
"hit_at_k_cosine": 0.74,
"k": 10,
"catastrophic_floor_violations": 0
}
}
],
"cross_seed": {
"n_0": {
"delta_mrr_mean": 0.0,
"delta_mrr_stdev": 0.0,
"delta_mrr_min": 0.0,
"delta_mrr_max": 0.0,
"robust": false
},
"n_1": {
"delta_mrr_mean": 0.0,
"delta_mrr_stdev": 0.0,
"delta_mrr_min": 0.0,
"delta_mrr_max": 0.0,
"robust": false
}
},
"gates": {
"per_cell": {
"seed13_n0": {
"gate_a": true,
"gate_b_classical": false,
"gate_b_contract": true
},
"seed13_n1": {
"gate_a": true,
"gate_b_classical": false,
"gate_b_contract": true
},
"seed42_n0": {
"gate_a": true,
"gate_b_classical": false,
"gate_b_contract": true
},
"seed42_n1": {
"gate_a": true,
"gate_b_classical": false,
"gate_b_contract": true
},
"seed137_n0": {
"gate_a": true,
"gate_b_classical": false,
"gate_b_contract": true
},
"seed137_n1": {
"gate_a": true,
"gate_b_classical": false,
"gate_b_contract": true
}
},
"cross_seed_robust": false,
"overall_pass": true
}
}
}

View file

@ -0,0 +1,63 @@
# Contradiction-longitudinal falsifiability bench — PASS
**Run ID:** 20260503T011024Z-seeds13-42-137-scale_honest
**Duration:** 5328.5s
## Environment
| Field | Value |
|---|---|
| `cpu_brand` | Apple M2 Max |
| `cpu_cores_physical` | 12 |
| `ram_gb` | 64.0 |
| `os` | Darwin |
| `os_version` | 25.3.0 |
| `python_version` | 3.12.13 |
| `iai_mcp_git_sha` | (pre-release) |
| `iai_mcp_git_dirty` | True |
| `lance_version` | unknown |
| `lancedb_version` | 0.30.2 |
| `pyarrow_version` | 23.0.1 |
| `sentence_transformers_version` | 5.4.1 |
| `embedder_model` | bge-small-en-v1.5 |
| `seed_list` | [13, 42, 137] |
| `iai_mcp_store` | /private/tmp/iai-mcp-bench-claude/store |
| `wall_clock_start_utc` | 2026-05-03T01:10:24.783110+00:00 |
| `scale` | honest |
| `n_sessions` | 1000 |
| `n_probes_pre` | 250 |
| `n_probes_post` | 250 |
| `n_slices` | [0, 1] |
| `k_hits` | 10 |
| `a_threshold` | 0.98 |
| `candidate_pool_size` | 200 |
| `bootstrap_resamples` | 10000 |
| `floor_mode` | relaxed |
| `wall_clock_duration_seconds` | 5328.49 |
## Cross-seed (B robustness)
| N slice | ΔMRR mean | stdev | min | max | robust? |
|---|---|---|---|---|---|
| n_0 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | NO |
| n_1 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | NO |
## Per-cell detail
| seed | N | A hit@k (pipe / cos) | A floor | B-class ΔMRR (CI) | B-contract hint% / anti-hits% | gate A | gate B-class | gate B-contract |
|---|---|---|---|---|---|---|---|---|
| 13 | 0 | 1.000 / 0.692 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.912 | PASS | FAIL | PASS |
| 13 | 1 | 1.000 / 0.692 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.912 | PASS | FAIL | PASS |
| 42 | 0 | 1.000 / 0.708 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.892 | PASS | FAIL | PASS |
| 42 | 1 | 1.000 / 0.708 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.892 | PASS | FAIL | PASS |
| 137 | 0 | 1.000 / 0.740 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.868 | PASS | FAIL | PASS |
| 137 | 1 | 1.000 / 0.740 | 0 | 0.0000 (0.0000, 0.0000) | 1.000 / 0.868 | PASS | FAIL | PASS |
**Cross-seed robust gate (B-classical only):** FAIL (expected: B-class is not the architectural promise)
**Overall verdict (uses gate_a + gate_b_contract):** PASS
## Notes on metric design
- **Metric A (verbatim preserved)** tests REQUIREMENTS.md — the system's promise that contradiction = reconsolidation, never overwrite. Pipeline beating cosine here = real architectural advantage.
- **Metric B-classical (rank current above cosine)** tests an expectation that does NOT appear in any design doc. Per REQUIREMENTS.md + 02-CONTEXT.md, the system uses dual-route + inhibitory edges + hints, not rerank. Expect ΔMRR ≈ 0; this is a feature, not a bug.
- **Metric B-contract (s4_contradiction hint OR anti_hits ≥80%)** tests what the system actually promises (REQUIREMENTS.md MEM-08, dual-route). Cosine cannot do either; pipeline either signals contradictions or it doesn't.