Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
80 lines
2.9 KiB
Python
80 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Contradiction-longitudinal falsifiability bench (skeleton + pre-registered criteria).
|
|
|
|
**Do not run on the construction host by default** — this module is meant for a
|
|
dedicated bench machine with an isolated ``IAI_MCP_STORE`` and optional GPU.
|
|
|
|
Pre-registered pass criteria:
|
|
- **Metric B (post-flip):** cues issued after session ``t_0`` (contradiction +
|
|
consolidation window simulated) must rank the *current* winning fact above
|
|
flat cosine-only retrieval on the same store slice.
|
|
- **Metric A (historical verbatim):** probes asking for superseded wording must
|
|
still surface the archived surface (verbatim MEM-06), not the post-flip fact alone.
|
|
- **Regression gate:** pipeline score on B must beat cosine baseline; A must not
|
|
collapse below a configured verbatim hit threshold.
|
|
|
|
This file loads :file:`fixtures/contradiction_longitudinal.jsonl` (synthetic JSONL
|
|
rows: ``session``, ``text``, optional ``probe`` / ``expects``) and documents the
|
|
evaluation harness contract. A full implementation wires:
|
|
|
|
1. Fixture loader → ``MemoryStore`` inserts per session order.
|
|
2. Explicit ``memory_contradict`` (or edge-equivalent) at ``t_0``.
|
|
3. Optional sleep/consolidation tick simulation (bench-only knobs).
|
|
4. Two eval slices: ``pre_flip_cues`` vs ``post_flip_cues`` with separated metrics.
|
|
|
|
Exit code 0 only when all gates pass; non-zero on any failure. Until the harness
|
|
is completed, ``main()`` prints the criteria and exits with code 2 to avoid a
|
|
silent green run::
|
|
|
|
python bench/contradiction_longitudinal.py --fixture bench/fixtures/contradiction_longitudinal.jsonl
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def load_rows(path: Path) -> list[dict]:
|
|
rows: list[dict] = []
|
|
with path.open(encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
|
|
parser.add_argument(
|
|
"--fixture",
|
|
type=Path,
|
|
default=Path(__file__).resolve().parent / "fixtures" / "contradiction_longitudinal.jsonl",
|
|
)
|
|
args = parser.parse_args(argv)
|
|
rows = load_rows(args.fixture)
|
|
print(
|
|
json.dumps(
|
|
{
|
|
"loaded_rows": len(rows),
|
|
"fixture": str(args.fixture),
|
|
"status": "harness_stub",
|
|
"criteria": [
|
|
"B: post-flip cues — pipeline beats flat cosine",
|
|
"A: historical verbatim probes — superseded text still retrievable",
|
|
"No regression: B gain without A collapse",
|
|
],
|
|
},
|
|
indent=2,
|
|
)
|
|
)
|
|
# Stub: full eval is intentionally absent so CI never runs heavy retrieval.
|
|
return 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|