From 3d51a3d8aefd1c531528fd13dec8846c8c4c1f72 Mon Sep 17 00:00:00 2001 From: pitboss Date: Sun, 17 May 2026 00:05:12 -0500 Subject: [PATCH] [pitboss/grind] deferred session-0001 (20260517T044708Z-e058) --- src/dynamic/lang/c.rs | 63 ++++++++++++- src/dynamic/lang/cpp.rs | 58 +++++++++++- tests/eval_corpus/tabulate.py | 54 +++++++++++ tests/eval_corpus/test_tabulate_regression.py | 91 +++++++++++++++++++ 4 files changed, 264 insertions(+), 2 deletions(-) diff --git a/src/dynamic/lang/c.rs b/src/dynamic/lang/c.rs index 4570acbb..cb3bab74 100644 --- a/src/dynamic/lang/c.rs +++ b/src/dynamic/lang/c.rs @@ -379,11 +379,22 @@ impl LangEmitter for CEmitter { /// Phase 26 — C chain-step harness. /// +/// Splices the C probe shim ([`probe_shim`]) ahead of a minimal driver +/// that reads `NYX_PREV_OUTPUT` and forwards it on stdout. The shim's +/// static functions (`__nyx_probe`, `__nyx_install_crash_guard`, +/// `__nyx_stub_sql_record`, `__nyx_stub_http_record`) become callable +/// from a future sink-rewrite pass without bringing in another +/// translation unit. Unreferenced shim helpers stay quiet under +/// default `cc` flags — `-Wunused-function` is not on the warning +/// baseline so dead helpers do not fail the build. +/// /// Shell-wraps `cc` + run so the compiled binary actually executes after /// the build completes — `ChainStepHarness.command` models a single /// process, so the build-then-run sequence must collapse to one `sh -c`. fn chain_step(prev_output: Option<&[u8]>) -> ChainStepHarness { - let source = "#include \n#include \n\nint main(void) {\n const char *prev = getenv(\"NYX_PREV_OUTPUT\");\n if (prev) fputs(prev, stdout);\n return 0;\n}\n".to_owned(); + let shim = probe_shim(); + let driver = "\nint main(void) {\n const char *prev = getenv(\"NYX_PREV_OUTPUT\");\n if (prev) fputs(prev, stdout);\n return 0;\n}\n"; + let source = format!("{shim}{driver}"); ChainStepHarness { source, filename: "step.c".to_owned(), @@ -853,4 +864,54 @@ mod tests { let mk = h.extra_files.iter().find(|(n, _)| n == "Makefile").expect("Makefile must be staged"); assert!(mk.1.contains("nyx_harness: main.c entry.c")); } + + #[test] + fn chain_step_splices_probe_shim_for_composite_reverify() { + // Phase 26 follow-up: C chain_step now splices the probe shim + // ahead of the driver so a chain step that terminates at a sink + // can drive the `__nyx_probe` channel directly. Asserts the + // shim banner is present and lands before `int main`, that + // `__nyx_install_crash_guard` is reachable from the spliced + // source, that `prev_output` rides through `extra_env`, and + // that the build-then-run command stays in one `sh -c` so the + // sandbox sees a single process. + let step = chain_step(Some(b"prev-output")); + assert!( + step.source.contains("__nyx_probe shim (Phase 06"), + "probe_shim banner missing from chain step source", + ); + assert!( + step.source.contains("static void __nyx_install_crash_guard("), + "install_crash_guard missing from chain step source", + ); + let shim_pos = step + .source + .find("__nyx_probe shim (Phase 06") + .expect("shim banner"); + let main_pos = step.source.find("int main(void)").expect("main fn"); + assert!( + shim_pos < main_pos, + "shim must be spliced before int main: shim={shim_pos} main={main_pos}", + ); + assert_eq!(step.filename, "step.c"); + assert_eq!( + step.command, + vec![ + "sh".to_owned(), + "-c".to_owned(), + "cc step.c -o step && ./step".to_owned(), + ], + ); + assert!( + step.extra_env + .iter() + .any(|(k, v)| k == ChainStepHarness::PREV_OUTPUT_ENV && v == "prev-output"), + "prev_output must be threaded through extra_env, got {:?}", + step.extra_env, + ); + assert!( + step.extra_files.is_empty(), + "C chain step needs no companion build manifest; `cc` is self-sufficient", + ); + } } diff --git a/src/dynamic/lang/cpp.rs b/src/dynamic/lang/cpp.rs index 8e9cc8f6..56051655 100644 --- a/src/dynamic/lang/cpp.rs +++ b/src/dynamic/lang/cpp.rs @@ -332,10 +332,18 @@ impl LangEmitter for CppEmitter { /// Phase 26 — C++ chain-step harness. /// +/// Splices the C++ probe shim ([`probe_shim`]) ahead of a minimal driver +/// that reads `NYX_PREV_OUTPUT` and forwards it on stdout. Same +/// rationale as the C sibling: the inline shim helpers become callable +/// from a future sink-rewrite pass without a separate translation unit; +/// unreferenced inline functions stay quiet under default `c++` flags. +/// /// Shell-wraps `c++` + run so the compiled binary actually executes /// after the build completes (see C-side commentary for the rationale). fn chain_step(prev_output: Option<&[u8]>) -> ChainStepHarness { - let source = "#include \n#include \n\nint main() {\n const char *prev = std::getenv(\"NYX_PREV_OUTPUT\");\n if (prev) std::fputs(prev, stdout);\n return 0;\n}\n".to_owned(); + let shim = probe_shim(); + let driver = "\nint main() {\n const char *prev = std::getenv(\"NYX_PREV_OUTPUT\");\n if (prev) std::fputs(prev, stdout);\n return 0;\n}\n"; + let source = format!("{shim}{driver}"); ChainStepHarness { source, filename: "step.cpp".to_owned(), @@ -725,4 +733,52 @@ mod tests { let mk = h.extra_files.iter().find(|(n, _)| n == "CMakeLists.txt").expect("CMakeLists.txt must be staged"); assert!(mk.1.contains("add_executable(nyx_harness main.cpp)")); } + + #[test] + fn chain_step_splices_probe_shim_for_composite_reverify() { + // Phase 26 follow-up: C++ chain_step now splices the probe shim + // ahead of the driver so a chain step that terminates at a sink + // can drive the `__nyx_probe` channel directly. Asserts the + // shim banner is present and lands before `int main`, that + // `__nyx_install_crash_guard` is reachable, prev_output rides + // through `extra_env`, and build-then-run stays one `sh -c`. + let step = chain_step(Some(b"prev-output")); + assert!( + step.source.contains("__nyx_probe shim (Phase 06"), + "probe_shim banner missing from chain step source", + ); + assert!( + step.source.contains("inline void __nyx_install_crash_guard("), + "install_crash_guard missing from chain step source", + ); + let shim_pos = step + .source + .find("__nyx_probe shim (Phase 06") + .expect("shim banner"); + let main_pos = step.source.find("int main()").expect("main fn"); + assert!( + shim_pos < main_pos, + "shim must be spliced before int main: shim={shim_pos} main={main_pos}", + ); + assert_eq!(step.filename, "step.cpp"); + assert_eq!( + step.command, + vec![ + "sh".to_owned(), + "-c".to_owned(), + "c++ step.cpp -o step && ./step".to_owned(), + ], + ); + assert!( + step.extra_env + .iter() + .any(|(k, v)| k == ChainStepHarness::PREV_OUTPUT_ENV && v == "prev-output"), + "prev_output must be threaded through extra_env, got {:?}", + step.extra_env, + ); + assert!( + step.extra_files.is_empty(), + "C++ chain step needs no companion build manifest; `c++` is self-sufficient", + ); + } } diff --git a/tests/eval_corpus/tabulate.py b/tests/eval_corpus/tabulate.py index 8ad3e2c4..d022337b 100644 --- a/tests/eval_corpus/tabulate.py +++ b/tests/eval_corpus/tabulate.py @@ -317,6 +317,19 @@ def main() -> int: p.add_argument("--ground-truth", default="", help="ground truth JSON") p.add_argument("--inhouse", action="store_true") p.add_argument("--append", required=True, help="results accumulator JSON") + p.add_argument( + "--manual-triage", + default="", + help=( + "path to a manual-triage JSON file (list of " + "{path, line, cap, vuln: bool}). Confirmed findings matching a " + "`vuln: false` entry are stamped with `wrong: true` before " + "tabulation so the per-cell False-Confirmed budget becomes " + "non-vacuous without depending on the host's `nyx verify-feedback` " + "log. Matching uses LINE_TOLERANCE (=5) — line == 0 in the triage " + "entry matches any line." + ), + ) p.add_argument( "--budget", default="", @@ -332,6 +345,47 @@ def main() -> int: scan_data = load_json(args.scan) findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", []) + # ── Manual-triage stamping (Phase 31 follow-up) ─────────────────────── + # Cross-reference Confirmed rows against a manual-triage file before + # tabulation. Each `vuln: false` entry whose `(path, cap)` matches a + # Confirmed finding (with LINE_TOLERANCE, or any line when triage + # entry's `line == 0`) stamps `wrong: true` on the finding's + # `dynamic_verdict`, which the existing wrong_confirmed counter picks + # up below. Decouples the False-Confirmed budget from the host-local + # `nyx verify-feedback` log so CI on a fresh eval corpus can still + # gate the headline target. + if args.manual_triage and Path(args.manual_triage).exists(): + triage = load_json(args.manual_triage) + not_vuln: list[dict] = [] + for entry in triage if isinstance(triage, list) else []: + if entry.get("vuln") is False: + not_vuln.append({ + "path": entry.get("path", ""), + "line": entry.get("line", 0), + "cap": entry.get("cap", ""), + }) + used: set[int] = set() + for f in findings: + ev = f.get("evidence") or {} + dv = ev.get("dynamic_verdict") or {} + if dv.get("status") != "Confirmed": + continue + f_path = f.get("path", "") + f_line = f.get("line", 0) + f_cap = cap_of(f) + for idx, entry in enumerate(not_vuln): + if idx in used: + continue + if (entry["path"] == f_path + and entry["cap"] == f_cap + and (entry["line"] == 0 + or abs(entry["line"] - f_line) <= LINE_TOLERANCE)): + used.add(idx) + dv["wrong"] = True + ev["dynamic_verdict"] = dv + f["evidence"] = ev + break + # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed, # wrong_confirmed, stable_replays, total}} cells: dict[tuple[str, str], dict] = defaultdict( diff --git a/tests/eval_corpus/test_tabulate_regression.py b/tests/eval_corpus/test_tabulate_regression.py index cdad3ba6..53d5541d 100644 --- a/tests/eval_corpus/test_tabulate_regression.py +++ b/tests/eval_corpus/test_tabulate_regression.py @@ -199,6 +199,95 @@ def test_diff_passes_on_improvement(tmp: Path) -> None: assert "no regressions" in proc.stdout, proc.stdout +def test_manual_triage_stamps_wrong_confirmed(tmp: Path) -> None: + # Phase 31 follow-up: --manual-triage should cross-reference Confirmed + # findings against a list of {path, line, cap, vuln: false} entries + # and stamp `wrong: true` so the per-cell wrong_confirmed counter + # becomes non-vacuous without the host's verify-feedback log. + # + # Confirmed at line 10 matches the triage's vuln:false at line 12 + # (within LINE_TOLERANCE=5). Confirmed at line 100 does not match + # any triage entry, so wrong_confirmed stays at 1 / 2 Confirmed. + scan = tmp / "scan.json" + write_json( + scan, + { + "findings": [ + python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"), + python_finding(SINK_BIT_SQL, "app.py", 100, "Confirmed"), + ] + }, + ) + triage = tmp / "triage.json" + write_json( + triage, + [ + {"path": "app.py", "line": 12, "cap": "sqli", "vuln": False}, + ], + ) + append = tmp / "results.json" + write_json(append, []) + proc = run_tabulate( + "--label", "triage-test", + "--scan", str(scan), + "--inhouse", + "--append", str(append), + "--manual-triage", str(triage), + ) + assert proc.returncode == 0, ( + f"manual-triage run must succeed without budget, got {proc.returncode}\n" + f"stdout: {proc.stdout}\nstderr: {proc.stderr}" + ) + results = json.loads(append.read_text()) + cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]} + sqli_py = cells.get(("sqli", "python")) + assert sqli_py is not None, f"expected sqli/python cell, got {list(cells)}" + assert sqli_py["confirmed"] == 2, sqli_py + assert sqli_py["wrong_confirmed"] == 1, ( + "exactly one Confirmed finding must be stamped wrong via the triage match; " + f"got {sqli_py}" + ) + + +def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None: + # Triage entries with `vuln: true` are ground-truth-positive markers, + # not False-Confirmed evidence. --manual-triage must leave them alone + # so a real Confirmed-on-vuln-true row does not get downgraded. + scan = tmp / "scan.json" + write_json( + scan, + { + "findings": [ + python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"), + ] + }, + ) + triage = tmp / "triage.json" + write_json( + triage, + [ + {"path": "app.py", "line": 10, "cap": "sqli", "vuln": True}, + ], + ) + append = tmp / "results.json" + write_json(append, []) + proc = run_tabulate( + "--label", "triage-true-test", + "--scan", str(scan), + "--inhouse", + "--append", str(append), + "--manual-triage", str(triage), + ) + assert proc.returncode == 0 + results = json.loads(append.read_text()) + cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]} + sqli_py = cells[("sqli", "python")] + assert sqli_py["confirmed"] == 1 + assert sqli_py["wrong_confirmed"] == 0, ( + f"vuln:true triage rows must not stamp wrong; got {sqli_py}" + ) + + def test_budget_malformed_exits_3(tmp: Path) -> None: bad = tmp / "bad.toml" bad.write_text("[default]\nunsupported_rate = not_a_number\n") @@ -226,6 +315,8 @@ def main() -> int: test_budget_fails_when_unsupported_exceeds, test_diff_fails_on_regression, test_diff_passes_on_improvement, + test_manual_triage_stamps_wrong_confirmed, + test_manual_triage_ignores_vuln_true_entries, test_budget_malformed_exits_3, ): sub = tmp / fn.__name__