mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-15 20:05:13 +02:00
[pitboss/grind] deferred session-0001 (20260517T044708Z-e058)
This commit is contained in:
parent
6189c4a4c5
commit
3d51a3d8ae
4 changed files with 264 additions and 2 deletions
|
|
@ -317,6 +317,19 @@ def main() -> int:
|
|||
p.add_argument("--ground-truth", default="", help="ground truth JSON")
|
||||
p.add_argument("--inhouse", action="store_true")
|
||||
p.add_argument("--append", required=True, help="results accumulator JSON")
|
||||
p.add_argument(
|
||||
"--manual-triage",
|
||||
default="",
|
||||
help=(
|
||||
"path to a manual-triage JSON file (list of "
|
||||
"{path, line, cap, vuln: bool}). Confirmed findings matching a "
|
||||
"`vuln: false` entry are stamped with `wrong: true` before "
|
||||
"tabulation so the per-cell False-Confirmed budget becomes "
|
||||
"non-vacuous without depending on the host's `nyx verify-feedback` "
|
||||
"log. Matching uses LINE_TOLERANCE (=5) — line == 0 in the triage "
|
||||
"entry matches any line."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
|
|
@ -332,6 +345,47 @@ def main() -> int:
|
|||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
|
||||
# ── Manual-triage stamping (Phase 31 follow-up) ───────────────────────
|
||||
# Cross-reference Confirmed rows against a manual-triage file before
|
||||
# tabulation. Each `vuln: false` entry whose `(path, cap)` matches a
|
||||
# Confirmed finding (with LINE_TOLERANCE, or any line when triage
|
||||
# entry's `line == 0`) stamps `wrong: true` on the finding's
|
||||
# `dynamic_verdict`, which the existing wrong_confirmed counter picks
|
||||
# up below. Decouples the False-Confirmed budget from the host-local
|
||||
# `nyx verify-feedback` log so CI on a fresh eval corpus can still
|
||||
# gate the headline target.
|
||||
if args.manual_triage and Path(args.manual_triage).exists():
|
||||
triage = load_json(args.manual_triage)
|
||||
not_vuln: list[dict] = []
|
||||
for entry in triage if isinstance(triage, list) else []:
|
||||
if entry.get("vuln") is False:
|
||||
not_vuln.append({
|
||||
"path": entry.get("path", ""),
|
||||
"line": entry.get("line", 0),
|
||||
"cap": entry.get("cap", ""),
|
||||
})
|
||||
used: set[int] = set()
|
||||
for f in findings:
|
||||
ev = f.get("evidence") or {}
|
||||
dv = ev.get("dynamic_verdict") or {}
|
||||
if dv.get("status") != "Confirmed":
|
||||
continue
|
||||
f_path = f.get("path", "")
|
||||
f_line = f.get("line", 0)
|
||||
f_cap = cap_of(f)
|
||||
for idx, entry in enumerate(not_vuln):
|
||||
if idx in used:
|
||||
continue
|
||||
if (entry["path"] == f_path
|
||||
and entry["cap"] == f_cap
|
||||
and (entry["line"] == 0
|
||||
or abs(entry["line"] - f_line) <= LINE_TOLERANCE)):
|
||||
used.add(idx)
|
||||
dv["wrong"] = True
|
||||
ev["dynamic_verdict"] = dv
|
||||
f["evidence"] = ev
|
||||
break
|
||||
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
|
||||
# wrong_confirmed, stable_replays, total}}
|
||||
cells: dict[tuple[str, str], dict] = defaultdict(
|
||||
|
|
|
|||
|
|
@ -199,6 +199,95 @@ def test_diff_passes_on_improvement(tmp: Path) -> None:
|
|||
assert "no regressions" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_manual_triage_stamps_wrong_confirmed(tmp: Path) -> None:
|
||||
# Phase 31 follow-up: --manual-triage should cross-reference Confirmed
|
||||
# findings against a list of {path, line, cap, vuln: false} entries
|
||||
# and stamp `wrong: true` so the per-cell wrong_confirmed counter
|
||||
# becomes non-vacuous without the host's verify-feedback log.
|
||||
#
|
||||
# Confirmed at line 10 matches the triage's vuln:false at line 12
|
||||
# (within LINE_TOLERANCE=5). Confirmed at line 100 does not match
|
||||
# any triage entry, so wrong_confirmed stays at 1 / 2 Confirmed.
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 100, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
triage = tmp / "triage.json"
|
||||
write_json(
|
||||
triage,
|
||||
[
|
||||
{"path": "app.py", "line": 12, "cap": "sqli", "vuln": False},
|
||||
],
|
||||
)
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "triage-test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--manual-triage", str(triage),
|
||||
)
|
||||
assert proc.returncode == 0, (
|
||||
f"manual-triage run must succeed without budget, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
results = json.loads(append.read_text())
|
||||
cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
|
||||
sqli_py = cells.get(("sqli", "python"))
|
||||
assert sqli_py is not None, f"expected sqli/python cell, got {list(cells)}"
|
||||
assert sqli_py["confirmed"] == 2, sqli_py
|
||||
assert sqli_py["wrong_confirmed"] == 1, (
|
||||
"exactly one Confirmed finding must be stamped wrong via the triage match; "
|
||||
f"got {sqli_py}"
|
||||
)
|
||||
|
||||
|
||||
def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None:
|
||||
# Triage entries with `vuln: true` are ground-truth-positive markers,
|
||||
# not False-Confirmed evidence. --manual-triage must leave them alone
|
||||
# so a real Confirmed-on-vuln-true row does not get downgraded.
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
triage = tmp / "triage.json"
|
||||
write_json(
|
||||
triage,
|
||||
[
|
||||
{"path": "app.py", "line": 10, "cap": "sqli", "vuln": True},
|
||||
],
|
||||
)
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "triage-true-test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--manual-triage", str(triage),
|
||||
)
|
||||
assert proc.returncode == 0
|
||||
results = json.loads(append.read_text())
|
||||
cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
|
||||
sqli_py = cells[("sqli", "python")]
|
||||
assert sqli_py["confirmed"] == 1
|
||||
assert sqli_py["wrong_confirmed"] == 0, (
|
||||
f"vuln:true triage rows must not stamp wrong; got {sqli_py}"
|
||||
)
|
||||
|
||||
|
||||
def test_budget_malformed_exits_3(tmp: Path) -> None:
|
||||
bad = tmp / "bad.toml"
|
||||
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
|
||||
|
|
@ -226,6 +315,8 @@ def main() -> int:
|
|||
test_budget_fails_when_unsupported_exceeds,
|
||||
test_diff_fails_on_regression,
|
||||
test_diff_passes_on_improvement,
|
||||
test_manual_triage_stamps_wrong_confirmed,
|
||||
test_manual_triage_ignores_vuln_true_entries,
|
||||
test_budget_malformed_exits_3,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue