introduce ground-truth converters for OWASP and SARD datasets

This commit is contained in:
elipeter 2026-05-12 16:16:26 -04:00
parent e62fddb82a
commit 5909fa8c5d
14 changed files with 16779 additions and 369 deletions

View file

@ -104,6 +104,7 @@ mod parity_tests {
},
project_root: None,
db_path: None,
verify_all_confidence: false,
}
}
@ -116,6 +117,7 @@ mod parity_tests {
},
project_root: None,
db_path: None,
verify_all_confidence: false,
}
}

View file

@ -58,17 +58,15 @@ mod escape_tests {
backend: SandboxBackend::Docker,
env_passthrough: vec![],
output_limit: 65536,
oob_listener: None,
}
}
/// Minimal no-op payload (escape scripts ignore NYX_PAYLOAD).
fn noop_payload() -> nyx_scanner::dynamic::corpus::Payload {
nyx_scanner::dynamic::corpus::Payload {
bytes: b"",
label: "escape-noop",
oracle: nyx_scanner::dynamic::corpus::Oracle::ExitStatus(1),
is_benign: true,
}
/// Minimal no-op payload bytes (escape scripts ignore NYX_PAYLOAD).
/// `sandbox::run` takes `&[u8]` directly; the CuratedPayload struct lives
/// one level up in the runner.
fn noop_payload() -> &'static [u8] {
b""
}
/// Copy a directory tree into a destination (creating it if needed).

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,97 @@
#!/usr/bin/env python3
"""Convert OWASP Benchmark v1.2 expectedresults-*.csv into nyx ground-truth JSON.
Source: `expectedresults-1.2beta.csv` shipped in the BenchmarkJava repo.
Output: list of `{path, line, cap, vuln}` records, where:
- `path` is the absolute path to the BenchmarkTest*.java under --corpus-dir.
- `line` is 0 (CSV does not pin a line; tabulate uses LINE_TOLERANCE on findings).
- `cap` is a nyx cap label mapped from the OWASP category column.
- `vuln` is True for `real vulnerability == true`, else False.
Usage:
tests/eval_corpus/owasp_gt_convert.py \\
--corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \\
--output tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
"""
import argparse
import csv
import json
import sys
from pathlib import Path
OWASP_TO_NYX_CAP = {
"cmdi": "cmdi",
"crypto": "crypto",
"hash": "crypto",
"ldapi": "ldap_injection",
"pathtraver": "path_traversal",
"securecookie": "auth",
"sqli": "sqli",
"trustbound": "xss",
"weakrand": "crypto",
"xpathi": "xpath_injection",
"xss": "xss",
}
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--corpus-dir", required=True,
help="Path to BenchmarkJava clone root.")
p.add_argument("--output", required=True,
help="Output ground-truth JSON path.")
p.add_argument("--csv", default="",
help="Override CSV path (default: <corpus-dir>/expectedresults-1.2beta.csv).")
args = p.parse_args()
corpus = Path(args.corpus_dir).expanduser().resolve()
csv_path = Path(args.csv) if args.csv else corpus / "expectedresults-1.2beta.csv"
if not csv_path.exists():
print(f"error: csv not found: {csv_path}", file=sys.stderr)
return 1
java_root = corpus / "src" / "main" / "java" / "org" / "owasp" / "benchmark" / "testcode"
if not java_root.is_dir():
print(f"error: java testcode dir not found: {java_root}", file=sys.stderr)
return 1
records: list[dict] = []
skipped = 0
with open(csv_path) as f:
reader = csv.reader(f)
next(reader, None)
for row in reader:
if len(row) < 3:
continue
name, category, real_vuln = row[0].strip(), row[1].strip(), row[2].strip().lower()
cap = OWASP_TO_NYX_CAP.get(category)
if cap is None:
skipped += 1
continue
java_file = java_root / f"{name}.java"
if not java_file.exists():
skipped += 1
continue
records.append({
"path": str(java_file),
"line": 0,
"cap": cap,
"vuln": real_vuln == "true",
})
out = Path(args.output).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump(records, f, indent=2)
vuln_count = sum(1 for r in records if r["vuln"])
print(f"wrote {len(records)} records to {out}")
print(f" vulns: {vuln_count}")
print(f" non-vuln: {len(records) - vuln_count}")
print(f" skipped: {skipped}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -147,7 +147,23 @@ fi
# ── Emit summary table ────────────────────────────────────────────────────────
info ""
info "Results written to: $RESULTS_JSON"
python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON" \
|| { info "report.py not available; raw results at $RESULTS_JSON"; exit 0; }
[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"
if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
info "report.py not available; raw results at $RESULTS_JSON"
exit 0
fi
set +e
python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON"
REPORT_RC=$?
set -e
# Propagate gate-fail (exit 2). Treat other non-zero as setup error (exit 1).
if [[ $REPORT_RC -eq 2 ]]; then
exit 2
elif [[ $REPORT_RC -ne 0 ]]; then
info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
exit 1
fi
exit 0

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Convert NIST SARD manifest XML into nyx ground-truth JSON.
SARD ships per-test-case `manifest.xml` files alongside source. Each
`<testcase>` lists one or more `<file path="">` entries with optional
`<flaw line="" name="CWE-XXX_…"/>` children.
Output schema (consumed by tabulate.py):
list of {"path", "line", "cap", "vuln"} records.
Usage:
tests/eval_corpus/sard_gt_convert.py \\
--corpus-dir ~/.cache/nyx/eval_corpus/nist_sard \\
--output tests/eval_corpus/ground_truth/nist_sard.json
"""
import argparse
import json
import re
import sys
import xml.etree.ElementTree as ET
from pathlib import Path
CWE_TO_NYX_CAP = {
"20": "validation",
"22": "path_traversal",
"78": "cmdi",
"79": "xss",
"89": "sqli",
"90": "ldap_injection",
"91": "xpath_injection",
"94": "cmdi",
"113": "header_injection",
"117": "header_injection",
"190": "memory",
"200": "data_exfil",
"287": "auth",
"295": "crypto",
"311": "crypto",
"327": "crypto",
"328": "crypto",
"330": "crypto",
"352": "auth",
"434": "path_traversal",
"476": "memory",
"502": "deserialize",
"601": "redirect",
"611": "xxe",
"643": "xpath_injection",
"798": "crypto",
"918": "ssrf",
}
CWE_RE = re.compile(r"CWE[-_](\d+)", re.IGNORECASE)
def cap_for_flaw(name: str) -> str | None:
m = CWE_RE.search(name or "")
if not m:
return None
return CWE_TO_NYX_CAP.get(m.group(1))
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--corpus-dir", required=True)
p.add_argument("--output", required=True)
args = p.parse_args()
root = Path(args.corpus_dir).expanduser().resolve()
if not root.is_dir():
print(f"error: corpus dir not found: {root}", file=sys.stderr)
return 1
records: list[dict] = []
skipped_files = 0
skipped_caps = 0
for manifest in root.rglob("manifest.xml"):
try:
tree = ET.parse(manifest)
except ET.ParseError as e:
print(f"warn: parse failed {manifest}: {e}", file=sys.stderr)
continue
for tc in tree.iter("testcase"):
for fnode in tc.iter("file"):
rel = fnode.get("path") or ""
if not rel:
continue
abs_path = (manifest.parent / rel).resolve()
if not abs_path.exists():
skipped_files += 1
continue
flaws = list(fnode.iter("flaw")) + list(fnode.iter("mixed"))
if not flaws:
records.append({
"path": str(abs_path),
"line": 0,
"cap": "other",
"vuln": False,
})
continue
for flaw in flaws:
cap = cap_for_flaw(flaw.get("name", ""))
if cap is None:
skipped_caps += 1
continue
try:
line = int(flaw.get("line", "0") or 0)
except ValueError:
line = 0
records.append({
"path": str(abs_path),
"line": line,
"cap": cap,
"vuln": True,
})
out = Path(args.output).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump(records, f, indent=2)
vuln_count = sum(1 for r in records if r["vuln"])
print(f"wrote {len(records)} records to {out}")
print(f" vulns: {vuln_count}")
print(f" non-vuln: {len(records) - vuln_count}")
print(f" skipped (file): {skipped_files}")
print(f" skipped (cap): {skipped_caps}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -19,25 +19,46 @@ from pathlib import Path
LINE_TOLERANCE = 5
_CAP_PREFIX_TABLE = [
("taint.path_traversal", "path_traversal"),
("taint.sql", "sqli"),
("taint.xss", "xss"),
("taint.ssrf", "ssrf"),
("taint.cmdi", "cmdi"),
("taint.deserialize", "deserialize"),
("taint.redirect", "redirect"),
("taint.xxe", "xxe"),
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
_CAP_BIT_TABLE = [
(1 << 5, "path_traversal"), # FILE_IO
(1 << 6, "fmt_string"),
(1 << 7, "sqli"), # SQL_QUERY
(1 << 8, "deserialize"),
(1 << 9, "ssrf"),
(1 << 10, "cmdi"), # CODE_EXEC
(1 << 11, "crypto"),
(1 << 12, "unauthorized_id"),
(1 << 13, "data_exfil"),
(1 << 14, "ldap_injection"),
(1 << 15, "xpath_injection"),
(1 << 16, "header_injection"),
(1 << 17, "redirect"), # OPEN_REDIRECT
(1 << 18, "xss"), # SSTI (template_injection); also covers XSS sinks
(1 << 19, "xxe"),
(1 << 20, "prototype_pollution"),
]
# Substring → cap lookup for rule IDs. Order matters: most specific first.
_CAP_RULE_TABLE = [
("path_traversal", "path_traversal"),
("sqli", "sqli"),
("xss", "xss"),
("ssrf", "ssrf"),
("cmdi", "cmdi"),
("deserialize", "deserialize"),
("redirect", "redirect"),
("xxe", "xxe"),
("auth", "auth"),
("taint", "taint"),
("sql", "sqli"),
("xss", "xss"),
("ssrf", "ssrf"),
("cmdi", "cmdi"),
("cmd_exec", "cmdi"),
("code_exec", "cmdi"),
("deser", "deserialize"),
("unserialize", "deserialize"),
("redirect", "redirect"),
("xxe", "xxe"),
("template", "xss"),
("auth", "auth"),
("memory", "memory"),
("crypto", "crypto"),
("data-exfil", "data_exfil"),
("data_exfil", "data_exfil"),
("header", "header_injection"),
]
@ -47,9 +68,18 @@ def load_json(path: str) -> object:
def cap_of(finding: dict) -> str:
rule = finding.get("rule_id", "").lower()
for prefix, cap in _CAP_PREFIX_TABLE:
if rule.startswith(prefix):
# 1. Prefer evidence.sink_caps bitmask — the engine's own classification.
ev = finding.get("evidence", {}) or {}
sink_caps = ev.get("sink_caps")
if isinstance(sink_caps, int) and sink_caps:
for bit, name in _CAP_BIT_TABLE:
if sink_caps & bit:
return name
# 2. Fall back to rule id substring (e.g. py.cmdi.os_system, java.deser.readobject).
rid = (finding.get("id") or "").lower()
head = rid.split(" ", 1)[0]
for needle, cap in _CAP_RULE_TABLE:
if needle in head:
return cap
return "other"
@ -122,8 +152,9 @@ def main() -> int:
for idx, gt_entry in enumerate(gt_true):
if (gt_entry["path"] == f_path
and gt_entry["cap"] == f_cap
and abs(gt_entry["line"] - f_line) <= LINE_TOLERANCE
and idx not in matched_gt):
and idx not in matched_gt
and (gt_entry["line"] == 0
or abs(gt_entry["line"] - f_line) <= LINE_TOLERANCE)):
matched_idx = idx
break
if matched_idx is not None: