feat(ssa): optimize branch condition handling via constant folding, enhance precision for taint analysis, and expand OWASP Benchmark support

This commit is contained in:
elipeter 2026-06-02 13:41:45 -05:00
parent ec76c9e08f
commit 9c99f6c6a9
22 changed files with 1020 additions and 17 deletions

View file

@ -55,6 +55,19 @@ _CAP_BIT_TABLE = [
(1 << 20, "prototype_pollution"),
]
# Static lens (see --static): SHELL_ESCAPE (1<<2) is the command-injection sink
# cap for *every* language (`grep SHELL_ESCAPE src/labels/` — all Sink uses are
# command-exec; CODE_EXEC=1<<10 is the eval/code-exec variant, also cmdi). In a
# normal `nyx scan` (no dynamic confirmation) a Java cmdi finding carries only
# SHELL_ESCAPE; the SHELL_ESCAPE→CODE_EXEC remap that buckets it as cmdi is gated
# on VerifyStatus::Confirmed (src/commands/scan.rs), so with 0 confirmations the
# default table leaves these in "other" and the cmdi cell reads 0/0/N. The
# static lens appends SHELL_ESCAPE→cmdi at the LOWEST priority (after every other
# bit) so a SHELL_ESCAPE-only finding buckets as cmdi while a finding that also
# carries a higher-priority sink bit (e.g. FILE_IO) keeps its existing bucket.
# Opt-in via --static so the default confirmed-recall bucketing is byte-identical.
_CAP_BIT_TABLE_STATIC = _CAP_BIT_TABLE + [(1 << 2, "cmdi")] # SHELL_ESCAPE
# Substring → cap lookup for rule IDs. Order matters: most specific first.
_CAP_RULE_TABLE = [
("path_traversal", "path_traversal"),
@ -83,12 +96,13 @@ def load_json(path: str) -> object:
return json.load(f)
def cap_of(finding: dict) -> str:
def cap_of(finding: dict, static_lens: bool = False) -> str:
# 1. Prefer evidence.sink_caps bitmask — the engine's own classification.
ev = finding.get("evidence", {}) or {}
sink_caps = ev.get("sink_caps")
if isinstance(sink_caps, int) and sink_caps:
for bit, name in _CAP_BIT_TABLE:
table = _CAP_BIT_TABLE_STATIC if static_lens else _CAP_BIT_TABLE
for bit, name in table:
if sink_caps & bit:
return name
# 2. Fall back to rule id substring (e.g. py.cmdi.os_system, java.deser.readobject).
@ -383,6 +397,20 @@ def main() -> int:
default="",
help="path to a previous results JSON; fail on monotonic-improvement regression",
)
p.add_argument(
"--static",
action="store_true",
help=(
"static lens: bucket SHELL_ESCAPE (1<<2) findings as cmdi even when "
"they are unconfirmed. Java (and other) command-exec sinks carry "
"SHELL_ESCAPE and only get remapped to CODE_EXEC on dynamic Confirm; "
"without this flag, an env with 0 confirmations reads the cmdi cell "
"as 0/0/N regardless of static quality. SHELL_ESCAPE is the "
"command-injection sink cap for every language, so this is sound "
"globally; it is opt-in only so the default confirmed-recall "
"bucketing stays byte-identical."
),
)
args = p.parse_args()
lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()}
@ -418,7 +446,7 @@ def main() -> int:
continue
f_path = f.get("path", "")
f_line = f.get("line", 0)
f_cap = cap_of(f)
f_cap = cap_of(f, static_lens=args.static)
for idx, entry in enumerate(not_vuln):
if idx in used:
continue
@ -455,7 +483,7 @@ def main() -> int:
)
for f in findings:
cap = cap_of(f)
cap = cap_of(f, static_lens=args.static)
lang = lang_of(f)
key = (cap, lang)
ev = f.get("evidence", {}) or {}
@ -501,7 +529,7 @@ def main() -> int:
for f in findings:
f_path = f.get("path", "")
f_line = f.get("line", 0)
f_cap = cap_of(f)
f_cap = cap_of(f, static_lens=args.static)
cap = f_cap
lang = lang_of(f)
cell_key = (cap, lang)

View file

@ -46,6 +46,8 @@ def write_json(path: Path, data: object) -> None:
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
SINK_BIT_SHELL = 1 << 2 # SHELL_ESCAPE (Java/other command-exec sink)
SINK_BIT_FILE = 1 << 5 # FILE_IO (path_traversal)
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
@ -353,6 +355,91 @@ def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None:
assert all(lang != "javascript" for _cap, lang in cells), cells
def test_static_lens_buckets_shell_escape_as_cmdi(tmp: Path) -> None:
# Caveat-1 fix: in an env with 0 dynamic confirmations a Java command-exec
# finding carries only SHELL_ESCAPE (1<<2), which the default bit table
# leaves in "other" — so the cmdi cell reads 0 TP / N FN regardless of
# static quality. --static appends SHELL_ESCAPE→cmdi so static recall is
# measurable without dynamic confirmation.
gt = tmp / "gt.json"
write_json(
gt,
[{"path": "testcode/Cmd.java", "line": 0, "cap": "cmdi", "vuln": True}],
)
# Real Java taint findings carry id "taint-unsanitised-flow" (no cap
# substring), so the rule-id fallback yields "other" — not the sqli/cmdi
# the hand-crafted python_finding id would imply.
java_cmdi = {
"path": "/x/testcode/Cmd.java",
"line": 10,
"col": 0,
"id": "taint-unsanitised-flow",
"evidence": {"sink_caps": SINK_BIT_SHELL, "dynamic_verdict": {"status": "NotConfirmed"}},
}
scan = tmp / "scan.json"
write_json(scan, {"findings": [java_cmdi]})
# Default lens: the finding buckets as "other", so cmdi shows the GT
# positive as a pure FN (recall 0) — the measurement gap.
default = tmp / "default.json"
write_json(default, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--append", str(default),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(default.read_text())[-1]["cells"]}
assert ("cmdi", "java") in cells and cells[("cmdi", "java")]["tp"] == 0, cells
assert cells[("cmdi", "java")]["fn"] == 1, cells[("cmdi", "java")]
assert ("other", "java") in cells, f"SHELL_ESCAPE must bucket as other by default: {list(cells)}"
# Static lens: the finding buckets as cmdi → recall measurable (TP=1, FN=0).
static = tmp / "static.json"
write_json(static, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--static",
"--append", str(static),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(static.read_text())[-1]["cells"]}
cmdi = cells[("cmdi", "java")]
assert cmdi["tp"] == 1 and cmdi["fn"] == 0, cmdi
assert ("other", "java") not in cells, f"static lens must reclaim the other-bucketed finding: {list(cells)}"
def test_static_lens_preserves_higher_priority_bits(tmp: Path) -> None:
# A finding carrying BOTH FILE_IO and SHELL_ESCAPE must keep bucketing as
# path_traversal under the static lens (SHELL_ESCAPE is appended at lowest
# priority), so the static lens never steals a finding from a non-cmdi cell.
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_FILE | SINK_BIT_SHELL, "B.java", 10, "NotConfirmed"),
]
},
)
for flag in ([], ["--static"]):
append = tmp / f"out{len(flag)}.json"
write_json(append, [])
proc = run_tabulate(
"--label", "x",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
*flag,
)
assert proc.returncode == 0, proc.stdout + proc.stderr
caps = {c["cap"] for c in json.loads(append.read_text())[-1]["cells"]}
assert caps == {"path_traversal"}, f"flag={flag}: {caps}"
def test_budget_malformed_exits_3(tmp: Path) -> None:
bad = tmp / "bad.toml"
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
@ -661,6 +748,8 @@ def main() -> int:
test_manual_triage_stamps_wrong_confirmed,
test_manual_triage_ignores_vuln_true_entries,
test_lang_filter_scopes_findings_and_gt,
test_static_lens_buckets_shell_escape_as_cmdi,
test_static_lens_preserves_higher_priority_bits,
test_budget_malformed_exits_3,
test_relative_gt_path_suffix_matches_absolute_finding,
test_unmatched_gt_positive_lands_in_lang_cell,

View file

@ -0,0 +1,19 @@
{
"description": "Dead-branch constant condition (OWASP Benchmark cmdi non-vulnerable shape). `(7*42) - num > 200` with num=86 is 208 > 200 — always true — so `bar` is the constant string and the `else bar = param` arm is statically dead. The constant-branch fold (src/ssa/const_prop.rs::fold_constant_branches) evaluates the captured CondArith tree, prunes the dead edge, and drops the tainted phi operand AND neutralises the dead block so copy-prop cannot alias `bar`<->`param`. Result: `r.exec(cmd + bar)` carries no taint. Asserts NO taint finding fires (strict_unexpected promotes any taint-unsanitised-flow to a hard failure).",
"tags": [
"taint",
"cmdi",
"servlet",
"runtime",
"dead-branch",
"const-fold",
"precision"
],
"modes": [
"full"
],
"strict_unexpected": [
"taint-unsanitised-flow"
],
"expected": []
}

View file

@ -0,0 +1,27 @@
import java.io.*;
import javax.servlet.http.*;
// Dead-branch constant condition (OWASP Benchmark cmdi non-vulnerable shape).
// The guard `(7*42) - num > 200` is `294 - 86 = 208 > 200`, i.e. ALWAYS true,
// so `bar` is provably the constant string and the tainted `else` arm
// (`bar = param`) is unreachable. The constant-branch fold
// (`fold_constant_branches`) must prune the dead edge and drop the tainted
// phi operand so `r.exec(cmd + bar)` carries no attacker data NO finding.
public class DeadBranchConstSafe extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response)
throws IOException {
String param = request.getHeader("vector");
String bar;
int num = 86;
if ((7 * 42) - num > 200) {
bar = "This_should_always_happen";
} else {
bar = param;
}
String cmd = "echo ";
Runtime r = Runtime.getRuntime();
Process p = r.exec(cmd + bar);
}
}

View file

@ -0,0 +1,32 @@
{
"description": "Dead-branch constant condition with VULNERABLE polarity. `(500/42) + num > 200` is `11 + 196 = 207 > 200` (integer division) — always true — and the TRUE arm assigns the tainted `param`, so the reachable branch carries taint and only the `else bar = \"...\"` arm is dead. The constant-branch fold must prune the DEAD else edge while keeping the live `bar = param`, so the command-injection finding at `r.exec(cmd + bar)` MUST still fire. Zero-false-negative guard: it proves the fold never prunes the reachable (tainted) arm.",
"tags": [
"taint",
"cmdi",
"servlet",
"runtime",
"dead-branch",
"const-fold",
"no-false-negative"
],
"modes": [
"full"
],
"strict_unexpected": [
"taint-unsanitised-flow"
],
"expected": [
{
"rule_id": "taint-unsanitised-flow",
"severity": "HIGH",
"must_match": true,
"line_range": [
26,
26
],
"evidence_contains": [],
"notes": "request.getHeader (line 15) flows into bar on the always-taken true arm (line 21), then into r.exec at line 26. Exactly one finding survives.",
"max_count": 1
}
]
}

View file

@ -0,0 +1,28 @@
import java.io.*;
import javax.servlet.http.*;
// Dead-branch constant condition, VULNERABLE polarity (OWASP Benchmark cmdi
// vulnerable shape). The guard `(500/42) + num > 200` is `11 + 196 = 207 > 200`
// using integer division ALWAYS true and the TRUE arm assigns the tainted
// `param`. So the live branch carries taint and the `else bar = "never"` arm is
// dead. The constant-branch fold must prune the DEAD (else) edge and keep the
// reachable tainted `bar = param`, so `r.exec(cmd + bar)` MUST still fire. This
// is the zero-false-negative guard: the fold must never prune the live arm.
public class DeadBranchParamVuln extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response)
throws IOException {
String param = request.getHeader("vector");
String bar;
int num = 196;
if ((500 / 42) + num > 200) {
bar = param;
} else {
bar = "This_should_never_happen";
}
String cmd = "echo ";
Runtime r = Runtime.getRuntime();
Process p = r.exec(cmd + bar);
}
}

View file

@ -0,0 +1,29 @@
{
"description": "HttpServletRequest parameter flows through a List into ProcessBuilder.command(argList) — command injection via the setter form (list attached separately from the constructor, then pb.start()). This is the dominant OWASP Benchmark cmdi shape; resolved via type-qualified ProcessBuilder.command sink on the typed receiver plus container-element taint on the argument list.",
"tags": [
"taint",
"cmdi",
"servlet",
"container"
],
"modes": [
"full"
],
"strict_unexpected": [
"taint-unsanitised-flow"
],
"expected": [
{
"rule_id": "taint-unsanitised-flow",
"severity": "HIGH",
"must_match": true,
"line_range": [
16,
16
],
"evidence_contains": [],
"notes": "request.getParameter (line 8) is concatenated into a list element (argList.add at line 13), the list is attached to ProcessBuilder via pb.command(argList) at line 16, and executed by pb.start() at line 17. The type-qualified ProcessBuilder.command sink fires at line 16 on the tainted container argument. Exactly one finding survives.",
"max_count": 1
}
]
}

View file

@ -0,0 +1,19 @@
import java.io.*;
import java.util.*;
import javax.servlet.http.*;
public class ProcessCommandHandler extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response)
throws IOException {
String param = request.getParameter("vector");
List<String> argList = new ArrayList<String>();
argList.add("sh");
argList.add("-c");
argList.add("echo " + param);
ProcessBuilder pb = new ProcessBuilder();
pb.command(argList);
pb.start();
}
}

View file

@ -0,0 +1,30 @@
{
"description": "HttpServletRequest header flows into a String[] env array passed to a split-receiver Runtime.exec — command injection via the `Runtime r = Runtime.getRuntime(); ... r.exec(cmd, argsEnv)` shape (the dominant remaining OWASP Benchmark cmdi form). The callee text at the sink is `r.exec`, which does not suffix-match the flat `Runtime.exec` rule; resolution depends on the receiver `r` carrying TypeKind::Runtime (from the `Runtime.getRuntime()` factory / the `Runtime` declared type) so the type-qualified resolver rewrites `r.exec` → `Runtime.exec`. Taint is in the env array (arg 1), so no payload-arg restriction may be applied.",
"tags": [
"taint",
"cmdi",
"servlet",
"runtime",
"split-receiver"
],
"modes": [
"full"
],
"strict_unexpected": [
"taint-unsanitised-flow"
],
"expected": [
{
"rule_id": "taint-unsanitised-flow",
"severity": "HIGH",
"must_match": true,
"line_range": [
16,
16
],
"evidence_contains": [],
"notes": "request.getHeader (line 7) flows into the env array element argsEnv (line 15), which is passed as arg 1 of r.exec at line 16. The receiver r is typed Runtime via Runtime.getRuntime() (line 13), so the type-qualified Runtime.exec sink fires at the split-receiver call. Exactly one finding survives.",
"max_count": 1
}
]
}

View file

@ -0,0 +1,18 @@
import java.io.*;
import javax.servlet.http.*;
public class RuntimeSplitReceiverHandler extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response)
throws IOException {
String param = request.getHeader("vector");
// Split-receiver Runtime.exec: the receiver is bound to a local in
// one statement, then exec is called on it in another. The OWASP
// Benchmark cmdi shape places the tainted data in the environment
// array (arg 1), not the command (arg 0).
Runtime r = Runtime.getRuntime();
String[] args = { "/bin/sh", "-c", "echo nyx" };
String[] argsEnv = { "TAINT=" + param };
r.exec(args, argsEnv);
}
}