mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
feat(ssa): optimize branch condition handling via constant folding, enhance precision for taint analysis, and expand OWASP Benchmark support
This commit is contained in:
parent
ec76c9e08f
commit
9c99f6c6a9
22 changed files with 1020 additions and 17 deletions
|
|
@ -3997,3 +3997,94 @@ function outer(obj, x, y) {
|
|||
let (mline, _) = method_site.span.expect("method span populated");
|
||||
assert_eq!(mline, 4, "obj.method(x) on line 4");
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// Constant-branch fold: CondArith capture + evaluation
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// `CondArith::eval`/`eval_bool` must fold the two OWASP-Benchmark
|
||||
/// arithmetic guard shapes to a definite boolean, using integer
|
||||
/// (truncating) division, and must return `None` — never a wrong fold —
|
||||
/// for any undefined operation or unresolved variable.
|
||||
#[test]
|
||||
fn cond_arith_eval_is_sound() {
|
||||
use crate::cfg::{BinOp, CondArith, CondVal};
|
||||
let lit = |n| Box::new(CondArith::Lit(n));
|
||||
let var = |s: &str| Box::new(CondArith::Var(s.to_string()));
|
||||
let bin = |op, l, r| Box::new(CondArith::Bin(op, l, r));
|
||||
|
||||
// num = 86 resolver.
|
||||
let r86 = |name: &str| if name == "num" { Some(86) } else { None };
|
||||
// (7*42) - num > 200 → 208 > 200 → true.
|
||||
let shape1 = CondArith::Bin(
|
||||
BinOp::Gt,
|
||||
bin(BinOp::Sub, bin(BinOp::Mul, lit(7), lit(42)), var("num")),
|
||||
lit(200),
|
||||
);
|
||||
assert_eq!(shape1.eval_bool(&r86), Some(true));
|
||||
|
||||
// (500/42) + num > 200 → 11 + 196 = 207 > 200 → true (integer div).
|
||||
let r196 = |name: &str| if name == "num" { Some(196) } else { None };
|
||||
let shape2 = CondArith::Bin(
|
||||
BinOp::Gt,
|
||||
bin(BinOp::Add, bin(BinOp::Div, lit(500), lit(42)), var("num")),
|
||||
lit(200),
|
||||
);
|
||||
assert_eq!(shape2.eval_bool(&r196), Some(true));
|
||||
// Integer division truncates toward zero (500/42 == 11, not ~11.9).
|
||||
assert_eq!(
|
||||
CondArith::Bin(BinOp::Div, lit(500), lit(42)).eval(&r86),
|
||||
Some(CondVal::Int(11))
|
||||
);
|
||||
|
||||
// Unresolved variable → None (no prune).
|
||||
let none = |_: &str| None;
|
||||
assert_eq!(shape1.eval_bool(&none), None);
|
||||
|
||||
// Division / modulo by zero → None (never a wrong fold).
|
||||
assert_eq!(CondArith::Bin(BinOp::Div, lit(1), lit(0)).eval(&r86), None);
|
||||
assert_eq!(CondArith::Bin(BinOp::Mod, lit(1), lit(0)).eval(&r86), None);
|
||||
|
||||
// Arithmetic overflow → None.
|
||||
assert_eq!(
|
||||
CondArith::Bin(BinOp::Mul, lit(i64::MAX), lit(2)).eval(&r86),
|
||||
None
|
||||
);
|
||||
|
||||
// Bare integer at the top level is not a branch condition → eval_bool None.
|
||||
assert_eq!(CondArith::Lit(1).eval_bool(&r86), None);
|
||||
|
||||
// Comparing a boolean sub-result as an integer operand → None.
|
||||
let cmp = bin(BinOp::Gt, lit(2), lit(1)); // yields Bool
|
||||
assert_eq!(CondArith::Bin(BinOp::Add, cmp, lit(1)).eval(&r86), None);
|
||||
}
|
||||
|
||||
/// The CFG builder must capture a pure integer-arithmetic comparison as a
|
||||
/// `CondArith` on the `If` node, and must refuse (None) any condition that
|
||||
/// touches a call / field access / string.
|
||||
#[test]
|
||||
fn build_cond_arith_captures_pure_int_comparison() {
|
||||
let ts_lang = Language::from(tree_sitter_java::LANGUAGE);
|
||||
let src = br#"
|
||||
class C {
|
||||
void m(int num, String s) {
|
||||
if ((7 * 42) - num > 200) { foo(); }
|
||||
if (s.length() > 200) { bar(); }
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let (cfg, _entry) = parse_and_build(src, "java", ts_lang);
|
||||
let ifs = if_nodes(&cfg);
|
||||
let arith: Vec<_> = ifs.iter().filter_map(|&n| cfg[n].cond_arith.clone()).collect();
|
||||
|
||||
// Exactly one If condition is a pure int-arith comparison; the
|
||||
// `s.length() > 200` one must NOT be captured (it contains a call).
|
||||
assert_eq!(
|
||||
arith.len(),
|
||||
1,
|
||||
"only the pure int comparison should yield a CondArith, got {arith:?}"
|
||||
);
|
||||
// It folds to a definite bool once `num` is known constant.
|
||||
let r = |name: &str| if name == "num" { Some(86) } else { None };
|
||||
assert_eq!(arith[0].eval_bool(&r), Some(true));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1198,10 +1198,14 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
|
|||
| "string_content"
|
||||
| "string_fragment" => !has_string_interpolation(node),
|
||||
|
||||
// Numbers
|
||||
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
|
||||
true
|
||||
}
|
||||
// Numbers. Java's grammar uses radix-tagged kinds
|
||||
// (`decimal_integer_literal`, `hex_integer_literal`, …) rather than a
|
||||
// bare `integer`, so `int num = 86;` would otherwise miss this arm and
|
||||
// lower to `Const(None)` (Varying) instead of `Const("86")`.
|
||||
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number"
|
||||
| "decimal_integer_literal" | "hex_integer_literal" | "octal_integer_literal"
|
||||
| "binary_integer_literal" | "decimal_floating_point_literal"
|
||||
| "hex_floating_point_literal" => true,
|
||||
|
||||
// Booleans / null / nil / none
|
||||
"true" | "false" | "null" | "nil" | "none" | "null_literal" | "boolean"
|
||||
|
|
|
|||
307
src/cfg/mod.rs
307
src/cfg/mod.rs
|
|
@ -431,6 +431,129 @@ pub enum BinOp {
|
|||
GtEq,
|
||||
}
|
||||
|
||||
impl BinOp {
|
||||
/// True for the six comparison operators (result is a boolean 0/1).
|
||||
pub fn is_comparison(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
BinOp::Eq | BinOp::NotEq | BinOp::Lt | BinOp::LtEq | BinOp::Gt | BinOp::GtEq
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A branch condition captured as a pure integer-arithmetic + comparison
|
||||
/// expression tree at CFG-build time (where the real tree-sitter AST is
|
||||
/// available, so operator precedence and parentheses are correct by
|
||||
/// construction — no text re-parsing downstream).
|
||||
///
|
||||
/// Built only when *every* leaf is an integer literal or a plain identifier
|
||||
/// and *every* interior node is an arithmetic / comparison / bitwise operator,
|
||||
/// a unary `-`, or a parenthesis. Any call, field access, string, container,
|
||||
/// or compound-boolean (`&&` / `||`) subtree makes the builder return `None`
|
||||
/// for the whole condition. Identifiers are stored by name and resolved to
|
||||
/// their constant SSA value at fold time
|
||||
/// ([`crate::ssa::const_prop::fold_constant_branches`]); the actual numeric
|
||||
/// evaluation is shared in [`CondArith::eval`].
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum CondArith {
|
||||
/// Integer literal.
|
||||
Lit(i64),
|
||||
/// Identifier — resolved to a constant integer at fold time, else unknown.
|
||||
Var(String),
|
||||
/// Unary integer negation: `-x`.
|
||||
Neg(Box<CondArith>),
|
||||
/// Binary arithmetic / bitwise / comparison.
|
||||
Bin(BinOp, Box<CondArith>, Box<CondArith>),
|
||||
}
|
||||
|
||||
/// Result of folding a [`CondArith`] against a constant environment.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum CondVal {
|
||||
Int(i64),
|
||||
Bool(bool),
|
||||
}
|
||||
|
||||
impl CondArith {
|
||||
/// Evaluate against a variable→constant-integer resolver. Returns `None`
|
||||
/// the moment anything is non-constant or an operation is undefined
|
||||
/// (division/modulo by zero, arithmetic overflow, type mismatch), so a
|
||||
/// caller can only ever prune on a *definite* result. All integer
|
||||
/// arithmetic is checked; overflow yields `None` rather than a wrapped
|
||||
/// value, which keeps the fold sound across the i32/i64 gap.
|
||||
pub fn eval(&self, resolve: &impl Fn(&str) -> Option<i64>) -> Option<CondVal> {
|
||||
match self {
|
||||
CondArith::Lit(n) => Some(CondVal::Int(*n)),
|
||||
CondArith::Var(name) => resolve(name).map(CondVal::Int),
|
||||
CondArith::Neg(inner) => match inner.eval(resolve)? {
|
||||
CondVal::Int(n) => n.checked_neg().map(CondVal::Int),
|
||||
CondVal::Bool(_) => None,
|
||||
},
|
||||
CondArith::Bin(op, l, r) => {
|
||||
let lhs = match l.eval(resolve)? {
|
||||
CondVal::Int(n) => n,
|
||||
CondVal::Bool(_) => return None,
|
||||
};
|
||||
let rhs = match r.eval(resolve)? {
|
||||
CondVal::Int(n) => n,
|
||||
CondVal::Bool(_) => return None,
|
||||
};
|
||||
let arith = |v: Option<i64>| v.map(CondVal::Int);
|
||||
match op {
|
||||
BinOp::Add => arith(lhs.checked_add(rhs)),
|
||||
BinOp::Sub => arith(lhs.checked_sub(rhs)),
|
||||
BinOp::Mul => arith(lhs.checked_mul(rhs)),
|
||||
// Java/Rust integer division and modulo both truncate
|
||||
// toward zero; `checked_*` rejects div-by-zero and
|
||||
// i64::MIN / -1 overflow.
|
||||
BinOp::Div => arith(lhs.checked_div(rhs)),
|
||||
BinOp::Mod => arith(lhs.checked_rem(rhs)),
|
||||
BinOp::BitAnd => arith(Some(lhs & rhs)),
|
||||
BinOp::BitOr => arith(Some(lhs | rhs)),
|
||||
BinOp::BitXor => arith(Some(lhs ^ rhs)),
|
||||
BinOp::LeftShift => {
|
||||
u32::try_from(rhs).ok().and_then(|s| lhs.checked_shl(s)).map(CondVal::Int)
|
||||
}
|
||||
BinOp::RightShift => {
|
||||
u32::try_from(rhs).ok().and_then(|s| lhs.checked_shr(s)).map(CondVal::Int)
|
||||
}
|
||||
BinOp::Eq => Some(CondVal::Bool(lhs == rhs)),
|
||||
BinOp::NotEq => Some(CondVal::Bool(lhs != rhs)),
|
||||
BinOp::Lt => Some(CondVal::Bool(lhs < rhs)),
|
||||
BinOp::LtEq => Some(CondVal::Bool(lhs <= rhs)),
|
||||
BinOp::Gt => Some(CondVal::Bool(lhs > rhs)),
|
||||
BinOp::GtEq => Some(CondVal::Bool(lhs >= rhs)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate to a definite boolean, or `None`. The top-level node must be a
|
||||
/// comparison (a bare integer is not a branch condition we fold).
|
||||
pub fn eval_bool(&self, resolve: &impl Fn(&str) -> Option<i64>) -> Option<bool> {
|
||||
match self.eval(resolve)? {
|
||||
CondVal::Bool(b) => Some(b),
|
||||
CondVal::Int(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect every identifier name referenced by the tree.
|
||||
pub fn collect_vars(&self, out: &mut Vec<String>) {
|
||||
match self {
|
||||
CondArith::Lit(_) => {}
|
||||
CondArith::Var(name) => {
|
||||
if !out.iter().any(|v| v == name) {
|
||||
out.push(name.clone());
|
||||
}
|
||||
}
|
||||
CondArith::Neg(inner) => inner.collect_vars(out),
|
||||
CondArith::Bin(_, l, r) => {
|
||||
l.collect_vars(out);
|
||||
r.collect_vars(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Call-related metadata for CFG nodes.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CallMeta {
|
||||
|
|
@ -662,6 +785,17 @@ pub struct NodeInfo {
|
|||
pub condition_vars: Vec<String>,
|
||||
/// For If nodes: whether the condition has a leading negation (`!` / `not`).
|
||||
pub condition_negated: bool,
|
||||
/// For If / conditional (ternary) nodes: the condition as a pure
|
||||
/// integer-arithmetic + comparison expression tree, when the whole
|
||||
/// condition is built only from integer literals, identifiers, arithmetic
|
||||
/// / comparison operators, and parentheses. `None` for any condition that
|
||||
/// touches a call, field access, string, compound boolean (`&&`/`||`), or
|
||||
/// any shape this evaluator cannot prove constant. Consumed by
|
||||
/// [`crate::ssa::const_prop::fold_constant_branches`] to prune branches
|
||||
/// whose condition folds to a definite boolean once its variables are
|
||||
/// resolved to constants — closing the synthetic "dead branch keeps the
|
||||
/// tainted phi operand alive" false positive without any text re-parsing.
|
||||
pub cond_arith: Option<CondArith>,
|
||||
/// True when this is a Call node whose argument list contains only
|
||||
/// syntactic literal values (strings, numbers, booleans, null/nil,
|
||||
/// arrays/lists/tuples of literals). Also true for zero-argument calls
|
||||
|
|
@ -1065,7 +1199,7 @@ fn extract_condition_raw<'a>(
|
|||
ast: Node<'a>,
|
||||
lang: &str,
|
||||
code: &'a [u8],
|
||||
) -> (Option<String>, Vec<String>, bool) {
|
||||
) -> (Option<String>, Vec<String>, bool, Option<CondArith>) {
|
||||
// 1. Find the condition subtree.
|
||||
let cond_node = ast.child_by_field_name("condition").or_else(|| {
|
||||
// Rust `if_expression` uses positional children: the condition is
|
||||
|
|
@ -1085,7 +1219,7 @@ fn extract_condition_raw<'a>(
|
|||
});
|
||||
|
||||
let Some(cond) = cond_node else {
|
||||
return (None, Vec::new(), false);
|
||||
return (None, Vec::new(), false, None);
|
||||
};
|
||||
|
||||
// 2. Detect leading negation (`!expr`, `not expr`, Ruby `unless`).
|
||||
|
|
@ -1103,7 +1237,20 @@ fn extract_condition_raw<'a>(
|
|||
let text = text_of(cond, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
|
||||
(text, vars, negated)
|
||||
// 5. Capture the pure integer-arithmetic + comparison tree (for constant
|
||||
// branch folding). Built from the FULL condition node `cond` (not the
|
||||
// negation-stripped `inner`) so the folded boolean matches the
|
||||
// Branch terminator's `true_blk = cond-true` semantics directly. Ruby
|
||||
// `unless` swaps the True/False edges in the CFG builder (lines
|
||||
// ~5029), so the branch polarity would be inverted — skip it to stay
|
||||
// sound (`unless` with a constant arithmetic guard is negligible).
|
||||
let cond_arith = if ast.kind() == "unless" {
|
||||
None
|
||||
} else {
|
||||
build_cond_arith(cond, lang, code, 0)
|
||||
};
|
||||
|
||||
(text, vars, negated, cond_arith)
|
||||
}
|
||||
|
||||
/// Detect leading negation and return the inner expression.
|
||||
|
|
@ -1241,6 +1388,155 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
|
|||
None
|
||||
}
|
||||
|
||||
/// Parse an integer literal node to its `i64` value, honouring hex / octal /
|
||||
/// binary radix prefixes and Java/Rust digit separators (`1_000`). Returns
|
||||
/// `None` for floats, non-literals, or values that overflow `i64`.
|
||||
fn parse_int_literal(node: Node, code: &[u8]) -> Option<i64> {
|
||||
let kind = node.kind();
|
||||
let is_int = matches!(
|
||||
kind,
|
||||
"integer"
|
||||
| "integer_literal"
|
||||
| "int_literal"
|
||||
| "number"
|
||||
| "number_literal"
|
||||
| "decimal_integer_literal"
|
||||
| "hex_integer_literal"
|
||||
| "octal_integer_literal"
|
||||
| "binary_integer_literal"
|
||||
);
|
||||
if !is_int {
|
||||
return None;
|
||||
}
|
||||
let raw = std::str::from_utf8(&code[node.byte_range()]).ok()?.trim();
|
||||
// Strip Java long suffix and digit separators.
|
||||
let cleaned: String = raw
|
||||
.trim_end_matches(['l', 'L'])
|
||||
.chars()
|
||||
.filter(|c| *c != '_')
|
||||
.collect();
|
||||
if let Ok(v) = cleaned.parse::<i64>() {
|
||||
return Some(v);
|
||||
}
|
||||
if let Some(h) = cleaned.strip_prefix("0x").or_else(|| cleaned.strip_prefix("0X")) {
|
||||
return i64::from_str_radix(h, 16).ok();
|
||||
}
|
||||
if let Some(o) = cleaned.strip_prefix("0o").or_else(|| cleaned.strip_prefix("0O")) {
|
||||
return i64::from_str_radix(o, 8).ok();
|
||||
}
|
||||
if let Some(b) = cleaned.strip_prefix("0b").or_else(|| cleaned.strip_prefix("0B")) {
|
||||
return i64::from_str_radix(b, 2).ok();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Map the operator token of a binary expression node to a [`BinOp`].
|
||||
/// Scans for the single anonymous operator child (operands are named).
|
||||
/// Returns `None` for boolean operators (`&&` / `||`), assignment, or any
|
||||
/// token not in the arithmetic / bitwise / comparison set — those make the
|
||||
/// enclosing [`CondArith`] build bail.
|
||||
fn binary_op_token(node: Node) -> Option<BinOp> {
|
||||
let mut cursor = node.walk();
|
||||
for child in node.children(&mut cursor) {
|
||||
if child.is_named() {
|
||||
continue;
|
||||
}
|
||||
return match child.kind() {
|
||||
"+" => Some(BinOp::Add),
|
||||
"-" => Some(BinOp::Sub),
|
||||
"*" => Some(BinOp::Mul),
|
||||
"/" => Some(BinOp::Div),
|
||||
"%" => Some(BinOp::Mod),
|
||||
"&" => Some(BinOp::BitAnd),
|
||||
"|" => Some(BinOp::BitOr),
|
||||
"^" => Some(BinOp::BitXor),
|
||||
"<<" => Some(BinOp::LeftShift),
|
||||
">>" => Some(BinOp::RightShift),
|
||||
"==" | "===" => Some(BinOp::Eq),
|
||||
"!=" | "!==" => Some(BinOp::NotEq),
|
||||
"<" => Some(BinOp::Lt),
|
||||
"<=" => Some(BinOp::LtEq),
|
||||
">" => Some(BinOp::Gt),
|
||||
">=" => Some(BinOp::GtEq),
|
||||
_ => None,
|
||||
};
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Build a [`CondArith`] tree from a condition AST subtree, or `None` if the
|
||||
/// condition is not a pure integer-arithmetic + comparison expression. Uses
|
||||
/// the real tree-sitter node so operator precedence and parentheses are
|
||||
/// already encoded in the tree shape — no text parsing. Conservative by
|
||||
/// construction: any unrecognised node kind (call, field access, string,
|
||||
/// boolean `&&`/`||`, unary `!`) returns `None`, which disables folding for
|
||||
/// that branch (never a wrong fold). Depth-bounded to guard against
|
||||
/// pathological nesting.
|
||||
fn build_cond_arith(node: Node, lang: &str, code: &[u8], depth: u32) -> Option<CondArith> {
|
||||
if depth > 64 {
|
||||
return None;
|
||||
}
|
||||
let kind = node.kind();
|
||||
|
||||
// Unwrap parentheses (transparent to value).
|
||||
if matches!(kind, "parenthesized_expression" | "parenthesized" | "parenthesized_statement") {
|
||||
let inner = node.named_child(0)?;
|
||||
return build_cond_arith(inner, lang, code, depth + 1);
|
||||
}
|
||||
|
||||
if let Some(n) = parse_int_literal(node, code) {
|
||||
return Some(CondArith::Lit(n));
|
||||
}
|
||||
|
||||
// Bare identifier (reject dotted paths / field access — those are not
|
||||
// captured here; only a plain local whose const value we can resolve).
|
||||
if matches!(kind, "identifier" | "simple_identifier") {
|
||||
let name = text_of(node, code)?;
|
||||
if !name.is_empty()
|
||||
&& name.chars().all(|c| c.is_alphanumeric() || c == '_' || c == '$')
|
||||
{
|
||||
return Some(CondArith::Var(name));
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
// Unary `-` only (boolean `!` / `not` is intentionally unsupported: its
|
||||
// operand would be a boolean, which `CondArith::eval` rejects, so folding
|
||||
// a negated condition is left to the conservative `None` path).
|
||||
if matches!(
|
||||
kind,
|
||||
"unary_expression" | "unary_operator" | "prefix_unary_expression" | "unary"
|
||||
) {
|
||||
let operand = node.named_child(0)?;
|
||||
let mut cursor = node.walk();
|
||||
let is_neg = node
|
||||
.children(&mut cursor)
|
||||
.any(|c| !c.is_named() && c.kind() == "-");
|
||||
if is_neg {
|
||||
return Some(CondArith::Neg(Box::new(build_cond_arith(
|
||||
operand,
|
||||
lang,
|
||||
code,
|
||||
depth + 1,
|
||||
)?)));
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
// Binary arithmetic / comparison: exactly two operands + one operator.
|
||||
if is_binary_expr_kind(kind, lang) {
|
||||
if node.named_child_count() != 2 {
|
||||
return None; // chained comparison (Python `a < b < c`) etc.
|
||||
}
|
||||
let op = binary_op_token(node)?;
|
||||
let lhs = build_cond_arith(node.named_child(0)?, lang, code, depth + 1)?;
|
||||
let rhs = build_cond_arith(node.named_child(1)?, lang, code, depth + 1)?;
|
||||
return Some(CondArith::Bin(op, Box::new(lhs), Box::new(rhs)));
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Find the RHS value node of an assignment-like AST node (variable declarator,
|
||||
/// lexical declaration, assignment expression). Used by helpers that need to
|
||||
/// inspect what an identifier is being initialized to.
|
||||
|
|
@ -3231,11 +3527,11 @@ pub(super) fn push_node<'a>(
|
|||
};
|
||||
|
||||
// Extract condition metadata for If nodes.
|
||||
let (condition_text, condition_vars, condition_negated) =
|
||||
let (condition_text, condition_vars, condition_negated, cond_arith) =
|
||||
if matches!(lookup(lang, ast.kind()), Kind::If) {
|
||||
extract_condition_raw(ast, lang, code)
|
||||
} else {
|
||||
(None, Vec::new(), false)
|
||||
(None, Vec::new(), false, None)
|
||||
};
|
||||
|
||||
// Extract per-argument identifiers for Call nodes.
|
||||
|
|
@ -3512,6 +3808,7 @@ pub(super) fn push_node<'a>(
|
|||
condition_text,
|
||||
condition_vars,
|
||||
condition_negated,
|
||||
cond_arith,
|
||||
all_args_literal,
|
||||
catch_param: false,
|
||||
arg_callees,
|
||||
|
|
|
|||
|
|
@ -231,6 +231,13 @@ fn type_kind_index(kind: &TypeKind) -> u32 {
|
|||
| TypeKind::GormDb
|
||||
| TypeKind::SqlxDb
|
||||
| TypeKind::HibernateSession => 3,
|
||||
// ProcessBuilder participates only in the type-qualified callee
|
||||
// resolver via `label_prefix()`; no dedicated bitset slot, share
|
||||
// the Object index like the other receiver-only TypeKinds.
|
||||
TypeKind::ProcessBuilder => 3,
|
||||
// Runtime is likewise a type-qualified-resolver-only receiver kind
|
||||
// (`Runtime.exec`); no dedicated bitset slot, share the Object index.
|
||||
TypeKind::Runtime => 3,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -275,6 +275,14 @@ pub fn class_name_to_type_kind(name: &str) -> Option<TypeKind> {
|
|||
// type-qualified resolution to `Template.process`, the SSTI
|
||||
// sink defined in `labels/java.rs`.
|
||||
"Template" => Some(TypeKind::Template),
|
||||
// `java.lang.Runtime` declared receiver type. Routes the
|
||||
// split-receiver shape `Runtime r = Runtime.getRuntime(); ...
|
||||
// r.exec(...)` through type-qualified resolution to
|
||||
// `Runtime.exec` (the only `Runtime.*` rule, always SHELL_ESCAPE),
|
||||
// complementing the `constructor_type` factory route for
|
||||
// `Runtime.getRuntime()`. No benign `Runtime.exec` exists, so
|
||||
// typing any `Runtime`-declared receiver carries no FP risk.
|
||||
"Runtime" => Some(TypeKind::Runtime),
|
||||
// Python qualified type names.
|
||||
// Only covers raw lowered names from isinstance(). The lowering in lower.rs
|
||||
// extracts the literal type text: isinstance(x, requests.Session) produces
|
||||
|
|
|
|||
|
|
@ -124,6 +124,23 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// `ProcessBuilder.command(argList)` — the dominant OWASP Benchmark
|
||||
// command-injection shape builds an argument `List<String>`, attaches it
|
||||
// via `pb.command(argList)`, then runs `pb.start()`. The argument list is
|
||||
// a separate channel from the constructor, so the flat `ProcessBuilder`
|
||||
// constructor sink above never sees the tainted args. This rule fires
|
||||
// only via type-qualified resolution: the receiver `pb` must carry a
|
||||
// `TypeKind::ProcessBuilder` fact (set by `constructor_type` for
|
||||
// `new ProcessBuilder(...)`), so the resolver rewrites `pb.command(...)` →
|
||||
// `ProcessBuilder.command`. Case-sensitive and receiver-typed to avoid
|
||||
// colliding with the many unrelated `.command(...)` methods (CLI builders,
|
||||
// JCommander, picocli, Swing actions). The payload is restricted to arg 0
|
||||
// (the command list) via `type_qualified_sink_payload_args`.
|
||||
LabelRule {
|
||||
matchers: &["ProcessBuilder.command"],
|
||||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: true,
|
||||
},
|
||||
LabelRule {
|
||||
matchers: &["executeQuery", "executeUpdate"],
|
||||
label: DataLabel::Sink(Cap::SQL_QUERY),
|
||||
|
|
|
|||
|
|
@ -1496,7 +1496,11 @@ pub fn type_qualified_sink_payload_args(qualified_callee: &str) -> Option<&'stat
|
|||
| "TypeOrmRepo.createQueryBuilder"
|
||||
| "TypeOrmManager.query"
|
||||
| "TypeOrmManager.createQueryBuilder"
|
||||
| "MikroOrmEm.execute" => Some(&[0]),
|
||||
| "MikroOrmEm.execute"
|
||||
// `ProcessBuilder.command(argList)` — arg 0 is the command list;
|
||||
// any later positional args are not part of the v1 shape. Restrict
|
||||
// sink-taint scanning to arg 0 so receiver / unrelated args don't fire.
|
||||
| "ProcessBuilder.command" => Some(&[0]),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1202,6 +1202,8 @@ fn type_kind_tag(k: &TypeKind) -> String {
|
|||
TypeKind::GormDb => "GormDb".into(),
|
||||
TypeKind::SqlxDb => "SqlxDb".into(),
|
||||
TypeKind::HibernateSession => "HibernateSession".into(),
|
||||
TypeKind::ProcessBuilder => "ProcessBuilder".into(),
|
||||
TypeKind::Runtime => "Runtime".into(),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -624,6 +624,192 @@ pub fn apply_const_prop(body: &mut SsaBody, result: &ConstPropResult) -> usize {
|
|||
pruned
|
||||
}
|
||||
|
||||
/// Resolve a condition variable name to the SSA value reaching `block`.
|
||||
///
|
||||
/// Mirrors `constraint::lower::resolve_single_var` (the established resolver
|
||||
/// for branch-condition variables): prefer the highest-indexed definition in
|
||||
/// the branch block itself, else the highest-indexed definition elsewhere.
|
||||
/// Kept local to avoid a `ssa → constraint` dependency cycle (constraint
|
||||
/// already depends on ssa).
|
||||
fn resolve_const_var(body: &SsaBody, var_name: &str, block: BlockId) -> Option<SsaValue> {
|
||||
let mut best_in_block: Option<SsaValue> = None;
|
||||
let mut best_outside: Option<SsaValue> = None;
|
||||
for (idx, vd) in body.value_defs.iter().enumerate() {
|
||||
if vd.var_name.as_deref() != Some(var_name) {
|
||||
continue;
|
||||
}
|
||||
let v = SsaValue(idx as u32);
|
||||
if vd.block == block {
|
||||
best_in_block = Some(match best_in_block {
|
||||
Some(existing) if existing.0 > v.0 => existing,
|
||||
_ => v,
|
||||
});
|
||||
} else {
|
||||
best_outside = Some(match best_outside {
|
||||
Some(existing) if existing.0 > v.0 => existing,
|
||||
_ => v,
|
||||
});
|
||||
}
|
||||
}
|
||||
best_in_block.or(best_outside)
|
||||
}
|
||||
|
||||
/// Fold branch conditions that are pure integer-arithmetic comparisons over
|
||||
/// constant operands, pruning the statically-dead edge.
|
||||
///
|
||||
/// Complements [`apply_const_prop`], which only folds a condition that lowers
|
||||
/// to a single SSA boolean value. An arithmetic comparison condition such as
|
||||
/// `(7*42) - num > 200` is **never** an SSA value — condition nodes lower to
|
||||
/// `Nop` and the comparison is held structurally on the branch terminator — so
|
||||
/// SCCP cannot reach it. This pass instead evaluates the
|
||||
/// [`crate::cfg::CondArith`] tree captured at CFG-build time, resolving each
|
||||
/// variable to its const-propagated integer.
|
||||
///
|
||||
/// Sound by construction:
|
||||
/// * A branch is pruned only when its `CondArith` evaluates to a **definite**
|
||||
/// boolean — every variable bound to a known integer constant and every
|
||||
/// operation defined (no div-by-zero / overflow). `None`/`Varying` leaves
|
||||
/// both edges intact.
|
||||
/// * After the terminator is rewritten to `Goto(taken)` and the dead edge is
|
||||
/// dropped (symmetrically, preserving pred/succ consistency), every phi
|
||||
/// operand whose predecessor is no longer reachable from entry is removed.
|
||||
/// That last step is what actually drops the dead-branch operand from a
|
||||
/// merge phi like `bar = phi(then: "const", else: param)` — without it the
|
||||
/// taint engine's phi fallback would still read the tainted `param` from
|
||||
/// the joined entry state.
|
||||
///
|
||||
/// Returns the number of branches pruned.
|
||||
pub fn fold_constant_branches(
|
||||
body: &mut SsaBody,
|
||||
cfg: &crate::cfg::Cfg,
|
||||
const_values: &HashMap<SsaValue, ConstLattice>,
|
||||
) -> usize {
|
||||
use crate::ssa::ir::Terminator;
|
||||
|
||||
// 1. Collect definite fold decisions: (branch_block_idx, taken, untaken).
|
||||
let mut prune_ops: Vec<(usize, BlockId, BlockId)> = Vec::new();
|
||||
for (block_idx, block) in body.blocks.iter().enumerate() {
|
||||
let Terminator::Branch {
|
||||
cond,
|
||||
true_blk,
|
||||
false_blk,
|
||||
..
|
||||
} = &block.terminator
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
// Degenerate `cond ? X : X` (both edges to one block): nothing to prune.
|
||||
if true_blk == false_blk {
|
||||
continue;
|
||||
}
|
||||
let Some(cond_info) = cfg.node_weight(*cond) else {
|
||||
continue;
|
||||
};
|
||||
let Some(arith) = cond_info.cond_arith.as_ref() else {
|
||||
continue;
|
||||
};
|
||||
let branch_block = block.id;
|
||||
let resolve = |name: &str| -> Option<i64> {
|
||||
let v = resolve_const_var(body, name, branch_block)?;
|
||||
match const_values.get(&v) {
|
||||
Some(ConstLattice::Int(n)) => Some(*n),
|
||||
_ => None,
|
||||
}
|
||||
};
|
||||
match arith.eval_bool(&resolve) {
|
||||
Some(true) => prune_ops.push((block_idx, *true_blk, *false_blk)),
|
||||
Some(false) => prune_ops.push((block_idx, *false_blk, *true_blk)),
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
let pruned = prune_ops.len();
|
||||
if pruned == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 2. Rewrite terminators + drop the dead edge (symmetrically).
|
||||
for &(block_idx, taken, untaken) in &prune_ops {
|
||||
let pred_id = body.blocks[block_idx].id;
|
||||
body.blocks[block_idx].terminator = Terminator::Goto(taken);
|
||||
body.blocks[block_idx].succs.retain(|s| *s != untaken);
|
||||
let untaken_idx = untaken.0 as usize;
|
||||
if untaken_idx < body.blocks.len() {
|
||||
body.blocks[untaken_idx].preds.retain(|p| *p != pred_id);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Recompute reachability from entry over the (now-pruned) succ edges.
|
||||
let n = body.blocks.len();
|
||||
let mut reachable = vec![false; n];
|
||||
let mut stack = vec![body.entry];
|
||||
if (body.entry.0 as usize) < n {
|
||||
reachable[body.entry.0 as usize] = true;
|
||||
}
|
||||
while let Some(b) = stack.pop() {
|
||||
let bidx = b.0 as usize;
|
||||
if bidx >= n {
|
||||
continue;
|
||||
}
|
||||
// Clone succs to avoid borrow conflict with `reachable`.
|
||||
let succs: SmallVec<[BlockId; 2]> = body.blocks[bidx].succs.clone();
|
||||
for s in succs {
|
||||
let sidx = s.0 as usize;
|
||||
if sidx < n && !reachable[sidx] {
|
||||
reachable[sidx] = true;
|
||||
stack.push(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Reachable blocks: drop the now-dead predecessor. Removing the phi
|
||||
// operand from the merge block is what stops the tainted dead-branch
|
||||
// value feeding the phi; removing the pred keeps pred/succ symmetric
|
||||
// with step 5's succ clearing. Operands from still-reachable
|
||||
// predecessors are untouched, so no live flow is lost.
|
||||
for (bidx, block) in body.blocks.iter_mut().enumerate() {
|
||||
if !reachable[bidx] {
|
||||
continue;
|
||||
}
|
||||
block.preds.retain(|p| {
|
||||
let pidx = p.0 as usize;
|
||||
pidx < n && reachable[pidx]
|
||||
});
|
||||
for phi in &mut block.phis {
|
||||
if let SsaOp::Phi(operands) = &mut phi.op {
|
||||
operands.retain(|(pred, _)| {
|
||||
let pidx = pred.0 as usize;
|
||||
pidx < n && reachable[pidx]
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Unreachable blocks: neutralise them so the *later* optimiser passes
|
||||
// (copy-prop, base-alias grouping, type-facts, points-to) and the taint
|
||||
// transfer never observe their dead instructions. This is the
|
||||
// load-bearing step for precision: a dead `else bar = param` would
|
||||
// otherwise make copy-prop alias `bar`↔`param`, and
|
||||
// `propagate_taint_to_aliases` would then poison the *surviving const*
|
||||
// `bar` with `param`'s (still-reachable) taint — defeating the whole
|
||||
// prune. Each instruction is rewritten to `Nop` (value + cfg_node
|
||||
// preserved so `value_defs` coverage holds), the terminator to
|
||||
// `Unreachable`, and the block is fully disconnected.
|
||||
for (bidx, block) in body.blocks.iter_mut().enumerate() {
|
||||
if reachable[bidx] {
|
||||
continue;
|
||||
}
|
||||
for inst in block.phis.iter_mut().chain(block.body.iter_mut()) {
|
||||
inst.op = SsaOp::Nop;
|
||||
}
|
||||
block.terminator = Terminator::Unreachable;
|
||||
block.succs.clear();
|
||||
block.preds.clear();
|
||||
}
|
||||
|
||||
pruned
|
||||
}
|
||||
|
||||
/// Collect module aliases from `require()` calls in the SSA body.
|
||||
///
|
||||
/// Detects patterns like `const http = require("http")` and propagates
|
||||
|
|
|
|||
|
|
@ -101,7 +101,12 @@ pub fn optimize_ssa_with_param_types(
|
|||
) -> OptimizeResult {
|
||||
// 1. Constant propagation (SCCP)
|
||||
let cp = const_prop::const_propagate(body);
|
||||
let branches_pruned = const_prop::apply_const_prop(body, &cp);
|
||||
let mut branches_pruned = const_prop::apply_const_prop(body, &cp);
|
||||
// 1b. Fold pure integer-arithmetic comparison branch conditions that SCCP
|
||||
// cannot reach (the comparison is held on the terminator, not an SSA
|
||||
// value). Prunes statically-dead edges + their merge-phi operands so a
|
||||
// dead `else bar = param` stops feeding a tainted operand into the phi.
|
||||
branches_pruned += const_prop::fold_constant_branches(body, cfg, &cp.values);
|
||||
|
||||
// 2. Copy propagation
|
||||
let (copies_eliminated, copy_map) = copy_prop::copy_propagate(body, cfg);
|
||||
|
|
|
|||
|
|
@ -261,6 +261,33 @@ pub enum TypeKind {
|
|||
/// arbitrary-receiver-name shape (`sess`, `hibernateSession`, etc.)
|
||||
/// via type-qualified resolution.
|
||||
HibernateSession,
|
||||
/// A `java.lang.ProcessBuilder` instance produced by
|
||||
/// `new ProcessBuilder(...)`. The dominant OWASP Benchmark
|
||||
/// command-injection shape builds an argument `List<String>`, attaches
|
||||
/// it via `pb.command(argList)`, then runs it with `pb.start()`. The
|
||||
/// argument list is a separate channel from the constructor, so the
|
||||
/// flat `ProcessBuilder` constructor sink never sees the tainted args.
|
||||
/// Mapping the receiver to this TypeKind lets the type-qualified
|
||||
/// resolver rewrite `pb.command(argList)` → `ProcessBuilder.command`
|
||||
/// against the flat SHELL_ESCAPE rule in `labels/java.rs`, so tainted
|
||||
/// list contents reaching the command builder are caught at the
|
||||
/// `command(...)` call site.
|
||||
ProcessBuilder,
|
||||
/// A `java.lang.Runtime` instance produced by the static factory
|
||||
/// `Runtime.getRuntime()`. The dominant OWASP Benchmark
|
||||
/// command-injection shape splits the receiver across statements:
|
||||
/// `Runtime r = Runtime.getRuntime(); ... r.exec(args, argsEnv)`. The
|
||||
/// callee text at the sink is `r.exec`, which does not suffix-match the
|
||||
/// flat `Runtime.exec` rule in `labels/java.rs` (the chained
|
||||
/// `Runtime.getRuntime().exec(...)` form fires only because its callee
|
||||
/// text literally contains `Runtime`). Mapping the receiver `r` to
|
||||
/// this TypeKind lets the type-qualified resolver rewrite `r.exec(...)`
|
||||
/// → `Runtime.exec` against the flat SHELL_ESCAPE rule, so tainted data
|
||||
/// reaching the split-receiver exec is caught. No payload-arg
|
||||
/// restriction: `Runtime.exec` overloads place the tainted data in
|
||||
/// either the command (arg 0) or the environment array (arg 1), so the
|
||||
/// default all-args sink scan must cover every position.
|
||||
Runtime,
|
||||
}
|
||||
|
||||
/// structural carrier for a recognised DTO type. Maps
|
||||
|
|
@ -318,6 +345,8 @@ impl TypeKind {
|
|||
Self::GormDb => Some("GormDb"),
|
||||
Self::SqlxDb => Some("SqlxDb"),
|
||||
Self::HibernateSession => Some("HibernateSession"),
|
||||
Self::ProcessBuilder => Some("ProcessBuilder"),
|
||||
Self::Runtime => Some("Runtime"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
@ -708,6 +737,18 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
|
|||
"openSession" | "getCurrentSession" | "openStatelessSession" => {
|
||||
Some(TypeKind::HibernateSession)
|
||||
}
|
||||
// `new ProcessBuilder(...)` — the receiver's `command(argList)`
|
||||
// setter is a command-injection sink for the list contents.
|
||||
// Type-qualified resolution rewrites `pb.command(...)` →
|
||||
// `ProcessBuilder.command` against the flat SHELL_ESCAPE rule.
|
||||
"ProcessBuilder" => Some(TypeKind::ProcessBuilder),
|
||||
// `Runtime.getRuntime()` — the static factory returns the
|
||||
// singleton `java.lang.Runtime`. Gating on `callee.contains
|
||||
// ("Runtime")` keeps an unrelated `foo.getRuntime()` method from
|
||||
// being mistyped. Type-qualified resolution rewrites the
|
||||
// split-receiver `r.exec(...)` → `Runtime.exec` against the flat
|
||||
// SHELL_ESCAPE rule.
|
||||
"getRuntime" if callee.contains("Runtime") => Some(TypeKind::Runtime),
|
||||
_ => None,
|
||||
},
|
||||
Lang::JavaScript | Lang::TypeScript => {
|
||||
|
|
|
|||
|
|
@ -1929,7 +1929,7 @@ pub(crate) fn extract_intra_file_ssa_summaries(
|
|||
|
||||
for (func_name, func_entry) in &func_entries {
|
||||
let formal_params = lookup_formal_params(local_summaries, func_name);
|
||||
let func_ssa = match crate::ssa::lower_to_ssa_with_params(
|
||||
let mut func_ssa = match crate::ssa::lower_to_ssa_with_params(
|
||||
cfg,
|
||||
*func_entry,
|
||||
Some(func_name),
|
||||
|
|
@ -1939,6 +1939,9 @@ pub(crate) fn extract_intra_file_ssa_summaries(
|
|||
Ok(ssa) => ssa,
|
||||
Err(_) => continue,
|
||||
};
|
||||
// Match the `_from_bodies` path: prune dead constant branches before
|
||||
// the summary probe (see `prefold_dead_branches_for_summary`).
|
||||
prefold_dead_branches_for_summary(&mut func_ssa, cfg);
|
||||
|
||||
// `formal_params` is authoritative even when it is empty. SSA lowering
|
||||
// also emits Param ops for external captures; counting those as arity
|
||||
|
|
@ -2019,6 +2022,22 @@ pub(crate) fn extract_intra_file_ssa_summaries(
|
|||
/// name overloads with different arity, and anonymous bodies at distinct
|
||||
/// source spans all get distinct keys.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
/// Prune definite-constant dead branches on a freshly-lowered body *before*
|
||||
/// its interprocedural summary is extracted.
|
||||
///
|
||||
/// Summary extraction ([`ssa_transfer::extract_ssa_func_summary`]) runs on the
|
||||
/// pre-optimisation SSA, so without this a helper whose body returns a constant
|
||||
/// only because a dead `else x = param` branch is never taken would still emit
|
||||
/// a `param → return` transform — re-tainting the caller's `bar =
|
||||
/// helper(param)` and defeating the in-body branch fold. Only
|
||||
/// [`crate::ssa::const_prop::fold_constant_branches`] is applied (no copy-prop /
|
||||
/// DCE), so the change is limited to provably-dead arithmetic-comparison
|
||||
/// branches; the body's value numbering is otherwise untouched.
|
||||
fn prefold_dead_branches_for_summary(func_ssa: &mut crate::ssa::SsaBody, cfg: &crate::cfg::Cfg) {
|
||||
let cp = crate::ssa::const_prop::const_propagate(func_ssa);
|
||||
crate::ssa::const_prop::fold_constant_branches(func_ssa, cfg, &cp.values);
|
||||
}
|
||||
|
||||
pub(crate) fn lower_all_functions_from_bodies(
|
||||
file_cfg: &FileCfg,
|
||||
lang: Lang,
|
||||
|
|
@ -2108,6 +2127,9 @@ fn lower_all_functions_from_bodies_inner(
|
|||
Err(_) => continue,
|
||||
};
|
||||
perf_lower_record(0, _t_lower.elapsed().as_micros());
|
||||
// Prune dead constant branches before the summary probe so a helper's
|
||||
// dead `else x = param` does not surface as a spurious param→return.
|
||||
prefold_dead_branches_for_summary(&mut func_ssa, &body.graph);
|
||||
|
||||
let param_count = if !formal_params.is_empty() {
|
||||
formal_params.len()
|
||||
|
|
|
|||
|
|
@ -55,6 +55,19 @@ _CAP_BIT_TABLE = [
|
|||
(1 << 20, "prototype_pollution"),
|
||||
]
|
||||
|
||||
# Static lens (see --static): SHELL_ESCAPE (1<<2) is the command-injection sink
|
||||
# cap for *every* language (`grep SHELL_ESCAPE src/labels/` — all Sink uses are
|
||||
# command-exec; CODE_EXEC=1<<10 is the eval/code-exec variant, also cmdi). In a
|
||||
# normal `nyx scan` (no dynamic confirmation) a Java cmdi finding carries only
|
||||
# SHELL_ESCAPE; the SHELL_ESCAPE→CODE_EXEC remap that buckets it as cmdi is gated
|
||||
# on VerifyStatus::Confirmed (src/commands/scan.rs), so with 0 confirmations the
|
||||
# default table leaves these in "other" and the cmdi cell reads 0/0/N. The
|
||||
# static lens appends SHELL_ESCAPE→cmdi at the LOWEST priority (after every other
|
||||
# bit) so a SHELL_ESCAPE-only finding buckets as cmdi while a finding that also
|
||||
# carries a higher-priority sink bit (e.g. FILE_IO) keeps its existing bucket.
|
||||
# Opt-in via --static so the default confirmed-recall bucketing is byte-identical.
|
||||
_CAP_BIT_TABLE_STATIC = _CAP_BIT_TABLE + [(1 << 2, "cmdi")] # SHELL_ESCAPE
|
||||
|
||||
# Substring → cap lookup for rule IDs. Order matters: most specific first.
|
||||
_CAP_RULE_TABLE = [
|
||||
("path_traversal", "path_traversal"),
|
||||
|
|
@ -83,12 +96,13 @@ def load_json(path: str) -> object:
|
|||
return json.load(f)
|
||||
|
||||
|
||||
def cap_of(finding: dict) -> str:
|
||||
def cap_of(finding: dict, static_lens: bool = False) -> str:
|
||||
# 1. Prefer evidence.sink_caps bitmask — the engine's own classification.
|
||||
ev = finding.get("evidence", {}) or {}
|
||||
sink_caps = ev.get("sink_caps")
|
||||
if isinstance(sink_caps, int) and sink_caps:
|
||||
for bit, name in _CAP_BIT_TABLE:
|
||||
table = _CAP_BIT_TABLE_STATIC if static_lens else _CAP_BIT_TABLE
|
||||
for bit, name in table:
|
||||
if sink_caps & bit:
|
||||
return name
|
||||
# 2. Fall back to rule id substring (e.g. py.cmdi.os_system, java.deser.readobject).
|
||||
|
|
@ -383,6 +397,20 @@ def main() -> int:
|
|||
default="",
|
||||
help="path to a previous results JSON; fail on monotonic-improvement regression",
|
||||
)
|
||||
p.add_argument(
|
||||
"--static",
|
||||
action="store_true",
|
||||
help=(
|
||||
"static lens: bucket SHELL_ESCAPE (1<<2) findings as cmdi even when "
|
||||
"they are unconfirmed. Java (and other) command-exec sinks carry "
|
||||
"SHELL_ESCAPE and only get remapped to CODE_EXEC on dynamic Confirm; "
|
||||
"without this flag, an env with 0 confirmations reads the cmdi cell "
|
||||
"as 0/0/N regardless of static quality. SHELL_ESCAPE is the "
|
||||
"command-injection sink cap for every language, so this is sound "
|
||||
"globally; it is opt-in only so the default confirmed-recall "
|
||||
"bucketing stays byte-identical."
|
||||
),
|
||||
)
|
||||
args = p.parse_args()
|
||||
lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()}
|
||||
|
||||
|
|
@ -418,7 +446,7 @@ def main() -> int:
|
|||
continue
|
||||
f_path = f.get("path", "")
|
||||
f_line = f.get("line", 0)
|
||||
f_cap = cap_of(f)
|
||||
f_cap = cap_of(f, static_lens=args.static)
|
||||
for idx, entry in enumerate(not_vuln):
|
||||
if idx in used:
|
||||
continue
|
||||
|
|
@ -455,7 +483,7 @@ def main() -> int:
|
|||
)
|
||||
|
||||
for f in findings:
|
||||
cap = cap_of(f)
|
||||
cap = cap_of(f, static_lens=args.static)
|
||||
lang = lang_of(f)
|
||||
key = (cap, lang)
|
||||
ev = f.get("evidence", {}) or {}
|
||||
|
|
@ -501,7 +529,7 @@ def main() -> int:
|
|||
for f in findings:
|
||||
f_path = f.get("path", "")
|
||||
f_line = f.get("line", 0)
|
||||
f_cap = cap_of(f)
|
||||
f_cap = cap_of(f, static_lens=args.static)
|
||||
cap = f_cap
|
||||
lang = lang_of(f)
|
||||
cell_key = (cap, lang)
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@ def write_json(path: Path, data: object) -> None:
|
|||
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
|
||||
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
|
||||
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
|
||||
SINK_BIT_SHELL = 1 << 2 # SHELL_ESCAPE (Java/other command-exec sink)
|
||||
SINK_BIT_FILE = 1 << 5 # FILE_IO (path_traversal)
|
||||
|
||||
|
||||
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
|
||||
|
|
@ -353,6 +355,91 @@ def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None:
|
|||
assert all(lang != "javascript" for _cap, lang in cells), cells
|
||||
|
||||
|
||||
def test_static_lens_buckets_shell_escape_as_cmdi(tmp: Path) -> None:
|
||||
# Caveat-1 fix: in an env with 0 dynamic confirmations a Java command-exec
|
||||
# finding carries only SHELL_ESCAPE (1<<2), which the default bit table
|
||||
# leaves in "other" — so the cmdi cell reads 0 TP / N FN regardless of
|
||||
# static quality. --static appends SHELL_ESCAPE→cmdi so static recall is
|
||||
# measurable without dynamic confirmation.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[{"path": "testcode/Cmd.java", "line": 0, "cap": "cmdi", "vuln": True}],
|
||||
)
|
||||
# Real Java taint findings carry id "taint-unsanitised-flow" (no cap
|
||||
# substring), so the rule-id fallback yields "other" — not the sqli/cmdi
|
||||
# the hand-crafted python_finding id would imply.
|
||||
java_cmdi = {
|
||||
"path": "/x/testcode/Cmd.java",
|
||||
"line": 10,
|
||||
"col": 0,
|
||||
"id": "taint-unsanitised-flow",
|
||||
"evidence": {"sink_caps": SINK_BIT_SHELL, "dynamic_verdict": {"status": "NotConfirmed"}},
|
||||
}
|
||||
scan = tmp / "scan.json"
|
||||
write_json(scan, {"findings": [java_cmdi]})
|
||||
|
||||
# Default lens: the finding buckets as "other", so cmdi shows the GT
|
||||
# positive as a pure FN (recall 0) — the measurement gap.
|
||||
default = tmp / "default.json"
|
||||
write_json(default, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(default),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(default.read_text())[-1]["cells"]}
|
||||
assert ("cmdi", "java") in cells and cells[("cmdi", "java")]["tp"] == 0, cells
|
||||
assert cells[("cmdi", "java")]["fn"] == 1, cells[("cmdi", "java")]
|
||||
assert ("other", "java") in cells, f"SHELL_ESCAPE must bucket as other by default: {list(cells)}"
|
||||
|
||||
# Static lens: the finding buckets as cmdi → recall measurable (TP=1, FN=0).
|
||||
static = tmp / "static.json"
|
||||
write_json(static, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--static",
|
||||
"--append", str(static),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(static.read_text())[-1]["cells"]}
|
||||
cmdi = cells[("cmdi", "java")]
|
||||
assert cmdi["tp"] == 1 and cmdi["fn"] == 0, cmdi
|
||||
assert ("other", "java") not in cells, f"static lens must reclaim the other-bucketed finding: {list(cells)}"
|
||||
|
||||
|
||||
def test_static_lens_preserves_higher_priority_bits(tmp: Path) -> None:
|
||||
# A finding carrying BOTH FILE_IO and SHELL_ESCAPE must keep bucketing as
|
||||
# path_traversal under the static lens (SHELL_ESCAPE is appended at lowest
|
||||
# priority), so the static lens never steals a finding from a non-cmdi cell.
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_FILE | SINK_BIT_SHELL, "B.java", 10, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
for flag in ([], ["--static"]):
|
||||
append = tmp / f"out{len(flag)}.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "x",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
*flag,
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
caps = {c["cap"] for c in json.loads(append.read_text())[-1]["cells"]}
|
||||
assert caps == {"path_traversal"}, f"flag={flag}: {caps}"
|
||||
|
||||
|
||||
def test_budget_malformed_exits_3(tmp: Path) -> None:
|
||||
bad = tmp / "bad.toml"
|
||||
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
|
||||
|
|
@ -661,6 +748,8 @@ def main() -> int:
|
|||
test_manual_triage_stamps_wrong_confirmed,
|
||||
test_manual_triage_ignores_vuln_true_entries,
|
||||
test_lang_filter_scopes_findings_and_gt,
|
||||
test_static_lens_buckets_shell_escape_as_cmdi,
|
||||
test_static_lens_preserves_higher_priority_bits,
|
||||
test_budget_malformed_exits_3,
|
||||
test_relative_gt_path_suffix_matches_absolute_finding,
|
||||
test_unmatched_gt_positive_lands_in_lang_cell,
|
||||
|
|
|
|||
19
tests/fixtures/real_world/java/taint/cmdi_deadbranch_const_safe.expect.json
vendored
Normal file
19
tests/fixtures/real_world/java/taint/cmdi_deadbranch_const_safe.expect.json
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"description": "Dead-branch constant condition (OWASP Benchmark cmdi non-vulnerable shape). `(7*42) - num > 200` with num=86 is 208 > 200 — always true — so `bar` is the constant string and the `else bar = param` arm is statically dead. The constant-branch fold (src/ssa/const_prop.rs::fold_constant_branches) evaluates the captured CondArith tree, prunes the dead edge, and drops the tainted phi operand AND neutralises the dead block so copy-prop cannot alias `bar`<->`param`. Result: `r.exec(cmd + bar)` carries no taint. Asserts NO taint finding fires (strict_unexpected promotes any taint-unsanitised-flow to a hard failure).",
|
||||
"tags": [
|
||||
"taint",
|
||||
"cmdi",
|
||||
"servlet",
|
||||
"runtime",
|
||||
"dead-branch",
|
||||
"const-fold",
|
||||
"precision"
|
||||
],
|
||||
"modes": [
|
||||
"full"
|
||||
],
|
||||
"strict_unexpected": [
|
||||
"taint-unsanitised-flow"
|
||||
],
|
||||
"expected": []
|
||||
}
|
||||
27
tests/fixtures/real_world/java/taint/cmdi_deadbranch_const_safe.java
vendored
Normal file
27
tests/fixtures/real_world/java/taint/cmdi_deadbranch_const_safe.java
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
import java.io.*;
|
||||
import javax.servlet.http.*;
|
||||
|
||||
// Dead-branch constant condition (OWASP Benchmark cmdi non-vulnerable shape).
|
||||
// The guard `(7*42) - num > 200` is `294 - 86 = 208 > 200`, i.e. ALWAYS true,
|
||||
// so `bar` is provably the constant string and the tainted `else` arm
|
||||
// (`bar = param`) is unreachable. The constant-branch fold
|
||||
// (`fold_constant_branches`) must prune the dead edge and drop the tainted
|
||||
// phi operand so `r.exec(cmd + bar)` carries no attacker data — NO finding.
|
||||
public class DeadBranchConstSafe extends HttpServlet {
|
||||
protected void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||
throws IOException {
|
||||
String param = request.getHeader("vector");
|
||||
|
||||
String bar;
|
||||
int num = 86;
|
||||
if ((7 * 42) - num > 200) {
|
||||
bar = "This_should_always_happen";
|
||||
} else {
|
||||
bar = param;
|
||||
}
|
||||
|
||||
String cmd = "echo ";
|
||||
Runtime r = Runtime.getRuntime();
|
||||
Process p = r.exec(cmd + bar);
|
||||
}
|
||||
}
|
||||
32
tests/fixtures/real_world/java/taint/cmdi_deadbranch_param_vuln.expect.json
vendored
Normal file
32
tests/fixtures/real_world/java/taint/cmdi_deadbranch_param_vuln.expect.json
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"description": "Dead-branch constant condition with VULNERABLE polarity. `(500/42) + num > 200` is `11 + 196 = 207 > 200` (integer division) — always true — and the TRUE arm assigns the tainted `param`, so the reachable branch carries taint and only the `else bar = \"...\"` arm is dead. The constant-branch fold must prune the DEAD else edge while keeping the live `bar = param`, so the command-injection finding at `r.exec(cmd + bar)` MUST still fire. Zero-false-negative guard: it proves the fold never prunes the reachable (tainted) arm.",
|
||||
"tags": [
|
||||
"taint",
|
||||
"cmdi",
|
||||
"servlet",
|
||||
"runtime",
|
||||
"dead-branch",
|
||||
"const-fold",
|
||||
"no-false-negative"
|
||||
],
|
||||
"modes": [
|
||||
"full"
|
||||
],
|
||||
"strict_unexpected": [
|
||||
"taint-unsanitised-flow"
|
||||
],
|
||||
"expected": [
|
||||
{
|
||||
"rule_id": "taint-unsanitised-flow",
|
||||
"severity": "HIGH",
|
||||
"must_match": true,
|
||||
"line_range": [
|
||||
26,
|
||||
26
|
||||
],
|
||||
"evidence_contains": [],
|
||||
"notes": "request.getHeader (line 15) flows into bar on the always-taken true arm (line 21), then into r.exec at line 26. Exactly one finding survives.",
|
||||
"max_count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
28
tests/fixtures/real_world/java/taint/cmdi_deadbranch_param_vuln.java
vendored
Normal file
28
tests/fixtures/real_world/java/taint/cmdi_deadbranch_param_vuln.java
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import java.io.*;
|
||||
import javax.servlet.http.*;
|
||||
|
||||
// Dead-branch constant condition, VULNERABLE polarity (OWASP Benchmark cmdi
|
||||
// vulnerable shape). The guard `(500/42) + num > 200` is `11 + 196 = 207 > 200`
|
||||
// using integer division — ALWAYS true — and the TRUE arm assigns the tainted
|
||||
// `param`. So the live branch carries taint and the `else bar = "never"` arm is
|
||||
// dead. The constant-branch fold must prune the DEAD (else) edge and keep the
|
||||
// reachable tainted `bar = param`, so `r.exec(cmd + bar)` MUST still fire. This
|
||||
// is the zero-false-negative guard: the fold must never prune the live arm.
|
||||
public class DeadBranchParamVuln extends HttpServlet {
|
||||
protected void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||
throws IOException {
|
||||
String param = request.getHeader("vector");
|
||||
|
||||
String bar;
|
||||
int num = 196;
|
||||
if ((500 / 42) + num > 200) {
|
||||
bar = param;
|
||||
} else {
|
||||
bar = "This_should_never_happen";
|
||||
}
|
||||
|
||||
String cmd = "echo ";
|
||||
Runtime r = Runtime.getRuntime();
|
||||
Process p = r.exec(cmd + bar);
|
||||
}
|
||||
}
|
||||
29
tests/fixtures/real_world/java/taint/cmdi_processbuilder_command.expect.json
vendored
Normal file
29
tests/fixtures/real_world/java/taint/cmdi_processbuilder_command.expect.json
vendored
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
{
|
||||
"description": "HttpServletRequest parameter flows through a List into ProcessBuilder.command(argList) — command injection via the setter form (list attached separately from the constructor, then pb.start()). This is the dominant OWASP Benchmark cmdi shape; resolved via type-qualified ProcessBuilder.command sink on the typed receiver plus container-element taint on the argument list.",
|
||||
"tags": [
|
||||
"taint",
|
||||
"cmdi",
|
||||
"servlet",
|
||||
"container"
|
||||
],
|
||||
"modes": [
|
||||
"full"
|
||||
],
|
||||
"strict_unexpected": [
|
||||
"taint-unsanitised-flow"
|
||||
],
|
||||
"expected": [
|
||||
{
|
||||
"rule_id": "taint-unsanitised-flow",
|
||||
"severity": "HIGH",
|
||||
"must_match": true,
|
||||
"line_range": [
|
||||
16,
|
||||
16
|
||||
],
|
||||
"evidence_contains": [],
|
||||
"notes": "request.getParameter (line 8) is concatenated into a list element (argList.add at line 13), the list is attached to ProcessBuilder via pb.command(argList) at line 16, and executed by pb.start() at line 17. The type-qualified ProcessBuilder.command sink fires at line 16 on the tainted container argument. Exactly one finding survives.",
|
||||
"max_count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
19
tests/fixtures/real_world/java/taint/cmdi_processbuilder_command.java
vendored
Normal file
19
tests/fixtures/real_world/java/taint/cmdi_processbuilder_command.java
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
import java.io.*;
|
||||
import java.util.*;
|
||||
import javax.servlet.http.*;
|
||||
|
||||
public class ProcessCommandHandler extends HttpServlet {
|
||||
protected void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||
throws IOException {
|
||||
String param = request.getParameter("vector");
|
||||
|
||||
List<String> argList = new ArrayList<String>();
|
||||
argList.add("sh");
|
||||
argList.add("-c");
|
||||
argList.add("echo " + param);
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder();
|
||||
pb.command(argList);
|
||||
pb.start();
|
||||
}
|
||||
}
|
||||
30
tests/fixtures/real_world/java/taint/cmdi_runtime_split_receiver.expect.json
vendored
Normal file
30
tests/fixtures/real_world/java/taint/cmdi_runtime_split_receiver.expect.json
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"description": "HttpServletRequest header flows into a String[] env array passed to a split-receiver Runtime.exec — command injection via the `Runtime r = Runtime.getRuntime(); ... r.exec(cmd, argsEnv)` shape (the dominant remaining OWASP Benchmark cmdi form). The callee text at the sink is `r.exec`, which does not suffix-match the flat `Runtime.exec` rule; resolution depends on the receiver `r` carrying TypeKind::Runtime (from the `Runtime.getRuntime()` factory / the `Runtime` declared type) so the type-qualified resolver rewrites `r.exec` → `Runtime.exec`. Taint is in the env array (arg 1), so no payload-arg restriction may be applied.",
|
||||
"tags": [
|
||||
"taint",
|
||||
"cmdi",
|
||||
"servlet",
|
||||
"runtime",
|
||||
"split-receiver"
|
||||
],
|
||||
"modes": [
|
||||
"full"
|
||||
],
|
||||
"strict_unexpected": [
|
||||
"taint-unsanitised-flow"
|
||||
],
|
||||
"expected": [
|
||||
{
|
||||
"rule_id": "taint-unsanitised-flow",
|
||||
"severity": "HIGH",
|
||||
"must_match": true,
|
||||
"line_range": [
|
||||
16,
|
||||
16
|
||||
],
|
||||
"evidence_contains": [],
|
||||
"notes": "request.getHeader (line 7) flows into the env array element argsEnv (line 15), which is passed as arg 1 of r.exec at line 16. The receiver r is typed Runtime via Runtime.getRuntime() (line 13), so the type-qualified Runtime.exec sink fires at the split-receiver call. Exactly one finding survives.",
|
||||
"max_count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
18
tests/fixtures/real_world/java/taint/cmdi_runtime_split_receiver.java
vendored
Normal file
18
tests/fixtures/real_world/java/taint/cmdi_runtime_split_receiver.java
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import java.io.*;
|
||||
import javax.servlet.http.*;
|
||||
|
||||
public class RuntimeSplitReceiverHandler extends HttpServlet {
|
||||
protected void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||
throws IOException {
|
||||
String param = request.getHeader("vector");
|
||||
|
||||
// Split-receiver Runtime.exec: the receiver is bound to a local in
|
||||
// one statement, then exec is called on it in another. The OWASP
|
||||
// Benchmark cmdi shape places the tainted data in the environment
|
||||
// array (arg 1), not the command (arg 0).
|
||||
Runtime r = Runtime.getRuntime();
|
||||
String[] args = { "/bin/sh", "-c", "echo nyx" };
|
||||
String[] argsEnv = { "TAINT=" + param };
|
||||
r.exec(args, argsEnv);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue