mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-18 20:15:14 +02:00
feat(ssa): optimize branch condition handling via constant folding, enhance precision for taint analysis, and expand OWASP Benchmark support
This commit is contained in:
parent
ec76c9e08f
commit
9c99f6c6a9
22 changed files with 1020 additions and 17 deletions
|
|
@ -3997,3 +3997,94 @@ function outer(obj, x, y) {
|
|||
let (mline, _) = method_site.span.expect("method span populated");
|
||||
assert_eq!(mline, 4, "obj.method(x) on line 4");
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// Constant-branch fold: CondArith capture + evaluation
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// `CondArith::eval`/`eval_bool` must fold the two OWASP-Benchmark
|
||||
/// arithmetic guard shapes to a definite boolean, using integer
|
||||
/// (truncating) division, and must return `None` — never a wrong fold —
|
||||
/// for any undefined operation or unresolved variable.
|
||||
#[test]
|
||||
fn cond_arith_eval_is_sound() {
|
||||
use crate::cfg::{BinOp, CondArith, CondVal};
|
||||
let lit = |n| Box::new(CondArith::Lit(n));
|
||||
let var = |s: &str| Box::new(CondArith::Var(s.to_string()));
|
||||
let bin = |op, l, r| Box::new(CondArith::Bin(op, l, r));
|
||||
|
||||
// num = 86 resolver.
|
||||
let r86 = |name: &str| if name == "num" { Some(86) } else { None };
|
||||
// (7*42) - num > 200 → 208 > 200 → true.
|
||||
let shape1 = CondArith::Bin(
|
||||
BinOp::Gt,
|
||||
bin(BinOp::Sub, bin(BinOp::Mul, lit(7), lit(42)), var("num")),
|
||||
lit(200),
|
||||
);
|
||||
assert_eq!(shape1.eval_bool(&r86), Some(true));
|
||||
|
||||
// (500/42) + num > 200 → 11 + 196 = 207 > 200 → true (integer div).
|
||||
let r196 = |name: &str| if name == "num" { Some(196) } else { None };
|
||||
let shape2 = CondArith::Bin(
|
||||
BinOp::Gt,
|
||||
bin(BinOp::Add, bin(BinOp::Div, lit(500), lit(42)), var("num")),
|
||||
lit(200),
|
||||
);
|
||||
assert_eq!(shape2.eval_bool(&r196), Some(true));
|
||||
// Integer division truncates toward zero (500/42 == 11, not ~11.9).
|
||||
assert_eq!(
|
||||
CondArith::Bin(BinOp::Div, lit(500), lit(42)).eval(&r86),
|
||||
Some(CondVal::Int(11))
|
||||
);
|
||||
|
||||
// Unresolved variable → None (no prune).
|
||||
let none = |_: &str| None;
|
||||
assert_eq!(shape1.eval_bool(&none), None);
|
||||
|
||||
// Division / modulo by zero → None (never a wrong fold).
|
||||
assert_eq!(CondArith::Bin(BinOp::Div, lit(1), lit(0)).eval(&r86), None);
|
||||
assert_eq!(CondArith::Bin(BinOp::Mod, lit(1), lit(0)).eval(&r86), None);
|
||||
|
||||
// Arithmetic overflow → None.
|
||||
assert_eq!(
|
||||
CondArith::Bin(BinOp::Mul, lit(i64::MAX), lit(2)).eval(&r86),
|
||||
None
|
||||
);
|
||||
|
||||
// Bare integer at the top level is not a branch condition → eval_bool None.
|
||||
assert_eq!(CondArith::Lit(1).eval_bool(&r86), None);
|
||||
|
||||
// Comparing a boolean sub-result as an integer operand → None.
|
||||
let cmp = bin(BinOp::Gt, lit(2), lit(1)); // yields Bool
|
||||
assert_eq!(CondArith::Bin(BinOp::Add, cmp, lit(1)).eval(&r86), None);
|
||||
}
|
||||
|
||||
/// The CFG builder must capture a pure integer-arithmetic comparison as a
|
||||
/// `CondArith` on the `If` node, and must refuse (None) any condition that
|
||||
/// touches a call / field access / string.
|
||||
#[test]
|
||||
fn build_cond_arith_captures_pure_int_comparison() {
|
||||
let ts_lang = Language::from(tree_sitter_java::LANGUAGE);
|
||||
let src = br#"
|
||||
class C {
|
||||
void m(int num, String s) {
|
||||
if ((7 * 42) - num > 200) { foo(); }
|
||||
if (s.length() > 200) { bar(); }
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let (cfg, _entry) = parse_and_build(src, "java", ts_lang);
|
||||
let ifs = if_nodes(&cfg);
|
||||
let arith: Vec<_> = ifs.iter().filter_map(|&n| cfg[n].cond_arith.clone()).collect();
|
||||
|
||||
// Exactly one If condition is a pure int-arith comparison; the
|
||||
// `s.length() > 200` one must NOT be captured (it contains a call).
|
||||
assert_eq!(
|
||||
arith.len(),
|
||||
1,
|
||||
"only the pure int comparison should yield a CondArith, got {arith:?}"
|
||||
);
|
||||
// It folds to a definite bool once `num` is known constant.
|
||||
let r = |name: &str| if name == "num" { Some(86) } else { None };
|
||||
assert_eq!(arith[0].eval_bool(&r), Some(true));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1198,10 +1198,14 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
|
|||
| "string_content"
|
||||
| "string_fragment" => !has_string_interpolation(node),
|
||||
|
||||
// Numbers
|
||||
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
|
||||
true
|
||||
}
|
||||
// Numbers. Java's grammar uses radix-tagged kinds
|
||||
// (`decimal_integer_literal`, `hex_integer_literal`, …) rather than a
|
||||
// bare `integer`, so `int num = 86;` would otherwise miss this arm and
|
||||
// lower to `Const(None)` (Varying) instead of `Const("86")`.
|
||||
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number"
|
||||
| "decimal_integer_literal" | "hex_integer_literal" | "octal_integer_literal"
|
||||
| "binary_integer_literal" | "decimal_floating_point_literal"
|
||||
| "hex_floating_point_literal" => true,
|
||||
|
||||
// Booleans / null / nil / none
|
||||
"true" | "false" | "null" | "nil" | "none" | "null_literal" | "boolean"
|
||||
|
|
|
|||
307
src/cfg/mod.rs
307
src/cfg/mod.rs
|
|
@ -431,6 +431,129 @@ pub enum BinOp {
|
|||
GtEq,
|
||||
}
|
||||
|
||||
impl BinOp {
|
||||
/// True for the six comparison operators (result is a boolean 0/1).
|
||||
pub fn is_comparison(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
BinOp::Eq | BinOp::NotEq | BinOp::Lt | BinOp::LtEq | BinOp::Gt | BinOp::GtEq
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A branch condition captured as a pure integer-arithmetic + comparison
|
||||
/// expression tree at CFG-build time (where the real tree-sitter AST is
|
||||
/// available, so operator precedence and parentheses are correct by
|
||||
/// construction — no text re-parsing downstream).
|
||||
///
|
||||
/// Built only when *every* leaf is an integer literal or a plain identifier
|
||||
/// and *every* interior node is an arithmetic / comparison / bitwise operator,
|
||||
/// a unary `-`, or a parenthesis. Any call, field access, string, container,
|
||||
/// or compound-boolean (`&&` / `||`) subtree makes the builder return `None`
|
||||
/// for the whole condition. Identifiers are stored by name and resolved to
|
||||
/// their constant SSA value at fold time
|
||||
/// ([`crate::ssa::const_prop::fold_constant_branches`]); the actual numeric
|
||||
/// evaluation is shared in [`CondArith::eval`].
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum CondArith {
|
||||
/// Integer literal.
|
||||
Lit(i64),
|
||||
/// Identifier — resolved to a constant integer at fold time, else unknown.
|
||||
Var(String),
|
||||
/// Unary integer negation: `-x`.
|
||||
Neg(Box<CondArith>),
|
||||
/// Binary arithmetic / bitwise / comparison.
|
||||
Bin(BinOp, Box<CondArith>, Box<CondArith>),
|
||||
}
|
||||
|
||||
/// Result of folding a [`CondArith`] against a constant environment.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum CondVal {
|
||||
Int(i64),
|
||||
Bool(bool),
|
||||
}
|
||||
|
||||
impl CondArith {
|
||||
/// Evaluate against a variable→constant-integer resolver. Returns `None`
|
||||
/// the moment anything is non-constant or an operation is undefined
|
||||
/// (division/modulo by zero, arithmetic overflow, type mismatch), so a
|
||||
/// caller can only ever prune on a *definite* result. All integer
|
||||
/// arithmetic is checked; overflow yields `None` rather than a wrapped
|
||||
/// value, which keeps the fold sound across the i32/i64 gap.
|
||||
pub fn eval(&self, resolve: &impl Fn(&str) -> Option<i64>) -> Option<CondVal> {
|
||||
match self {
|
||||
CondArith::Lit(n) => Some(CondVal::Int(*n)),
|
||||
CondArith::Var(name) => resolve(name).map(CondVal::Int),
|
||||
CondArith::Neg(inner) => match inner.eval(resolve)? {
|
||||
CondVal::Int(n) => n.checked_neg().map(CondVal::Int),
|
||||
CondVal::Bool(_) => None,
|
||||
},
|
||||
CondArith::Bin(op, l, r) => {
|
||||
let lhs = match l.eval(resolve)? {
|
||||
CondVal::Int(n) => n,
|
||||
CondVal::Bool(_) => return None,
|
||||
};
|
||||
let rhs = match r.eval(resolve)? {
|
||||
CondVal::Int(n) => n,
|
||||
CondVal::Bool(_) => return None,
|
||||
};
|
||||
let arith = |v: Option<i64>| v.map(CondVal::Int);
|
||||
match op {
|
||||
BinOp::Add => arith(lhs.checked_add(rhs)),
|
||||
BinOp::Sub => arith(lhs.checked_sub(rhs)),
|
||||
BinOp::Mul => arith(lhs.checked_mul(rhs)),
|
||||
// Java/Rust integer division and modulo both truncate
|
||||
// toward zero; `checked_*` rejects div-by-zero and
|
||||
// i64::MIN / -1 overflow.
|
||||
BinOp::Div => arith(lhs.checked_div(rhs)),
|
||||
BinOp::Mod => arith(lhs.checked_rem(rhs)),
|
||||
BinOp::BitAnd => arith(Some(lhs & rhs)),
|
||||
BinOp::BitOr => arith(Some(lhs | rhs)),
|
||||
BinOp::BitXor => arith(Some(lhs ^ rhs)),
|
||||
BinOp::LeftShift => {
|
||||
u32::try_from(rhs).ok().and_then(|s| lhs.checked_shl(s)).map(CondVal::Int)
|
||||
}
|
||||
BinOp::RightShift => {
|
||||
u32::try_from(rhs).ok().and_then(|s| lhs.checked_shr(s)).map(CondVal::Int)
|
||||
}
|
||||
BinOp::Eq => Some(CondVal::Bool(lhs == rhs)),
|
||||
BinOp::NotEq => Some(CondVal::Bool(lhs != rhs)),
|
||||
BinOp::Lt => Some(CondVal::Bool(lhs < rhs)),
|
||||
BinOp::LtEq => Some(CondVal::Bool(lhs <= rhs)),
|
||||
BinOp::Gt => Some(CondVal::Bool(lhs > rhs)),
|
||||
BinOp::GtEq => Some(CondVal::Bool(lhs >= rhs)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate to a definite boolean, or `None`. The top-level node must be a
|
||||
/// comparison (a bare integer is not a branch condition we fold).
|
||||
pub fn eval_bool(&self, resolve: &impl Fn(&str) -> Option<i64>) -> Option<bool> {
|
||||
match self.eval(resolve)? {
|
||||
CondVal::Bool(b) => Some(b),
|
||||
CondVal::Int(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect every identifier name referenced by the tree.
|
||||
pub fn collect_vars(&self, out: &mut Vec<String>) {
|
||||
match self {
|
||||
CondArith::Lit(_) => {}
|
||||
CondArith::Var(name) => {
|
||||
if !out.iter().any(|v| v == name) {
|
||||
out.push(name.clone());
|
||||
}
|
||||
}
|
||||
CondArith::Neg(inner) => inner.collect_vars(out),
|
||||
CondArith::Bin(_, l, r) => {
|
||||
l.collect_vars(out);
|
||||
r.collect_vars(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Call-related metadata for CFG nodes.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CallMeta {
|
||||
|
|
@ -662,6 +785,17 @@ pub struct NodeInfo {
|
|||
pub condition_vars: Vec<String>,
|
||||
/// For If nodes: whether the condition has a leading negation (`!` / `not`).
|
||||
pub condition_negated: bool,
|
||||
/// For If / conditional (ternary) nodes: the condition as a pure
|
||||
/// integer-arithmetic + comparison expression tree, when the whole
|
||||
/// condition is built only from integer literals, identifiers, arithmetic
|
||||
/// / comparison operators, and parentheses. `None` for any condition that
|
||||
/// touches a call, field access, string, compound boolean (`&&`/`||`), or
|
||||
/// any shape this evaluator cannot prove constant. Consumed by
|
||||
/// [`crate::ssa::const_prop::fold_constant_branches`] to prune branches
|
||||
/// whose condition folds to a definite boolean once its variables are
|
||||
/// resolved to constants — closing the synthetic "dead branch keeps the
|
||||
/// tainted phi operand alive" false positive without any text re-parsing.
|
||||
pub cond_arith: Option<CondArith>,
|
||||
/// True when this is a Call node whose argument list contains only
|
||||
/// syntactic literal values (strings, numbers, booleans, null/nil,
|
||||
/// arrays/lists/tuples of literals). Also true for zero-argument calls
|
||||
|
|
@ -1065,7 +1199,7 @@ fn extract_condition_raw<'a>(
|
|||
ast: Node<'a>,
|
||||
lang: &str,
|
||||
code: &'a [u8],
|
||||
) -> (Option<String>, Vec<String>, bool) {
|
||||
) -> (Option<String>, Vec<String>, bool, Option<CondArith>) {
|
||||
// 1. Find the condition subtree.
|
||||
let cond_node = ast.child_by_field_name("condition").or_else(|| {
|
||||
// Rust `if_expression` uses positional children: the condition is
|
||||
|
|
@ -1085,7 +1219,7 @@ fn extract_condition_raw<'a>(
|
|||
});
|
||||
|
||||
let Some(cond) = cond_node else {
|
||||
return (None, Vec::new(), false);
|
||||
return (None, Vec::new(), false, None);
|
||||
};
|
||||
|
||||
// 2. Detect leading negation (`!expr`, `not expr`, Ruby `unless`).
|
||||
|
|
@ -1103,7 +1237,20 @@ fn extract_condition_raw<'a>(
|
|||
let text = text_of(cond, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
|
||||
(text, vars, negated)
|
||||
// 5. Capture the pure integer-arithmetic + comparison tree (for constant
|
||||
// branch folding). Built from the FULL condition node `cond` (not the
|
||||
// negation-stripped `inner`) so the folded boolean matches the
|
||||
// Branch terminator's `true_blk = cond-true` semantics directly. Ruby
|
||||
// `unless` swaps the True/False edges in the CFG builder (lines
|
||||
// ~5029), so the branch polarity would be inverted — skip it to stay
|
||||
// sound (`unless` with a constant arithmetic guard is negligible).
|
||||
let cond_arith = if ast.kind() == "unless" {
|
||||
None
|
||||
} else {
|
||||
build_cond_arith(cond, lang, code, 0)
|
||||
};
|
||||
|
||||
(text, vars, negated, cond_arith)
|
||||
}
|
||||
|
||||
/// Detect leading negation and return the inner expression.
|
||||
|
|
@ -1241,6 +1388,155 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
|
|||
None
|
||||
}
|
||||
|
||||
/// Parse an integer literal node to its `i64` value, honouring hex / octal /
|
||||
/// binary radix prefixes and Java/Rust digit separators (`1_000`). Returns
|
||||
/// `None` for floats, non-literals, or values that overflow `i64`.
|
||||
fn parse_int_literal(node: Node, code: &[u8]) -> Option<i64> {
|
||||
let kind = node.kind();
|
||||
let is_int = matches!(
|
||||
kind,
|
||||
"integer"
|
||||
| "integer_literal"
|
||||
| "int_literal"
|
||||
| "number"
|
||||
| "number_literal"
|
||||
| "decimal_integer_literal"
|
||||
| "hex_integer_literal"
|
||||
| "octal_integer_literal"
|
||||
| "binary_integer_literal"
|
||||
);
|
||||
if !is_int {
|
||||
return None;
|
||||
}
|
||||
let raw = std::str::from_utf8(&code[node.byte_range()]).ok()?.trim();
|
||||
// Strip Java long suffix and digit separators.
|
||||
let cleaned: String = raw
|
||||
.trim_end_matches(['l', 'L'])
|
||||
.chars()
|
||||
.filter(|c| *c != '_')
|
||||
.collect();
|
||||
if let Ok(v) = cleaned.parse::<i64>() {
|
||||
return Some(v);
|
||||
}
|
||||
if let Some(h) = cleaned.strip_prefix("0x").or_else(|| cleaned.strip_prefix("0X")) {
|
||||
return i64::from_str_radix(h, 16).ok();
|
||||
}
|
||||
if let Some(o) = cleaned.strip_prefix("0o").or_else(|| cleaned.strip_prefix("0O")) {
|
||||
return i64::from_str_radix(o, 8).ok();
|
||||
}
|
||||
if let Some(b) = cleaned.strip_prefix("0b").or_else(|| cleaned.strip_prefix("0B")) {
|
||||
return i64::from_str_radix(b, 2).ok();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Map the operator token of a binary expression node to a [`BinOp`].
|
||||
/// Scans for the single anonymous operator child (operands are named).
|
||||
/// Returns `None` for boolean operators (`&&` / `||`), assignment, or any
|
||||
/// token not in the arithmetic / bitwise / comparison set — those make the
|
||||
/// enclosing [`CondArith`] build bail.
|
||||
fn binary_op_token(node: Node) -> Option<BinOp> {
|
||||
let mut cursor = node.walk();
|
||||
for child in node.children(&mut cursor) {
|
||||
if child.is_named() {
|
||||
continue;
|
||||
}
|
||||
return match child.kind() {
|
||||
"+" => Some(BinOp::Add),
|
||||
"-" => Some(BinOp::Sub),
|
||||
"*" => Some(BinOp::Mul),
|
||||
"/" => Some(BinOp::Div),
|
||||
"%" => Some(BinOp::Mod),
|
||||
"&" => Some(BinOp::BitAnd),
|
||||
"|" => Some(BinOp::BitOr),
|
||||
"^" => Some(BinOp::BitXor),
|
||||
"<<" => Some(BinOp::LeftShift),
|
||||
">>" => Some(BinOp::RightShift),
|
||||
"==" | "===" => Some(BinOp::Eq),
|
||||
"!=" | "!==" => Some(BinOp::NotEq),
|
||||
"<" => Some(BinOp::Lt),
|
||||
"<=" => Some(BinOp::LtEq),
|
||||
">" => Some(BinOp::Gt),
|
||||
">=" => Some(BinOp::GtEq),
|
||||
_ => None,
|
||||
};
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Build a [`CondArith`] tree from a condition AST subtree, or `None` if the
|
||||
/// condition is not a pure integer-arithmetic + comparison expression. Uses
|
||||
/// the real tree-sitter node so operator precedence and parentheses are
|
||||
/// already encoded in the tree shape — no text parsing. Conservative by
|
||||
/// construction: any unrecognised node kind (call, field access, string,
|
||||
/// boolean `&&`/`||`, unary `!`) returns `None`, which disables folding for
|
||||
/// that branch (never a wrong fold). Depth-bounded to guard against
|
||||
/// pathological nesting.
|
||||
fn build_cond_arith(node: Node, lang: &str, code: &[u8], depth: u32) -> Option<CondArith> {
|
||||
if depth > 64 {
|
||||
return None;
|
||||
}
|
||||
let kind = node.kind();
|
||||
|
||||
// Unwrap parentheses (transparent to value).
|
||||
if matches!(kind, "parenthesized_expression" | "parenthesized" | "parenthesized_statement") {
|
||||
let inner = node.named_child(0)?;
|
||||
return build_cond_arith(inner, lang, code, depth + 1);
|
||||
}
|
||||
|
||||
if let Some(n) = parse_int_literal(node, code) {
|
||||
return Some(CondArith::Lit(n));
|
||||
}
|
||||
|
||||
// Bare identifier (reject dotted paths / field access — those are not
|
||||
// captured here; only a plain local whose const value we can resolve).
|
||||
if matches!(kind, "identifier" | "simple_identifier") {
|
||||
let name = text_of(node, code)?;
|
||||
if !name.is_empty()
|
||||
&& name.chars().all(|c| c.is_alphanumeric() || c == '_' || c == '$')
|
||||
{
|
||||
return Some(CondArith::Var(name));
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
// Unary `-` only (boolean `!` / `not` is intentionally unsupported: its
|
||||
// operand would be a boolean, which `CondArith::eval` rejects, so folding
|
||||
// a negated condition is left to the conservative `None` path).
|
||||
if matches!(
|
||||
kind,
|
||||
"unary_expression" | "unary_operator" | "prefix_unary_expression" | "unary"
|
||||
) {
|
||||
let operand = node.named_child(0)?;
|
||||
let mut cursor = node.walk();
|
||||
let is_neg = node
|
||||
.children(&mut cursor)
|
||||
.any(|c| !c.is_named() && c.kind() == "-");
|
||||
if is_neg {
|
||||
return Some(CondArith::Neg(Box::new(build_cond_arith(
|
||||
operand,
|
||||
lang,
|
||||
code,
|
||||
depth + 1,
|
||||
)?)));
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
// Binary arithmetic / comparison: exactly two operands + one operator.
|
||||
if is_binary_expr_kind(kind, lang) {
|
||||
if node.named_child_count() != 2 {
|
||||
return None; // chained comparison (Python `a < b < c`) etc.
|
||||
}
|
||||
let op = binary_op_token(node)?;
|
||||
let lhs = build_cond_arith(node.named_child(0)?, lang, code, depth + 1)?;
|
||||
let rhs = build_cond_arith(node.named_child(1)?, lang, code, depth + 1)?;
|
||||
return Some(CondArith::Bin(op, Box::new(lhs), Box::new(rhs)));
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Find the RHS value node of an assignment-like AST node (variable declarator,
|
||||
/// lexical declaration, assignment expression). Used by helpers that need to
|
||||
/// inspect what an identifier is being initialized to.
|
||||
|
|
@ -3231,11 +3527,11 @@ pub(super) fn push_node<'a>(
|
|||
};
|
||||
|
||||
// Extract condition metadata for If nodes.
|
||||
let (condition_text, condition_vars, condition_negated) =
|
||||
let (condition_text, condition_vars, condition_negated, cond_arith) =
|
||||
if matches!(lookup(lang, ast.kind()), Kind::If) {
|
||||
extract_condition_raw(ast, lang, code)
|
||||
} else {
|
||||
(None, Vec::new(), false)
|
||||
(None, Vec::new(), false, None)
|
||||
};
|
||||
|
||||
// Extract per-argument identifiers for Call nodes.
|
||||
|
|
@ -3512,6 +3808,7 @@ pub(super) fn push_node<'a>(
|
|||
condition_text,
|
||||
condition_vars,
|
||||
condition_negated,
|
||||
cond_arith,
|
||||
all_args_literal,
|
||||
catch_param: false,
|
||||
arg_callees,
|
||||
|
|
|
|||
|
|
@ -231,6 +231,13 @@ fn type_kind_index(kind: &TypeKind) -> u32 {
|
|||
| TypeKind::GormDb
|
||||
| TypeKind::SqlxDb
|
||||
| TypeKind::HibernateSession => 3,
|
||||
// ProcessBuilder participates only in the type-qualified callee
|
||||
// resolver via `label_prefix()`; no dedicated bitset slot, share
|
||||
// the Object index like the other receiver-only TypeKinds.
|
||||
TypeKind::ProcessBuilder => 3,
|
||||
// Runtime is likewise a type-qualified-resolver-only receiver kind
|
||||
// (`Runtime.exec`); no dedicated bitset slot, share the Object index.
|
||||
TypeKind::Runtime => 3,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -275,6 +275,14 @@ pub fn class_name_to_type_kind(name: &str) -> Option<TypeKind> {
|
|||
// type-qualified resolution to `Template.process`, the SSTI
|
||||
// sink defined in `labels/java.rs`.
|
||||
"Template" => Some(TypeKind::Template),
|
||||
// `java.lang.Runtime` declared receiver type. Routes the
|
||||
// split-receiver shape `Runtime r = Runtime.getRuntime(); ...
|
||||
// r.exec(...)` through type-qualified resolution to
|
||||
// `Runtime.exec` (the only `Runtime.*` rule, always SHELL_ESCAPE),
|
||||
// complementing the `constructor_type` factory route for
|
||||
// `Runtime.getRuntime()`. No benign `Runtime.exec` exists, so
|
||||
// typing any `Runtime`-declared receiver carries no FP risk.
|
||||
"Runtime" => Some(TypeKind::Runtime),
|
||||
// Python qualified type names.
|
||||
// Only covers raw lowered names from isinstance(). The lowering in lower.rs
|
||||
// extracts the literal type text: isinstance(x, requests.Session) produces
|
||||
|
|
|
|||
|
|
@ -124,6 +124,23 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// `ProcessBuilder.command(argList)` — the dominant OWASP Benchmark
|
||||
// command-injection shape builds an argument `List<String>`, attaches it
|
||||
// via `pb.command(argList)`, then runs `pb.start()`. The argument list is
|
||||
// a separate channel from the constructor, so the flat `ProcessBuilder`
|
||||
// constructor sink above never sees the tainted args. This rule fires
|
||||
// only via type-qualified resolution: the receiver `pb` must carry a
|
||||
// `TypeKind::ProcessBuilder` fact (set by `constructor_type` for
|
||||
// `new ProcessBuilder(...)`), so the resolver rewrites `pb.command(...)` →
|
||||
// `ProcessBuilder.command`. Case-sensitive and receiver-typed to avoid
|
||||
// colliding with the many unrelated `.command(...)` methods (CLI builders,
|
||||
// JCommander, picocli, Swing actions). The payload is restricted to arg 0
|
||||
// (the command list) via `type_qualified_sink_payload_args`.
|
||||
LabelRule {
|
||||
matchers: &["ProcessBuilder.command"],
|
||||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: true,
|
||||
},
|
||||
LabelRule {
|
||||
matchers: &["executeQuery", "executeUpdate"],
|
||||
label: DataLabel::Sink(Cap::SQL_QUERY),
|
||||
|
|
|
|||
|
|
@ -1496,7 +1496,11 @@ pub fn type_qualified_sink_payload_args(qualified_callee: &str) -> Option<&'stat
|
|||
| "TypeOrmRepo.createQueryBuilder"
|
||||
| "TypeOrmManager.query"
|
||||
| "TypeOrmManager.createQueryBuilder"
|
||||
| "MikroOrmEm.execute" => Some(&[0]),
|
||||
| "MikroOrmEm.execute"
|
||||
// `ProcessBuilder.command(argList)` — arg 0 is the command list;
|
||||
// any later positional args are not part of the v1 shape. Restrict
|
||||
// sink-taint scanning to arg 0 so receiver / unrelated args don't fire.
|
||||
| "ProcessBuilder.command" => Some(&[0]),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1202,6 +1202,8 @@ fn type_kind_tag(k: &TypeKind) -> String {
|
|||
TypeKind::GormDb => "GormDb".into(),
|
||||
TypeKind::SqlxDb => "SqlxDb".into(),
|
||||
TypeKind::HibernateSession => "HibernateSession".into(),
|
||||
TypeKind::ProcessBuilder => "ProcessBuilder".into(),
|
||||
TypeKind::Runtime => "Runtime".into(),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -624,6 +624,192 @@ pub fn apply_const_prop(body: &mut SsaBody, result: &ConstPropResult) -> usize {
|
|||
pruned
|
||||
}
|
||||
|
||||
/// Resolve a condition variable name to the SSA value reaching `block`.
|
||||
///
|
||||
/// Mirrors `constraint::lower::resolve_single_var` (the established resolver
|
||||
/// for branch-condition variables): prefer the highest-indexed definition in
|
||||
/// the branch block itself, else the highest-indexed definition elsewhere.
|
||||
/// Kept local to avoid a `ssa → constraint` dependency cycle (constraint
|
||||
/// already depends on ssa).
|
||||
fn resolve_const_var(body: &SsaBody, var_name: &str, block: BlockId) -> Option<SsaValue> {
|
||||
let mut best_in_block: Option<SsaValue> = None;
|
||||
let mut best_outside: Option<SsaValue> = None;
|
||||
for (idx, vd) in body.value_defs.iter().enumerate() {
|
||||
if vd.var_name.as_deref() != Some(var_name) {
|
||||
continue;
|
||||
}
|
||||
let v = SsaValue(idx as u32);
|
||||
if vd.block == block {
|
||||
best_in_block = Some(match best_in_block {
|
||||
Some(existing) if existing.0 > v.0 => existing,
|
||||
_ => v,
|
||||
});
|
||||
} else {
|
||||
best_outside = Some(match best_outside {
|
||||
Some(existing) if existing.0 > v.0 => existing,
|
||||
_ => v,
|
||||
});
|
||||
}
|
||||
}
|
||||
best_in_block.or(best_outside)
|
||||
}
|
||||
|
||||
/// Fold branch conditions that are pure integer-arithmetic comparisons over
|
||||
/// constant operands, pruning the statically-dead edge.
|
||||
///
|
||||
/// Complements [`apply_const_prop`], which only folds a condition that lowers
|
||||
/// to a single SSA boolean value. An arithmetic comparison condition such as
|
||||
/// `(7*42) - num > 200` is **never** an SSA value — condition nodes lower to
|
||||
/// `Nop` and the comparison is held structurally on the branch terminator — so
|
||||
/// SCCP cannot reach it. This pass instead evaluates the
|
||||
/// [`crate::cfg::CondArith`] tree captured at CFG-build time, resolving each
|
||||
/// variable to its const-propagated integer.
|
||||
///
|
||||
/// Sound by construction:
|
||||
/// * A branch is pruned only when its `CondArith` evaluates to a **definite**
|
||||
/// boolean — every variable bound to a known integer constant and every
|
||||
/// operation defined (no div-by-zero / overflow). `None`/`Varying` leaves
|
||||
/// both edges intact.
|
||||
/// * After the terminator is rewritten to `Goto(taken)` and the dead edge is
|
||||
/// dropped (symmetrically, preserving pred/succ consistency), every phi
|
||||
/// operand whose predecessor is no longer reachable from entry is removed.
|
||||
/// That last step is what actually drops the dead-branch operand from a
|
||||
/// merge phi like `bar = phi(then: "const", else: param)` — without it the
|
||||
/// taint engine's phi fallback would still read the tainted `param` from
|
||||
/// the joined entry state.
|
||||
///
|
||||
/// Returns the number of branches pruned.
|
||||
pub fn fold_constant_branches(
|
||||
body: &mut SsaBody,
|
||||
cfg: &crate::cfg::Cfg,
|
||||
const_values: &HashMap<SsaValue, ConstLattice>,
|
||||
) -> usize {
|
||||
use crate::ssa::ir::Terminator;
|
||||
|
||||
// 1. Collect definite fold decisions: (branch_block_idx, taken, untaken).
|
||||
let mut prune_ops: Vec<(usize, BlockId, BlockId)> = Vec::new();
|
||||
for (block_idx, block) in body.blocks.iter().enumerate() {
|
||||
let Terminator::Branch {
|
||||
cond,
|
||||
true_blk,
|
||||
false_blk,
|
||||
..
|
||||
} = &block.terminator
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
// Degenerate `cond ? X : X` (both edges to one block): nothing to prune.
|
||||
if true_blk == false_blk {
|
||||
continue;
|
||||
}
|
||||
let Some(cond_info) = cfg.node_weight(*cond) else {
|
||||
continue;
|
||||
};
|
||||
let Some(arith) = cond_info.cond_arith.as_ref() else {
|
||||
continue;
|
||||
};
|
||||
let branch_block = block.id;
|
||||
let resolve = |name: &str| -> Option<i64> {
|
||||
let v = resolve_const_var(body, name, branch_block)?;
|
||||
match const_values.get(&v) {
|
||||
Some(ConstLattice::Int(n)) => Some(*n),
|
||||
_ => None,
|
||||
}
|
||||
};
|
||||
match arith.eval_bool(&resolve) {
|
||||
Some(true) => prune_ops.push((block_idx, *true_blk, *false_blk)),
|
||||
Some(false) => prune_ops.push((block_idx, *false_blk, *true_blk)),
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
let pruned = prune_ops.len();
|
||||
if pruned == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 2. Rewrite terminators + drop the dead edge (symmetrically).
|
||||
for &(block_idx, taken, untaken) in &prune_ops {
|
||||
let pred_id = body.blocks[block_idx].id;
|
||||
body.blocks[block_idx].terminator = Terminator::Goto(taken);
|
||||
body.blocks[block_idx].succs.retain(|s| *s != untaken);
|
||||
let untaken_idx = untaken.0 as usize;
|
||||
if untaken_idx < body.blocks.len() {
|
||||
body.blocks[untaken_idx].preds.retain(|p| *p != pred_id);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Recompute reachability from entry over the (now-pruned) succ edges.
|
||||
let n = body.blocks.len();
|
||||
let mut reachable = vec![false; n];
|
||||
let mut stack = vec![body.entry];
|
||||
if (body.entry.0 as usize) < n {
|
||||
reachable[body.entry.0 as usize] = true;
|
||||
}
|
||||
while let Some(b) = stack.pop() {
|
||||
let bidx = b.0 as usize;
|
||||
if bidx >= n {
|
||||
continue;
|
||||
}
|
||||
// Clone succs to avoid borrow conflict with `reachable`.
|
||||
let succs: SmallVec<[BlockId; 2]> = body.blocks[bidx].succs.clone();
|
||||
for s in succs {
|
||||
let sidx = s.0 as usize;
|
||||
if sidx < n && !reachable[sidx] {
|
||||
reachable[sidx] = true;
|
||||
stack.push(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Reachable blocks: drop the now-dead predecessor. Removing the phi
|
||||
// operand from the merge block is what stops the tainted dead-branch
|
||||
// value feeding the phi; removing the pred keeps pred/succ symmetric
|
||||
// with step 5's succ clearing. Operands from still-reachable
|
||||
// predecessors are untouched, so no live flow is lost.
|
||||
for (bidx, block) in body.blocks.iter_mut().enumerate() {
|
||||
if !reachable[bidx] {
|
||||
continue;
|
||||
}
|
||||
block.preds.retain(|p| {
|
||||
let pidx = p.0 as usize;
|
||||
pidx < n && reachable[pidx]
|
||||
});
|
||||
for phi in &mut block.phis {
|
||||
if let SsaOp::Phi(operands) = &mut phi.op {
|
||||
operands.retain(|(pred, _)| {
|
||||
let pidx = pred.0 as usize;
|
||||
pidx < n && reachable[pidx]
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Unreachable blocks: neutralise them so the *later* optimiser passes
|
||||
// (copy-prop, base-alias grouping, type-facts, points-to) and the taint
|
||||
// transfer never observe their dead instructions. This is the
|
||||
// load-bearing step for precision: a dead `else bar = param` would
|
||||
// otherwise make copy-prop alias `bar`↔`param`, and
|
||||
// `propagate_taint_to_aliases` would then poison the *surviving const*
|
||||
// `bar` with `param`'s (still-reachable) taint — defeating the whole
|
||||
// prune. Each instruction is rewritten to `Nop` (value + cfg_node
|
||||
// preserved so `value_defs` coverage holds), the terminator to
|
||||
// `Unreachable`, and the block is fully disconnected.
|
||||
for (bidx, block) in body.blocks.iter_mut().enumerate() {
|
||||
if reachable[bidx] {
|
||||
continue;
|
||||
}
|
||||
for inst in block.phis.iter_mut().chain(block.body.iter_mut()) {
|
||||
inst.op = SsaOp::Nop;
|
||||
}
|
||||
block.terminator = Terminator::Unreachable;
|
||||
block.succs.clear();
|
||||
block.preds.clear();
|
||||
}
|
||||
|
||||
pruned
|
||||
}
|
||||
|
||||
/// Collect module aliases from `require()` calls in the SSA body.
|
||||
///
|
||||
/// Detects patterns like `const http = require("http")` and propagates
|
||||
|
|
|
|||
|
|
@ -101,7 +101,12 @@ pub fn optimize_ssa_with_param_types(
|
|||
) -> OptimizeResult {
|
||||
// 1. Constant propagation (SCCP)
|
||||
let cp = const_prop::const_propagate(body);
|
||||
let branches_pruned = const_prop::apply_const_prop(body, &cp);
|
||||
let mut branches_pruned = const_prop::apply_const_prop(body, &cp);
|
||||
// 1b. Fold pure integer-arithmetic comparison branch conditions that SCCP
|
||||
// cannot reach (the comparison is held on the terminator, not an SSA
|
||||
// value). Prunes statically-dead edges + their merge-phi operands so a
|
||||
// dead `else bar = param` stops feeding a tainted operand into the phi.
|
||||
branches_pruned += const_prop::fold_constant_branches(body, cfg, &cp.values);
|
||||
|
||||
// 2. Copy propagation
|
||||
let (copies_eliminated, copy_map) = copy_prop::copy_propagate(body, cfg);
|
||||
|
|
|
|||
|
|
@ -261,6 +261,33 @@ pub enum TypeKind {
|
|||
/// arbitrary-receiver-name shape (`sess`, `hibernateSession`, etc.)
|
||||
/// via type-qualified resolution.
|
||||
HibernateSession,
|
||||
/// A `java.lang.ProcessBuilder` instance produced by
|
||||
/// `new ProcessBuilder(...)`. The dominant OWASP Benchmark
|
||||
/// command-injection shape builds an argument `List<String>`, attaches
|
||||
/// it via `pb.command(argList)`, then runs it with `pb.start()`. The
|
||||
/// argument list is a separate channel from the constructor, so the
|
||||
/// flat `ProcessBuilder` constructor sink never sees the tainted args.
|
||||
/// Mapping the receiver to this TypeKind lets the type-qualified
|
||||
/// resolver rewrite `pb.command(argList)` → `ProcessBuilder.command`
|
||||
/// against the flat SHELL_ESCAPE rule in `labels/java.rs`, so tainted
|
||||
/// list contents reaching the command builder are caught at the
|
||||
/// `command(...)` call site.
|
||||
ProcessBuilder,
|
||||
/// A `java.lang.Runtime` instance produced by the static factory
|
||||
/// `Runtime.getRuntime()`. The dominant OWASP Benchmark
|
||||
/// command-injection shape splits the receiver across statements:
|
||||
/// `Runtime r = Runtime.getRuntime(); ... r.exec(args, argsEnv)`. The
|
||||
/// callee text at the sink is `r.exec`, which does not suffix-match the
|
||||
/// flat `Runtime.exec` rule in `labels/java.rs` (the chained
|
||||
/// `Runtime.getRuntime().exec(...)` form fires only because its callee
|
||||
/// text literally contains `Runtime`). Mapping the receiver `r` to
|
||||
/// this TypeKind lets the type-qualified resolver rewrite `r.exec(...)`
|
||||
/// → `Runtime.exec` against the flat SHELL_ESCAPE rule, so tainted data
|
||||
/// reaching the split-receiver exec is caught. No payload-arg
|
||||
/// restriction: `Runtime.exec` overloads place the tainted data in
|
||||
/// either the command (arg 0) or the environment array (arg 1), so the
|
||||
/// default all-args sink scan must cover every position.
|
||||
Runtime,
|
||||
}
|
||||
|
||||
/// structural carrier for a recognised DTO type. Maps
|
||||
|
|
@ -318,6 +345,8 @@ impl TypeKind {
|
|||
Self::GormDb => Some("GormDb"),
|
||||
Self::SqlxDb => Some("SqlxDb"),
|
||||
Self::HibernateSession => Some("HibernateSession"),
|
||||
Self::ProcessBuilder => Some("ProcessBuilder"),
|
||||
Self::Runtime => Some("Runtime"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
@ -708,6 +737,18 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
|
|||
"openSession" | "getCurrentSession" | "openStatelessSession" => {
|
||||
Some(TypeKind::HibernateSession)
|
||||
}
|
||||
// `new ProcessBuilder(...)` — the receiver's `command(argList)`
|
||||
// setter is a command-injection sink for the list contents.
|
||||
// Type-qualified resolution rewrites `pb.command(...)` →
|
||||
// `ProcessBuilder.command` against the flat SHELL_ESCAPE rule.
|
||||
"ProcessBuilder" => Some(TypeKind::ProcessBuilder),
|
||||
// `Runtime.getRuntime()` — the static factory returns the
|
||||
// singleton `java.lang.Runtime`. Gating on `callee.contains
|
||||
// ("Runtime")` keeps an unrelated `foo.getRuntime()` method from
|
||||
// being mistyped. Type-qualified resolution rewrites the
|
||||
// split-receiver `r.exec(...)` → `Runtime.exec` against the flat
|
||||
// SHELL_ESCAPE rule.
|
||||
"getRuntime" if callee.contains("Runtime") => Some(TypeKind::Runtime),
|
||||
_ => None,
|
||||
},
|
||||
Lang::JavaScript | Lang::TypeScript => {
|
||||
|
|
|
|||
|
|
@ -1929,7 +1929,7 @@ pub(crate) fn extract_intra_file_ssa_summaries(
|
|||
|
||||
for (func_name, func_entry) in &func_entries {
|
||||
let formal_params = lookup_formal_params(local_summaries, func_name);
|
||||
let func_ssa = match crate::ssa::lower_to_ssa_with_params(
|
||||
let mut func_ssa = match crate::ssa::lower_to_ssa_with_params(
|
||||
cfg,
|
||||
*func_entry,
|
||||
Some(func_name),
|
||||
|
|
@ -1939,6 +1939,9 @@ pub(crate) fn extract_intra_file_ssa_summaries(
|
|||
Ok(ssa) => ssa,
|
||||
Err(_) => continue,
|
||||
};
|
||||
// Match the `_from_bodies` path: prune dead constant branches before
|
||||
// the summary probe (see `prefold_dead_branches_for_summary`).
|
||||
prefold_dead_branches_for_summary(&mut func_ssa, cfg);
|
||||
|
||||
// `formal_params` is authoritative even when it is empty. SSA lowering
|
||||
// also emits Param ops for external captures; counting those as arity
|
||||
|
|
@ -2019,6 +2022,22 @@ pub(crate) fn extract_intra_file_ssa_summaries(
|
|||
/// name overloads with different arity, and anonymous bodies at distinct
|
||||
/// source spans all get distinct keys.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
/// Prune definite-constant dead branches on a freshly-lowered body *before*
|
||||
/// its interprocedural summary is extracted.
|
||||
///
|
||||
/// Summary extraction ([`ssa_transfer::extract_ssa_func_summary`]) runs on the
|
||||
/// pre-optimisation SSA, so without this a helper whose body returns a constant
|
||||
/// only because a dead `else x = param` branch is never taken would still emit
|
||||
/// a `param → return` transform — re-tainting the caller's `bar =
|
||||
/// helper(param)` and defeating the in-body branch fold. Only
|
||||
/// [`crate::ssa::const_prop::fold_constant_branches`] is applied (no copy-prop /
|
||||
/// DCE), so the change is limited to provably-dead arithmetic-comparison
|
||||
/// branches; the body's value numbering is otherwise untouched.
|
||||
fn prefold_dead_branches_for_summary(func_ssa: &mut crate::ssa::SsaBody, cfg: &crate::cfg::Cfg) {
|
||||
let cp = crate::ssa::const_prop::const_propagate(func_ssa);
|
||||
crate::ssa::const_prop::fold_constant_branches(func_ssa, cfg, &cp.values);
|
||||
}
|
||||
|
||||
pub(crate) fn lower_all_functions_from_bodies(
|
||||
file_cfg: &FileCfg,
|
||||
lang: Lang,
|
||||
|
|
@ -2108,6 +2127,9 @@ fn lower_all_functions_from_bodies_inner(
|
|||
Err(_) => continue,
|
||||
};
|
||||
perf_lower_record(0, _t_lower.elapsed().as_micros());
|
||||
// Prune dead constant branches before the summary probe so a helper's
|
||||
// dead `else x = param` does not surface as a spurious param→return.
|
||||
prefold_dead_branches_for_summary(&mut func_ssa, &body.graph);
|
||||
|
||||
let param_count = if !formal_params.is_empty() {
|
||||
formal_params.len()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue