#![allow(clippy::collapsible_if)] // ─── PredicateKind ─────────────────────────────────────────────────────────── /// Classification of what an if-condition tests. /// /// Determined by heuristic analysis of the raw condition text. /// Classification is conservative: prefer [`Unknown`](PredicateKind::Unknown) /// over a wrong guess. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum PredicateKind { /// `x.is_none()`, `x == null`, `x == nil`, `x is None` NullCheck, /// `x.is_empty()`, `x.len() == 0`, `x == ""` EmptyCheck, /// `x.is_err()`, `x.is_ok()`, `err != nil` ErrorCheck, /// Call to a validation/guard function: `validate(x)`, `is_safe(x)` ValidationCall, /// Call to a sanitizer function: `sanitize(x)`, `escape(x)` SanitizerCall, /// Allowlist/membership check: `.includes(x)`, `x in ALLOWED`, `in_array(x, ...)` AllowlistCheck, /// Type-check guard: `typeof x`, `isinstance(x, int)`, `is_numeric(x)` TypeCheck, /// Negative-validation of shell metacharacters: /// `x.contains(";")`, `x.match(/[;|&]/)`, `";" in x`, etc. /// /// The **true branch is the REJECT path** (early return / panic / throw) /// and the **false branch is the validated path**. Use inverted polarity /// when applying branch predicates. ShellMetaValidated, /// Bounded-length rejection: `x.len() > N` / `x.length < N` with N >= 2. /// /// Commonly paired with `ShellMetaValidated` in OR-chain rejection /// idioms (`if x.len() > MAX || x.contains(";") { reject }`). Counts as /// a dominator guard for `cfg-unguarded-sink` purposes, but intentionally /// does **not** mark variables as validated, the rejection direction is /// ambiguous from the condition alone (a `.len() > 5 { sink(x) }` /// gate is a precondition, not a rejection). BoundedLength, /// Comparison operators: `x == 5`, `x > threshold` Comparison, /// Generic boolean test, cannot classify further. Unknown, } /// Single-character shell metacharacters that a rejection check commonly /// guards against before constructing a shell command. /// /// Presence of any of these in user input is sufficient to enable shell /// injection, so rejecting input that contains them is a real sanitizer. /// `"foo"` or other non-metachar needles don't qualify, a rejection of /// those is business logic, not security. const SHELL_METACHARS: &[&str] = &[";", "|", "&", "`", "$", ">", "<", "\n", "\r", "\0"]; /// Check whether `text` matches a shell-metachar rejection idiom. /// /// Recognizes: /// - Rust / Java / Go: `x.contains("")` /// - JS / TS: `x.includes("")` /// - Python: `"" in x` /// - Ruby: `x.include?("")` /// - Regex form: `x.match(/[;|&]/)` / `re.search(r"[;|&]", x)` with a /// character class containing only metacharacters. /// /// Returns `false` if the needle is a non-metachar literal or cannot be /// extracted, falls through to broader classification. fn is_shell_metachar_rejection(text: &str) -> bool { // Method-call form: `.contains(…)` / `.includes(…)` / `.include?(…)` for method in [".contains(", ".includes(", ".include?("] { if let Some(idx) = text.find(method) { let args_start = idx + method.len(); if let Some(needle) = extract_first_string_arg(&text[args_start..]) { if SHELL_METACHARS.contains(&needle.as_str()) { return true; } } } } // Python membership form: `"" in x` (but not `x in ALLOWED`) if let Some(needle) = extract_python_in_needle(text) { if SHELL_METACHARS.contains(&needle.as_str()) { return true; } } // Regex character-class form: `.match(/[;|&]/)` / `re.search(r"[…]", …)` if is_metachar_regex_class(text) { return true; } false } /// Extract the first string literal argument from a slice starting just after /// an opening `(` in a call expression. Returns the raw inner text of the /// literal (without surrounding quotes). /// /// Handles `"..."`, `'...'`, and simple escapes `\"`, `\'`, `\\`. fn extract_first_string_arg(after_open: &str) -> Option { let bytes = after_open.as_bytes(); let mut i = 0; while i < bytes.len() && bytes[i].is_ascii_whitespace() { i += 1; } if i >= bytes.len() { return None; } let quote = bytes[i]; if quote != b'"' && quote != b'\'' { return None; } i += 1; let mut out = Vec::new(); while i < bytes.len() { let b = bytes[i]; if b == b'\\' && i + 1 < bytes.len() { match bytes[i + 1] { b'n' => out.push(b'\n'), b'r' => out.push(b'\r'), b't' => out.push(b'\t'), b'0' => out.push(b'\0'), c => out.push(c), } i += 2; continue; } if b == quote { return String::from_utf8(out).ok(); } out.push(b); i += 1; } None } /// For Python `"" in x` (needle on the left side of ` in `), return /// the needle. Returns `None` for `x in ALLOWED` (identifier on the left) , /// that is an allowlist check, not a rejection. fn extract_python_in_needle(text: &str) -> Option { let pos = text.find(" in ")?; let left = text[..pos].trim(); // Strip leading `!` / `not` for rejection contexts let left = left.strip_prefix('!').unwrap_or(left).trim(); let bytes = left.as_bytes(); let quote = *bytes.first()?; if quote != b'"' && quote != b'\'' { return None; } if bytes.last() != Some("e) || bytes.len() < 2 { return None; } let inner = &left[1..left.len() - 1]; Some(inner.to_string()) } /// Detect regex character classes that contain only shell metacharacters: /// `[;|&]`, `[;&`$]`, etc. Missing: escape-class metacharacters inside the /// class (e.g. `[\n]`), conservative, returns false there. fn is_metachar_regex_class(text: &str) -> bool { // Find `[` followed by content and `]`, anywhere in the text. let mut rest = text; while let Some(open) = rest.find('[') { let after = &rest[open + 1..]; if let Some(close) = after.find(']') { let inner = &after[..close]; if !inner.is_empty() && inner .chars() .all(|c| SHELL_METACHARS.iter().any(|m| m.starts_with(c))) { return true; } rest = &after[close + 1..]; } else { break; } } false } /// Check whether `text` looks like a bounded-length rejection: /// `x.len() > N`, `x.len() < N`, `x.length >= N`, etc. where `N` is an /// integer literal >= 2. Excludes `> 0` / `>= 1` / `< 1`, those are /// non-empty checks, which are not length-bound validations. fn is_bounded_length_check(lower: &str) -> bool { const PROBES: &[&str] = &[ ".len()", ".length", // JS/TS/Java `.length` property (no parens) ]; for probe in PROBES { let mut rest = lower; while let Some(pos) = rest.find(probe) { let after = &rest[pos + probe.len()..]; // Skip the optional `()` that `.length` never has but `.len` does. let after = after.trim_start(); let after = after.strip_prefix("()").unwrap_or(after); let after = after.trim_start(); for op in [">=", "<=", ">", "<"] { if let Some(tail) = after.strip_prefix(op) { let tail = tail.trim_start(); if let Some(n) = parse_leading_uint(tail) { if n >= 2 { return true; } } break; } } rest = &rest[pos + probe.len()..]; } } false } /// Parse a leading non-negative integer literal (decimal only). fn parse_leading_uint(s: &str) -> Option { let mut n: u64 = 0; let mut any = false; for c in s.chars() { if let Some(d) = c.to_digit(10) { n = n.checked_mul(10)?.checked_add(d as u64)?; any = true; } else { break; } } any.then_some(n) } /// Classify a raw condition text into a [`PredicateKind`]. /// /// # Rules /// /// - Empty/None text → [`Unknown`](PredicateKind::Unknown). /// - `ValidationCall` / `SanitizerCall` require a `(` in the text **and** a /// matching callee token. This avoids misclassifying comparisons like /// `x_valid == true`. /// - Prefers [`Unknown`](PredicateKind::Unknown) over false positives. pub fn classify_condition(text: &str) -> PredicateKind { if text.is_empty() { return PredicateKind::Unknown; } let lower = text.to_ascii_lowercase(); // ── Error checks (before null checks: `err != nil` is an error check, // not a null check, even though it contains `!= nil`) ────────────── if lower.contains("is_err") || lower.contains("is_ok") || lower.contains("err != nil") || lower.contains("err == nil") || lower.contains("error != nil") || lower.contains("error == nil") { return PredicateKind::ErrorCheck; } // ── Null checks ────────────────────────────────────────────────────── if lower.contains("is_none") || lower.contains("is_some") || lower.contains("== none") || lower.contains("!= none") || lower.contains("is none") || lower.contains("is not none") || lower.contains("== null") || lower.contains("!= null") || lower.contains("=== null") || lower.contains("!== null") || lower.contains("== nil") || lower.contains("!= nil") { return PredicateKind::NullCheck; } // ── Empty checks ───────────────────────────────────────────────────── if lower.contains("is_empty") || lower.contains(".len() == 0") || lower.contains(".len() != 0") || lower.contains(".length == 0") || lower.contains(".length === 0") || lower.contains(".length != 0") || lower.contains(".length !== 0") || lower.contains("== \"\"") || lower.contains("== ''") { return PredicateKind::EmptyCheck; } // ── Shell-metachar negative validation ─────────────────────────────── // // Matched BEFORE AllowlistCheck so that `x.contains(";")` is recognized // as a rejection idiom rather than a membership test. Checked on the // raw (non-lowercased) text so metacharacter comparisons stay // case-accurate, `;` / `|` / `&` have no case. if is_shell_metachar_rejection(text) { return PredicateKind::ShellMetaValidated; } // ── Allowlist / membership checks ──────────────────────────────────── if lower.contains(".includes(") || lower.contains(".include?(") || lower.contains(".contains(") || lower.contains(".indexof(") || lower.contains(".has(") || lower.contains("in_array(") || lower.contains(" in ") || (lower.contains('[') && !lower.contains('(')) { return PredicateKind::AllowlistCheck; } // ── Type-check guards ────────────────────────────────────────────── if lower.contains("typeof ") || lower.contains("isinstance(") || lower.contains(" instanceof ") || lower.contains(".matches(") || lower.contains("is_numeric(") || lower.contains("is_int(") || lower.contains("is_string(") || lower.contains("is_float(") || lower.contains("ctype_") || lower.contains(".is_a?(") || lower.contains(".kind_of?(") // Rust character-class validation: `.chars().all(|c| c.is_ascii_*())` // and similar per-character validations. Presence of `is_ascii_` // inside an `.all(…)` / `.iter().all(…)` call is a strong validation // signal equivalent to a TypeCheck. || (lower.contains(".all(") && lower.contains("is_ascii_")) || (lower.contains(".all(") && lower.contains("is_alphanumeric")) || (lower.contains(".all(") && lower.contains("is_numeric(")) { return PredicateKind::TypeCheck; } // ── Bounded-length rejection ───────────────────────────────────────── // // `.len() > N` / `.length < N` with N >= 2. Pairs with // ShellMetaValidated in OR-chain rejection patterns. Kept as its own // kind (not TypeCheck) because the rejection direction is ambiguous: a // `.len() > 5 { sink(x) }` gate is a precondition, not a rejection, so // marking condition vars as validated on the true branch would silence // legitimate findings. `cfg-unguarded-sink` still treats this as a // dominator guard (structural intent), just without SSA-level validation. if is_bounded_length_check(&lower) { return PredicateKind::BoundedLength; } // ── Call-based kinds (require `(` to be present) ───────────────────── if lower.contains('(') { // Strip leading wrappers (parens, `!`, whitespace) before locating // the callee token. Without this, idiomatic forms like // `(!validate(x))` (TypeScript / JS) or `not validate(x)` (Python) // produce an empty `callee_part` and the classifier misses // ValidationCall, defeating downstream validated-must propagation. let trimmed = lower.trim_start_matches(['(', '!', ' ', '\t']); // Strip a leading `not ` keyword (Python boolean not) plus surrounding // whitespace. Without this, `not validate_no_dotdot(raw)` skips // ValidationCall classification and validation never propagates. let trimmed = trimmed.strip_prefix("not ").unwrap_or(trimmed).trim(); // Extract a rough callee token: everything before the first `(` // that looks like an identifier (letters, digits, underscores, dots). let callee_part = trimmed.split('(').next().unwrap_or(""); // Take the last segment (after `.` or `::`) as the bare name. let bare = callee_part .rsplit(['.', ':']) .next() .unwrap_or(callee_part) .trim(); // Validation if bare.contains("valid") || bare.contains("check") || bare.contains("verify") || bare.starts_with("is_safe") || bare.starts_with("is_authorized") || bare.starts_with("is_authenticated") { return PredicateKind::ValidationCall; } // Regex / pattern allowlist `.test(value)` / `.match(value)` calls // where the receiver name carries a regex or pattern marker. The // standard JS / TS / Python / Java / Ruby / Go regex APIs all expose a // boolean test method; the success arm (true) means `value` matches the // pattern. Conservative on receiver names so non-regex methods like // `obj.test(x)` (test runner), `db.test(...)` (test column) etc. don't // get pulled in. Motivated by Payload CVE-2026-25544 // (`if (!SAFE_STRING_REGEX.test(value)) throw …;`). if (bare == "test" || bare == "match" || bare == "matches") && let Some(dot_pos) = callee_part.rfind('.') { let receiver = &callee_part[..dot_pos]; let receiver_lower = receiver.to_ascii_lowercase(); if receiver_lower.contains("regex") || receiver_lower.contains("pattern") { return PredicateKind::ValidationCall; } } // Sanitizer if bare.contains("sanitiz") || bare.contains("escape") || bare.contains("encode") { return PredicateKind::SanitizerCall; } } // ── Comparison operators ───────────────────────────────────────────── if lower.contains("==") || lower.contains("!=") || lower.contains(">=") || lower.contains("<=") || lower.contains(" > ") || lower.contains(" < ") { return PredicateKind::Comparison; } PredicateKind::Unknown } /// Classify a condition AND extract the specific validated variable target. /// /// For `ValidationCall`/`SanitizerCall`, tries to extract the first argument /// or method receiver as the validated variable: /// - `validate(x, ...)` → target = `"x"` /// - `x.validate(...)` → target = `"x"` /// /// When target extraction fails on a multi-argument call (e.g., /// `validate(expr, limit)` where `expr` is not a plain identifier), the /// validator's effect is opaque: we can't tell which argument is being /// checked. Returning the original kind with `None` target would cause /// upstream code to over-validate (mark every `condition_var` as validated). /// Instead, we fall back to `PredicateKind::Unknown`, safer to assume the /// validator did nothing than to assume it validated every variable in the /// condition. Single-argument calls retain `(kind, None)` so downstream code /// can still use the predicate-summary bit tracking. pub fn classify_condition_with_target(text: &str) -> (PredicateKind, Option) { let kind = classify_condition(text); match kind { PredicateKind::ValidationCall | PredicateKind::SanitizerCall => { if let Some(target) = extract_validation_target(text) { (kind, Some(target)) } else if count_call_args(text).map(|n| n > 1).unwrap_or(false) { (PredicateKind::Unknown, None) } else { (kind, None) } } PredicateKind::AllowlistCheck => { let target = extract_allowlist_target(text); (kind, target) } PredicateKind::TypeCheck => { let target = extract_type_check_target(text); (kind, target) } PredicateKind::ShellMetaValidated => { // The receiver of `.contains(…)` / `.includes(…)` is the value // being validated. Reuses the validation extractor which already // handles `x.method(arg)` → `"x"`. let target = extract_validation_target(text); (kind, target) } PredicateKind::Comparison => { // `x === '/login'`, `x == 5`, `null != obj`, when exactly one // side is a literal, extract the identifier side as the target. // Downstream `apply_branch_predicates` uses this to mark the // variable as `validated_may` on the true (equal) branch. let target = extract_comparison_target(text); (kind, target) } _ => (kind, None), } } /// Extract the identifier side of an equality/inequality comparison where /// exactly one side is a scalar literal. /// /// Examples: /// - `x === '/login'` → `Some("x")` /// - `x !== 5` → `Some("x")` /// - `null != obj` → `Some("obj")` /// - `x === y` → `None` (neither side is a literal) /// - `'a' == 'b'` → `None` (both sides are literals) /// - `obj.field == 3` → `None` (not a bare identifier) /// /// Best-effort text analysis, kept conservative to avoid false validation. fn extract_comparison_target(text: &str) -> Option { let trimmed = text.trim(); // Find the operator token. Check longer forms first so `===` doesn't // match as `==` with a trailing `=`. for op in &["===", "!==", "==", "!="] { if let Some(pos) = trimmed.find(op) { let left = trimmed[..pos].trim(); let right = trimmed[pos + op.len()..].trim(); let left_is_ident = is_identifier(left); let right_is_ident = is_identifier(right); let left_is_lit = is_comparison_literal(left); let right_is_lit = is_comparison_literal(right); return match (left_is_ident, right_is_ident, left_is_lit, right_is_lit) { (true, _, false, true) => Some(left.to_string()), (_, true, true, false) => Some(right.to_string()), _ => None, }; } } None } /// Test whether `s` is a scalar literal for comparison-target extraction. /// Accepts string literals (single/double/backtick quoted), numeric literals, /// and the null/undefined/nil/true/false tokens. fn is_comparison_literal(s: &str) -> bool { let s = s.trim(); if s.is_empty() { return false; } // String literal: delimited by matching quotes. let bytes = s.as_bytes(); if bytes.len() >= 2 { let first = bytes[0]; let last = bytes[bytes.len() - 1]; if (first == b'"' || first == b'\'' || first == b'`') && first == last { return true; } } // Keyword literal tokens. if matches!(s, "null" | "undefined" | "nil" | "None" | "true" | "false") { return true; } // Numeric literal: optional sign + digits, optional decimal point. let mut chars = s.chars(); let first = chars.next().unwrap(); let rest_start = if first == '-' || first == '+' { match chars.next() { Some(c) => c, None => return false, } } else { first }; if !rest_start.is_ascii_digit() { return false; } s.chars() .skip(if first == '-' || first == '+' { 1 } else { 0 }) .all(|c| c.is_ascii_digit() || c == '.' || c == '_') } /// Count positional arguments in a call-shaped condition text. /// /// Returns `None` when the text does not look like a call (no `(`). Returns /// `Some(0)` for a call with empty argument list. Respects paren/bracket/brace /// nesting so `f(g(a, b), c)` counts as 2 top-level args. /// /// Best-effort, operates on source text, not an AST. Used by /// `classify_condition_with_target` to distinguish single-arg vs multi-arg /// validator calls when target extraction fails. fn count_call_args(text: &str) -> Option { let trimmed = text.trim(); let trimmed = trimmed.strip_prefix('!').unwrap_or(trimmed).trim(); let paren_pos = trimmed.find('(')?; let args_part = &trimmed[paren_pos + 1..]; let args_inner = args_part .trim_end() .strip_suffix(')') .unwrap_or(args_part) .trim(); if args_inner.is_empty() { return Some(0); } let mut count = 1usize; let mut depth: i32 = 0; for ch in args_inner.chars() { match ch { '(' | '[' | '{' => depth += 1, ')' | ']' | '}' => depth -= 1, ',' if depth == 0 => count += 1, _ => {} } } Some(count) } /// Extract the first top-level argument from `args_part`, the substring /// immediately following the open paren of a call expression. Walks /// paren/bracket/brace depth and skips quoted strings so nested calls and /// punctuation inside string literals do not confuse the scan. Returns /// the trimmed argument substring up to the first top-level `,` or /// matching `)`, or `None` when no balanced close paren is found. /// /// Robust against trailing wrapper parens such as /// `(!ALLOWED.includes(cmd))` where naïve `strip_suffix(')')` would leave /// `cmd)` and lose the argument. fn first_call_arg(args_part: &str) -> Option<&str> { let bytes = args_part.as_bytes(); let mut depth: usize = 1; let mut end: Option = None; let mut first_comma: Option = None; let mut i = 0; while i < bytes.len() { let b = bytes[i]; match b { b'(' | b'[' | b'{' => depth += 1, b')' | b']' | b'}' => { depth -= 1; if depth == 0 { end = Some(i); break; } } b',' if depth == 1 && first_comma.is_none() => first_comma = Some(i), b'"' | b'\'' => { let quote = b; i += 1; while i < bytes.len() { if bytes[i] == b'\\' && i + 1 < bytes.len() { i += 2; continue; } if bytes[i] == quote { break; } i += 1; } } _ => {} } i += 1; } let end = end?; let cut = first_comma.unwrap_or(end); Some(args_part[..cut].trim()) } /// Extract the validated variable from a condition text. /// /// Handles two patterns: /// - Function call: `validate(x, ...)` → `"x"` /// - Method call: `x.validate(...)` → `"x"` fn extract_validation_target(text: &str) -> Option { let trimmed = text.trim(); // Strip leading wrappers (parens, `!`, `not `) so idiomatic forms like // `(!validate(x))` (TS/JS) and `not validate(x)` (Python) are reachable. let trimmed = trimmed.trim_start_matches(['(', '!', ' ', '\t']); let trimmed = trimmed.strip_prefix("not ").unwrap_or(trimmed).trim(); // Find the first `(` which separates callee from args let paren_pos = trimmed.find('(')?; let callee_part = &trimmed[..paren_pos]; let args_part = &trimmed[paren_pos + 1..]; // Check for method call pattern: `x.method(...)` or `x.method_name(...)` if let Some(dot_pos) = callee_part.rfind('.') { let receiver = callee_part[..dot_pos].trim(); let method = callee_part[dot_pos + 1..].trim().to_ascii_lowercase(); // Regex-allowlist `.test(value)` / `.match(value)` / `.matches(value)`: // the validated target is the call's first argument, not the regex // receiver. Without this special case, branch narrowing would mark // the regex itself as validated and leave the user input alone. if matches!(method.as_str(), "test" | "match" | "matches") && let Some(first_arg) = first_call_arg(args_part) { let first_arg = first_arg.strip_prefix('&').unwrap_or(first_arg).trim(); if !first_arg.is_empty() && is_identifier(first_arg) { return Some(first_arg.to_string()); } } if !receiver.is_empty() && is_identifier(receiver) { return Some(receiver.to_string()); } } // Function call pattern: `func(x, ...)`, extract first argument with // balanced-paren scan so trailing wrapper parens (`(validate(x))`) do // not corrupt the argument substring. let first_arg = first_call_arg(args_part)?; // Strip reference operators (e.g. `&x` → `x`) let first_arg = first_arg.strip_prefix('&').unwrap_or(first_arg).trim(); if !first_arg.is_empty() && is_identifier(first_arg) { Some(first_arg.to_string()) } else { None } } /// Extract the target variable from an allowlist/membership check. /// /// Handles: /// - `.includes(cmd)` → `cmd` (first argument) /// - `in_array($cmd, $allowed)` → `cmd` (first arg, strip `$`) /// - `cmd not in ALLOWED` / `cmd in ALLOWED` → `cmd` (left of ` in `) /// - `allowed[cmd]` → `cmd` (inside brackets) fn extract_allowlist_target(text: &str) -> Option { let trimmed = text.trim(); let lower = trimmed.to_ascii_lowercase(); // Method call pattern: something.includes(arg) / .contains(arg) / .has(arg) / .indexof(arg) for method in &[ ".includes(", ".include?(", ".contains(", ".indexof(", ".has(", ] { if let Some(pos) = lower.find(method) { let args_start = pos + method.len(); let args_part = &trimmed[args_start..]; if let Some(first_arg) = first_call_arg(args_part) { let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); if !first_arg.is_empty() && is_identifier(first_arg) { return Some(first_arg.to_string()); } } } } // in_array($cmd, $allowed) → cmd if let Some(pos) = lower.find("in_array(") { let args_start = pos + "in_array(".len(); let args_part = &trimmed[args_start..]; if let Some(first_arg) = first_call_arg(args_part) { let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); if !first_arg.is_empty() && is_identifier(first_arg) { return Some(first_arg.to_string()); } } } // Python `in` operator: `cmd in ALLOWED` / `cmd not in ALLOWED` if lower.contains(" in ") { // Find the leftmost ` in `, everything before it is the target expression // Handle `not in` by looking for ` not in ` first let target_part = if let Some(pos) = lower.find(" not in ") { &trimmed[..pos] } else if let Some(pos) = lower.find(" in ") { &trimmed[..pos] } else { return None; }; let target = target_part.trim(); let target = target.strip_prefix('!').unwrap_or(target).trim(); let target = target.strip_prefix('$').unwrap_or(target); if !target.is_empty() && is_identifier(target) { return Some(target.to_string()); } } // Go map lookup: `allowed[cmd]` if let Some(open) = trimmed.find('[') { if let Some(close) = trimmed.find(']') { if close > open + 1 { let inner = trimmed[open + 1..close].trim(); let inner = inner.strip_prefix('$').unwrap_or(inner); if !inner.is_empty() && is_identifier(inner) { return Some(inner.to_string()); } } } } None } /// Extract the target variable from a type-check guard. /// /// Handles: /// - `typeof input !== 'number'` → `input` (word after `typeof`) /// - `isinstance(user_id, int)` → `user_id` (first arg) /// - `input.matches("\\d+")` → `input` (receiver) /// - `is_numeric($id)` → `id` (first arg, strip `$`) fn extract_type_check_target(text: &str) -> Option { let trimmed = text.trim(); let lower = trimmed.to_ascii_lowercase(); // typeof: `typeof input !== 'number'` if let Some(pos) = lower.find("typeof ") { let after = &trimmed[pos + "typeof ".len()..]; // The target is the next identifier-like word let target: String = after .chars() .take_while(|c| c.is_alphanumeric() || *c == '_') .collect(); if !target.is_empty() { return Some(target); } } // isinstance(user_id, int) → user_id if let Some(pos) = lower.find("isinstance(") { let args_start = pos + "isinstance(".len(); let args_part = &trimmed[args_start..]; let inner = args_part.strip_suffix(')').unwrap_or(args_part); let first_arg = inner.split(',').next()?.trim(); let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); if !first_arg.is_empty() && is_identifier(first_arg) { return Some(first_arg.to_string()); } } // Java/TS instanceof: "x instanceof String" → "x" if let Some(pos) = lower.find(" instanceof ") { let var_part = trimmed[..pos].trim(); if !var_part.is_empty() && is_identifier(var_part) { return Some(var_part.to_string()); } } // .matches("...") → receiver if let Some(pos) = lower.find(".matches(") { let receiver = trimmed[..pos].trim(); let receiver = receiver.strip_prefix('!').unwrap_or(receiver).trim(); if !receiver.is_empty() && is_identifier(receiver) { return Some(receiver.to_string()); } } // PHP type checks: is_numeric($id), is_int($x), is_string($x), is_float($x) for func in &["is_numeric(", "is_int(", "is_string(", "is_float("] { if let Some(pos) = lower.find(func) { let args_start = pos + func.len(); let args_part = &trimmed[args_start..]; let inner = args_part.strip_suffix(')').unwrap_or(args_part); let first_arg = inner.split(',').next()?.trim(); let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); if !first_arg.is_empty() && is_identifier(first_arg) { return Some(first_arg.to_string()); } } } // Ruby type checks: user_id.is_a?(Integer), x.kind_of?(String) → receiver for method in &[".is_a?(", ".kind_of?("] { if let Some(pos) = lower.find(method) { let receiver = trimmed[..pos].trim(); let receiver = receiver.strip_prefix('!').unwrap_or(receiver).trim(); if !receiver.is_empty() && is_identifier(receiver) { return Some(receiver.to_string()); } } } // ctype_ functions: ctype_digit($x) if let Some(pos) = lower.find("ctype_") { // Find the `(` after ctype_xxx if let Some(paren_pos) = trimmed[pos..].find('(') { let args_start = pos + paren_pos + 1; let args_part = &trimmed[args_start..]; let inner = args_part.strip_suffix(')').unwrap_or(args_part); let first_arg = inner.split(',').next()?.trim(); let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); if !first_arg.is_empty() && is_identifier(first_arg) { return Some(first_arg.to_string()); } } } None } /// Check if a string is a simple identifier (letters, digits, underscores, dots). fn is_identifier(s: &str) -> bool { !s.is_empty() && s.chars() .all(|c| c.is_alphanumeric() || c == '_' || c == '.') && !s.starts_with(|c: char| c.is_ascii_digit()) } // ─── Tests ─────────────────────────────────────────────────────────────────── #[cfg(test)] mod tests { use super::*; // ── classify_condition ──────────────────────────────────────────────── #[test] fn classify_empty_is_unknown() { assert_eq!(classify_condition(""), PredicateKind::Unknown); } #[test] fn classify_null_checks() { assert_eq!(classify_condition("x.is_none()"), PredicateKind::NullCheck); assert_eq!(classify_condition("x == null"), PredicateKind::NullCheck); assert_eq!(classify_condition("x != nil"), PredicateKind::NullCheck); assert_eq!(classify_condition("x is None"), PredicateKind::NullCheck); assert_eq!(classify_condition("x === null"), PredicateKind::NullCheck); } #[test] fn classify_error_checks() { assert_eq!(classify_condition("x.is_err()"), PredicateKind::ErrorCheck); assert_eq!(classify_condition("err != nil"), PredicateKind::ErrorCheck); assert_eq!(classify_condition("x.is_ok()"), PredicateKind::ErrorCheck); } #[test] fn classify_empty_checks() { assert_eq!( classify_condition("x.is_empty()"), PredicateKind::EmptyCheck ); assert_eq!( classify_condition("x.len() == 0"), PredicateKind::EmptyCheck ); assert_eq!( classify_condition("x.length === 0"), PredicateKind::EmptyCheck ); } #[test] fn classify_validation_call() { assert_eq!( classify_condition("validate(x)"), PredicateKind::ValidationCall ); assert_eq!( classify_condition("is_safe(input)"), PredicateKind::ValidationCall ); assert_eq!( classify_condition("check_auth(req)"), PredicateKind::ValidationCall ); assert_eq!( classify_condition("input.verify(sig)"), PredicateKind::ValidationCall ); } #[test] fn classify_validation_requires_paren() { // `x_valid == true` should NOT be ValidationCall, no `(` call syntax. assert_eq!( classify_condition("x_valid == true"), PredicateKind::Comparison ); assert_eq!( classify_condition("is_valid && ready"), PredicateKind::Unknown ); } #[test] fn classify_sanitizer_call() { assert_eq!( classify_condition("sanitize(x)"), PredicateKind::SanitizerCall ); assert_eq!( classify_condition("html_escape(s)"), PredicateKind::SanitizerCall ); assert_eq!( classify_condition("url_encode(path)"), PredicateKind::SanitizerCall ); } #[test] fn classify_comparison() { assert_eq!(classify_condition("x == 5"), PredicateKind::Comparison); assert_eq!(classify_condition("x != y"), PredicateKind::Comparison); assert_eq!(classify_condition("a >= b"), PredicateKind::Comparison); } #[test] fn classify_unknown_fallback() { assert_eq!(classify_condition("flag"), PredicateKind::Unknown); assert_eq!(classify_condition("a && b"), PredicateKind::Unknown); } // ── classify_condition_with_target ────────────────────────────────── #[test] fn target_function_call_first_arg() { let (kind, target) = classify_condition_with_target("validate(x, config)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("x")); } #[test] fn target_method_call_receiver() { let (kind, target) = classify_condition_with_target("x.isValid()"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("x")); } #[test] fn target_sanitizer_first_arg() { let (kind, target) = classify_condition_with_target("sanitize(input)"); assert_eq!(kind, PredicateKind::SanitizerCall); assert_eq!(target.as_deref(), Some("input")); } #[test] fn target_negated_validation() { let (kind, target) = classify_condition_with_target("!validate(&x)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("x")); } /// Regex `.test(value)` should classify as ValidationCall and the /// validated target should be the call argument, not the regex /// receiver. Pinned because the receiver-as-target heuristic is the /// default for method calls. Motivated by Payload CVE-2026-25544 /// (`if (!SAFE_STRING_REGEX.test(value)) throw …;`). #[test] fn target_regex_test_first_arg() { let (kind, target) = classify_condition_with_target("!SAFE_STRING_REGEX.test(value)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("value")); } #[test] fn target_regex_test_pattern_receiver() { let (kind, target) = classify_condition_with_target("ALLOWED_PATTERN.test(s)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("s")); } /// Receiver name without a regex/pattern marker should NOT be pulled /// in as a validator: `obj.test(x)` is a test runner, not a regex. #[test] fn target_test_non_regex_receiver_is_not_validation() { let kind = classify_condition("obj.test(value)"); assert_eq!(kind, PredicateKind::Unknown); } #[test] fn target_comparison_extracts_identifier_side() { let (kind, target) = classify_condition_with_target("x == 5"); assert_eq!(kind, PredicateKind::Comparison); assert_eq!(target.as_deref(), Some("x")); } #[test] fn target_comparison_strict_equality_with_string() { let (kind, target) = classify_condition_with_target("x === '/login'"); assert_eq!(kind, PredicateKind::Comparison); assert_eq!(target.as_deref(), Some("x")); } #[test] fn target_comparison_literal_on_left() { let (kind, target) = classify_condition_with_target("null != obj"); assert_eq!(kind, PredicateKind::Comparison); assert_eq!(target.as_deref(), Some("obj")); } #[test] fn target_comparison_both_identifiers_returns_none() { let (kind, target) = classify_condition_with_target("x === y"); assert_eq!(kind, PredicateKind::Comparison); assert_eq!(target, None); } #[test] fn target_comparison_both_literals_returns_none() { let (kind, target) = classify_condition_with_target("'a' == 'b'"); assert_eq!(kind, PredicateKind::Comparison); assert_eq!(target, None); } #[test] fn target_check_auth_first_arg() { let (kind, target) = classify_condition_with_target("check_auth(req)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("req")); } #[test] fn target_method_with_args() { let (kind, target) = classify_condition_with_target("input.verify(sig)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target.as_deref(), Some("input")); } #[test] fn target_multi_arg_fallback_opaque_expr_is_unknown() { // `validate(x + 1, y)`, first arg is an expression, not an identifier. // Target extraction fails. Multi-arg call, so fall back to Unknown // rather than letting upstream validate every condition var. let (kind, target) = classify_condition_with_target("validate(x + 1, y)"); assert_eq!(kind, PredicateKind::Unknown); assert_eq!(target, None); } #[test] fn target_single_arg_fallback_preserves_kind() { // Single-arg call with unextractable target: keep the original kind so // the predicate-summary bit can still be set. No over-validation risk // because there is only one var in scope. let (kind, target) = classify_condition_with_target("validate(x + 1)"); assert_eq!(kind, PredicateKind::ValidationCall); assert_eq!(target, None); } #[test] fn count_call_args_basic() { assert_eq!(super::count_call_args("f(a, b, c)"), Some(3)); assert_eq!(super::count_call_args("f(a)"), Some(1)); assert_eq!(super::count_call_args("f()"), Some(0)); assert_eq!(super::count_call_args("f(g(x, y), z)"), Some(2)); assert_eq!(super::count_call_args("not_a_call"), None); } // ── AllowlistCheck classification ───────────────────────────────── #[test] fn classify_allowlist_includes() { assert_eq!( classify_condition("ALLOWED.includes(cmd)"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_in_array() { assert_eq!( classify_condition("in_array($cmd, $allowed)"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_python_not_in() { assert_eq!( classify_condition("cmd not in ALLOWED"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_python_in() { assert_eq!( classify_condition("cmd in ALLOWED"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_map_lookup() { assert_eq!( classify_condition("allowed[cmd]"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_contains() { assert_eq!( classify_condition("whitelist.contains(value)"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_has() { assert_eq!( classify_condition("allowedSet.has(key)"), PredicateKind::AllowlistCheck ); } #[test] fn extract_allowlist_target_negated_paren_wrapper() { // Tree-sitter records the if-condition as `(!ALLOWED.includes(cmd))`, // including the surrounding parens. Naïve `strip_suffix(')')` left // `cmd)` and `is_identifier` rejected the trailing `)`, dropping the // structural guard for `cfg-unguarded-sink` suppression. The // balanced-paren scan must return `Some("cmd")`. let (kind, target) = classify_condition_with_target("(!ALLOWED.includes(cmd))"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } #[test] fn extract_allowlist_target_java_contains_paren_wrapper() { let (kind, target) = classify_condition_with_target("(!ALLOWED.contains(cmd))"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } #[test] fn extract_allowlist_target_in_array_paren_wrapper() { let (kind, target) = classify_condition_with_target("(!in_array($cmd, $allowed))"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } // ── TypeCheck classification ────────────────────────────────────── #[test] fn classify_type_check_typeof() { assert_eq!( classify_condition("typeof input !== 'number'"), PredicateKind::TypeCheck ); } #[test] fn classify_type_check_isinstance() { assert_eq!( classify_condition("isinstance(user_id, int)"), PredicateKind::TypeCheck ); } #[test] fn classify_type_check_matches() { assert_eq!( classify_condition("input.matches(\"\\\\d+\")"), PredicateKind::TypeCheck ); } #[test] fn classify_type_check_is_numeric() { assert_eq!( classify_condition("is_numeric($id)"), PredicateKind::TypeCheck ); } #[test] fn classify_type_check_is_int() { assert_eq!(classify_condition("is_int($x)"), PredicateKind::TypeCheck); } #[test] fn classify_type_check_ctype() { assert_eq!( classify_condition("ctype_digit($x)"), PredicateKind::TypeCheck ); } // ── Allowlist target extraction ─────────────────────────────────── #[test] fn target_allowlist_includes() { let (kind, target) = classify_condition_with_target("ALLOWED.includes(cmd)"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } #[test] fn target_allowlist_in_array() { let (kind, target) = classify_condition_with_target("in_array($cmd, $allowed)"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } #[test] fn target_allowlist_python_in() { let (kind, target) = classify_condition_with_target("cmd in ALLOWED"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } #[test] fn target_allowlist_python_not_in() { let (kind, target) = classify_condition_with_target("cmd not in ALLOWED"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } #[test] fn target_allowlist_map_lookup() { let (kind, target) = classify_condition_with_target("allowed[cmd]"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } // ── TypeCheck target extraction ─────────────────────────────────── #[test] fn target_type_check_typeof() { let (kind, target) = classify_condition_with_target("typeof input !== 'number'"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("input")); } #[test] fn target_type_check_isinstance() { let (kind, target) = classify_condition_with_target("isinstance(user_id, int)"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("user_id")); } #[test] fn target_type_check_matches() { let (kind, target) = classify_condition_with_target("input.matches(\"\\\\d+\")"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("input")); } #[test] fn target_type_check_is_numeric() { let (kind, target) = classify_condition_with_target("is_numeric($id)"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("id")); } #[test] fn target_type_check_ctype() { let (kind, target) = classify_condition_with_target("ctype_digit($x)"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("x")); } #[test] fn classify_type_check_is_a() { assert_eq!( classify_condition("user_id.is_a?(Integer)"), PredicateKind::TypeCheck ); } #[test] fn target_type_check_is_a() { let (kind, target) = classify_condition_with_target("user_id.is_a?(Integer)"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("user_id")); } #[test] fn classify_allowlist_include_question() { assert_eq!( classify_condition("ALLOWED.include?(cmd)"), PredicateKind::AllowlistCheck ); } #[test] fn target_allowlist_include_question() { let (kind, target) = classify_condition_with_target("ALLOWED.include?(cmd)"); assert_eq!(kind, PredicateKind::AllowlistCheck); assert_eq!(target.as_deref(), Some("cmd")); } // ── instanceof classification and target ───────────────────────────── #[test] fn classify_instanceof_is_type_check() { assert_eq!( classify_condition("x instanceof String"), PredicateKind::TypeCheck ); } #[test] fn target_instanceof_x_string() { let (kind, target) = classify_condition_with_target("x instanceof String"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("x")); } #[test] fn target_instanceof_obj_integer() { let (kind, target) = classify_condition_with_target("obj instanceof Integer"); assert_eq!(kind, PredicateKind::TypeCheck); assert_eq!(target.as_deref(), Some("obj")); } // ── ShellMetaValidated classification ───────────────────────────────── #[test] fn classify_shell_metachar_contains_rust() { assert_eq!( classify_condition("input.contains(\";\")"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("cmd.contains(\"|\")"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("s.contains(\"&\")"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("s.contains(\"`\")"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("s.contains(\"$\")"), PredicateKind::ShellMetaValidated ); } #[test] fn classify_shell_metachar_includes_js() { assert_eq!( classify_condition("input.includes(';')"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("cmd.includes(\"|\")"), PredicateKind::ShellMetaValidated ); } #[test] fn classify_shell_metachar_include_question_ruby() { assert_eq!( classify_condition("cmd.include?(\";\")"), PredicateKind::ShellMetaValidated ); } #[test] fn classify_shell_metachar_python_in() { assert_eq!( classify_condition("\";\" in cmd"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("'|' in cmd"), PredicateKind::ShellMetaValidated ); } #[test] fn classify_shell_metachar_regex_class() { assert_eq!( classify_condition("cmd.match(/[;|&]/)"), PredicateKind::ShellMetaValidated ); assert_eq!( classify_condition("re.search(\"[;|&]\", cmd)"), PredicateKind::ShellMetaValidated ); } #[test] fn classify_non_metachar_contains_stays_allowlist() { // `x.contains("foo")` must NOT be credited as a shell-metachar // rejection. It falls back to the existing AllowlistCheck behavior. assert_eq!( classify_condition("input.contains(\"foo\")"), PredicateKind::AllowlistCheck ); assert_eq!( classify_condition("path.contains(\"..\")"), PredicateKind::AllowlistCheck ); assert_eq!( classify_condition("name.contains(\"admin\")"), PredicateKind::AllowlistCheck ); } #[test] fn classify_allowlist_membership_unaffected() { // `x in ALLOWED` (identifier on left) remains AllowlistCheck. // Only a quoted metachar on the LEFT of ` in ` triggers ShellMeta. assert_eq!( classify_condition("cmd in ALLOWED"), PredicateKind::AllowlistCheck ); assert_eq!( classify_condition("cmd not in ALLOWED"), PredicateKind::AllowlistCheck ); } #[test] fn target_shell_metachar_receiver() { let (kind, target) = classify_condition_with_target("input.contains(\";\")"); assert_eq!(kind, PredicateKind::ShellMetaValidated); assert_eq!(target.as_deref(), Some("input")); } // ── Bounded-length TypeCheck ────────────────────────────────────────── #[test] fn classify_bounded_length_rust_len() { assert_eq!( classify_condition("input.len() > 100"), PredicateKind::BoundedLength ); assert_eq!( classify_condition("s.len() >= 256"), PredicateKind::BoundedLength ); assert_eq!( classify_condition("s.len() < 4096"), PredicateKind::BoundedLength ); } #[test] fn classify_bounded_length_js_length() { assert_eq!( classify_condition("input.length > 100"), PredicateKind::BoundedLength ); } #[test] fn classify_non_empty_len_stays_comparison() { // `.len() > 0` is a non-empty check, NOT a bounded-length validation. // Must fall through to Comparison. assert_eq!( classify_condition("input.len() > 0"), PredicateKind::Comparison ); assert_eq!( classify_condition("s.len() >= 1"), PredicateKind::Comparison ); } // ── Helper sanity ───────────────────────────────────────────────────── #[test] fn shell_metachar_rejection_detects_common_chars() { for m in &[";", "|", "&", "`", "$", ">", "<"] { let text = format!("x.contains(\"{m}\")"); assert!( is_shell_metachar_rejection(&text), "should detect metachar {m:?} in {text:?}" ); } } #[test] fn shell_metachar_rejection_rejects_non_metachar() { assert!(!is_shell_metachar_rejection("x.contains(\"foo\")")); assert!(!is_shell_metachar_rejection("x.contains(\"admin\")")); assert!(!is_shell_metachar_rejection("x.contains(\"..\")")); } #[test] fn shell_metachar_rejection_handles_escapes() { assert!(is_shell_metachar_rejection("x.contains(\"\\n\")")); } #[test] fn bounded_length_rejects_zero_and_one() { assert!(!is_bounded_length_check("x.len() > 0")); assert!(!is_bounded_length_check("x.len() >= 1")); assert!(!is_bounded_length_check("x.len() < 1")); } #[test] fn bounded_length_accepts_small_bounds() { assert!(is_bounded_length_check("x.len() > 2")); assert!(is_bounded_length_check("x.len() <= 256")); } }