Improved path traversal detection and enhanced sink classification logic

This commit is contained in:
Eli Peter 2026-05-02 03:36:14 -04:00 committed by GitHub
parent 58f1794a4e
commit 3c89bddbf2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
56 changed files with 3989 additions and 345 deletions

View file

@ -207,6 +207,34 @@ impl PathFact {
!self.is_bottom && self.dotdot == Tri::No && self.absolute == Tri::No
}
/// True iff the fact proves the path stays inside a trusted region
/// for path-traversal purposes (the FILE_IO sink-suppression
/// predicate).
///
/// Accepts either of two structural invariants:
///
/// * `dotdot = No && absolute = No` — the relative-and-`..`-free
/// shape recognised by [`is_path_safe`]. Cannot escape to an
/// attacker-controlled absolute location.
/// * `dotdot = No && prefix_lock.is_some()` — a canonicalised path
/// (typically `File.expand_path` / `realpath` / `fs::canonicalize`)
/// that has been verified-rooted by a `starts_with`-style guard
/// against some prefix. The prefix may be opaque
/// ([`OPAQUE_PREFIX_LOCK`]); the structural guarantee is the same:
/// the path is provably inside the locked subtree.
///
/// This relaxation closes the rswag CVE-2023-38337 patched-counterpart
/// FP shape (`File.expand_path(File.join(root, p)) + start_with? root`)
/// and the equivalent Python (`os.path.realpath + .startswith(root)`)
/// and JS (`path.resolve + .startsWith(root)`) idioms, all of which
/// produce absolute paths but are sound against `..` traversal.
pub fn is_path_traversal_safe(&self) -> bool {
if self.is_bottom || self.dotdot != Tri::No {
return false;
}
self.absolute == Tri::No || self.prefix_lock.is_some()
}
/// True iff the fact has a prefix lock equal to or contained under
/// `root`. Used by sink-suppression to confirm that a path derived
/// from a locked root is provably still under that root.
@ -391,6 +419,16 @@ pub enum PathAssertion {
None,
}
/// Sentinel root attached to a [`PathFact::prefix_lock`] when the
/// `starts_with`-style guard's argument is non-literal (a method call,
/// field access, configured root from the application). The structural
/// invariant — "verified rooted under SOME prefix" — is what the sink-
/// suppression layer needs; the *exact* prefix bytes are not. Combined
/// with a `dotdot=No` proof from canonicalisation or `..`-rejection, an
/// opaque prefix-lock is sufficient to prove the path stays inside a
/// trusted region.
pub const OPAQUE_PREFIX_LOCK: &str = "__nyx_opaque_prefix__";
/// Recognise a Rust path-rejection branch idiom from the raw condition text.
///
/// Accepts both atomic conditions (`x.contains("..")`) and multi-clause
@ -449,6 +487,22 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec
out
}
/// True iff any top-level OR clause of `text` is the pre-negated
/// `!filepath.IsLocal(<expr>)` Go idiom — i.e. a clause whose `!` is
/// already consumed by [`classify_path_rejection_axes`] when reporting
/// the safe arm. Callers use this to decide whether AST-level negation
/// (`condition_negated`) was already accounted for by the classifier
/// (returns `true`) or still needs to flip the safe-arm polarity for
/// polarity-blind atoms like `!path.contains("..")` (returns `false`).
pub(crate) fn cond_has_pre_negated_islocal_clause(text: &str) -> bool {
for clause in split_top_level_or(text) {
if has_negated_filepath_is_local(clause.trim()) {
return true;
}
}
false
}
/// Detect `!filepath.IsLocal(<expr>)`, Go's idiomatic path-traversal
/// guard. Whitespace-tolerant: `! filepath.IsLocal(`, `!filepath . IsLocal(`,
/// etc. Used by [`classify_path_rejection_axes`] to inject both
@ -651,19 +705,39 @@ fn split_top_level_or(text: &str) -> smallvec::SmallVec<[&str; 4]> {
out
}
/// Recognise a Rust path-positive-assertion branch idiom.
/// Recognise a path-positive-assertion branch idiom (language-agnostic).
///
/// Returns:
///
/// * `PrefixLock(<literal>)` when the condition is a `starts_with`-style
/// call with a literal prefix of length ≥ 2. Sibling single-character
/// prefixes (`"/"`, `"\\"`) are absolute-axis rejections, not locks.
/// * `PrefixLock(`[`OPAQUE_PREFIX_LOCK`]`)` when the call has a
/// non-empty, *non-literal* argument (method call, field access, local
/// variable). The opaque marker certifies the structural invariant
/// "verified rooted under some prefix" without committing to bytes,
/// which is exactly what FILE_IO sink-suppression needs to combine with
/// a `dotdot=No` proof — the upstream code path
/// `File.expand_path(...) + start_with?(<config_root>)` is the
/// motivating example.
/// * `None` otherwise.
pub fn classify_path_assertion(text: &str) -> PathAssertion {
let trimmed = text.trim();
if let Some(needle) = extract_starts_with_arg(trimmed) {
// Positive assertion: a literal-prefix `starts_with` on a locked
// root. Sibling slash ("/") and backslash ("\\") are also
// classified as rejections above; prefix-lock only fires when the
// prefix is multi-character (i.e. carries real locking info).
if needle.len() >= 2 {
return PathAssertion::PrefixLock(needle);
match extract_starts_with_arg(trimmed) {
Some(needle) if needle.len() >= 2 => PathAssertion::PrefixLock(needle),
// Single-char literal (`"/"`, `"\\"`) is an absolute-axis
// rejection idiom handled by `classify_path_rejection_axes`, not
// a positive prefix-lock — fall through to None.
Some(_) => PathAssertion::None,
// No literal recovered: check for a non-literal argument
// (method call, field access, configured root) and attach the
// opaque marker so the structural "verified rooted under SOME
// prefix" invariant is recorded for downstream sink suppression.
None if has_starts_with_call_with_nonempty_arg(trimmed) => {
PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string())
}
None => PathAssertion::None,
}
PathAssertion::None
}
/// Recognise a *structural* one-argument enum-variant constructor.
@ -1136,6 +1210,69 @@ fn extract_starts_with_arg(text: &str) -> Option<String> {
None
}
/// Detect a `starts_with`-style call with a non-empty argument, where the
/// argument is *not* recovered as a string literal by
/// [`extract_starts_with_arg`] (so it's a method call, field access, local
/// variable, etc.). Used by [`classify_path_assertion`] to attach an
/// opaque prefix-lock when the application validates with a configured
/// root rather than an inline string literal.
///
/// Whitespace-tolerant. Conservative: returns `false` for any shape where
/// the argument cannot be confirmed non-empty.
fn has_starts_with_call_with_nonempty_arg(text: &str) -> bool {
// Method-call forms with parens. The argument-presence check is
// simple: after the opening `(`, the first non-whitespace byte must
// not be `)` (empty arg list).
for method in [
".starts_with(",
".start_with?(",
".startsWith(",
".startswith(",
] {
if let Some(idx) = text.find(method) {
let after = &text[idx + method.len()..];
if first_non_ws_byte(after).is_some_and(|b| b != b')') {
return true;
}
}
}
// Ruby paren-less call: `r.start_with? <expr>`. Tree-sitter still
// serialises the source text verbatim, so a space (or tab) follows
// the `?`. Require a non-empty, non-clause-terminator token after.
if let Some(idx) = text.find(".start_with?") {
let rest = &text[idx + ".start_with?".len()..];
// Skip the `(` form (already covered above) and any whitespace.
let after = rest.trim_start();
if !after.is_empty() {
let first = after.as_bytes()[0];
// `(` belongs to the parenthesised form; clause terminators
// (`&&` / `||` / `)` / `]` / `;` / `,`) mean the call has no
// arguments at this position.
if !matches!(first, b'(' | b'&' | b'|' | b')' | b']' | b';' | b',') {
return true;
}
}
}
// Go free-function form `strings.HasPrefix(<recv>, <prefix>)`. The
// second argument must exist and be non-empty.
if let Some(idx) = text.find("strings.HasPrefix(") {
let inner = &text[idx + "strings.HasPrefix(".len()..];
if let Some(comma_idx) = top_level_comma(inner) {
let after_comma = inner[comma_idx + 1..].trim_start();
if !after_comma.is_empty() && !after_comma.starts_with(')') {
return true;
}
}
}
false
}
/// Return the first non-whitespace byte of `text`, or `None` if the slice
/// is empty or all-whitespace.
fn first_non_ws_byte(text: &str) -> Option<u8> {
text.bytes().find(|b| !b.is_ascii_whitespace())
}
/// Find the index of the first top-level `,` in a slice (depth 0, ignoring
/// commas inside nested parentheses, brackets, braces, or string literals).
/// Returns `None` if no top-level comma is present.
@ -1716,6 +1853,109 @@ mod tests {
);
}
#[test]
fn assertion_opaque_prefix_lock_method_call_arg() {
// rswag CVE-2023-38337 patched shape: `start_with?` with a
// configured-root method call as argument. The exact bytes are
// unknown to the analyser, but the structural invariant "rooted
// under SOME prefix" is captured via the opaque marker.
assert_eq!(
classify_path_assertion("filename.start_with? @config.resolve_swagger_root(env)"),
PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string())
);
}
#[test]
fn assertion_opaque_prefix_lock_paren_method_call() {
// Same shape, parenthesised: `r.start_with?(some_root)`.
assert_eq!(
classify_path_assertion("filename.start_with?(@config.root)"),
PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string())
);
}
#[test]
fn assertion_opaque_prefix_lock_python_startswith() {
// Python: `os.path.realpath(p).startswith(safe_root)` where
// `safe_root` is a local variable, not a literal.
assert_eq!(
classify_path_assertion("p.startswith(safe_root)"),
PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string())
);
}
#[test]
fn assertion_opaque_prefix_lock_js_starts_with() {
assert_eq!(
classify_path_assertion("resolved.startsWith(uploadsDir)"),
PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string())
);
}
#[test]
fn assertion_opaque_prefix_lock_go_hasprefix() {
assert_eq!(
classify_path_assertion("strings.HasPrefix(p, safeRoot)"),
PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string())
);
}
#[test]
fn assertion_no_lock_on_empty_arg() {
// `r.starts_with()` (degenerate) should not produce a lock.
assert_eq!(
classify_path_assertion("r.starts_with()"),
PathAssertion::None
);
}
#[test]
fn is_path_traversal_safe_relative_dotdot_free() {
let f = PathFact::default()
.with_dotdot_cleared()
.with_absolute_cleared();
assert!(f.is_path_traversal_safe());
}
#[test]
fn is_path_traversal_safe_canonicalised_with_prefix_lock() {
// `File.expand_path + start_with?(root)` shape: dotdot=No,
// absolute=Yes, prefix_lock=Some. The relaxed predicate should
// accept this even though the strict `is_path_safe` rejects it.
let f = PathFact::default()
.with_dotdot_cleared()
.with_prefix_lock("__nyx_opaque_prefix__");
assert!(!f.is_path_safe(), "absolute axis still Maybe blocks strict");
// Setting absolute=Yes via expand_path-style transfer:
let mut f2 = f.clone();
f2.absolute = Tri::Yes;
assert!(!f2.is_path_safe(), "absolute=Yes blocks strict predicate");
assert!(
f2.is_path_traversal_safe(),
"prefix_lock + dotdot=No is sufficient under relaxed predicate"
);
}
#[test]
fn is_path_traversal_safe_rejects_dotdot_maybe() {
let f = PathFact::default().with_prefix_lock("/var/app/");
// dotdot still Maybe — relaxed predicate must still reject.
assert!(!f.is_path_traversal_safe());
}
#[test]
fn is_path_traversal_safe_rejects_absolute_without_lock() {
let mut f = PathFact::default().with_dotdot_cleared();
f.absolute = Tri::Yes;
// No prefix_lock — relaxed predicate must reject.
assert!(!f.is_path_traversal_safe());
}
#[test]
fn is_path_traversal_safe_rejects_bottom() {
assert!(!PathFact::bottom().is_path_traversal_safe());
}
#[test]
fn primitive_canonicalize_normalises() {
let f = classify_path_primitive("fs::canonicalize", &PathFact::top()).unwrap();

1267
src/ast.rs

File diff suppressed because it is too large Load diff

View file

@ -85,6 +85,77 @@ fn inner_call_override_narrows_classification_span() {
);
}
/// Ruby (and any language without an `expression_statement` wrapper)
/// reaches `push_node` with `ast.kind() == "call"` (`Kind::CallMethod`)
/// for top-level statement-position calls. The inner-call fallback at
/// `push_node` line ~1690 must include `Kind::CallFn | Kind::CallMethod
/// | Kind::CallMacro` in its kind gate, otherwise an unclassified outer
/// wrapper around a sink (e.g. `YAML.safe_load(File.read(filename))`,
/// `String.new(File.read(x))`, `JSON.parse(File.read(x))` — every
/// chain-style sink wrapper used in real Ruby helpers) loses the inner
/// sink's classification entirely. Cross-function summary extraction
/// then misses the wrapper's `param_to_sink` and downstream callers
/// silently lose detection. Regression guard for CVE-2023-38337
/// (rswag-api `parse_file → load_yaml/load_json → File.read` chain)
/// and CVE-2021-21288 (CarrierWave `download → OpenURI.open_uri`).
#[test]
fn ruby_inner_call_fallback_classifies_wrapper_around_file_read() {
let src = b"def f(x)\n YAML.safe_load(File.read(x))\nend\n";
let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "ruby", ts_lang);
// The outer call `YAML.safe_load(...)` does not classify by itself;
// the fallback must descend into its argument list and pick up the
// inner `File.read(x)` Sink(FILE_IO) label.
let sink = cfg
.node_indices()
.find(|&i| cfg[i].call.callee.as_deref() == Some("File.read"))
.expect(
"inner-call fallback should override the outer YAML.safe_load callee with File.read",
);
let info = &cfg[sink];
assert!(
info.taint
.labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::FILE_IO))),
"wrapper-around-File.read node must carry the FILE_IO sink label"
);
// outer_callee should preserve the original callee text so cross-fn
// summary lookup can still find the wrapping function.
assert_eq!(
info.call.outer_callee.as_deref(),
Some("YAML.safe_load"),
"outer_callee must preserve the original wrapping callee"
);
}
/// Identical-shape regression guard for the *bare-function* call
/// variant (`outer(File.read(x))`) — exercises the `Kind::CallFn`
/// branch of the gate, where Ruby/Python/etc.'s top-level free
/// function calls lacking a method receiver land.
#[test]
fn ruby_inner_call_fallback_classifies_bare_outer_around_file_read() {
let src = b"def f(x)\n outer(File.read(x))\nend\n";
let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "ruby", ts_lang);
let sink = cfg
.node_indices()
.find(|&i| cfg[i].call.callee.as_deref() == Some("File.read"))
.expect("inner-call fallback must override `outer` callee with File.read");
let info = &cfg[sink];
assert!(
info.taint
.labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::FILE_IO))),
"wrapper-around-File.read node must carry FILE_IO sink label"
);
}
/// `classification_span()` must fall back to `ast.span` when no narrower
/// sub-expression was recorded, so existing structural code paths keep
/// working unchanged for nodes whose classification applies to the whole

View file

@ -1681,12 +1681,31 @@ pub(super) fn push_node<'a>(
// When the callee is overridden, save the original for container ops
// (e.g. `parts.add(req.getParameter(...))`, callee becomes
// "req.getParameter" but outer_callee preserves "parts.add").
//
// Statement-level calls in languages without a separate
// `expression_statement` wrapper (Ruby, where `body_statement` directly
// contains the call AST node) reach `push_node` with `ast.kind() ==
// "call"` (`Kind::CallMethod`) rather than `Kind::CallWrapper`. Without
// including the call kinds in the gate, an unclassified outer wrapper
// around a sink (e.g. `YAML.safe_load(File.read(filename))` or
// `String.new(File.read(x))`) loses the inner sink's classification
// entirely — the outer call becomes a non-sink node, and the inner call
// is not emitted as a standalone CFG node because it sits inside the
// outer's `argument_list`. Cross-function summary extraction then
// misses the `param_to_sink` for the wrapper helper, breaking detection
// of every chain-style sink wrapper used in real Ruby CVEs (rswag
// CVE-2023-38337, the Marshal/JSON/YAML-of-File.read pattern, etc.).
let mut outer_callee: Option<String> = None;
let mut inner_callee_span: Option<(usize, usize)> = None;
if labels.is_empty()
&& matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment | Kind::Return
Kind::CallWrapper
| Kind::Assignment
| Kind::Return
| Kind::CallFn
| Kind::CallMethod
| Kind::CallMacro
)
&& let Some((inner_text, inner_label, inner_span)) =
find_classifiable_inner_call(ast, lang, code, extra)

View file

@ -576,6 +576,7 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
|| cl.contains("form")
|| cl.contains("query")
|| cl.contains("params")
|| cl.contains("param")
|| cl.contains("input")
|| cl.contains("body")
|| cl.contains("location")
@ -1691,6 +1692,16 @@ mod tests {
assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
}
#[test]
fn classify_ruby_openuri_open_uri_is_ssrf_sink() {
// OpenURI.open_uri is the canonical low-level URI fetcher that
// URI.open delegates to. CarrierWave / Paperclip / similar gems
// route SSRF-vulnerable downloads through it directly.
// CVE-2021-21288 (CarrierWave) regression guard.
let result = classify("ruby", "OpenURI.open_uri", None);
assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
}
#[test]
fn unpack_matcher_strips_exact_sigil() {
let (m, exact) = unpack_matcher(b"=open");

View file

@ -127,11 +127,15 @@ pub static RULES: &[LabelRule] = &[
},
// URI.open is the network-capable Kernel#open wrapper, more specific than
// plain `open` (excluded to avoid file I/O false positives).
// OpenURI.open_uri is the canonical low-level URI fetcher that URI.open
// delegates to — every SSRF-vulnerable Ruby download helper (CarrierWave
// pre-2.1.1 / 1.3.2, Paperclip, etc.) ultimately reaches it.
LabelRule {
matchers: &[
"Net::HTTP.get",
"Net::HTTP.post",
"URI.open",
"OpenURI.open_uri",
"HTTParty.get",
"HTTParty.post",
],

View file

@ -255,6 +255,7 @@ pub const PATTERNS: &[Pattern] = &[
confidence: Confidence::High,
},
// ── Tier A: Hardcoded fallback secret ──────────────────────────────
// Empty-string fallback (`|| ""`) is excluded — see typescript.rs for rationale.
Pattern {
id: "js.secrets.fallback_secret",
description: "Environment variable with secret-like name has hardcoded fallback value",
@ -266,7 +267,7 @@ pub const PATTERNS: &[Pattern] = &[
property: (property_identifier) @key
(#match? @key "(?i)(secret|password|key|token)"))
operator: "||"
right: (string) @fallback)
right: (string) @fallback (#match? @fallback "[^\"']"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,

View file

@ -244,6 +244,10 @@ pub const PATTERNS: &[Pattern] = &[
confidence: Confidence::High,
},
// ── Tier A: Hardcoded fallback secret ──────────────────────────────
// The `(#match? @fallback "[^\"']")` predicate excludes empty-string
// fallbacks (`process.env.X || ""`), which are the dominant FP shape
// in production TypeScript: developers write `|| ""` to satisfy the
// non-undefined string type without committing a real secret.
Pattern {
id: "ts.secrets.fallback_secret",
description: "Environment variable with secret-like name has hardcoded fallback value",
@ -255,7 +259,7 @@ pub const PATTERNS: &[Pattern] = &[
property: (property_identifier) @key
(#match? @key "(?i)(secret|password|key|token)"))
operator: "||"
right: (string) @fallback)
right: (string) @fallback (#match? @fallback "[^\"']"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,

View file

@ -1037,6 +1037,75 @@ pub fn detect_replace_sanitizer(
}
}
/// Detect a call-site Replace sanitizer from syntactic argument literals.
///
/// Used by SSA transfer to recognize replace-based shell/HTML/SQL escapers
/// without requiring a label rule per pattern. Returns the sanitized caps
/// when:
/// * the callee is a recognized Replace string method (per language),
/// * the pattern argument is a concrete string literal, and
/// * the pattern matches a security-relevant escape pattern in
/// [`detect_replace_sanitizer`].
///
/// Non-global replaces (e.g. JS `s.replace(";", "")` only replaces the first
/// occurrence) are excluded because partial replacement does not provide a
/// sanitiser-strength guarantee at the call site.
pub fn detect_call_site_replace_sanitizer(
callee: &str,
lang: Lang,
arg_string_literals: &[Option<String>],
) -> Option<Cap> {
let pattern_pos = pattern_arg_position(callee, lang)?;
let pattern = arg_string_literals
.get(pattern_pos)
.and_then(|o| o.as_deref())?;
let replacement = arg_string_literals
.get(pattern_pos + 1)
.and_then(|o| o.as_deref())
.unwrap_or("");
let info = detect_replace_sanitizer(pattern, replacement, callee, lang)?;
if !info.is_global || info.sanitized_caps.is_empty() {
return None;
}
Some(info.sanitized_caps)
}
fn pattern_arg_position(callee: &str, lang: Lang) -> Option<usize> {
let method = bare_method_name(callee);
match lang {
Lang::JavaScript | Lang::TypeScript => match method {
"replace" | "replaceAll" => Some(0),
_ => None,
},
Lang::Python => match method {
"replace" => Some(0),
"sub" if callee == "re.sub" => Some(0),
_ => None,
},
Lang::Ruby => match method {
"gsub" | "sub" => Some(0),
_ => None,
},
Lang::Java => match method {
"replace" | "replaceAll" => Some(0),
_ => None,
},
Lang::Go => match callee {
"strings.Replace" | "strings.ReplaceAll" => Some(1),
_ => None,
},
Lang::Php => match callee {
"str_replace" => Some(0),
_ => None,
},
Lang::Rust => match method {
"replace" | "replacen" => Some(0),
_ => None,
},
_ => None,
}
}
/// Determine whether a replace call is global (replaces all occurrences).
fn is_global_replace(callee: &str, lang: Lang) -> bool {
let method = bare_method_name(callee);

View file

@ -566,6 +566,57 @@ fn count_call_args(text: &str) -> Option<usize> {
Some(count)
}
/// Extract the first top-level argument from `args_part`, the substring
/// immediately following the open paren of a call expression. Walks
/// paren/bracket/brace depth and skips quoted strings so nested calls and
/// punctuation inside string literals do not confuse the scan. Returns
/// the trimmed argument substring up to the first top-level `,` or
/// matching `)`, or `None` when no balanced close paren is found.
///
/// Robust against trailing wrapper parens such as
/// `(!ALLOWED.includes(cmd))` where naïve `strip_suffix(')')` would leave
/// `cmd)` and lose the argument.
fn first_call_arg(args_part: &str) -> Option<&str> {
let bytes = args_part.as_bytes();
let mut depth: usize = 1;
let mut end: Option<usize> = None;
let mut first_comma: Option<usize> = None;
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
match b {
b'(' | b'[' | b'{' => depth += 1,
b')' | b']' | b'}' => {
depth -= 1;
if depth == 0 {
end = Some(i);
break;
}
}
b',' if depth == 1 && first_comma.is_none() => first_comma = Some(i),
b'"' | b'\'' => {
let quote = b;
i += 1;
while i < bytes.len() {
if bytes[i] == b'\\' && i + 1 < bytes.len() {
i += 2;
continue;
}
if bytes[i] == quote {
break;
}
i += 1;
}
}
_ => {}
}
i += 1;
}
let end = end?;
let cut = first_comma.unwrap_or(end);
Some(args_part[..cut].trim())
}
/// Extract the validated variable from a condition text.
///
/// Handles two patterns:
@ -592,11 +643,10 @@ fn extract_validation_target(text: &str) -> Option<String> {
}
}
// Function call pattern: `func(x, ...)`, extract first argument
// Strip closing paren if present
let args_inner = args_part.trim_end().strip_suffix(')').unwrap_or(args_part);
// Take text up to first comma (first argument)
let first_arg = args_inner.split(',').next()?.trim();
// Function call pattern: `func(x, ...)`, extract first argument with
// balanced-paren scan so trailing wrapper parens (`(validate(x))`) do
// not corrupt the argument substring.
let first_arg = first_call_arg(args_part)?;
// Strip reference operators (e.g. `&x` → `x`)
let first_arg = first_arg.strip_prefix('&').unwrap_or(first_arg).trim();
@ -630,11 +680,11 @@ fn extract_allowlist_target(text: &str) -> Option<String> {
if let Some(pos) = lower.find(method) {
let args_start = pos + method.len();
let args_part = &trimmed[args_start..];
let inner = args_part.strip_suffix(')').unwrap_or(args_part);
let first_arg = inner.split(',').next()?.trim();
let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg);
if !first_arg.is_empty() && is_identifier(first_arg) {
return Some(first_arg.to_string());
if let Some(first_arg) = first_call_arg(args_part) {
let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg);
if !first_arg.is_empty() && is_identifier(first_arg) {
return Some(first_arg.to_string());
}
}
}
}
@ -643,11 +693,11 @@ fn extract_allowlist_target(text: &str) -> Option<String> {
if let Some(pos) = lower.find("in_array(") {
let args_start = pos + "in_array(".len();
let args_part = &trimmed[args_start..];
let inner = args_part.strip_suffix(')').unwrap_or(args_part);
let first_arg = inner.split(',').next()?.trim();
let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg);
if !first_arg.is_empty() && is_identifier(first_arg) {
return Some(first_arg.to_string());
if let Some(first_arg) = first_call_arg(args_part) {
let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg);
if !first_arg.is_empty() && is_identifier(first_arg) {
return Some(first_arg.to_string());
}
}
}
@ -1063,6 +1113,32 @@ mod tests {
);
}
#[test]
fn extract_allowlist_target_negated_paren_wrapper() {
// Tree-sitter records the if-condition as `(!ALLOWED.includes(cmd))`,
// including the surrounding parens. Naïve `strip_suffix(')')` left
// `cmd)` and `is_identifier` rejected the trailing `)`, dropping the
// structural guard for `cfg-unguarded-sink` suppression. The
// balanced-paren scan must return `Some("cmd")`.
let (kind, target) = classify_condition_with_target("(!ALLOWED.includes(cmd))");
assert_eq!(kind, PredicateKind::AllowlistCheck);
assert_eq!(target.as_deref(), Some("cmd"));
}
#[test]
fn extract_allowlist_target_java_contains_paren_wrapper() {
let (kind, target) = classify_condition_with_target("(!ALLOWED.contains(cmd))");
assert_eq!(kind, PredicateKind::AllowlistCheck);
assert_eq!(target.as_deref(), Some("cmd"));
}
#[test]
fn extract_allowlist_target_in_array_paren_wrapper() {
let (kind, target) = classify_condition_with_target("(!in_array($cmd, $allowed))");
assert_eq!(kind, PredicateKind::AllowlistCheck);
assert_eq!(target.as_deref(), Some("cmd"));
}
// ── TypeCheck classification ──────────────────────────────────────
#[test]

View file

@ -296,16 +296,16 @@ pub fn ssa_events_to_findings(
crate::taint::ssa_transfer::state::record_all_validated_span(span);
// Mirror the path-safety pathway: when the SSA engine has
// already proved every tainted input to a privileged
// FILE_IO sink passed through validation, publish the sink
// span so the state-analysis pass suppresses
// `state-unauthed-access` on the same span. Trust here
// matches the trust the engine already extends when
// dropping the taint flow finding. Scoped to FILE_IO sinks
// because that is the only sink class state-unauthed-access
// currently fires on; broadening would risk stretching
// validator-name heuristics into unrelated finding classes.
if event.sink_caps.intersects(Cap::FILE_IO) {
// already proved every tainted input to a privileged sink
// passed through validation, publish the sink span so the
// state-analysis pass suppresses `state-unauthed-access`
// on the same span. Trust here matches the trust the
// engine already extends when dropping the taint flow
// finding. Covers the privileged sink classes
// [`is_privileged_sink`] keys on (FILE_IO + SHELL_ESCAPE);
// broadening past those would stretch the validator-trust
// heuristic into unrelated finding classes.
if event.sink_caps.intersects(Cap::FILE_IO | Cap::SHELL_ESCAPE) {
crate::taint::ssa_transfer::state::record_path_safe_suppressed_span(span);
}
continue;

View file

@ -987,6 +987,7 @@ fn compute_succ_states(
&effective_vars,
ssa,
Some(transfer.interner),
effective_negated,
);
// Validation-call err-check narrowing. When the condition
@ -1522,7 +1523,13 @@ fn resolve_var_to_ssa_value(var_name: &str, ssa: &SsaBody, block: BlockId) -> Op
/// variables) and updates its [`PathFact`] according to the classified
/// rejection / assertion idiom.
///
/// Gated on `transfer.lang == Lang::Rust` by the caller.
/// `negated` reflects the effective negation of `cond_text`: when true,
/// the condition's surface form is `!<cond_text>` (or `not <cond_text>`)
/// and the True/False successor states correspond to the *rejection* /
/// *surviving* arms inverted relative to the unwrapped condition. The
/// narrowing functions are written against the unwrapped condition; this
/// flag lets the caller route prefix-lock / rejection-axis narrowing to
/// the arm where the unwrapped condition holds.
#[cfg(test)]
fn apply_path_fact_branch_narrowing(
true_state: &mut SsaTaintState,
@ -1538,6 +1545,7 @@ fn apply_path_fact_branch_narrowing(
effective_vars,
ssa,
None,
false,
);
}
@ -1548,10 +1556,12 @@ fn apply_path_fact_branch_narrowing_with_interner(
effective_vars: &[String],
ssa: &SsaBody,
interner: Option<&SymbolInterner>,
negated: bool,
) {
use crate::abstract_interp::PathFact;
use crate::abstract_interp::path_domain::{
PathAssertion, PathRejection, classify_path_assertion, classify_path_rejection_axes,
cond_has_pre_negated_islocal_clause,
};
let rejection_axes = classify_path_rejection_axes(cond_text);
@ -1561,24 +1571,44 @@ fn apply_path_fact_branch_narrowing_with_interner(
return;
}
// Mark validated_may on the false branch when a path-rejection
// Resolve the "safe arm" for the rejection axes.
//
// `classify_path_rejection_axes` reports axes that hold on the FALSE
// branch of `cond_text` AS WRITTEN, with one exception: the
// `!filepath.IsLocal(...)` Go idiom is matched at the clause level
// and the classifier consumes the leading `!` itself (the safe arm
// remains the FALSE branch of the whole condition).
//
// For polarity-blind atoms like `!path.contains("..")`, the
// classifier ignores the leading `!` and still extracts `..`. In
// that shape, AST detects the unary `!` and sets
// `condition_negated = true`, but the rejection axis's *true* safe
// arm is the TRUE branch of the whole condition. So when
// `negated == true` AND no clause is the pre-negated IsLocal idiom,
// flip the narrow target.
let rejection_pre_negated = cond_has_pre_negated_islocal_clause(cond_text);
let rejection_safe_is_true = negated && !rejection_pre_negated;
// Mark validated_may on the safe arm when a path-rejection
// pattern fires. Mirrors the AllowlistCheck quirk that already
// marks validated on the rejection-arm via `apply_branch_predicates`
// for languages whose `.contains(...)` / membership idiom hits the
// AllowlistCheck classifier, but normalises behaviour for shapes
// like C `strstr(path, "..") != NULL` that hit the NullCheck arm
// first and never get a chance to mark validation through the
// allowlist path. Once the path-rejection classifier has accepted
// the condition, the false branch (where the sink is reached after
// the rejection-arm terminates) is the validated arm by
// construction.
// allowlist path.
if !rejection_axes.is_empty()
&& let Some(intern) = interner
{
let safe_state: &mut SsaTaintState = if rejection_safe_is_true {
&mut *true_state
} else {
&mut *false_state
};
for var in effective_vars {
if let Some(sym) = intern.get(var) {
false_state.validated_may.insert(sym);
false_state.validated_must.insert(sym);
safe_state.validated_may.insert(sym);
safe_state.validated_must.insert(sym);
}
}
}
@ -1632,15 +1662,47 @@ fn apply_path_fact_branch_narrowing_with_interner(
}
};
// Apply rejection axes to the safe arm. The rejection classifier
// (`has_negated_filepath_is_local` + `classify_path_rejection_atom`)
// reports axes that hold on the FALSE branch of `cond_text` AS
// WRITTEN, with one exception: the `!filepath.IsLocal(...)` Go idiom
// is matched at the clause level and the classifier consumes the
// leading `!` itself (safe arm remains the FALSE branch).
//
// For polarity-blind atoms like `!path.contains("..")` the classifier
// ignores the leading `!` but AST-level negation flips the safe arm
// to TRUE. Use the same `rejection_safe_is_true` resolution as the
// validated-marker block above so soundness is consistent.
let rejection_state: &mut SsaTaintState = if rejection_safe_is_true {
&mut *true_state
} else {
&mut *false_state
};
for v in &targets {
if let Some(ref mut abs) = false_state.abstract_state {
if let Some(ref mut abs) = rejection_state.abstract_state {
let mut av = abs.get(*v);
narrow_false(&mut av.path);
if !av.is_top() {
abs.set(*v, av);
}
}
if let Some(ref mut abs) = true_state.abstract_state {
}
// Apply prefix-lock assertion to the cond-holds branch. Unlike the
// rejection classifier, `classify_path_assertion` is naive about
// leading negation — it just searches cond_text for a
// `starts_with`-like substring. When `condition_negated` is true
// (e.g. `if !target.startsWith(ROOT) { return; }`) the assertion
// actually holds on the *false* CFG edge, where the sink is reached.
// Flip the destination state in that case so the lock attaches to
// the surviving block.
let assertion_state = if negated {
&mut *false_state
} else {
&mut *true_state
};
for v in &targets {
if let Some(ref mut abs) = assertion_state.abstract_state {
let mut av = abs.get(*v);
narrow_true(&mut av.path);
if !av.is_top() {
@ -3024,6 +3086,80 @@ pub(super) fn transfer_inst(
return;
}
// Chain-wrapper sanitiser detection. Computed up-front so
// both the container-element-write hook and the outer-
// callee taint suppression block below can consult it.
// Walks `info.arg_callees` for the chain shape
// `outer(... wrapper(<source>) ...)`, collecting any
// sanitiser caps the wrapper's summary or label exposes.
// The set is empty when there is no chain wrapper or when
// none of the wrappers expose sanitisation.
//
// Argument attribution: when `find_classifiable_inner_call`
// overrode the callee to an inner Source, the source can be
// either (a) a direct argument call (`outer(escape(x),
// source())`) or (b) nested inside one wrapper
// (`outer(escape(source(x)))`). Crediting any wrapper's
// sanitizer caps when the source sits in a different argument
// position would suppress real taint flow.
//
// * `source_arg_pos = Some(N)` — the source call is the
// immediate callee of arg N (`arg_callees[N] == callee`).
// No other-arg wrapper can sanitize it. Credit nothing.
// * `source_arg_pos = None` — the source is nested inside
// some arg's wrapper. Credit only when exactly one arg
// has a sanitizing wrapper, since that one must be the
// parent of the nested source. Multiple sanitizing
// wrappers across different positions is ambiguous; stay
// conservative and credit nothing.
let caller_func_for_chain = info.ast.enclosing_func.as_deref().unwrap_or("");
let mut chain_wrapper_sanitizer_caps = Cap::empty();
if !info.arg_callees.is_empty() {
let source_arg_pos = info
.arg_callees
.iter()
.position(|c| c.as_deref() == Some(callee.as_str()));
let mut per_arg_sanitizer_caps: SmallVec<[Cap; 4]> = SmallVec::new();
for (idx, maybe_callee) in info.arg_callees.iter().enumerate() {
if Some(idx) == source_arg_pos {
continue;
}
let Some(wrap_callee) = maybe_callee else {
continue;
};
if Some(wrap_callee.as_str()) == info.call.outer_callee.as_deref() {
continue;
}
let mut caps_here = Cap::empty();
if let Some(resolved) = resolve_callee_hinted(
transfer,
wrap_callee,
caller_func_for_chain,
info.call.call_ordinal,
None,
) {
caps_here |= resolved.sanitizer_caps;
} else {
let labels = crate::labels::classify_all(
transfer.lang.as_str(),
wrap_callee,
transfer.extra_labels,
);
for lbl in &labels {
if let DataLabel::Sanitizer(bits) = lbl {
caps_here |= *bits;
}
}
}
if !caps_here.is_empty() {
per_arg_sanitizer_caps.push(caps_here);
}
}
if source_arg_pos.is_none() && per_arg_sanitizer_caps.len() == 1 {
chain_wrapper_sanitizer_caps = per_arg_sanitizer_caps[0];
}
}
// Container element-write hook. Runs before other Call-arm
// processing so `try_container_propagation`'s early-return
// can't bypass us. Writes only into `(loc, ELEM)` cells on
@ -3033,8 +3169,48 @@ pub(super) fn transfer_inst(
// through: cell `must = AND` over args (every writer must be
// must-validated), `may = OR` over args. Anonymous SSA temps
// contribute `false/false` and break the `must` invariant.
if let (Some(pf), Some(rcv)) = (transfer.pointer_facts, *receiver) {
if crate::pointer::is_container_write_callee(callee) {
//
// Two callee shapes:
// * Method-style write (`receiver.push(val)`) — `receiver`
// channel resolves the container, value args start at
// position 0.
// * Go `append` builtin (or chain shape with
// `outer_callee == "append"`) — no receiver channel,
// `args[0]` is the slice itself, value args start at
// position 1.
if let Some(pf) = transfer.pointer_facts {
let go_append_chain = transfer.lang == Lang::Go
&& receiver.is_none()
&& (callee == "append" || info.call.outer_callee.as_deref() == Some("append"));
// For Go append, args[0] is the input slice whose
// points-to set may be empty when the slice was just
// initialised with a composite literal (`cmds :=
// []string{}`). The call result (inst.value) carries
// the fresh allocation site that pointer analysis
// attaches to every Call op, and downstream uses of
// the slice flow through that result, so it is the
// authoritative container identity. Fall back to
// args[0] when the result has no pt set yet.
let resolved_recv: Option<SsaValue> = if let Some(rcv) = *receiver {
Some(rcv)
} else if go_append_chain {
let result_v = inst.value;
let result_pt = pf.pt(result_v);
if !result_pt.is_empty() && !result_pt.is_top() {
Some(result_v)
} else {
args.first().and_then(|a| a.first().copied())
}
} else {
None
};
let value_arg_start = if go_append_chain { 1 } else { 0 };
let write_callee_match = if go_append_chain {
true
} else {
crate::pointer::is_container_write_callee(callee)
};
if let (Some(rcv), true) = (resolved_recv, write_callee_match) {
let pt = pf.pt(rcv);
if !pt.is_empty() && !pt.is_top() {
let mut elem_caps = Cap::empty();
@ -3043,7 +3219,7 @@ pub(super) fn transfer_inst(
let mut elem_must_all = true; // AND over args (vacuously true for empty args)
let mut elem_may_any = false; // OR over args
let mut saw_any_arg = false;
for arg_group in args {
for arg_group in args.iter().skip(value_arg_start) {
for &arg_v in arg_group {
saw_any_arg = true;
if let Some(t) = state.get(arg_v) {
@ -3059,6 +3235,35 @@ pub(super) fn transfer_inst(
elem_may_any |= av;
}
}
// Chain-shape Go append: the inner Source label
// fires on this same call instruction, so its
// caps are not yet on any positional arg's SSA
// value at this point. Pull them in directly
// from the source labels so the W4 cell sees
// the real source caps; without this the cell
// is empty for the chain shape and the index-
// read taint flow appears clean for the wrong
// reason.
if go_append_chain {
for lbl in &info.taint.labels {
if let DataLabel::Source(bits) = lbl {
elem_caps |= *bits;
saw_any_arg = true;
}
}
// A chain-shape sanitising wrapper around the
// source counts as the validation that the
// ELEM cell needs. Each entry in
// `info.arg_callees` whose summary or label
// exposes non-empty `sanitizer_caps`
// contributes to validation, the cell's
// must/may bits flip on so the index-read
// counterpart sees the value as validated.
if !chain_wrapper_sanitizer_caps.is_empty() {
elem_must_all = true;
elem_may_any = true;
}
}
// Vacuous AND: a zero-arg container write supplies
// no validation source, so coerce must to false.
if !saw_any_arg {
@ -3204,6 +3409,20 @@ pub(super) fn transfer_inst(
}
}
// Call-site replace sanitizer detection. Recognises
// `s.replace*(pat, rep)` / `strings.ReplaceAll(s, pat, rep)` /
// `str_replace($pat, $rep, $s)` shapes whose pattern is a
// concrete shell/HTML/SQL escape literal and treats the call
// as a sanitizer for the corresponding caps. Mirrors the
// semantics that label-rule sanitizers already provide.
if let Some(extra) = crate::symex::strings::detect_call_site_replace_sanitizer(
callee,
transfer.lang,
&info.call.arg_string_literals,
) {
sanitizer_bits |= extra;
}
// Resolve callee summary, always attempt, even when explicit
// labels are present. Labels take precedence for source caps, but
// summary propagation and sanitizer behaviour must still apply
@ -4006,7 +4225,10 @@ pub(super) fn transfer_inst(
// produces return_bits. Check if the wrapper function blocks taint:
// if its SSA summary shows no propagation, no source_caps, and no
// container identity return, the return value is independent of its
// arguments, clear return_bits.
// arguments, clear return_bits. Additionally apply the wrapper's
// sanitizer caps (StripBits transforms) so a sanitising wrapper
// like `validate(<source>)` clears the relevant cap bits even
// when the wrapper still propagates other taint.
if !return_bits.is_empty() && has_source_label {
if let Some(ref oc) = info.call.outer_callee {
if let Some(ref oc_sum) = resolve_callee_hinted(
@ -4021,11 +4243,36 @@ pub(super) fn transfer_inst(
// no internal sources reaching return.
return_bits = Cap::empty();
return_origins.clear();
} else if !oc_sum.sanitizer_caps.is_empty() {
return_bits &= !oc_sum.sanitizer_caps;
}
}
}
}
// Chain-wrapper sanitizer suppression: when the chain shape
// `outer(... wrapper(<source>) ...)` puts a sanitising wrapper
// function between the inner Source and the outer call,
// mark the call result's symbol as validated so any
// downstream sink event over the same value fires with
// `all_validated = true`, suppressing the taint finding and
// (via [`record_path_safe_suppressed_span`]) the
// `state-unauthed-access` finding on the same span.
// `chain_wrapper_sanitizer_caps` is computed up-front above
// so the container-element-write hook can also consult it.
if has_source_label && !chain_wrapper_sanitizer_caps.is_empty() {
if let Some(name) = ssa
.value_defs
.get(inst.value.0 as usize)
.and_then(|vd| vd.var_name.as_deref())
{
if let Some(sym) = transfer.interner.get(name) {
state.validated_must.insert(sym);
state.validated_may.insert(sym);
}
}
}
// Constructor cap narrowing: a `new X(...)` call returns an object
// instance, not a string. Caps that name a string-shaped sink
// pattern (path argument, format string, URL component, JSON
@ -7654,11 +7901,12 @@ fn is_abstract_safe_for_sink(
}
/// Check every tainted leaf flowing into `inst`'s used values carries a
/// PathFact proving it is dotdot-free and non-absolute.
/// PathFact proving it cannot perform path traversal.
///
/// Core gate for the rs-safe-0** FP closure (see [`PathFact::is_path_safe`]).
/// Traces through Assign chains so `Path::new(sanitised)` still resolves
/// to the sanitised string's fact.
/// Core gate for the rs-safe-0** FP closure plus the canonicalised+rooted
/// shape (see [`PathFact::is_path_traversal_safe`]). Traces through
/// Assign chains so `Path::new(sanitised)` still resolves to the
/// sanitised string's fact.
fn is_path_safe_for_sink(
inst: &SsaInst,
state: &SsaTaintState,
@ -7670,7 +7918,9 @@ fn is_path_safe_for_sink(
if leaves.is_empty() {
return false;
}
let safe = leaves.iter().all(|v| abs.get(*v).path.is_path_safe());
let safe = leaves
.iter()
.all(|v| abs.get(*v).path.is_path_traversal_safe());
if safe {
// Publish the suppression to the file-level set so the
// state-analysis pass can suppress `state-unauthed-access` on
@ -7925,7 +8175,7 @@ fn trace_single_leaf(
// existing trace-through-args behaviour.
let proves_path_safe = state.abstract_state.as_ref().is_some_and(|abs_state| {
let f = abs_state.get(v).path;
!f.is_top() && f.is_path_safe()
!f.is_top() && f.is_path_traversal_safe()
});
if is_source || proves_path_safe {
leaves.push(v);

View file

@ -1229,6 +1229,80 @@ mod goto_succ_propagation_tests {
);
}
#[test]
fn path_fact_negated_contains_dotdot_narrows_true_branch() {
// `if !path.contains("..") { return; } sink(path);` — the surviving
// (sink-reaching) arm is the TRUE branch of the IF condition. The
// rejection axis (DotDot) must narrow `true_state`, not `false_state`,
// otherwise the unsafe arm gets dotdot=No and the sink suppression
// masks the bug.
let ssa = ssa_body_with_named_value("path");
let mut true_state = initial_state_with_abstract();
let mut false_state = initial_state_with_abstract();
super::super::apply_path_fact_branch_narrowing_with_interner(
&mut true_state,
&mut false_state,
"!path.contains(\"..\")",
&["path".to_string()],
&ssa,
None,
true,
);
let true_abs = true_state.abstract_state.as_ref().unwrap();
let false_abs = false_state.abstract_state.as_ref().unwrap();
assert_eq!(
true_abs.get(SsaValue(0)).path.dotdot,
crate::abstract_interp::Tri::No,
"negated-contains: TRUE arm (sink-reaching, safe) must narrow"
);
assert_eq!(
false_abs.get(SsaValue(0)).path.dotdot,
crate::abstract_interp::Tri::Maybe,
"negated-contains: FALSE arm (rejection arm) must NOT narrow"
);
}
#[test]
fn path_fact_negated_filepath_islocal_narrows_false_branch() {
// `if !filepath.IsLocal(p) { return; } sink(p);` — Go idiom. The
// classifier consumes the `!` itself (pre-negated handler), so the
// safe arm remains the FALSE branch of the whole condition even
// though `condition_negated == true` at AST level.
let ssa = ssa_body_with_named_value("p");
let mut true_state = initial_state_with_abstract();
let mut false_state = initial_state_with_abstract();
super::super::apply_path_fact_branch_narrowing_with_interner(
&mut true_state,
&mut false_state,
"!filepath.IsLocal(p)",
&["p".to_string()],
&ssa,
None,
true,
);
let true_abs = true_state.abstract_state.as_ref().unwrap();
let false_abs = false_state.abstract_state.as_ref().unwrap();
assert_eq!(
false_abs.get(SsaValue(0)).path.dotdot,
crate::abstract_interp::Tri::No,
"!filepath.IsLocal: FALSE arm (sink-reaching, IsLocal=true) must narrow"
);
assert_eq!(
false_abs.get(SsaValue(0)).path.absolute,
crate::abstract_interp::Tri::No,
"!filepath.IsLocal: FALSE arm absolute axis must narrow"
);
assert_eq!(
true_abs.get(SsaValue(0)).path.dotdot,
crate::abstract_interp::Tri::Maybe,
"!filepath.IsLocal: TRUE arm (return) must NOT narrow"
);
}
#[test]
fn path_fact_no_match_leaves_state_untouched() {
let ssa = ssa_body_with_named_value("x");