Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
This commit is contained in:
Eli Peter 2026-05-01 10:59:52 -04:00 committed by GitHub
parent a438886217
commit 58f1794a4e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
189 changed files with 8421 additions and 383 deletions

View file

@ -1118,6 +1118,7 @@ fn clone_preserves_all_sub_structs() {
arg_string_literals: vec![Some("lit".into())],
destination_uses: None,
gate_filters: Vec::new(),
is_constructor: false,
},
taint: TaintMeta {
labels: {

View file

@ -373,11 +373,26 @@ pub(crate) fn first_member_label(
if let Some(full) = member_expr_text(n, code) {
// Try the full text first, then progressively strip the last segment
// to match rules like "process.env" from "process.env.CMD".
//
// The strip-and-retry only ever yields a sound label for Sources:
// `process.env.CMD` → strip → `process.env` makes sense because
// the receiver itself IS the source. Sinks and Sanitizers, by
// contrast, name the *operation* — `connection.query`, `eval`,
// `exec` — and stripping a trailing segment to match them is
// not semantically valid (e.g. `exec.start` should never be
// treated as a SHELL_ESCAPE sink because of bare `exec`). We
// accept any label on a full-text match (the behaviour callers
// already depend on for Source/Sink labels alike), but only
// accept Source labels after segment stripping.
let mut candidate = full.as_str();
let mut first = true;
loop {
if let Some(lbl) = classify(lang, candidate, extra_labels) {
return Some(lbl);
if first || matches!(lbl, DataLabel::Source(_)) {
return Some(lbl);
}
}
first = false;
match candidate.rsplit_once('.') {
Some((prefix, _)) => candidate = prefix,
None => break,

View file

@ -38,25 +38,27 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
}
}
/// Extract identifiers from specified fields of an object-literal argument.
/// Extract `(field_name, ident_name)` pairs from specified fields of an
/// object-literal argument.
///
/// Returns:
/// * `Some(names)` if the positional argument at `index` IS an object literal
/// (JS `object`, TS `object`, Python `dictionary`). `names` contains
/// identifiers lifted from pair values whose key matches any entry in
/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field
/// pairs are present, returns `Some(vec![])`, the sink is effectively
/// silenced because no destination identifier exists.
/// * `Some(pairs)` if the positional argument at `index` IS an object literal
/// (JS `object`, TS `object`, Python `dictionary`). Each pair is
/// `(field_name, ident_name)` where `field_name` is the matched key from
/// `fields` and `ident_name` is an identifier lifted from that pair's
/// value expression. When no destination-field pairs are present, returns
/// `Some(vec![])`, the sink is effectively silenced because no destination
/// identifier exists.
/// * `None` if the arg is absent, is not an object literal (plain string
/// / ident / expression), or has splat/spread children that break static
/// per-field reasoning. Callers fall back to the whole-arg positional
/// filter in this case.
pub(super) fn extract_destination_field_idents(
pub(super) fn extract_destination_field_pairs(
call_node: Node,
arg_index: usize,
fields: &[&str],
code: &[u8],
) -> Option<Vec<String>> {
) -> Option<Vec<(String, String)>> {
if fields.is_empty() {
return None;
}
@ -71,7 +73,7 @@ pub(super) fn extract_destination_field_idents(
return None;
}
let mut out: Vec<String> = Vec::new();
let mut out: Vec<(String, String)> = Vec::new();
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
match child.kind() {
@ -88,8 +90,8 @@ pub(super) fn extract_destination_field_idents(
let Some(name) = text_of(child, code) else {
continue;
};
if fields.iter().any(|&f| f == name) && !out.contains(&name) {
out.push(name);
if fields.iter().any(|&f| f == name) && !out.iter().any(|(_, v)| v == &name) {
out.push((name.clone(), name));
}
}
"pair" => {
@ -124,8 +126,8 @@ pub(super) fn extract_destination_field_idents(
let mut paths: Vec<String> = Vec::new();
collect_idents_with_paths(val_node, code, &mut idents, &mut paths);
for name in paths.into_iter().chain(idents) {
if !out.contains(&name) {
out.push(name);
if !out.iter().any(|(_, v)| v == &name) {
out.push((key.clone(), name));
}
}
}
@ -135,6 +137,62 @@ pub(super) fn extract_destination_field_idents(
Some(out)
}
/// Extract `(field_name, ident_name)` pairs from `keyword_argument` /
/// `named_argument` children of a call whose keyword name matches one of
/// `fields`. Used for languages where destination-bearing fields are passed
/// as direct kwargs rather than wrapped in a dict literal, e.g. Python
/// `requests.post(url, data=tainted, json=safe)` where `data` and `json` are
/// `keyword_argument` siblings of the positional URL.
///
/// Returns the union of matching kwargs, preserving the kwarg name in the
/// `field` slot so callers can still attribute findings per-field. Empty
/// when no matching kwargs exist or the call has no `arguments` field.
pub(super) fn extract_destination_kwarg_pairs(
call_node: Node,
fields: &[&str],
code: &[u8],
) -> Vec<(String, String)> {
if fields.is_empty() {
return Vec::new();
}
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut out: Vec<(String, String)> = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
if kind != "keyword_argument" && kind != "named_argument" {
continue;
}
let named_count = child.named_child_count();
let name_node = child
.child_by_field_name("name")
.or_else(|| child.named_child(0));
let value_node = child
.child_by_field_name("value")
.or_else(|| child.named_child(named_count.saturating_sub(1) as u32));
let (Some(nn), Some(vn)) = (name_node, value_node) else {
continue;
};
let Some(name) = text_of(nn, code) else {
continue;
};
if !fields.iter().any(|&f| f == name) {
continue;
}
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(vn, code, &mut idents, &mut paths);
for ident in paths.into_iter().chain(idents) {
if !out.iter().any(|(_, v)| v == &ident) {
out.push((name.clone(), ident));
}
}
}
out
}
/// Extract the string-literal content at argument position `index` (0-based).
/// Returns `None` if the argument is not a string literal or the index is out of range.
pub(super) fn extract_const_string_arg(
@ -144,7 +202,14 @@ pub(super) fn extract_const_string_arg(
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg = args.named_children(&mut cursor).nth(index)?;
let mut arg = args.named_children(&mut cursor).nth(index)?;
// PHP / Go wrap each positional argument in an `argument` node; unwrap so
// the kind-match below sees the inner literal.
if arg.kind() == "argument" && arg.named_child_count() == 1 {
if let Some(inner) = arg.named_child(0) {
arg = inner;
}
}
match arg.kind() {
// `string` / `string_literal` cover JS/TS, Python, Java, PHP, C/C++, Ruby, Rust;
// `interpreted_string_literal` / `raw_string_literal` cover Go's
@ -177,6 +242,39 @@ pub(super) fn extract_const_string_arg(
}
}
/// Extract a macro-constant or `define`d identifier name at argument position
/// `index` (0-based). Used for languages where activation values are
/// preprocessor symbols rather than string literals — currently C, C++, and
/// PHP define-constants like `CURLOPT_POSTFIELDS` whose syntactic form is an
/// `identifier` / `name` node, not a `string`.
///
/// Returns `None` for any non-identifier shape so dynamic-activation
/// semantics still apply when the activation arg is a runtime value
/// (variable, expression, function call).
pub(super) fn extract_const_macro_arg(
call_node: Node,
index: usize,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let mut arg = args.named_children(&mut cursor).nth(index)?;
if arg.kind() == "argument" && arg.named_child_count() == 1 {
if let Some(inner) = arg.named_child(0) {
arg = inner;
}
}
match arg.kind() {
// C/C++ identifier / PHP `name` node for define-style constants.
// Scoped C++ identifiers (`Curl::OPT_POSTFIELDS`) and PHP namespaced
// names also surface here so the dangerous_values match catches them.
"identifier" | "name" | "qualified_name" | "scoped_identifier" => {
text_of(arg, code).map(|s| s.to_string())
}
_ => None,
}
}
/// Extract the value of a keyword argument from a call node (e.g. Python `shell=True`).
/// Walks argument children looking for `keyword_argument` nodes, matches the keyword
/// name, and extracts the value node text for literals.
@ -1546,6 +1644,59 @@ pub(super) fn def_use(
(None, uses, vec![])
}
// for-in / for-of / Python `for x in iter:` ─────────────────────────
//
// Tree-sitter classifies these as `Kind::For` with a `left`/`right`
// field pair (binding pattern + iterable). Without an explicit
// arm here, the default branch collects every ident as a `use` and
// never registers the iteration binding as a `define`, so taint
// entering the iterable does not propagate into the body's
// references to the binding (`for (const [a, b] of obj) { sink(a) }`
// would lose the flow at `a`).
//
// C-style `for_statement` has no `left`/`right` fields (it uses
// `initializer`/`condition`/`increment`), so this path falls through
// to the default-collecting behaviour for those, preserving today's
// semantics.
Kind::For => {
let left = ast.child_by_field_name("left");
let right = ast.child_by_field_name("right");
if left.is_none() && right.is_none() {
// C-style for, defer to default ident collection.
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
return (None, uses, vec![]);
}
let mut defs: Option<String> = None;
let mut extra_defs: Vec<String> = Vec::new();
let mut uses: Vec<String> = Vec::new();
if let Some(pat) = left {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(pat, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
if let Some(val) = right {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(val, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
}
(defs, uses, extra_defs)
}
// everything else no definition, but may read vars
_ => {
let mut idents = Vec::new();
@ -1557,3 +1708,225 @@ pub(super) fn def_use(
}
}
}
/// One match from [`extract_shell_array_payload_idents`].
///
/// `arg_position` is the positional argument index of the call where the
/// shell-array literal was found. `payload_idents` is the union of
/// identifiers (and dotted paths) lifted from the array's payload elements
/// (positions 2+ for POSIX `sh -c <cmd>` form; positions 2+ for `cmd /c <cmd>`
/// likewise). Empty `payload_idents` means the payload is a constant string,
/// which the caller should treat as benign (no SHELL_ESCAPE finding possible).
#[derive(Debug, Clone)]
pub(super) struct ShellArrayMatch {
pub arg_position: usize,
pub payload_idents: Vec<String>,
}
/// Detect inline shell-execution array literals at a call site.
///
/// Recognises the pattern `[<shell>, "-c", <payload>]` (POSIX shells) and
/// `[<cmd-shell>, "/c"|"/C", <payload>]` (Windows `cmd.exe`) appearing as
/// either:
/// * a direct positional argument of `call_node`, or
/// * the value of any field within an object-literal positional argument
/// (covers `container.exec({Cmd: ["bash", "-c", x]})` form).
///
/// Returns one [`ShellArrayMatch`] per detected shell-array. Empty when the
/// call has no shell-array literals.
///
/// The shell-name list is intentionally narrow (POSIX shells + Windows
/// `cmd.exe`/`powershell`) to avoid false positives on benign array literals
/// like `["ls", "-la"]` or `["git", "rev-parse", "HEAD"]`, where element 0 is
/// not a shell. Element 1 must be a literal `-c` (POSIX) or `/c`/`/C` (cmd);
/// otherwise the array is not in shell-exec form regardless of element 0.
///
/// Identifiers from elements at positions 2+ are lifted via
/// [`collect_idents_with_paths`] so template-literal interpolations
/// (`` `echo ${x}` ``), member-expressions (`obj.field`), and bare idents are
/// all captured. Dedup is preserved across array elements so a single ident
/// referenced in multiple payload positions appears once.
pub(super) fn extract_shell_array_payload_idents(
call_node: Node,
code: &[u8],
) -> Vec<ShellArrayMatch> {
let mut out = Vec::new();
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return out;
};
let mut cursor = args_node.walk();
for (idx, child) in args_node.named_children(&mut cursor).enumerate() {
let kind = child.kind();
// Splats break positional indexing; bail conservatively on the whole call.
if kind == "spread_element"
|| kind == "dictionary_splat"
|| kind == "list_splat"
|| kind == "splat_argument"
|| kind == "hash_splat_argument"
{
return Vec::new();
}
if kind == "keyword_argument" || kind == "named_argument" {
continue;
}
// Direct array-literal arg.
if let Some(idents) = shell_array_payload_idents_of(child, code) {
out.push(ShellArrayMatch {
arg_position: idx,
payload_idents: idents,
});
continue;
}
// Object-literal arg whose field value is a shell-array literal.
// Covers `container.exec({Cmd: [...]})` form. Field name is not
// restricted to `Cmd` / `cmd`: the shell-shape itself is the gate,
// and the payload extraction is per-array.
if matches!(kind, "object" | "dictionary") {
let mut cc = child.walk();
for pair in child.named_children(&mut cc) {
if pair.kind() != "pair" {
continue;
}
let Some(val_node) = pair.child_by_field_name("value") else {
continue;
};
let val_node = unwrap_parens(val_node);
if let Some(idents) = shell_array_payload_idents_of(val_node, code) {
out.push(ShellArrayMatch {
arg_position: idx,
payload_idents: idents,
});
}
}
}
}
out
}
/// If `node` is an array literal of shape `[<shell>, "-c", *]` (POSIX shells)
/// or `[<cmd-shell>, "/c", *]` (Windows cmd.exe), return the identifiers
/// referenced in the payload elements (positions 2+). Otherwise return
/// `None`. Returning `Some(vec![])` means the payload is a constant string
/// — caller should still skip emitting a sink (no taint can reach a literal).
fn shell_array_payload_idents_of(node: Node, code: &[u8]) -> Option<Vec<String>> {
let node = unwrap_parens(node);
if node.kind() != "array" {
return None;
}
// Walk named children to skip commas and other trivia.
let mut cursor = node.walk();
let elems: Vec<Node> = node.named_children(&mut cursor).collect();
if elems.len() < 3 {
return None;
}
let shell = const_string_value(elems[0], code)?;
if !is_known_shell(&shell) {
return None;
}
let flag = const_string_value(elems[1], code)?;
if !is_shell_command_flag(&shell, &flag) {
return None;
}
// Lift identifiers from the payload elements (positions 2+). Constants
// contribute nothing. An empty result means the entire payload is
// statically benign.
let mut idents: Vec<String> = Vec::new();
let mut paths: Vec<String> = Vec::new();
for elem in &elems[2..] {
collect_idents_with_paths(*elem, code, &mut idents, &mut paths);
}
let mut combined = paths;
combined.extend(idents);
// Dedup (preserve first-seen order).
let mut seen = std::collections::HashSet::new();
combined.retain(|s| seen.insert(s.clone()));
if combined.is_empty() {
// Static payload — no taint can reach it. Return None so the caller
// does not emit a useless sink filter.
return None;
}
Some(combined)
}
/// Extract a constant string value from `node`, handling JS/TS `string` /
/// `template_string` (no interpolation) forms. Returns `None` for dynamic
/// values, identifiers, or expressions.
fn const_string_value(node: Node, code: &[u8]) -> Option<String> {
let node = unwrap_parens(node);
match node.kind() {
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
let raw = text_of(node, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
"template_string" => {
let mut c = node.walk();
if node
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return None;
}
let raw = text_of(node, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
_ => None,
}
}
/// Known shell executable names that activate the shell-array detector.
/// Scoped narrowly to POSIX shells + Windows command interpreters, listing
/// only canonical names so benign arrays like `["ls", ...]`, `["git", ...]`,
/// or `["python", ...]` do not match.
fn is_known_shell(name: &str) -> bool {
// Strip directory prefix for matching: `/bin/bash` → `bash`.
let leaf = name.rsplit('/').next().unwrap_or(name);
matches!(
leaf,
"bash"
| "sh"
| "zsh"
| "dash"
| "ksh"
| "fish"
| "ash"
| "tcsh"
| "csh"
| "cmd"
| "cmd.exe"
| "powershell"
| "powershell.exe"
| "pwsh"
| "pwsh.exe"
)
}
/// True when `flag` is the "execute the following string as a shell command"
/// switch for the given `shell`. POSIX shells use `-c`; cmd.exe accepts
/// `/c` / `/C`; PowerShell uses `-Command` (also `-c` as alias) and
/// `-EncodedCommand`.
fn is_shell_command_flag(shell: &str, flag: &str) -> bool {
let leaf = shell.rsplit('/').next().unwrap_or(shell);
let is_cmd = matches!(leaf, "cmd" | "cmd.exe");
let is_powershell = matches!(leaf, "powershell" | "powershell.exe" | "pwsh" | "pwsh.exe");
if is_cmd {
return matches!(flag, "/c" | "/C" | "/k" | "/K");
}
if is_powershell {
return matches!(
flag,
"-c" | "-Command" | "-command" | "-EncodedCommand" | "-encodedcommand"
);
}
// POSIX shells.
flag == "-c"
}

View file

@ -52,10 +52,11 @@ use literals::has_sql_placeholders;
use literals::{
arg0_kind_and_interpolation, call_ident_of, def_use, detect_go_replace_call_sanitizer,
detect_rust_replace_chain_sanitizer, extract_arg_callees, extract_arg_string_literals,
extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg,
extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node,
find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args,
is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
extract_arg_uses, extract_const_keyword_arg, extract_const_macro_arg, extract_const_string_arg,
extract_destination_field_pairs, extract_destination_kwarg_pairs, extract_kwargs,
extract_literal_rhs, extract_shell_array_payload_idents, find_call_node, find_call_node_deep,
find_chained_inner_call, has_keyword_arg, has_only_literal_args, is_parameterized_query_call,
java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
};
use params::{
@ -312,6 +313,15 @@ pub struct CallMeta {
/// [`Self::destination_uses`]).
#[serde(default)]
pub gate_filters: Vec<GateFilter>,
/// True when this call expression is a constructor invocation
/// (e.g. JS/TS `new Stripe(key)`, PHP `new PDO(...)`). The SSA Call
/// transfer uses this to narrow the constructed value's caps: a wrapper
/// object instance is structurally not a path string, format string,
/// URL component, or JSON input, so out-of-process side-effect bits
/// (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on the arguments
/// must not survive into the constructed object.
#[serde(default)]
pub is_constructor: bool,
}
/// One gate's contribution at a call site whose callee matches multiple
@ -329,6 +339,15 @@ pub struct GateFilter {
/// considers SSA values whose `var_name` matches one of `names` (object-
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
pub destination_uses: Option<Vec<String>>,
/// Parallel to [`Self::destination_uses`]: for each entry, the
/// destination object-literal field name (e.g. `"body"`, `"headers"`,
/// `"json"`) where the corresponding ident was bound. Empty when
/// `destination_uses` is `None` or the gate had no
/// `object_destination_fields` configured. Consumed by diag rendering
/// to embed the destination field in `DATA_EXFIL` messages and SARIF
/// `properties.data_exfil_field`.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub destination_fields: Vec<String>,
}
/// Taint-classification and variable-flow metadata.
@ -450,6 +469,13 @@ pub struct NodeInfo {
/// up the field's declared `TypeKind`. Strictly additive, when
/// `None`, the legacy copy-prop semantics apply.
pub member_field: Option<String>,
/// True when this assignment / declaration's RHS is a function or
/// lambda literal (`obj.handler = (e) => {...}`, `let f = function(){}`).
/// State analysis uses this to suppress resource-ownership transfer:
/// storing a function reference into a property does not move the
/// resources captured by the closure body, so the lifecycle of those
/// captures must remain unchanged on the assignment node.
pub rhs_is_function_literal: bool,
}
impl NodeInfo {
@ -1564,6 +1590,92 @@ pub(super) fn push_node<'a>(
let extra = analysis_rules.map(|r| r.extra_labels.as_slice());
let mut labels = classify_all(lang, &text, extra);
// Rust chain-text classification. The default `text` for a Rust
// CallMethod is `{root_receiver}.{method}`, where `root_receiver`
// is the leftmost identifier after walking through every nested
// call/method receiver. That convention loses the intermediate
// chain methods, so a body-binding chain like
// `Client::post(url).body(payload).send()` reduces to
// `Client::post.send` and rules keyed on `body.send` /
// `RequestBuilder.body` cannot fire.
//
// Reclassify against the call-AST's source text (with paren groups
// stripped) so suffix matchers covering chain shapes
// (`body.send`, `body_string`, `Request::builder.body`, ...) attach.
// Strictly additive: we union new labels with the existing ones,
// never override. Limited to Rust to avoid disturbing the other
// languages' chain conventions.
if lang == "rust" {
if let Some(cn) = find_call_node(ast, lang) {
if let Some(chain_raw) = text_of(cn, code) {
// Multi-line Rust chains (`Client::new()\n .post(url)\n
// .body(p)\n .send()`) preserve interior whitespace in
// the source slice, which would prevent suffix matchers
// like `body.send` from firing. Strip whitespace before
// normalizing paren groups, mirroring the same trick
// used by `find_chained_inner_call` for JS/TS chains.
let chain_compact: String =
chain_raw.chars().filter(|c| !c.is_whitespace()).collect();
let chain_text = crate::labels::normalize_chained_call_for_classify(&chain_compact);
if chain_text != text {
let chain_labels = classify_all(lang, &chain_text, extra);
for l in chain_labels {
if !labels.contains(&l) {
labels.push(l);
}
}
}
// Also try classification against the chain with
// trailing identity methods peeled. Rust chains often
// end in `.unwrap()` / `.expect("...")` / `.await` /
// `.clone()` etc., which obscure the body-bind verb
// for suffix matchers. E.g. hyper's
// `Request::builder().method(..).uri(..).body(p).unwrap()`
// peels to `...body`, allowing a simpler `body` /
// `Request::builder.body` matcher to fire.
let peeled = crate::ssa::type_facts::peel_identity_suffix(&chain_text);
if peeled != chain_text && peeled != text {
let peeled_labels = classify_all(lang, &peeled, extra);
for l in peeled_labels {
if !labels.contains(&l) {
labels.push(l);
}
}
}
// Pattern synthesis: the hyper request-builder chain
// (`hyper::Request::builder().method(..).uri(..).body(p)`)
// can interleave `.method`, `.uri`, `.header`, `.version`
// etc. between `Request::builder` and the body-bind step.
// Suffix matchers can't span those, so synthesise a
// DATA_EXFIL sink whenever the chain begins with
// `Request::builder` and ends in a body-binding verb.
// Strictly additive: no labels are removed, only added,
// and the synthesis only fires when an explicit Sink
// hasn't already attached.
let chain_for_synth = if peeled != chain_text {
&peeled
} else {
&chain_text
};
if !labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::DATA_EXFIL)))
&& (chain_for_synth.contains("Request::builder.")
|| chain_for_synth.contains("hyper::Request::builder."))
{
let last_seg =
chain_for_synth.rsplit('.').next().unwrap_or(chain_for_synth);
if matches!(
last_seg,
"body" | "body_mut" | "body_string" | "body_json" | "body_bytes"
) {
labels.push(DataLabel::Sink(crate::labels::Cap::DATA_EXFIL));
}
}
}
}
}
// If the outermost call didn't classify, try inner/nested calls.
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
// When the callee is overridden, save the original for container ops
@ -1727,7 +1839,23 @@ pub(super) fn push_node<'a>(
let mut sink_payload_args: Option<Vec<usize>> = None;
let mut destination_uses: Option<Vec<String>> = None;
let mut gate_filters: Vec<GateFilter> = Vec::new();
if labels.is_empty() {
// Gates run when no flat `Sink` label is already present, OR when a
// matching gate restricts the payload-arg set on top of an existing flat
// sink. Source / Sanitizer labels are orthogonal — a callee like
// Python's `requests.post` is a `Source` for its response object AND a
// gated `Sink` for its URL/body argument positions; both should attach.
//
// Payload-arg refinement: when a flat sink matches a callee that ALSO
// has a gate entry restricting `payload_args`, the gate's `payload_args`
// are propagated to `sink_payload_args` so only those positions are
// taint-checked. Example: `execSync(cmd, { env: process.env })` matches
// the bare `execSync` flat `Sink(SHELL_ESCAPE)` AND the gate `=execSync`
// with `payload_args: &[0]`; without the refinement, the flat rule's
// implicit "all args" would flag `process.env` flowing into the options
// object's `env` field. The gate's labels themselves are deduped so a
// single capability never double-attributes.
let has_sink_label = labels.iter().any(|l| matches!(l, DataLabel::Sink(_)));
{
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(cn) = gate_call {
let gate_callee_text = if call_ast.is_some() {
@ -1746,7 +1874,22 @@ pub(super) fn push_node<'a>(
let matches = classify_gated_sink(
lang,
&gate_callee_text,
|idx| extract_const_string_arg(cn, idx, code),
|idx| {
extract_const_string_arg(cn, idx, code).or_else(|| {
// C/C++ preprocessor macros and PHP `define`d constants
// surface as identifier nodes, not string literals.
// Falling back to the macro-arg extractor for those
// languages lets gates like `curl_easy_setopt` /
// `curl_setopt` activate on a `CURLOPT_POSTFIELDS`
// ident match instead of firing conservatively on
// every positional arg.
if matches!(lang, "c" | "cpp" | "c++" | "php") {
extract_const_macro_arg(cn, idx, code)
} else {
None
}
})
},
|kw| extract_const_keyword_arg(cn, kw, code),
|kw| has_keyword_arg(cn, kw, code),
);
@ -1758,11 +1901,23 @@ pub(super) fn push_node<'a>(
// * a `GateFilter` carrying that gate's specific
// `(label_caps, payload_args, destination_uses)` so
// the SSA sink scan can attribute taint per-cap.
//
// When a flat sink already matches, gate labels are deduped
// so the same capability isn't attributed twice (once flat,
// once gated). Their `payload_args` still flow into
// `sink_payload_args` so the gate's arg-position restriction
// applies on top of the flat sink.
let mut union_payload: Vec<usize> = Vec::new();
for gm in &matches {
labels.push(gm.label);
if has_sink_label {
if !labels.contains(&gm.label) {
labels.push(gm.label);
}
} else {
labels.push(gm.label);
}
let payload_vec: Vec<usize> =
let mut payload_vec: Vec<usize> =
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual
@ -1780,19 +1935,57 @@ pub(super) fn push_node<'a>(
// checks to identifiers under those fields. Non-object
// arg forms return `None` from the extractor and the gate
// falls back to whole-arg positional filtering.
//
// The pair form preserves which object-literal field each
// ident was bound to (e.g. `body` vs `headers` vs `json`)
// so diag rendering can attribute `DATA_EXFIL` findings to
// a specific destination field.
let mut dest_uses: Option<Vec<String>> = None;
let mut dest_fields: Vec<String> = Vec::new();
if !gm.object_destination_fields.is_empty() {
let mut all_pairs: Vec<(String, String)> = Vec::new();
let mut had_object_match = false;
for &pos in gm.payload_args {
if let Some(names) = extract_destination_field_idents(
if let Some(pairs) = extract_destination_field_pairs(
cn,
pos,
gm.object_destination_fields,
code,
) {
dest_uses = Some(names);
all_pairs.extend(pairs);
had_object_match = true;
break;
}
}
// Direct kwargs: languages where destination-bearing
// fields are passed as `keyword_argument` siblings of
// the positional args (Python `data=`, Ruby kwargs).
// SSA lowering folds kwarg idents into the implicit
// args group at index `arity`, so we expand
// `payload_vec` to include that position; the
// `destination_filter` then narrows to the kwarg
// ident's `var_name`.
let kwarg_pairs =
extract_destination_kwarg_pairs(cn, gm.object_destination_fields, code);
if !kwarg_pairs.is_empty() {
let arity = extract_arg_uses(cn, code).len();
if !payload_vec.contains(&arity) {
payload_vec.push(arity);
}
for pair in kwarg_pairs {
if !all_pairs.iter().any(|(_, v)| v == &pair.1) {
all_pairs.push(pair);
}
}
}
if had_object_match || !all_pairs.is_empty() {
let (fields, vars): (Vec<String>, Vec<String>) =
all_pairs.into_iter().unzip();
dest_uses = Some(vars);
dest_fields = fields;
}
}
let label_caps = match gm.label {
@ -1809,6 +2002,7 @@ pub(super) fn push_node<'a>(
label_caps,
payload_args: payload_vec,
destination_uses: dest_uses,
destination_fields: dest_fields,
});
}
if !union_payload.is_empty() {
@ -1826,6 +2020,65 @@ pub(super) fn push_node<'a>(
}
}
// ── Inline shell-array sink synthesis ────────────────────────────────
//
// Recognise `[<shell>, "-c", <payload>]` (and `cmd /c <payload>`)
// appearing as an argument to *any* call. The shell-array shape itself
// is the gate, regardless of callee, so this fires through user-defined
// wrappers like `execInContainer(id, ["bash", "-c", `echo ${tainted}`])`
// without needing per-wrapper summary annotations. Only fires for JS/TS
// because the array-literal grammar (`array` node) and shell-form usage
// are JS/TS conventions; other languages use different shapes for
// shell-exec wrappers.
//
// The inner array also covers Dockerode's
// `container.exec({Cmd: [shell, "-c", payload]})`: the helper looks
// inside object-literal args for shell-array values under any field.
//
// Existing FP carve-outs are preserved. `["ls", "-la"]` doesn't match
// (element 0 is not a known shell). `untaintedArrayVariable` doesn't
// match (variable, not literal). `execSync(cmd, { env: process.env })`
// doesn't match (string + object args, no shell-array literal). When
// the payload elements are constant strings the helper returns no
// match, so a literal `["bash", "-c", "ls -la"]` doesn't fire either.
if matches!(lang, "javascript" | "js" | "typescript" | "ts") {
if let Some(cn) = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)) {
let shell_matches = extract_shell_array_payload_idents(cn, code);
if !shell_matches.is_empty() {
let shell_label = DataLabel::Sink(Cap::SHELL_ESCAPE);
let already_has_shell_sink = labels.iter().any(|l| match l {
DataLabel::Sink(c) => c.contains(Cap::SHELL_ESCAPE),
_ => false,
});
if !already_has_shell_sink {
labels.push(shell_label);
}
let mut union_payload: Vec<usize> = sink_payload_args.clone().unwrap_or_default();
for sm in shell_matches {
if !union_payload.contains(&sm.arg_position) {
union_payload.push(sm.arg_position);
}
gate_filters.push(GateFilter {
label_caps: Cap::SHELL_ESCAPE,
payload_args: vec![sm.arg_position],
destination_uses: Some(sm.payload_idents),
destination_fields: Vec::new(),
});
}
if !union_payload.is_empty() {
sink_payload_args = Some(union_payload);
}
// Legacy single-gate path: when this is the only gate filter,
// populate the top-level destination_uses too so the SSA
// fast-path stays consistent with the multi-gate behaviour.
if gate_filters.len() == 1 {
destination_uses = gate_filters[0].destination_uses.clone();
}
}
}
}
// Pattern-based sanitizer synthesis: recognise a Rust
// `param.replace(LIT, LIT)[.replace(LIT, LIT)]*` chain that provably strips
// path-traversal or HTML metacharacters. The CFG collapses the whole
@ -2296,6 +2549,20 @@ pub(super) fn push_node<'a>(
// just bloat every labeled Call node.
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
// Constructor detection: a `new X(...)` call carries different cap
// semantics than a plain function call. The SSA Call transfer uses
// this flag to narrow the constructed value's caps so out-of-process
// side-effect bits (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on
// the arguments don't survive into a wrapper-object instance.
// Recognised forms:
// * JS/TS `new_expression`
// * Java/C++ `object_creation_expression`
// * PHP `object_creation_expression`
let is_constructor = ast.kind() == "new_expression"
|| ast.kind() == "object_creation_expression"
|| call_ast
.is_some_and(|cn| matches!(cn.kind(), "new_expression" | "object_creation_expression"));
let idx = g.add_node(NodeInfo {
kind,
call: CallMeta {
@ -2311,6 +2578,7 @@ pub(super) fn push_node<'a>(
arg_string_literals,
destination_uses,
gate_filters,
is_constructor,
},
taint: TaintMeta {
labels,
@ -2339,6 +2607,7 @@ pub(super) fn push_node<'a>(
is_eq_with_const: detect_eq_with_const(ast, lang),
is_numeric_length_access: detect_numeric_length_access(ast, lang, code),
member_field: detect_member_field_assignment(ast, code),
rhs_is_function_literal: rhs_is_function_literal(ast, lang),
});
debug!(
@ -2404,7 +2673,10 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
if candidate.is_none() {
// Walk one level into declarations whose direct child is the
// declarator (variable_declaration → variable_declarator →
// value).
// value), or expression-statement wrappers whose direct child is
// an assignment_expression / assignment with a `right` field
// (JS `expression_statement > assignment_expression`, Python
// `expression_statement > assignment`).
let mut cursor = ast.walk();
for c in ast.children(&mut cursor) {
if matches!(
@ -2417,6 +2689,11 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
if candidate.is_some() {
break;
}
} else if matches!(lookup(lang, c.kind()), Kind::Assignment) {
candidate = c.child_by_field_name("right");
if candidate.is_some() {
break;
}
}
}
}
@ -4417,7 +4694,23 @@ fn apply_promisify_labels(
let Some(alias) = aliases.get(&callee) else {
continue;
};
let wrapped_labels = classify_all(lang, &alias.wrapped, extra);
// Inherit both flat and gated labels from the wrapped callee.
// Gated sinks (e.g. `child_process.exec`) carry the same
// capability semantics as flat sinks, just with arg-position
// filtering at the call site; the promisify alias should
// surface the wrapped function's sink class regardless of
// which arm originally classified it.
let mut wrapped_labels: Vec<crate::labels::DataLabel> =
classify_all(lang, &alias.wrapped, extra)
.into_iter()
.collect();
for gm in
classify_gated_sink(lang, &alias.wrapped, |_| None, |_| None, |_| false).iter()
{
if !wrapped_labels.contains(&gm.label) {
wrapped_labels.push(gm.label);
}
}
if wrapped_labels.is_empty() {
continue;
}