use super::conditions::unwrap_parens; use super::helpers::{collect_array_pattern_bindings_indexed, collect_rhs_array_literal_elements}; use super::{ anon_fn_name, collect_idents, collect_idents_with_paths, find_constructor_type_child, first_call_ident, root_receiver_text, text_of, }; use crate::labels::{Cap, Kind, lookup}; use smallvec::SmallVec; use tree_sitter::Node; /// Find the inner CallFn/CallMethod/CallMacro node within an AST node. /// For direct call nodes, returns the node itself. For wrappers, searches /// up to two levels of children, transparently descending through /// `await_expression` / `yield_expression` (`Kind::AwaitForward`) wrappers /// so `const x = await foo(y)` reaches the inner `call_expression` at /// effective depth 3 (`lexical_declaration > variable_declarator > /// await_expression > call_expression`). pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option> { match lookup(lang, n.kind()) { Kind::CallFn | Kind::CallMethod | Kind::CallMacro => Some(n), Kind::AwaitForward => { // Transparent wrapper: descend into the awaited expression. let mut cursor = n.walk(); for c in n.children(&mut cursor) { if let Some(found) = find_call_node(c, lang) { return Some(found); } } None } _ => { let mut cursor = n.walk(); for c in n.children(&mut cursor) { match lookup(lang, c.kind()) { Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return Some(c), // Skip past await/yield wrappers without consuming a // recursion level — the wrapper itself is transparent. Kind::AwaitForward => { if let Some(found) = find_call_node(c, lang) { return Some(found); } } _ => {} } } // Recurse one more level (handles `expression_statement > variable_declarator > call`) let mut cursor2 = n.walk(); for c in n.children(&mut cursor2) { let mut cursor3 = c.walk(); for gc in c.children(&mut cursor3) { match lookup(lang, gc.kind()) { Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return Some(gc), Kind::AwaitForward => { if let Some(found) = find_call_node(gc, lang) { return Some(found); } } _ => {} } } } None } } } /// Extract `(field_name, ident_name)` pairs from specified fields of an /// object-literal argument. /// /// Returns: /// * `Some(pairs)` if the positional argument at `index` IS an object literal /// (JS `object`, TS `object`, Python `dictionary`). Each pair is /// `(field_name, ident_name)` where `field_name` is the matched key from /// `fields` and `ident_name` is an identifier lifted from that pair's /// value expression. When no destination-field pairs are present, returns /// `Some(vec![])`, the sink is effectively silenced because no destination /// identifier exists. /// * `None` if the arg is absent, is not an object literal (plain string /// / ident / expression), or has splat/spread children that break static /// per-field reasoning. Callers fall back to the whole-arg positional /// filter in this case. pub(super) fn extract_destination_field_pairs( call_node: Node, arg_index: usize, fields: &[&str], code: &[u8], ) -> Option> { if fields.is_empty() { return None; } let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); let arg = args.named_children(&mut cursor).nth(arg_index)?; // Only object / dict literal forms carry per-field destination semantics. // For anything else (identifier, member expression, string, call), return // None so the caller treats the whole arg as destination. if !matches!(arg.kind(), "object" | "dictionary") { return None; } let mut out: Vec<(String, String)> = Vec::new(); let mut c = arg.walk(); for child in arg.named_children(&mut c) { match child.kind() { // `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't // statically attribute spread contents to specific fields, so // bail out, caller falls back to the whole-arg filter, matching // the conservative posture used by arg_uses for splats. "spread_element" | "dictionary_splat" => { return None; } // Shorthand property `{ url }` binds the `url` field to a binding // also named `url`. Treat as destination iff the name matches. "shorthand_property_identifier" | "shorthand_property_identifier_pattern" => { let Some(name) = text_of(child, code) else { continue; }; if fields.iter().any(|&f| f == name) && !out.iter().any(|(_, v)| v == &name) { out.push((name.clone(), name)); } } "pair" => { let Some(key_node) = child.child_by_field_name("key") else { continue; }; let key_text = match key_node.kind() { // Strip quotes from string-literal keys so `"url"` and `url` // both match the configured field list. "string" | "string_literal" => text_of(key_node, code).map(|raw| { if raw.len() >= 2 { raw[1..raw.len() - 1].to_string() } else { raw } }), // Computed keys: resolve only when the inner expression // is a pure string literal (`['url']`). Dynamic forms // (`[someVar]`, `[`url-${i}`]`, ``[`url`]`` with // interpolation) stay conservative-skip. "computed_property_name" => { let mut inner_cursor = key_node.walk(); let inner = key_node.named_children(&mut inner_cursor).find(|c| { !matches!(c.kind(), "comment" | "block_comment" | "line_comment") }); match inner.map(|n| (n.kind(), n)) { Some(("string" | "string_literal", n)) => text_of(n, code).map(|raw| { if raw.len() >= 2 { raw[1..raw.len() - 1].to_string() } else { raw } }), // Template strings only when no interpolation // (no `template_substitution` children). Some(("template_string", n)) if { let mut tc = n.walk(); !n.named_children(&mut tc) .any(|c| c.kind() == "template_substitution") } => { text_of(n, code).map(|raw| { if raw.len() >= 2 { raw[1..raw.len() - 1].to_string() } else { raw } }) } _ => continue, } } _ => text_of(key_node, code), }; let Some(key) = key_text else { continue; }; if !fields.iter().any(|&f| f == key) { continue; } let Some(val_node) = child.child_by_field_name("value") else { continue; }; let mut idents: Vec = Vec::new(); let mut paths: Vec = Vec::new(); collect_idents_with_paths(val_node, code, &mut idents, &mut paths); for name in paths.into_iter().chain(idents) { if !out.iter().any(|(_, v)| v == &name) { out.push((key.clone(), name)); } } } _ => {} } } Some(out) } /// Extract `(field_name, ident_name)` pairs from `keyword_argument` / /// `named_argument` children of a call whose keyword name matches one of /// `fields`. Used for languages where destination-bearing fields are passed /// as direct kwargs rather than wrapped in a dict literal, e.g. Python /// `requests.post(url, data=tainted, json=safe)` where `data` and `json` are /// `keyword_argument` siblings of the positional URL. /// /// Also covers Ruby, where tree-sitter-ruby emits `pair` nodes (with /// `key`/`value` fields) directly under `argument_list` for the /// `Faraday.new(url: x)` / `Net::HTTP.start(host, port, proxy_addr: prx)` /// kwarg shape. The `key` is typically a `hash_key_symbol` whose text is the /// bare identifier (`url`); `simple_symbol` (`:url`) and string keys are /// normalised by stripping a leading `:` or wrapping quotes. /// /// Returns the union of matching kwargs, preserving the kwarg name in the /// `field` slot so callers can still attribute findings per-field. Empty /// when no matching kwargs exist or the call has no `arguments` field. pub(super) fn extract_destination_kwarg_pairs( call_node: Node, fields: &[&str], code: &[u8], ) -> Vec<(String, String)> { if fields.is_empty() { return Vec::new(); } let Some(args_node) = call_node.child_by_field_name("arguments") else { return Vec::new(); }; let mut out: Vec<(String, String)> = Vec::new(); let mut cursor = args_node.walk(); for child in args_node.named_children(&mut cursor) { let kind = child.kind(); let (name_node, value_node) = if kind == "keyword_argument" || kind == "named_argument" { let named_count = child.named_child_count(); ( child .child_by_field_name("name") .or_else(|| child.named_child(0)), child .child_by_field_name("value") .or_else(|| child.named_child(named_count.saturating_sub(1) as u32)), ) } else if kind == "pair" { // Ruby `pair` node sits directly under `argument_list` for // kwarg-style call args (`f(url: x)`). `key`/`value` fields // are populated; key text is `hash_key_symbol` ("url"), // `simple_symbol` (":url"), or a string literal. ( child.child_by_field_name("key"), child.child_by_field_name("value"), ) } else { continue; }; let (Some(nn), Some(vn)) = (name_node, value_node) else { continue; }; let Some(name_raw) = text_of(nn, code) else { continue; }; let name = name_raw .trim_start_matches(':') .trim_matches(['"', '\'']) .to_string(); if !fields.iter().any(|&f| f == name) { continue; } let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(vn, code, &mut idents, &mut paths); for ident in paths.into_iter().chain(idents) { if !out.iter().any(|(_, v)| v == &ident) { out.push((name.clone(), ident)); } } } out } /// Extract the string-literal content at argument position `index` (0-based). /// Returns `None` if the argument is not a string literal or the index is out of range. /// True when `call_node` is `Object.create(null)` (or its parenthesised / /// awaited / type-cast wrappers). Strict literal-`null` first-arg match, /// no aliasing through intermediate variables. Caller restricts to JS/TS. pub(super) fn is_object_create_null_call(call_node: Node, code: &[u8]) -> bool { if !matches!(call_node.kind(), "call_expression") { return false; } let callee = call_node .child_by_field_name("function") .and_then(|f| text_of(f, code)) .unwrap_or_default(); if callee != "Object.create" { return false; } let Some(args) = call_node.child_by_field_name("arguments") else { return false; }; let mut cursor = args.walk(); let named: Vec = args.named_children(&mut cursor).collect(); if named.len() != 1 { return false; } let mut arg = named[0]; // Unwrap parens / await / TS type-assertions. for _ in 0..4 { match arg.kind() { "parenthesized_expression" => { if let Some(inner) = arg.named_child(0) { arg = inner; continue; } } "await_expression" => { if let Some(inner) = arg.child_by_field_name("argument") { arg = inner; continue; } } "as_expression" | "type_assertion" => { if let Some(inner) = arg.named_child(0) { arg = inner; continue; } } _ => break, } } arg.kind() == "null" || text_of(arg, code).as_deref() == Some("null") } pub(super) fn extract_const_string_arg( call_node: Node, index: usize, code: &[u8], ) -> Option { let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); let mut arg = args.named_children(&mut cursor).nth(index)?; // PHP / Go wrap each positional argument in an `argument` node; unwrap so // the kind-match below sees the inner literal. if arg.kind() == "argument" && arg.named_child_count() == 1 { if let Some(inner) = arg.named_child(0) { arg = inner; } } match arg.kind() { // `string` / `string_literal` cover JS/TS, Python, Java, PHP, C/C++, Ruby, Rust; // `interpreted_string_literal` / `raw_string_literal` cover Go's // tree-sitter grammar (double-quoted vs. backtick-quoted forms). "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => { let raw = text_of(arg, code)?; if raw.len() >= 2 { Some(raw[1..raw.len() - 1].to_string()) } else { None } } // Boolean literals — JS/TS `true`/`false` are their own node kinds; some // grammars wrap them as identifiers carrying the keyword text. Returned // verbatim so `dangerous_values` matching can detect deep-flag forms // like `extend(true, target, src)`. "true" | "false" => Some(arg.kind().to_string()), // PHP double-quoted strings parse as `encapsed_string` whose body is // a sequence of `string_content` / `escape_sequence` / interpolation // nodes. Treat the string as constant only when every child is a // pure-literal segment (no `variable_name` / `subscript_expression` // interpolations); the returned value is the concatenation of the // literal segments verbatim. "encapsed_string" => { let mut c = arg.walk(); let mut buf = String::new(); for ch in arg.named_children(&mut c) { match ch.kind() { "string_content" => { if let Some(s) = text_of(ch, code) { buf.push_str(&s); } } "escape_sequence" => { if let Some(s) = text_of(ch, code) { buf.push_str(&s); } } _ => return None, } } Some(buf) } "template_string" => { // Only treat as constant if no interpolation (no template_substitution children) let mut c = arg.walk(); if arg .named_children(&mut c) .any(|ch| ch.kind() == "template_substitution") { return None; // dynamic } let raw = text_of(arg, code)?; if raw.len() >= 2 { Some(raw[1..raw.len() - 1].to_string()) } else { None } } // Concat-style binary expression with a leading string literal, e.g. // PHP `"Location: " . $url`, JS/TS `"Location: " + url`. Returns the // left-most literal so prefix-driven gates (`dangerous_prefixes`) can // activate on partially-dynamic concatenations; falls through to // `None` when the leading segment is not a string literal so // exact-`dangerous_values` matching keeps its strict semantics. "binary_expression" => { let left = arg.child_by_field_name("left")?; match left.kind() { "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => { let raw = text_of(left, code)?; if raw.len() >= 2 { Some(raw[1..raw.len() - 1].to_string()) } else { None } } "encapsed_string" => { let mut c = left.walk(); let mut buf = String::new(); for ch in left.named_children(&mut c) { match ch.kind() { "string_content" | "escape_sequence" => { if let Some(s) = text_of(ch, code) { buf.push_str(&s); } } _ => return None, } } Some(buf) } _ => None, } } _ => None, } } /// Extract a macro-constant or `define`d identifier name at argument position /// `index` (0-based). Used for languages where activation values are /// preprocessor symbols rather than string literals — currently C, C++, and /// PHP define-constants like `CURLOPT_POSTFIELDS` whose syntactic form is an /// `identifier` / `name` node, not a `string`. /// /// Returns `None` for any non-identifier shape so dynamic-activation /// semantics still apply when the activation arg is a runtime value /// (variable, expression, function call). pub(super) fn extract_const_macro_arg( call_node: Node, index: usize, code: &[u8], ) -> Option { let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); let mut arg = args.named_children(&mut cursor).nth(index)?; if arg.kind() == "argument" && arg.named_child_count() == 1 { if let Some(inner) = arg.named_child(0) { arg = inner; } } match arg.kind() { // C/C++ identifier / PHP `name` node for define-style constants. // Scoped C++ identifiers (`Curl::OPT_POSTFIELDS`) and PHP namespaced // names also surface here so the dangerous_values match catches them. "identifier" | "name" | "qualified_name" | "scoped_identifier" => text_of(arg, code), // Ruby bare constant (`NOENT`) — leaf form. "constant" => text_of(arg, code), // Ruby scope-qualified constant (`Nokogiri::XML::ParseOptions::NOENT`). // Return only the rightmost `name` segment so the gate's // `dangerous_values` list can stay identifier-bare instead of // enumerating every possible namespacing. Falls back to the full // text if the `name` field is missing for any reason. "scope_resolution" => arg .child_by_field_name("name") .and_then(|n| text_of(n, code)) .or_else(|| text_of(arg, code)), // Integer literals at the activation arg position. PHP / C / C++ // commonly use plain `0` to opt into the safe-default option set // (e.g. `simplexml_load_string($xml, "SimpleXMLElement", 0)`). The // gate's `dangerous_values` list is identifier-only, so returning // the literal text lets the comparison fail against `LIBXML_NOENT` // and suppresses the conservative-fire branch. "integer" | "integer_literal" | "number_literal" | "decimal_integer_literal" => { text_of(arg, code) } _ => None, } } /// Extract the value of a keyword argument from a call node (e.g. Python `shell=True`). /// Walks argument children looking for `keyword_argument` nodes, matches the keyword /// name, and extracts the value node text for literals. pub(super) fn extract_const_keyword_arg( call_node: Node, keyword_name: &str, code: &[u8], ) -> Option { let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); for child in args.named_children(&mut cursor) { if child.kind() == "keyword_argument" || child.kind() == "named_argument" { // keyword_argument has a "name" field and a "value" field in Python tree-sitter let Some(name_node) = child.child_by_field_name("name") else { continue; }; let Some(name_text) = text_of(name_node, code) else { continue; }; if name_text != keyword_name { continue; } let value_node = child.child_by_field_name("value")?; // Only return a literal, identifiers / calls / complex exprs are // "dynamic" and must be reported as `None` so the gate can // distinguish literal-safe from dynamic. return match value_node.kind() { "true" | "false" | "none" | "integer" | "float" | "string" | "string_literal" | "identifier" => text_of(value_node, code), _ => None, } .filter(|_| { // identifiers are only "literal" when they're the Python // booleans True/False/None (tree-sitter-python classifies // these as identifiers in older grammar versions). match value_node.kind() { "identifier" => text_of(value_node, code) .as_deref() .is_some_and(|s| matches!(s, "True" | "False" | "None")), _ => true, } }); } } None } /// Return `true` if the call node has a keyword/named argument whose name /// matches `keyword_name` (regardless of whether the value is a literal). /// Used by gated-sink classification to distinguish an absent kwarg (language /// default) from a present-but-dynamic kwarg (conservative). pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8]) -> bool { let Some(args) = call_node.child_by_field_name("arguments") else { return false; }; let mut cursor = args.walk(); for child in args.named_children(&mut cursor) { if child.kind() != "keyword_argument" && child.kind() != "named_argument" { continue; } let Some(name_node) = child.child_by_field_name("name") else { continue; }; if text_of(name_node, code).as_deref() == Some(keyword_name) { return true; } } false } /// Extract the literal value of a property `prop_name` from the object /// literal at positional argument `arg_index`. Returns `None` if the /// arg is absent, is not an object literal, the prop key isn't found, /// or the prop value isn't a literal (so callers can distinguish /// "present but dynamic" from "absent" only via [`has_object_arg_property`]). /// /// Used by JS/TS-style "options object as kwargs" gates — e.g. /// `_.template(tpl, { evaluate: false })` — where the safe-flag lives /// in an inline object literal rather than as a dedicated kwarg node /// (which JS does not have). Strict-additive: returns `None` for any /// non-JS-object shape, including bare identifiers passed as the /// options arg, so the gate falls back to the conservative dynamic /// branch. pub(super) fn extract_object_arg_property( call_node: Node, arg_index: usize, prop_name: &str, code: &[u8], ) -> Option { let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); let arg = args.named_children(&mut cursor).nth(arg_index)?; let arg = unwrap_parens(arg); if !matches!(arg.kind(), "object" | "dictionary") { return None; } let mut c = arg.walk(); for child in arg.named_children(&mut c) { if child.kind() != "pair" { continue; } let Some(key_node) = child.child_by_field_name("key") else { continue; }; let key_text = match key_node.kind() { "string" | "string_literal" => text_of(key_node, code).map(|raw| { if raw.len() >= 2 { raw[1..raw.len() - 1].to_string() } else { raw } }), "computed_property_name" => continue, _ => text_of(key_node, code), }; if key_text.as_deref() != Some(prop_name) { continue; } let val_node = child.child_by_field_name("value")?; let val_node = unwrap_parens(val_node); return match val_node.kind() { "true" | "false" | "null" | "undefined" | "number" | "string" | "string_literal" => { text_of(val_node, code) } // JS booleans true/false are their own node kinds (above), but // some grammar versions wrap them as identifier literals; surface // `undefined` similarly. "identifier" => text_of(val_node, code) .filter(|s| matches!(s.as_str(), "true" | "false" | "null" | "undefined")), _ => None, }; } None } /// Return `true` if the call node's positional arg at `arg_index` is an /// object literal containing a property named `prop_name` (whether the /// value is a literal or a dynamic expression). Used alongside /// [`extract_object_arg_property`] so gated-sink classification can /// distinguish "options key absent" (language default) from "options /// key present with dynamic value" (conservative dangerous). pub(super) fn has_object_arg_property( call_node: Node, arg_index: usize, prop_name: &str, code: &[u8], ) -> bool { let Some(args) = call_node.child_by_field_name("arguments") else { return false; }; let mut cursor = args.walk(); let Some(arg) = args.named_children(&mut cursor).nth(arg_index) else { return false; }; let arg = unwrap_parens(arg); if !matches!(arg.kind(), "object" | "dictionary") { return false; } let mut c = arg.walk(); for child in arg.named_children(&mut c) { match child.kind() { "shorthand_property_identifier" | "shorthand_property_identifier_pattern" if text_of(child, code).as_deref() == Some(prop_name) => { return true; } "pair" => { if let Some(key_node) = child.child_by_field_name("key") { let key_text = match key_node.kind() { "string" | "string_literal" => text_of(key_node, code).map(|raw| { if raw.len() >= 2 { raw[1..raw.len() - 1].to_string() } else { raw } }), "computed_property_name" => continue, _ => text_of(key_node, code), }; if key_text.as_deref() == Some(prop_name) { return true; } } } _ => {} } } false } /// Inspect the first positional argument of a call node and return its /// tree-sitter `kind()` plus a flag indicating whether any descendant is an /// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as /// `arg0`). Returns `None` when the call has no arguments. /// /// Used by per-language shape-aware sink suppression, for example, Ruby /// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically /// parameterised when arg 0 is a hash/symbol/array/non-interpolated string, /// regardless of taint reaching that argument. pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bool)> { let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); let arg0 = args.named_children(&mut cursor).next()?; let arg0 = unwrap_parens(arg0); let kind = arg0.kind().to_string(); let has_interp = subtree_has_interpolation(arg0); Some((kind, has_interp)) } /// Walk a Java method-chain receiver looking for an inner `method_invocation` /// whose method name matches one of `target_methods` (e.g. `createQuery`, /// `prepareStatement`). Returns the kind of that inner call's arg 0, used /// to verify the SQL-bearing call up-chain was given a string literal rather /// than a concatenation / method call. /// /// Conservative: returns `None` when no matching call is found in the chain. /// Stops drilling into args of an unrelated call, so the chain walk is /// strictly down the receiver spine. pub(super) fn java_chain_arg0_kind_for_method( expr: Node, target_methods: &[&str], code: &[u8], ) -> Option { let n = unwrap_parens(expr); if n.kind() == "method_invocation" && let Some(name_node) = n.child_by_field_name("name") && let Some(name) = text_of(name_node, code) && target_methods.iter().any(|m| *m == name) { let args = n.child_by_field_name("arguments")?; let mut cursor = args.walk(); let arg0 = args.named_children(&mut cursor).next()?; let arg0 = unwrap_parens(arg0); return Some(arg0.kind().to_string()); } // Drill down the receiver spine. Java grammar uses `object` for the // receiver of a `method_invocation`. if n.kind() == "method_invocation" && let Some(recv) = n.child_by_field_name("object") && let Some(found) = java_chain_arg0_kind_for_method(recv, target_methods, code) { return Some(found); } None } /// Walk a Ruby method-chain receiver-side looking for the inner call whose /// method identifier matches one of `target_methods`, then return that /// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node /// represents a chained expression like `Model.where(...).preload(...).to_a` ///, the outermost call (`to_a`) has no arguments, so the shape suppressor /// must reach down the chain to inspect `where`'s arg 0. /// /// Conservative: returns `None` if the chain doesn't contain a matching /// method, so callers fall through to the no-suppression path. pub(super) fn ruby_chain_arg0_for_method( expr: Node, target_methods: &[&str], code: &[u8], ) -> Option<(String, bool)> { let n = unwrap_parens(expr); if n.kind() == "call" && let Some(method) = n.child_by_field_name("method") && let Some(name) = text_of(method, code) && target_methods.iter().any(|m| *m == name) { return arg0_kind_and_interpolation(n); } // Recurse into the receiver chain (`call.receiver` → next call up). if n.kind() == "call" && let Some(recv) = n .child_by_field_name("receiver") .or_else(|| n.child_by_field_name("object")) && let Some(found) = ruby_chain_arg0_for_method(recv, target_methods, code) { return Some(found); } // Also descend into named children to handle wrapping (assignment RHS, // begin-end blocks, parenthesised expressions, etc.). let mut cursor = n.walk(); for c in n.named_children(&mut cursor) { if let Some(found) = ruby_chain_arg0_for_method(c, target_methods, code) { return Some(found); } } None } fn subtree_has_interpolation(n: Node) -> bool { if n.kind() == "interpolation" || n.kind() == "string_interpolation" { return true; } let mut cursor = n.walk(); n.named_children(&mut cursor).any(subtree_has_interpolation) } /// Walk a JS/TS method-chain receiver-side to find an inner `call_expression` /// whose member-property name matches one of `target_methods` (e.g. `query`, /// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0. /// /// Used to recognise ORM-accessor chains where a labelled SQL sink sits on /// the receiver side of a parameterised execute method: /// `strapi.db.query('admin::api-token').findOne({...})`. The outer call /// (`findOne`) is the CFG node; the inner labelled `db.query` call carries /// the literal model UID that proves the chain is parameterised. /// /// Conservative: returns `None` when no matching inner call is found, so /// callers fall through to the no-suppression path. pub(super) fn js_chain_arg0_kind_for_method( expr: Node, target_methods: &[&str], code: &[u8], ) -> Option<(String, bool)> { let n = unwrap_parens(expr); // tree-sitter-typescript / -javascript: call_expression with fields // `function` (member_expression / identifier) and `arguments`. if n.kind() == "call_expression" { // Check this call's callee: if its property name (or full text) ends // with one of `target_methods`, this is the inner labelled call. if let Some(function) = n.child_by_field_name("function") { // Property of a member_expression; falls back to the function // text itself for bare-identifier calls. let prop_text = function .child_by_field_name("property") .and_then(|p| text_of(p, code)); let full_text = text_of(function, code); let leaf_text = full_text .as_ref() .map(|s| s.rsplit('.').next().unwrap_or(s).to_string()); let matched = target_methods.iter().any(|m| { prop_text.as_deref() == Some(*m) || leaf_text.as_deref() == Some(*m) || full_text.as_deref() == Some(*m) || full_text .as_deref() .is_some_and(|s| s.ends_with(&format!(".{m}"))) }); if matched { return arg0_kind_and_interpolation(n); } // Drill down the receiver spine: function.object is the prior // call in the chain. if let Some(object) = function.child_by_field_name("object") && let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code) { return Some(found); } } } None } /// Walk the receiver chain of a JS/TS call to count *non-execute* method /// calls between the outer call and an inner labelled call to /// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer /// chain method name (e.g. `findOne`) when an inner-call to `target_inner` /// exists somewhere on the receiver spine, otherwise `None`. /// /// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain /// shape `.query(LITERAL).(...)`, bare /// `connection.query("SELECT ...")` returns `None` because there is no /// outer chain method. pub(super) fn js_chain_outer_method_for_inner<'a>( outer: Node<'a>, target_inner: &[&str], code: &'a [u8], ) -> Option { let n = unwrap_parens(outer); if n.kind() != "call_expression" { return None; } let function = n.child_by_field_name("function")?; let object = function.child_by_field_name("object")?; // If `object` itself is a call_expression whose property matches // `target_inner`, the immediate outer is `function.property`. if object.kind() == "call_expression" { let inner_function = object.child_by_field_name("function"); if let Some(inner_function) = inner_function { let prop_text = inner_function .child_by_field_name("property") .and_then(|p| text_of(p, code)); let full_text = text_of(inner_function, code); let leaf_text = full_text .as_ref() .map(|s| s.rsplit('.').next().unwrap_or(s).to_string()); let inner_matched = target_inner.iter().any(|m| { prop_text.as_deref() == Some(*m) || leaf_text.as_deref() == Some(*m) || full_text.as_deref() == Some(*m) || full_text .as_deref() .is_some_and(|s| s.ends_with(&format!(".{m}"))) }); if inner_matched { return function .child_by_field_name("property") .and_then(|p| text_of(p, code)); } } // Recurse: outer chain may have more depth (`a.b().c().d()` , // d is outermost, c is next, target may be at b or further in). return js_chain_outer_method_for_inner(object, target_inner, code); } None } /// For a chained method call (`a.b().c().d()`), walk down the receiver /// chain (`function.object`) and return the innermost call_expression /// alongside its callee text (e.g. `"http.get"`). /// /// Returns `None` when: /// * `outer` is not itself a CallFn / CallMethod node, or /// * its `function`/`method` field is not a member-style expression whose /// `object` field is itself a call (i.e. there is no chained receiver). /// /// Motivated by CVE-2025-64430 (Parse Server SSRF via /// `http.get(uri, cb).on('error', e => ...)`). Without this, the outer /// `.on(...)` call swallows classification of the inner gated sink. pub(super) fn find_chained_inner_call<'a>( outer: Node<'a>, lang: &str, code: &[u8], ) -> Option<(Node<'a>, String)> { if !matches!(lookup(lang, outer.kind()), Kind::CallFn | Kind::CallMethod) { return None; } let function = outer .child_by_field_name("function") .or_else(|| outer.child_by_field_name("method"))?; // Direct double-call form (`f()(x)`): the outer call's `function` // field IS itself a call_expression, with no intermediate // member-chain. Treat the inner call as the chain's innermost. // Without this, lodash-style template-render chains like // `_.template(t)(data)` evade the chained-inner rebinding because // the outer's function field is a `call_expression`, not the // `member_expression` shape the original branch below expects. if matches!( lookup(lang, function.kind()), Kind::CallFn | Kind::CallMethod ) { // Recurse: the inner call may itself be chained. if let Some(inner) = find_chained_inner_call(function, lang, code) { return Some(inner); } let inner_func = function .child_by_field_name("function") .or_else(|| function.child_by_field_name("method")) .or_else(|| function.child_by_field_name("name"))?; let raw = text_of(inner_func, code)?; let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect(); return Some((function, inner_text)); } // The function/method field for a chained call is a member_expression // (JS/TS), attribute (Python), or field_expression (Rust); its // receiver is the `object` field (JS/TS/Python) or `value` field // (Rust). Only proceed when that receiver is itself a call. let object = function .child_by_field_name("object") .or_else(|| function.child_by_field_name("value"))?; if !matches!(lookup(lang, object.kind()), Kind::CallFn | Kind::CallMethod) { return None; } // Decide whether `object` is itself a chained method call (its // function/method field is a member-style expression). When yes, // recurse one more level so deeper chains resolve to their innermost // method (e.g. `axios.get(u).then(h).catch(h)` → `axios.get`). // When no — the receiver is a plain function/constructor call like // Rust's `HttpResponse::Found()` — descending one more level would // strand us on the non-method leaf whose text would not match any // gate matcher. Stop here and return the current `outer` level, // which IS the innermost method call. let object_function = object .child_by_field_name("function") .or_else(|| object.child_by_field_name("method")); let object_is_chained_method = object_function .map(|f| { matches!( f.kind(), "member_expression" | "attribute" | "field_expression" | "scoped_identifier" | "scope_resolution" ) && f .child_by_field_name("object") .or_else(|| f.child_by_field_name("value")) .is_some() }) .unwrap_or(false); if object_is_chained_method { // Recurse: the inner call may itself be chained. if let Some(inner) = find_chained_inner_call(object, lang, code) { return Some(inner); } // `object` is the innermost call_expression in the chain. Extract // its callee identifier the same way `first_call_ident_with_span` // does for a CallFn (member_expression text → "http.get"). let inner_func = object .child_by_field_name("function") .or_else(|| object.child_by_field_name("method")) .or_else(|| object.child_by_field_name("name"))?; // Multi-line dotted member expressions (`http\n .get`) include // formatting whitespace in the source-text slice. The labels map // keys are literal `"http.get"` etc., strip whitespace so the // chained-call inner-gate rebinding fires for both single-line and // multi-line chain styles. Also strips `\r` for CRLF sources. // Motivated by upstream Parse Server CVE-2025-64430 which uses the // multi-line `http\n .get(uri, ...)\n .on(...)` form. let raw = text_of(inner_func, code)?; let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect(); return Some((object, inner_text)); } // Receiver is a non-chained call (Rust constructor `Foo::new()` / // `HttpResponse::Found()`, JS bare `f()`). Outer level IS the // innermost method call — return its own function text so gate // matching sees the method name. let raw = text_of(function, code)?; let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect(); Some((outer, inner_text)) } /// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod /// node) and yield each *named argument* of every inner call along the /// way. Outer's own arguments are NOT included, the caller already /// handles those via the standard `pre_emit_arg_source_nodes` pass over /// `outer.arguments`. /// /// For `json.NewDecoder(r.Body).Decode(emoji)`: /// outer = `.Decode(emoji)` , caller iterates `emoji` /// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body` /// /// We only pull from each inner call's `arguments` field, never from its /// `function`/`method`/receiver expressions. That distinction matters /// because chained source-receivers like `r.URL.Query()` expose a /// member-text path that classifies as a Source, but it's the OUTER /// chain text (`r.URL.Query.Get`) that already classifies, so emitting /// a synth source for the inner-call's own callee would double-count. /// /// Used by Go (where chain shapes like `json.NewDecoder(r.Body).Decode` /// hide source-labeled args inside parens between dots, leaving the /// outer callee text un-classifiable). The helper itself is /// language-neutral, but callers should gate per-language until each /// language's regression coverage catches up. pub(super) fn walk_chain_inner_call_args<'a>(outer: Node<'a>, lang: &str, out: &mut Vec>) { if !matches!(lookup(lang, outer.kind()), Kind::CallFn | Kind::CallMethod) { return; } let function = outer .child_by_field_name("function") .or_else(|| outer.child_by_field_name("method")); let Some(function) = function else { return }; let object = function .child_by_field_name("object") .or_else(|| function.child_by_field_name("operand")) .or_else(|| function.child_by_field_name("value")); let Some(inner) = object else { return }; if !matches!(lookup(lang, inner.kind()), Kind::CallFn | Kind::CallMethod) { return; } if let Some(args) = inner.child_by_field_name("arguments") { let mut cursor = args.walk(); for arg in args.named_children(&mut cursor) { out.push(arg); } } walk_chain_inner_call_args(inner, lang, out); } /// Recursively find a call-expression node within an AST subtree (up to /// 4 levels deep). Unlike `find_call_node` which only checks 2 levels, /// this handles `await`-wrapped calls inside declarations. pub(super) fn find_call_node_deep<'a>(n: Node<'a>, lang: &str, depth: u8) -> Option> { if depth == 0 { return None; } match lookup(lang, n.kind()) { Kind::CallFn | Kind::CallMethod | Kind::CallMacro => Some(n), _ => { let mut cursor = n.walk(); for c in n.children(&mut cursor) { if let Some(found) = find_call_node_deep(c, lang, depth - 1) { return Some(found); } } None } } } /// Detect whether a call node is a parameterized SQL query. /// /// Returns `true` when: /// 1. The first argument (arg 0) is a string literal (including template /// strings without interpolation) containing SQL placeholder patterns: /// `$1`..`$N`, `?`, `%s`, or `:identifier`. /// 2. The call has at least 2 arguments (the second being the params /// array/tuple). /// /// This is intentionally conservative: if arg 0 is dynamic (variable, /// concatenation, template with interpolation), returns `false`. pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool { let Some(args) = call_node.child_by_field_name("arguments") else { return false; }; let mut cursor = args.walk(); let named: Vec<_> = args.named_children(&mut cursor).collect(); // Need at least 2 arguments: query string + params if named.len() < 2 { return false; } let first_arg = named[0]; // Extract the raw text of arg 0, must be a string literal or // template string without interpolation. let query_text = match first_arg.kind() { "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => { text_of(first_arg, code) } "template_string" => { // Only constant templates (no interpolation) let mut c = first_arg.walk(); if first_arg .named_children(&mut c) .any(|ch| ch.kind() == "template_substitution") { return false; // dynamic, not safe } text_of(first_arg, code) } // Python concatenated strings: "SELECT" "..." are implicit concat "concatenated_string" => { // If it's a concatenated_string, get the full text text_of(first_arg, code) } _ => return false, // not a literal }; let Some(qt) = query_text else { return false; }; has_sql_placeholders(&qt) } /// Check whether a string contains SQL parameterized-query placeholders. /// /// Recognised patterns: /// - `$1`, `$2`, …, `$N` (PostgreSQL positional) /// - `?` (MySQL / SQLite positional) /// - `%s` (Python DB-API / psycopg2) /// - `:identifier` (Oracle / named parameters), requires the colon to be /// preceded by a space or `=` (to avoid matching JS ternary / object /// literals). pub(super) fn has_sql_placeholders(s: &str) -> bool { let bytes = s.as_bytes(); let len = bytes.len(); let mut i = 0; while i < len { match bytes[i] { b'$' if i + 1 < len && bytes[i + 1].is_ascii_digit() && bytes[i + 1] != b'0' => { // $N where N is 1..9 (at minimum) return true; } b'?' => return true, b'%' if i + 1 < len && bytes[i + 1] == b's' => { return true; } b':' if i > 0 && (bytes[i - 1] == b' ' || bytes[i - 1] == b'=' || bytes[i - 1] == b'(' || bytes[i - 1] == b',') && i + 1 < len && bytes[i + 1].is_ascii_alphabetic() => { // :identifier, must be preceded by whitespace/= to avoid // false positives on object literals or ternary operators. return true; } _ => {} } i += 1; } false } /// Returns true when a tree-sitter node is a syntactic literal value. /// /// Intentionally conservative: if in doubt, returns false. It is better /// to miss a suppression opportunity than to suppress a real tainted flow. /// /// NOTE: Literal-kind classification also exists in `ast.rs::is_literal_node`. /// The two must stay aligned across languages. TODO: consider extracting a /// shared literal-kind helper if a third call site appears. #[allow(clippy::only_used_in_recursion)] pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool { match node.kind() { // Scalar strings, but reject if they contain interpolation // (e.g. Ruby `"hello #{name}"`, Python f-strings). "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" | "string_content" | "string_fragment" => !has_string_interpolation(node), // Numbers "integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => { true } // Booleans / null / nil / none "true" | "false" | "null" | "nil" | "none" | "null_literal" | "boolean" | "boolean_literal" => true, // PHP encapsed_string: safe only if no variable interpolation "encapsed_string" => !has_interpolation_cfg(node), // Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap "argument" => { node.named_child_count() == 1 && node .named_child(0) .is_some_and(|c| is_syntactic_literal(c, code)) } // Unary minus on a number literal: `-42` "unary_expression" | "unary_op" => { node.named_child_count() == 1 && node .named_child(0) .is_some_and(|c| is_syntactic_literal(c, code)) } // String concatenation of literals: `"a" + "b"` or `"a" . "b"` "binary_expression" | "concatenated_string" => { let count = node.named_child_count(); count >= 2 && (0..count).all(|i| { node.named_child(i as u32) .is_some_and(|c| is_syntactic_literal(c, code)) }) } // JS/TS template string: only if no interpolation substitution "template_string" => { let mut c = node.walk(); !node .named_children(&mut c) .any(|ch| ch.kind() == "template_substitution") } // Containers: all elements must be syntactic literals "list" | "array" | "array_expression" | "array_creation_expression" | "tuple" | "tuple_expression" => { let mut c = node.walk(); node.named_children(&mut c) .all(|ch| is_syntactic_literal(ch, code)) } // Container entries: `{"key": "value"}` style pairs "pair" => { let mut c = node.walk(); node.named_children(&mut c) .all(|ch| is_syntactic_literal(ch, code)) } _ => false, } } /// Check if a string node contains interpolation children /// (e.g. Ruby `"hello #{name}"` has `interpolation` children, /// Python f-strings may have `interpolation` children). pub(super) fn has_string_interpolation(node: Node) -> bool { let mut cursor = node.walk(); for child in node.children(&mut cursor) { if child.kind().contains("interpolation") { return true; } } false } /// Check if an encapsed_string node contains interpolation (PHP). pub(super) fn has_interpolation_cfg(node: Node) -> bool { for i in 0..node.child_count() as u32 { if let Some(child) = node.child(i) { let kind = child.kind(); if kind == "variable_name" || kind == "simple_variable" || kind.contains("interpolation") { return true; } } } false } /// Extract the raw literal text from the RHS of a declaration/assignment AST node. /// /// Walks the same value/right child paths as `def_use` and returns the text /// if the RHS is a syntactic literal. Used to populate `NodeInfo::const_text`. pub(super) fn extract_literal_rhs(ast: Node, lang: &str, code: &[u8]) -> Option { use crate::labels::lookup; // Direct value/right field (Rust let, Go short_var, etc.) let val_node = ast .child_by_field_name("value") .or_else(|| ast.child_by_field_name("right")); if let Some(val) = val_node { if is_syntactic_literal(val, code) { return text_of(val, code); } } // Nested declarator pattern (JS let/const → variable_declarator, etc.) if matches!( lookup(lang, ast.kind()), Kind::CallWrapper | Kind::Assignment ) { let mut cursor = ast.walk(); for child in ast.children(&mut cursor) { let child_val = child.child_by_field_name("value").or_else(|| { if matches!(lookup(lang, child.kind()), Kind::Assignment) { child.child_by_field_name("right") } else { None } }); if let Some(val) = child_val { if is_syntactic_literal(val, code) { return text_of(val, code); } } } } // Return statement with a literal argument (`return []`, `return {}`). // Lets SSA's const-return path ([`crate::ssa::lower`] line ~1066) emit // `SsaOp::Const(Some(text))` instead of `Const(None)` so downstream // container-literal detection (heap points-to, fresh-alloc summary) // can recognise the fresh allocation. if matches!(lookup(lang, ast.kind()), Kind::Return) { let mut cursor = ast.walk(); for child in ast.named_children(&mut cursor) { if is_syntactic_literal(child, code) { return text_of(child, code); } } } None } /// Returns true when every argument in the call's argument list is a /// syntactic literal (per `is_syntactic_literal`). Returns true for calls /// with zero arguments (no argument-carried taint vector). Returns false /// when the argument list cannot be found. /// /// For method chains like `a("x").b(y).c()`, the outermost call node /// represents the entire chain. This function walks nested call expressions /// to verify ALL argument lists in the chain contain only literals. pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool { let Some(args) = call_node.child_by_field_name("arguments") else { return false; }; let mut cursor = args.walk(); let mut any_arg = false; for ch in args.named_children(&mut cursor) { any_arg = true; if !is_syntactic_literal(ch, code) { return false; } } // Zero-arg calls are not "all literal", taint can still flow via a // non-literal receiver (e.g. `tainted.readObject()`), and the sink- // suppression gate (`info.all_args_literal`) must not skip these. if !any_arg { return false; } // Walk nested call expressions in the callee chain. check_inner_call_args(call_node, code) } /// Recursively check nested call expressions in a method chain for /// non-literal arguments. pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool { let mut cursor = node.walk(); for child in node.children(&mut cursor) { let kind = child.kind(); // Skip argument lists, those are checked by the caller. if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" { continue; } // If this child is itself a call expression, check its arguments. if child.child_by_field_name("arguments").is_some() { if !has_only_literal_args(child, code) { return false; } } else { // Recurse through non-call structural nodes (field_expression, etc.) if !check_inner_call_args(child, code) { return false; } } } true } /// Extract identifiers captured by Rust format-string named-argument syntax /// (`format!("…{name}…")`, stable since 1.58) from a `macro_invocation` /// node. Returns the identifier names referenced by `{name}` / /// `{name:fmt-spec}` patterns inside the first `string_literal` child of /// the macro's `token_tree`. /// /// Without this lifting, `let q = format!("...{x}...")` carries no `x` in /// its `uses` because `x` lives in the format string's bytes rather than /// as a separate AST argument node, so taint stops at the macro /// boundary. Mirrors the Python f-string interpolation lifting in /// `patterns/python.rs`. /// /// Conservative recognition: only fires for known format-style macros /// (`format`, `print`/`println`, `eprint`/`eprintln`, `write`/`writeln`, /// `panic`, `format_args`, `assert`/`debug_assert`, the common `log` /// crate severity macros). Empty for any non-Rust call node, any other /// macro, or a token_tree whose first string is not present. pub(super) fn extract_rust_format_macro_named_idents(call_node: Node, code: &[u8]) -> Vec { if call_node.kind() != "macro_invocation" { return Vec::new(); } let Some(macro_node) = call_node.child_by_field_name("macro") else { return Vec::new(); }; let Some(macro_text) = text_of(macro_node, code) else { return Vec::new(); }; let leaf = macro_text .rsplit("::") .next() .unwrap_or(macro_text.as_str()); if !is_rust_format_style_macro(leaf) { return Vec::new(); } let tt = match call_node.child_by_field_name("token_tree") { Some(t) => t, None => { let mut cursor = call_node.walk(); match call_node .children(&mut cursor) .find(|c| c.kind() == "token_tree") { Some(t) => t, None => return Vec::new(), } } }; let mut cursor = tt.walk(); let fmt_lit = match tt .children(&mut cursor) .find(|c| matches!(c.kind(), "string_literal" | "raw_string_literal")) { Some(n) => n, None => return Vec::new(), }; let raw = match text_of(fmt_lit, code) { Some(s) => s, None => return Vec::new(), }; let content = strip_literal_quotes(&raw, fmt_lit, code).unwrap_or_else(|| raw.clone()); parse_rust_format_named_idents(&content) } /// Walk `n` and any descendants, accumulating named-format-arg idents from /// every Rust `macro_invocation` reachable through structural expression /// children (calls, fields, await, references, blocks, ...). Lets the /// def-use collectors lift `format!("...{x}...")` named args through one /// or two levels of expression wrapping (e.g. /// `let q = format!("{x}").to_owned();` or RHS chained method calls). pub(super) fn extract_rust_format_macro_named_idents_in(n: Node, code: &[u8]) -> Vec { let mut out = Vec::new(); collect_format_macro_idents_recursive(n, code, &mut out, 0); out } fn collect_format_macro_idents_recursive(n: Node, code: &[u8], out: &mut Vec, depth: u32) { if depth > 6 { return; } if n.kind() == "macro_invocation" { for ident in extract_rust_format_macro_named_idents(n, code) { out.push(ident); } } let mut cursor = n.walk(); for child in n.children(&mut cursor) { collect_format_macro_idents_recursive(child, code, out, depth + 1); } } fn is_rust_format_style_macro(name: &str) -> bool { matches!( name, "format" | "print" | "println" | "eprint" | "eprintln" | "write" | "writeln" | "panic" | "format_args" | "assert" | "debug_assert" | "todo" | "unimplemented" | "unreachable" | "info" | "warn" | "error" | "debug" | "trace" ) } fn parse_rust_format_named_idents(s: &str) -> Vec { let bytes = s.as_bytes(); let mut out: Vec = Vec::new(); let mut i = 0; while i < bytes.len() { let b = bytes[i]; if b == b'{' { if i + 1 < bytes.len() && bytes[i + 1] == b'{' { i += 2; continue; } let start = i + 1; let mut j = start; while j < bytes.len() && bytes[j] != b'}' && bytes[j] != b':' { j += 1; } let ident_bytes = &bytes[start..j]; if is_valid_rust_format_ident(ident_bytes) { if let Ok(name) = std::str::from_utf8(ident_bytes) { out.push(name.to_string()); } } while j < bytes.len() && bytes[j] != b'}' { j += 1; } i = j + 1; } else if b == b'}' && i + 1 < bytes.len() && bytes[i + 1] == b'}' { i += 2; } else { i += 1; } } out } fn is_valid_rust_format_ident(b: &[u8]) -> bool { if b.is_empty() { return false; } let first = b[0]; if !(first.is_ascii_alphabetic() || first == b'_') { return false; } if b.iter().all(|c| c.is_ascii_digit()) { return false; } b.iter().all(|c| c.is_ascii_alphanumeric() || *c == b'_') } /// Extract per-argument identifiers from a call node's argument list. /// Returns one `Vec` per argument (in parameter-position order). /// Returns empty if argument list can't be found or contains spread/keyword args. pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec> { // Ruby `subshell` (backticks) has no `arguments` field, its children are // string fragments and `interpolation` nodes. Lift each interpolation's // identifiers into a positional arg so taint flows from `#{var}` into the // synthetic "subshell" sink. if call_node.kind() == "subshell" { let mut result = Vec::new(); let mut cursor = call_node.walk(); for child in call_node.named_children(&mut cursor) { if child.kind() == "interpolation" { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(child, code, &mut idents, &mut paths); let mut combined = paths; combined.extend(idents); if !combined.is_empty() { result.push(combined); } } } return result; } // Rust `tokio::join!` / `futures::join!` (and their `try_*` variants). // tree-sitter-rust models macro args as a `token_tree` rather than an // `arguments` field, so a vanilla extraction returns nothing. Walk the // top-level token_tree splitting on `,` separators, lifting identifiers // out of each chunk so the existing PromiseCombinator transfer can union // arg-side taint into the resulting tuple value. if call_node.kind() == "macro_invocation" && let Some(arg_uses) = extract_rust_macro_join_arg_uses(call_node, code) { return arg_uses; } let Some(args_node) = call_node.child_by_field_name("arguments") else { return Vec::new(); }; let mut result = Vec::new(); let mut cursor = args_node.walk(); for child in args_node.named_children(&mut cursor) { let kind = child.kind(); // Named / keyword arguments are tracked separately in `CallMeta.kwargs` // and do not participate in positional indexing, skip them here so // `arg_uses` remains strictly positional. Splats (spread/dict splat) // still invalidate positional mapping; bail out in that case. if kind == "spread_element" || kind == "dictionary_splat" || kind == "list_splat" || kind == "splat_argument" || kind == "hash_splat_argument" { return Vec::new(); } if kind == "keyword_argument" || kind == "named_argument" { continue; } let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(child, code, &mut idents, &mut paths); // Dotted paths first, then individual idents as fallback let mut combined = paths; combined.extend(idents); result.push(combined); } result } /// `tokio::join!` / `futures::join!` (and their `try_*` variants) bundle /// concurrently-awaited futures into a tuple result. tree-sitter-rust /// represents the args as a `token_tree` whose children alternate between /// expressions and `,` separators (`token_tree` itself nests on every /// parenthesised group, e.g. the `(x)` inside `fetch(x)`). Walk the /// top-level token_tree, segment by `,` leaves, and lift identifiers out /// of each chunk so the SSA Call op carries one positional arg per future. /// /// Returns `Some(arg_uses)` only when the macro is one of the recognised /// join macros, so `extract_arg_uses` can fall through to its normal /// `arguments`-field path for every other macro shape (`format!`, /// `println!`, custom DSL macros) where arg lifting could disturb existing /// label / SSA flow. pub(super) fn extract_rust_macro_join_arg_uses( call_node: Node, code: &[u8], ) -> Option>> { let macro_node = call_node.child_by_field_name("macro")?; let macro_text = text_of(macro_node, code)?; if !is_rust_join_macro(¯o_text) { return None; } let tt = match call_node.child_by_field_name("token_tree") { Some(t) => t, None => { let mut cursor = call_node.walk(); call_node .children(&mut cursor) .find(|c| c.kind() == "token_tree")? } }; let mut chunks: Vec> = vec![Vec::new()]; let mut cursor = tt.walk(); for child in tt.children(&mut cursor) { // Skip the surrounding `(`/`)` punctuation. if !child.is_named() { let kind = child.kind(); if kind == "," { chunks.push(Vec::new()); continue; } if kind == "(" || kind == ")" { continue; } } chunks.last_mut().unwrap().push(child); } let mut result = Vec::new(); for chunk in chunks { if chunk.is_empty() { continue; } let mut idents = Vec::new(); let mut paths = Vec::new(); for n in chunk { collect_idents_with_paths(n, code, &mut idents, &mut paths); } let mut combined = paths; combined.extend(idents); result.push(combined); } Some(result) } fn is_rust_join_macro(macro_text: &str) -> bool { matches!( macro_text, "tokio::join" | "tokio::try_join" | "futures::join" | "futures::try_join" | "join" | "try_join" ) } /// Extract keyword / named argument bindings for a call node. /// /// Returns `Vec<(name, uses)>` where `uses` are the identifier references /// from the keyword's value expression, in the same shape used by /// `arg_uses` entries. Empty for calls with no named arguments, or for /// languages whose grammar does not produce `keyword_argument` / `named_argument` /// children (C, Java, Go, …). pub(super) fn extract_kwargs(call_node: Node, code: &[u8]) -> Vec<(String, Vec)> { let Some(args_node) = call_node.child_by_field_name("arguments") else { return Vec::new(); }; let mut out = Vec::new(); let mut cursor = args_node.walk(); for child in args_node.named_children(&mut cursor) { let kind = child.kind(); // JS/TS object-literal positional arg: `f(x, { a: true, b: 'str' })`. // The pairs inside the object are not tree-sitter // `keyword_argument` nodes (those are Python/Ruby), but // downstream consumers (xml_config's // `lookup_kwargs(inst.cfg_node)` JS branch checking // `processEntities`) expect these fields in the kwargs vector. // Lift each `pair` (and `shorthand_property_identifier`) into // the kwargs list using the property name as kwarg name and the // raw text of the value expression as the single value. // Boolean / numeric / string / identifier values all surface as // their textual form, which is what xml_config's kwarg-value // matchers (e.g. `v == "true"`) compare against. if kind == "object" { let mut oc = child.walk(); for pair in child.named_children(&mut oc) { let pk = pair.kind(); if pk == "pair" { let Some(kn) = pair.child_by_field_name("key") else { continue; }; let Some(vn) = pair.child_by_field_name("value") else { continue; }; let Some(raw_name) = text_of(kn, code) else { continue; }; let name = raw_name .trim_start_matches(['"', '\'']) .trim_end_matches(['"', '\'']) .to_string(); if let Some(val_text) = text_of(vn, code) { out.push((name, vec![val_text.to_string()])); } } else if pk == "shorthand_property_identifier" { if let Some(name) = text_of(pair, code) { out.push((name.to_string(), vec![name.to_string()])); } } } continue; } if kind != "keyword_argument" && kind != "named_argument" { continue; } // Python `keyword_argument` uses `name`/`value`; Ruby `named_argument` // uses `name`/`value` as well (with `:` syntax in source). Fall back // to the first/last named children if fields are absent. let named_count = child.named_child_count(); let name_node = child .child_by_field_name("name") .or_else(|| child.named_child(0)); let value_node = child .child_by_field_name("value") .or_else(|| child.named_child(named_count.saturating_sub(1) as u32)); let (Some(nn), Some(vn)) = (name_node, value_node) else { continue; }; let Some(name) = text_of(nn, code) else { continue; }; let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(vn, code, &mut idents, &mut paths); let mut combined = paths; combined.extend(idents); // Boolean / numeric literal kwarg values (Python `True`/`False`, // Ruby `true`/`false`/integer/float, JS `true`/`false`/number) // do not surface through `collect_idents_with_paths` — the value // node's kind is `true`/`false`/`integer`/`float`/`number`, not // an identifier kind. Capture the raw text so consumers like // `xml_config::classify_call` (which checks // `values.iter().any(|v| v == "True" || v == "true")` for the // lxml `resolve_entities=True` opt-in) can match. if combined.is_empty() { if matches!( vn.kind(), "true" | "false" | "integer" | "float" | "number" | "string" | "string_literal" | "true_constant" | "false_constant" ) { if let Some(txt) = text_of(vn, code) { combined.push(txt.trim_matches(['"', '\'']).to_string()); } } } out.push((name, combined)); } out } /// Caps that a search literal is known to strip, provided the replacement /// itself does not reintroduce any dangerous sequence. /// /// Policy is deliberately narrow and conservative: only literals that contain /// *known-dangerous* payloads earn a strip credit, so an arbitrary /// `.replace("foo", "bar")` is never promoted to a sanitizer. /// * `..`, `/`, `\\` → path-traversal → `Cap::FILE_IO` /// * `<`, `>` → HTML metachars → `Cap::HTML_ESCAPE` /// * `;`, `|`, `&`, `$`, `\`` → shell metachars → `Cap::SHELL_ESCAPE` /// * `'`, `"`, `--` → SQL metachars → `Cap::SQL_QUERY` pub(super) fn caps_stripped_by_literal_pattern(search: &str) -> Cap { let mut caps = Cap::empty(); if search.contains("..") || search.contains('/') || search.contains('\\') { caps |= Cap::FILE_IO; } if search.contains('<') || search.contains('>') { caps |= Cap::HTML_ESCAPE; } if search.contains(';') || search.contains('|') || search.contains('&') || search.contains('$') || search.contains('`') { caps |= Cap::SHELL_ESCAPE; } if search.contains('\'') || search.contains('"') || search.contains("--") { caps |= Cap::SQL_QUERY; } caps } /// Maximum number of `.replace(LIT, LIT)` hops we'll walk on a single chain. const MAX_REPLACE_CHAIN_HOPS: usize = 16; /// Recognise a Rust `param.replace(LIT, LIT)[.replace(LIT, LIT)]*` chain whose /// receiver bottoms out at a plain identifier, and infer which caps the chain /// provably strips. /// /// In tree-sitter-rust a method call is encoded as a `call_expression` whose /// `function` field is a `field_expression` (`receiver.method`). Chained method /// calls therefore nest `call_expression` nodes recursively through the /// `field_expression.value` slot. The detector walks that nest, requiring /// every hop to be a pure literal-to-literal `replace` / `replacen` call and /// the innermost receiver to be a bare identifier. Returns the union of caps /// stripped across the chain when at least one literal contains a recognised /// dangerous pattern, or `None` when the pattern doesn't apply (so the caller /// falls back to normal unresolved-call propagation). pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) -> Option { fn is_rust_str_literal(k: &str) -> bool { matches!(k, "string_literal" | "raw_string_literal") } fn extract_rust_str_content<'a>(n: Node<'a>, code: &'a [u8]) -> Option { // A `string_literal` node in tree-sitter-rust has a `string_content` // child that holds the unquoted bytes. Fall back to whole-node text // with outer-character trimming only as a last resort. let mut cur = n.walk(); for c in n.named_children(&mut cur) { if c.kind() == "string_content" { return text_of(c, code); } } let raw = text_of(n, code)?; if raw.len() >= 2 { Some( raw.trim_start_matches('r') .trim_start_matches('#') .trim_end_matches('#') .trim_matches('"') .to_string(), ) } else { None } } let mut current = call_ast; let mut earned = Cap::empty(); for _ in 0..MAX_REPLACE_CHAIN_HOPS { if current.kind() != "call_expression" { // Chain base: must be a plain identifier (parameter / local) to // qualify. A base that's another expression (field access, // nested non-method call, …) breaks the sanitizer invariant. if current.kind() == "identifier" && !earned.is_empty() { return Some(earned); } return None; } // Must be a method-style call: function is a field_expression whose // `field` names a `replace`-like method. let func = current.child_by_field_name("function")?; if func.kind() != "field_expression" { return None; } let method_ident = func.child_by_field_name("field")?; let method_name = text_of(method_ident, code)?; if method_name != "replace" && method_name != "replacen" { return None; } let args_node = current.child_by_field_name("arguments")?; let mut cursor = args_node.walk(); let positional: Vec> = args_node .named_children(&mut cursor) .filter(|c| { !matches!( c.kind(), "keyword_argument" | "named_argument" | "spread_element" | "list_splat" | "dictionary_splat" | "splat_argument" | "hash_splat_argument" ) }) .collect(); let (arg0, arg1) = match positional.as_slice() { [a, b, ..] => (*a, *b), _ => return None, }; if !is_rust_str_literal(arg0.kind()) || !is_rust_str_literal(arg1.kind()) { return None; } let search = extract_rust_str_content(arg0, code)?; let replacement = extract_rust_str_content(arg1, code)?; // If the replacement itself contains a dangerous sequence, this hop // can reintroduce the pattern that a later hop tries to strip. Be // conservative: abandon all credit. if !caps_stripped_by_literal_pattern(&replacement).is_empty() { return None; } earned |= caps_stripped_by_literal_pattern(&search); // Walk to receiver via field_expression.value. current = func.child_by_field_name("value")?; } None } /// Recognise a Go `strings.Replace(s, OLD, NEW, n)` / /// `strings.ReplaceAll(s, OLD, NEW)` call that provably strips one of the /// known-dangerous metacharacter classes from its first argument. /// /// Returns the union of caps stripped, or `None` when the pattern doesn't /// apply (so the caller falls back to normal unresolved-call propagation). /// /// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call /// (non-method-chain) Go shape. The caller wires the resulting cap into /// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the /// taint engine consumes via the standard sanitizer pathway, taint flows /// in on `s`, the matching cap is stripped from the result. pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option { if call_ast.kind() != "call_expression" { return None; } // The call's `function` field is a `selector_expression`, `operand` // is the package ident (`strings`), `field` is the method ident. let func = call_ast.child_by_field_name("function")?; if func.kind() != "selector_expression" { return None; } let operand = func.child_by_field_name("operand")?; if text_of(operand, code).as_deref() != Some("strings") { return None; } let field = func.child_by_field_name("field")?; let method_name = text_of(field, code)?; if method_name != "Replace" && method_name != "ReplaceAll" { return None; } // Args layout: (s, old, new[, n]). Need positional args 1 (old) and // 2 (new) to be string literals. let old_lit = extract_const_string_arg(call_ast, 1, code)?; let new_lit = extract_const_string_arg(call_ast, 2, code)?; // If the replacement itself reintroduces a dangerous sequence, don't // credit the strip, matches the Rust chain detector's policy. if !caps_stripped_by_literal_pattern(&new_lit).is_empty() { return None; } let caps = caps_stripped_by_literal_pattern(&old_lit); if caps.is_empty() { None } else { Some(caps) } } /// Like `first_call_ident`, but also checks if `n` itself is a call node. /// `first_call_ident` only searches children, so when `n` IS the call /// expression (e.g. the argument `sanitize(cmd)`), this function catches it. pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Option { // C++ new/delete: normalize callee before field extraction. if lang == "cpp" && n.kind() == "new_expression" { return Some("new".to_string()); } if lang == "cpp" && n.kind() == "delete_expression" { return Some("delete".to_string()); } match lookup(lang, n.kind()) { Kind::Function => { // Function/closure expression passed as argument, return the same // synthetic anon name used by build_sub so callback_bindings and // source_to_callback can match it to the extracted BodyCfg. n.child_by_field_name("name") .and_then(|nm| text_of(nm, code)) .or_else(|| Some(anon_fn_name(n.start_byte()))) } Kind::CallFn => n .child_by_field_name("function") .or_else(|| n.child_by_field_name("method")) .or_else(|| n.child_by_field_name("name")) .or_else(|| n.child_by_field_name("type")) .or_else(|| find_constructor_type_child(n)) .and_then(|f| { let unwrapped = unwrap_parens(f); if lookup(lang, unwrapped.kind()) == Kind::Function { Some(anon_fn_name(unwrapped.start_byte())) } else { text_of(f, code) } }), Kind::CallMethod => { let func = n .child_by_field_name("method") .or_else(|| n.child_by_field_name("name")) .and_then(|f| text_of(f, code)); let recv_node = n .child_by_field_name("object") .or_else(|| n.child_by_field_name("receiver")) .or_else(|| n.child_by_field_name("scope")); let recv = recv_node.and_then(|f| root_receiver_text(f, lang, code)); // Preserve Java `.getClass()` segment in the chained callee text // so downstream predicates (e.g. // [`crate::ssa::type_facts::is_safe_string_producing_callee`]) // can recognise idiomatic `obj.getClass().()` chains. // Without this, `root_receiver_text` collapses the chain to // `obj.`, indistinguishable from a user-defined method. let recv = if lang == "java" && let Some(rn) = recv_node && lookup(lang, rn.kind()) == Kind::CallMethod && let Some(inner_method) = rn .child_by_field_name("method") .or_else(|| rn.child_by_field_name("name")) .and_then(|f| text_of(f, code)) && inner_method == "getClass" && let Some(r) = recv { Some(format!("{r}.getClass")) } else { recv }; match (recv, func) { (Some(r), Some(f)) => Some(format!("{r}.{f}")), (_, Some(f)) => Some(f), _ => None, } } Kind::CallMacro => n .child_by_field_name("macro") .and_then(|f| text_of(f, code)), _ => first_call_ident(n, lang, code), } } /// For each argument of `call_node`, return `Some(s)` when the argument is a /// syntactic string literal (unquoted contents) and `None` otherwise. The /// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`]. /// /// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces /// an empty vector, positional indices past the splat are meaningless and /// downstream passes already treat an empty vector as "no info". pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec> { let Some(args_node) = call_node.child_by_field_name("arguments") else { return Vec::new(); }; let mut result = Vec::new(); let mut cursor = args_node.walk(); for child in args_node.named_children(&mut cursor) { let kind = child.kind(); // Splat → positional indexing breaks; bail. if kind == "spread_element" || kind == "dictionary_splat" || kind == "list_splat" || kind == "splat_argument" || kind == "hash_splat_argument" { return Vec::new(); } // Named / keyword arguments are tracked separately in `kwargs` and // don't participate in positional indexing, skip them here so this // vector stays aligned with `arg_uses`. if kind == "keyword_argument" || kind == "named_argument" { continue; } // PHP wraps each call argument in an `argument` node whose first // named child is the actual expression. Unwrap one level so the // string-literal arm below sees the literal directly rather than // the wrapper kind, otherwise PHP `f("https://…")` records // `None` for arg 0 and downstream prefix-aware suppressions miss. let target = if kind == "argument" { child.named_child(0).unwrap_or(child) } else { child }; let target_kind = target.kind(); let literal = match target_kind { "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" // PHP's double-quoted form (single-quoted maps to `string`). // Only safe to lift when there is no `encapsed_string` / // `embedded_expression` interpolation child, checked below. | "encapsed_string" => { let raw = text_of(target, code); raw.and_then(|s| strip_literal_quotes(&s, target, code)) } // Boolean / null / numeric literal tokens — capture verbatim so // downstream pattern-aware analysis (e.g. the XXE config-fact // pass that needs to read the boolean polarity arg of // `setFeature(NAME, true)`) can recover the literal text without // re-walking the AST. Existing string-only consumers (URL // prefix matching, etc.) are unaffected: a "true" / "false" // token never satisfies their matching predicates. "true" | "false" | "null" | "null_literal" | "nil" | "nil_literal" | "none" | "boolean_literal" | "true_literal" | "false_literal" | "decimal_integer_literal" | "integer_literal" | "integer" | "number" | "number_literal" | "decimal_literal" => text_of(target, code), _ => None, }; result.push(literal); } result } /// Strip surrounding quotes from a syntactic string literal, resolving the /// `string_content` child for Rust-style two-level string nodes. Returns the /// raw inner text (no escape-sequence processing), sufficient for whitelist /// matching against shell-metachar sets. pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option { // Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child. // Prefer the content text so the caller doesn't have to deal with quote // pairing for raw strings (`r"..."`, `r#"..."#`, etc.). let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { if child.kind() == "string_content" { return text_of(child, code); } } if raw.len() >= 2 { let bytes = raw.as_bytes(); let first = bytes[0]; let last = bytes[raw.len() - 1]; if (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') { return Some(raw[1..raw.len() - 1].to_string()); } } None } /// For each argument of `call_node`, find the callee name if that argument /// is itself a call expression (e.g. `sanitize(x)` in `os.system(sanitize(x))`). /// Returns a `Vec>` parallel to `extract_arg_uses` output. pub(super) fn extract_arg_callees(call_node: Node, lang: &str, code: &[u8]) -> Vec> { let Some(args_node) = call_node.child_by_field_name("arguments") else { return Vec::new(); }; let mut result = Vec::new(); let mut cursor = args_node.walk(); for child in args_node.named_children(&mut cursor) { // Bail on spread/splat like extract_arg_uses does let kind = child.kind(); if kind == "spread_element" || kind == "dictionary_splat" || kind == "list_splat" || kind == "keyword_argument" || kind == "splat_argument" || kind == "hash_splat_argument" || kind == "named_argument" { return Vec::new(); } result.push(call_ident_of(child, lang, code)); } result } /// Return `(defines, uses, extra_defines, array_pattern_indices, /// rhs_array_elements)` for the AST fragment `ast`. /// /// `extra_defines` captures additional bindings from destructuring patterns /// beyond the primary define. `array_pattern_indices`, when non-empty, gives /// the source-order position of each binding in `iter::once(defines).chain( /// extra_defines)` for `array_pattern` / `tuple_pattern` LHS shapes. Empty /// for non-array destructures and for non-skip array patterns where callers /// can derive sequential 0..N indices implicitly. /// /// `rhs_array_elements`, when non-empty, gives source-order RHS slots for /// destructure-from-array-literal shapes (`const [a, b] = [safe, tainted]`, /// `let (a, b) = (safe, tainted)`, Python `a, b = safe, tainted`). Each slot /// is `Some(ident)` for a bare-ident element or `None` for a syntactic /// literal. Empty when RHS isn't an array-literal shape or any element is /// too complex; callers fall back to scalar union in that case. #[allow(clippy::type_complexity)] pub(super) fn def_use( ast: Node, lang: &str, code: &[u8], extra_labels: Option<&[crate::labels::RuntimeLabelRule]>, ) -> ( Option, Vec, Vec, SmallVec<[usize; 4]>, SmallVec<[crate::cfg::RhsArraySlot; 4]>, ) { match lookup(lang, ast.kind()) { // Declaration wrappers (let, var, short_var_declaration, etc.) Kind::CallWrapper => { let mut defs = None; let mut extra_defs = Vec::new(); let mut uses = Vec::new(); let mut pattern_indices: SmallVec<[usize; 4]> = SmallVec::new(); let mut rhs_array_elements: SmallVec<[crate::cfg::RhsArraySlot; 4]> = SmallVec::new(); // Try direct field names first (Rust `let_declaration`, Go `short_var_declaration`) let def_node = ast .child_by_field_name("pattern") .or_else(|| ast.child_by_field_name("name")) .or_else(|| ast.child_by_field_name("left")) // Python `with_item`: value is `as_pattern` whose `alias` holds the target .or_else(|| { ast.child_by_field_name("value") .and_then(|v| v.child_by_field_name("alias")) }); let val_node = ast .child_by_field_name("value") .or_else(|| ast.child_by_field_name("right")); if def_node.is_some() || val_node.is_some() { if let Some(pat) = def_node { let bindings = collect_array_pattern_bindings_indexed(pat, code); if !bindings.is_empty() { let mut iter = bindings.into_iter(); if let Some((first_name, first_idx)) = iter.next() { defs = Some(first_name); pattern_indices.push(first_idx); } for (name, idx) in iter { extra_defs.push(name); pattern_indices.push(idx); } } else { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(pat, code, &mut idents, &mut paths); let first = paths.pop().or_else(|| idents.first().cloned()); // Remaining idents are extra defines (for destructuring) for ident in &idents { if first.as_ref() != Some(ident) { extra_defs.push(ident.clone()); } } defs = first; } } if let Some(val) = val_node { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(val, code, &mut idents, &mut paths); uses.extend(paths); uses.extend(idents); // Rust format-string named-arg capture: `let q = // format!("...{x}...")` reads `x`, but `x` lives in // the format-string bytes, not as a separate AST // argument node, so collect_idents misses it. uses.extend(extract_rust_format_macro_named_idents_in(val, code)); // When the LHS is a recognised destructure pattern AND // the RHS is a bare array-literal shape (no call), record // per-element idents so the SSA destructure rewrite can // map each binding to its specific RHS slot. if !pattern_indices.is_empty() { rhs_array_elements = collect_rhs_array_literal_elements(val, lang, code, extra_labels); } } } else { // Try nested declarator pattern (JS/TS `lexical_declaration` → `variable_declarator`, // Java `local_variable_declaration` → `variable_declarator`, // C/C++ `declaration` → `init_declarator`, // Python/Ruby `expression_statement` → `assignment`) let mut cursor = ast.walk(); for child in ast.children(&mut cursor) { // Only use left/right fields for actual assignment nodes, binary // expressions also have left/right but are not definitions. let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment); let child_name = child .child_by_field_name("name") .or_else(|| child.child_by_field_name("declarator")) .or_else(|| { if is_assign { child.child_by_field_name("left") } else { None } }); let child_value = child.child_by_field_name("value").or_else(|| { if is_assign { child.child_by_field_name("right") } else { None } }); // Only treat this child as a declarator if it has BOTH a name // and a value (or at least a value). This prevents method_invocation // nodes (which have a `name` field) from being misinterpreted. if child_value.is_some() { if let Some(name_node) = child_name && defs.is_none() { let bindings = collect_array_pattern_bindings_indexed(name_node, code); if !bindings.is_empty() { let mut iter = bindings.into_iter(); if let Some((first_name, first_idx)) = iter.next() { defs = Some(first_name); pattern_indices.push(first_idx); } for (name, idx) in iter { extra_defs.push(name); pattern_indices.push(idx); } } else { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(name_node, code, &mut idents, &mut paths); let first = paths.pop().or_else(|| idents.first().cloned()); for ident in &idents { if first.as_ref() != Some(ident) { extra_defs.push(ident.clone()); } } defs = first; } } if let Some(val_node) = child_value { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(val_node, code, &mut idents, &mut paths); uses.extend(paths); uses.extend(idents); uses.extend(extract_rust_format_macro_named_idents_in(val_node, code)); if !pattern_indices.is_empty() && rhs_array_elements.is_empty() { rhs_array_elements = collect_rhs_array_literal_elements( val_node, lang, code, extra_labels, ); } } } } // Fallback: if still nothing found, collect all idents as uses. // This handles expression_statement wrappers. if defs.is_none() && uses.is_empty() { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(ast, code, &mut idents, &mut paths); uses.extend(paths); uses.extend(idents); uses.extend(extract_rust_format_macro_named_idents_in(ast, code)); } } (defs, uses, extra_defs, pattern_indices, rhs_array_elements) } // Plain assignment `x = y` or destructuring assignment such as // Python `a, b = await asyncio.gather(...)` whose LHS surfaces as // a `pattern_list` / `tuple_pattern`. When the LHS is a // destructure pattern that the indexed helper recognises, the // primary binding lands in `defs`, the rest land in `extra_defs`, // and `pattern_indices` carries source-order positions so the // SSA lowering's destructure-promise rewrite can paint each // binding from the matching combinator argument. Kind::Assignment => { let mut defs = None; let mut extra_defs = Vec::new(); let mut pattern_indices: SmallVec<[usize; 4]> = SmallVec::new(); let mut rhs_array_elements: SmallVec<[crate::cfg::RhsArraySlot; 4]> = SmallVec::new(); let mut uses = Vec::new(); if let Some(lhs) = ast.child_by_field_name("left") { let bindings = collect_array_pattern_bindings_indexed(lhs, code); if !bindings.is_empty() { let mut iter = bindings.into_iter(); if let Some((first_name, first_idx)) = iter.next() { defs = Some(first_name); pattern_indices.push(first_idx); } for (name, idx) in iter { extra_defs.push(name); pattern_indices.push(idx); } } else { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(lhs, code, &mut idents, &mut paths); // Prefer dotted path (member expression) over last ident defs = paths.pop().or_else(|| idents.pop()); } } if let Some(rhs) = ast.child_by_field_name("right") { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(rhs, code, &mut idents, &mut paths); uses.extend(paths); uses.extend(idents); uses.extend(extract_rust_format_macro_named_idents_in(rhs, code)); // When the LHS is a recognised destructure pattern AND the // RHS is a bare array-literal shape, record per-element // idents so the SSA destructure rewrite can map each // binding to its specific RHS slot. if !pattern_indices.is_empty() { rhs_array_elements = collect_rhs_array_literal_elements(rhs, lang, code, extra_labels); } } (defs, uses, extra_defs, pattern_indices, rhs_array_elements) } // if‑let / while‑let, the `let_condition` binds a variable from // the value expression. E.g. `if let Ok(cmd) = env::var("CMD")` // defines `cmd` and uses `env`, `var`, `CMD`. Kind::If | Kind::While => { let cond = ast.child_by_field_name("condition"); if let Some(c) = cond && c.kind() == "let_condition" { let mut defs = None; let mut uses = Vec::new(); if let Some(pat) = c.child_by_field_name("pattern") { let mut tmp = Vec::::new(); collect_idents(pat, code, &mut tmp); // The first plain identifier in the pattern is the binding. // Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the // last ident which is the inner binding name. defs = tmp.into_iter().last(); } if let Some(val) = c.child_by_field_name("value") { collect_idents(val, code, &mut uses); } return (defs, uses, vec![], SmallVec::new(), SmallVec::new()); } let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(ast, code, &mut idents, &mut paths); let mut uses = paths; uses.extend(idents); (None, uses, vec![], SmallVec::new(), SmallVec::new()) } // for-in / for-of / Python `for x in iter:` ───────────────────────── // // Tree-sitter classifies these as `Kind::For` with a `left`/`right` // field pair (binding pattern + iterable). Without an explicit // arm here, the default branch collects every ident as a `use` and // never registers the iteration binding as a `define`, so taint // entering the iterable does not propagate into the body's // references to the binding (`for (const [a, b] of obj) { sink(a) }` // would lose the flow at `a`). // // C-style `for_statement` has no `left`/`right` fields (it uses // `initializer`/`condition`/`increment`), so this path falls through // to the default-collecting behaviour for those, preserving today's // semantics. // // Go's `for ident := range iter` shape places the binding pattern // and iterable on a `range_clause` child of the `for_statement` // rather than as direct fields. Without the range_clause lookup // below, taint from the iterable never reaches the loop binding // (CVE-2026-41422 daptin: `c.QueryArray("col")` loop var `project` // flows into `goqu.L(project)` SQL_QUERY sink). Kind::For => { let mut left = ast.child_by_field_name("left"); let mut right = ast.child_by_field_name("right"); if left.is_none() && right.is_none() { let mut cursor = ast.walk(); for child in ast.children(&mut cursor) { if child.kind() == "range_clause" { left = child.child_by_field_name("left"); right = child.child_by_field_name("right"); break; } } } if left.is_none() && right.is_none() { // C-style for, defer to default ident collection. let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(ast, code, &mut idents, &mut paths); let mut uses = paths; uses.extend(idents); return (None, uses, vec![], SmallVec::new(), SmallVec::new()); } let mut defs: Option = None; let mut extra_defs: Vec = Vec::new(); let mut uses: Vec = Vec::new(); if let Some(pat) = left { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(pat, code, &mut idents, &mut paths); let first = paths.pop().or_else(|| idents.first().cloned()); for ident in &idents { if first.as_ref() != Some(ident) { extra_defs.push(ident.clone()); } } defs = first; } if let Some(val) = right { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(val, code, &mut idents, &mut paths); uses.extend(paths); uses.extend(idents); } (defs, uses, extra_defs, SmallVec::new(), SmallVec::new()) } // everything else – no definition, but may read vars _ => { let mut idents = Vec::new(); let mut paths = Vec::new(); collect_idents_with_paths(ast, code, &mut idents, &mut paths); let mut uses = paths; uses.extend(idents); (None, uses, vec![], SmallVec::new(), SmallVec::new()) } } } /// One match from [`extract_shell_array_payload_idents`]. /// /// `arg_position` is the positional argument index of the call where the /// shell-array literal was found. `payload_idents` is the union of /// identifiers (and dotted paths) lifted from the array's payload elements /// (positions 2+ for POSIX `sh -c ` form; positions 2+ for `cmd /c ` /// likewise). Empty `payload_idents` means the payload is a constant string, /// which the caller should treat as benign (no SHELL_ESCAPE finding possible). #[derive(Debug, Clone)] pub(super) struct ShellArrayMatch { pub arg_position: usize, pub payload_idents: Vec, } /// Detect inline shell-execution array literals at a call site. /// /// Recognises the pattern `[, "-c", ]` (POSIX shells) and /// `[, "/c"|"/C", ]` (Windows `cmd.exe`) appearing as /// either: /// * a direct positional argument of `call_node`, or /// * the value of any field within an object-literal positional argument /// (covers `container.exec({Cmd: ["bash", "-c", x]})` form). /// /// Returns one [`ShellArrayMatch`] per detected shell-array. Empty when the /// call has no shell-array literals. /// /// The shell-name list is intentionally narrow (POSIX shells + Windows /// `cmd.exe`/`powershell`) to avoid false positives on benign array literals /// like `["ls", "-la"]` or `["git", "rev-parse", "HEAD"]`, where element 0 is /// not a shell. Element 1 must be a literal `-c` (POSIX) or `/c`/`/C` (cmd); /// otherwise the array is not in shell-exec form regardless of element 0. /// /// Identifiers from elements at positions 2+ are lifted via /// [`collect_idents_with_paths`] so template-literal interpolations /// (`` `echo ${x}` ``), member-expressions (`obj.field`), and bare idents are /// all captured. Dedup is preserved across array elements so a single ident /// referenced in multiple payload positions appears once. pub(super) fn extract_shell_array_payload_idents( call_node: Node, code: &[u8], ) -> Vec { let mut out = Vec::new(); let Some(args_node) = call_node.child_by_field_name("arguments") else { return out; }; let mut cursor = args_node.walk(); for (idx, child) in args_node.named_children(&mut cursor).enumerate() { let kind = child.kind(); // Splats break positional indexing; bail conservatively on the whole call. if kind == "spread_element" || kind == "dictionary_splat" || kind == "list_splat" || kind == "splat_argument" || kind == "hash_splat_argument" { return Vec::new(); } if kind == "keyword_argument" || kind == "named_argument" { continue; } // Direct array-literal arg. if let Some(idents) = shell_array_payload_idents_of(child, code) { out.push(ShellArrayMatch { arg_position: idx, payload_idents: idents, }); continue; } // Object-literal arg whose field value is a shell-array literal. // Covers `container.exec({Cmd: [...]})` form. Field name is not // restricted to `Cmd` / `cmd`: the shell-shape itself is the gate, // and the payload extraction is per-array. if matches!(kind, "object" | "dictionary") { let mut cc = child.walk(); for pair in child.named_children(&mut cc) { if pair.kind() != "pair" { continue; } let Some(val_node) = pair.child_by_field_name("value") else { continue; }; let val_node = unwrap_parens(val_node); if let Some(idents) = shell_array_payload_idents_of(val_node, code) { out.push(ShellArrayMatch { arg_position: idx, payload_idents: idents, }); } } } } out } /// If `node` is an array literal of shape `[, "-c", *]` (POSIX shells) /// or `[, "/c", *]` (Windows cmd.exe), return the identifiers /// referenced in the payload elements (positions 2+). Otherwise return /// `None`. Returning `Some(vec![])` means the payload is a constant string /// — caller should still skip emitting a sink (no taint can reach a literal). fn shell_array_payload_idents_of(node: Node, code: &[u8]) -> Option> { let node = unwrap_parens(node); if node.kind() != "array" { return None; } // Walk named children to skip commas and other trivia. let mut cursor = node.walk(); let elems: Vec = node.named_children(&mut cursor).collect(); if elems.len() < 3 { return None; } let shell = const_string_value(elems[0], code)?; if !is_known_shell(&shell) { return None; } let flag = const_string_value(elems[1], code)?; if !is_shell_command_flag(&shell, &flag) { return None; } // Lift identifiers from the payload elements (positions 2+). Constants // contribute nothing. An empty result means the entire payload is // statically benign. let mut idents: Vec = Vec::new(); let mut paths: Vec = Vec::new(); for elem in &elems[2..] { collect_idents_with_paths(*elem, code, &mut idents, &mut paths); } let mut combined = paths; combined.extend(idents); // Dedup (preserve first-seen order). let mut seen = std::collections::HashSet::new(); combined.retain(|s| seen.insert(s.clone())); if combined.is_empty() { // Static payload — no taint can reach it. Return None so the caller // does not emit a useless sink filter. return None; } Some(combined) } /// Extract a constant string value from `node`, handling JS/TS `string` / /// `template_string` (no interpolation) forms. Returns `None` for dynamic /// values, identifiers, or expressions. fn const_string_value(node: Node, code: &[u8]) -> Option { let node = unwrap_parens(node); match node.kind() { "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => { let raw = text_of(node, code)?; if raw.len() >= 2 { Some(raw[1..raw.len() - 1].to_string()) } else { None } } "template_string" => { let mut c = node.walk(); if node .named_children(&mut c) .any(|ch| ch.kind() == "template_substitution") { return None; } let raw = text_of(node, code)?; if raw.len() >= 2 { Some(raw[1..raw.len() - 1].to_string()) } else { None } } _ => None, } } /// Known shell executable names that activate the shell-array detector. /// Scoped narrowly to POSIX shells + Windows command interpreters, listing /// only canonical names so benign arrays like `["ls", ...]`, `["git", ...]`, /// or `["python", ...]` do not match. fn is_known_shell(name: &str) -> bool { // Strip directory prefix for matching: `/bin/bash` → `bash`. let leaf = name.rsplit('/').next().unwrap_or(name); matches!( leaf, "bash" | "sh" | "zsh" | "dash" | "ksh" | "fish" | "ash" | "tcsh" | "csh" | "cmd" | "cmd.exe" | "powershell" | "powershell.exe" | "pwsh" | "pwsh.exe" ) } /// True when `flag` is the "execute the following string as a shell command" /// switch for the given `shell`. POSIX shells use `-c`; cmd.exe accepts /// `/c` / `/C`; PowerShell uses `-Command` (also `-c` as alias) and /// `-EncodedCommand`. fn is_shell_command_flag(shell: &str, flag: &str) -> bool { let leaf = shell.rsplit('/').next().unwrap_or(shell); let is_cmd = matches!(leaf, "cmd" | "cmd.exe"); let is_powershell = matches!(leaf, "powershell" | "powershell.exe" | "pwsh" | "pwsh.exe"); if is_cmd { return matches!(flag, "/c" | "/C" | "/k" | "/K"); } if is_powershell { return matches!( flag, "-c" | "-Command" | "-command" | "-EncodedCommand" | "-encodedcommand" ); } // POSIX shells. flag == "-c" }