nyx/src/cfg/literals.rs

2815 lines
116 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use super::conditions::unwrap_parens;
use super::helpers::{collect_array_pattern_bindings_indexed, collect_rhs_array_literal_elements};
use super::{
anon_fn_name, collect_idents, collect_idents_with_paths, find_constructor_type_child,
first_call_ident, root_receiver_text, text_of,
};
use crate::labels::{Cap, Kind, lookup};
use smallvec::SmallVec;
use tree_sitter::Node;
/// Find the inner CallFn/CallMethod/CallMacro node within an AST node.
/// For direct call nodes, returns the node itself. For wrappers, searches
/// up to two levels of children, transparently descending through
/// `await_expression` / `yield_expression` (`Kind::AwaitForward`) wrappers
/// so `const x = await foo(y)` reaches the inner `call_expression` at
/// effective depth 3 (`lexical_declaration > variable_declarator >
/// await_expression > call_expression`).
pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
match lookup(lang, n.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => Some(n),
Kind::AwaitForward => {
// Transparent wrapper: descend into the awaited expression.
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
if let Some(found) = find_call_node(c, lang) {
return Some(found);
}
}
None
}
_ => {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return Some(c),
// Skip past await/yield wrappers without consuming a
// recursion level — the wrapper itself is transparent.
Kind::AwaitForward => {
if let Some(found) = find_call_node(c, lang) {
return Some(found);
}
}
_ => {}
}
}
// Recurse one more level (handles `expression_statement > variable_declarator > call`)
let mut cursor2 = n.walk();
for c in n.children(&mut cursor2) {
let mut cursor3 = c.walk();
for gc in c.children(&mut cursor3) {
match lookup(lang, gc.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return Some(gc),
Kind::AwaitForward => {
if let Some(found) = find_call_node(gc, lang) {
return Some(found);
}
}
_ => {}
}
}
}
None
}
}
}
/// Extract `(field_name, ident_name)` pairs from specified fields of an
/// object-literal argument.
///
/// Returns:
/// * `Some(pairs)` if the positional argument at `index` IS an object literal
/// (JS `object`, TS `object`, Python `dictionary`). Each pair is
/// `(field_name, ident_name)` where `field_name` is the matched key from
/// `fields` and `ident_name` is an identifier lifted from that pair's
/// value expression. When no destination-field pairs are present, returns
/// `Some(vec![])`, the sink is effectively silenced because no destination
/// identifier exists.
/// * `None` if the arg is absent, is not an object literal (plain string
/// / ident / expression), or has splat/spread children that break static
/// per-field reasoning. Callers fall back to the whole-arg positional
/// filter in this case.
pub(super) fn extract_destination_field_pairs(
call_node: Node,
arg_index: usize,
fields: &[&str],
code: &[u8],
) -> Option<Vec<(String, String)>> {
if fields.is_empty() {
return None;
}
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg = args.named_children(&mut cursor).nth(arg_index)?;
// Only object / dict literal forms carry per-field destination semantics.
// For anything else (identifier, member expression, string, call), return
// None so the caller treats the whole arg as destination.
if !matches!(arg.kind(), "object" | "dictionary") {
return None;
}
let mut out: Vec<(String, String)> = Vec::new();
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
match child.kind() {
// `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't
// statically attribute spread contents to specific fields, so
// bail out, caller falls back to the whole-arg filter, matching
// the conservative posture used by arg_uses for splats.
"spread_element" | "dictionary_splat" => {
return None;
}
// Shorthand property `{ url }` binds the `url` field to a binding
// also named `url`. Treat as destination iff the name matches.
"shorthand_property_identifier" | "shorthand_property_identifier_pattern" => {
let Some(name) = text_of(child, code) else {
continue;
};
if fields.iter().any(|&f| f == name) && !out.iter().any(|(_, v)| v == &name) {
out.push((name.clone(), name));
}
}
"pair" => {
let Some(key_node) = child.child_by_field_name("key") else {
continue;
};
let key_text = match key_node.kind() {
// Strip quotes from string-literal keys so `"url"` and `url`
// both match the configured field list.
"string" | "string_literal" => text_of(key_node, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
// Computed keys: resolve only when the inner expression
// is a pure string literal (`['url']`). Dynamic forms
// (`[someVar]`, `[`url-${i}`]`, ``[`url`]`` with
// interpolation) stay conservative-skip.
"computed_property_name" => {
let mut inner_cursor = key_node.walk();
let inner = key_node.named_children(&mut inner_cursor).find(|c| {
!matches!(c.kind(), "comment" | "block_comment" | "line_comment")
});
match inner.map(|n| (n.kind(), n)) {
Some(("string" | "string_literal", n)) => text_of(n, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
// Template strings only when no interpolation
// (no `template_substitution` children).
Some(("template_string", n))
if {
let mut tc = n.walk();
!n.named_children(&mut tc)
.any(|c| c.kind() == "template_substitution")
} =>
{
text_of(n, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
})
}
_ => continue,
}
}
_ => text_of(key_node, code),
};
let Some(key) = key_text else {
continue;
};
if !fields.iter().any(|&f| f == key) {
continue;
}
let Some(val_node) = child.child_by_field_name("value") else {
continue;
};
let mut idents: Vec<String> = Vec::new();
let mut paths: Vec<String> = Vec::new();
collect_idents_with_paths(val_node, code, &mut idents, &mut paths);
for name in paths.into_iter().chain(idents) {
if !out.iter().any(|(_, v)| v == &name) {
out.push((key.clone(), name));
}
}
}
_ => {}
}
}
Some(out)
}
/// Extract `(field_name, ident_name)` pairs from `keyword_argument` /
/// `named_argument` children of a call whose keyword name matches one of
/// `fields`. Used for languages where destination-bearing fields are passed
/// as direct kwargs rather than wrapped in a dict literal, e.g. Python
/// `requests.post(url, data=tainted, json=safe)` where `data` and `json` are
/// `keyword_argument` siblings of the positional URL.
///
/// Also covers Ruby, where tree-sitter-ruby emits `pair` nodes (with
/// `key`/`value` fields) directly under `argument_list` for the
/// `Faraday.new(url: x)` / `Net::HTTP.start(host, port, proxy_addr: prx)`
/// kwarg shape. The `key` is typically a `hash_key_symbol` whose text is the
/// bare identifier (`url`); `simple_symbol` (`:url`) and string keys are
/// normalised by stripping a leading `:` or wrapping quotes.
///
/// Returns the union of matching kwargs, preserving the kwarg name in the
/// `field` slot so callers can still attribute findings per-field. Empty
/// when no matching kwargs exist or the call has no `arguments` field.
pub(super) fn extract_destination_kwarg_pairs(
call_node: Node,
fields: &[&str],
code: &[u8],
) -> Vec<(String, String)> {
if fields.is_empty() {
return Vec::new();
}
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut out: Vec<(String, String)> = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
let (name_node, value_node) = if kind == "keyword_argument" || kind == "named_argument" {
let named_count = child.named_child_count();
(
child
.child_by_field_name("name")
.or_else(|| child.named_child(0)),
child
.child_by_field_name("value")
.or_else(|| child.named_child(named_count.saturating_sub(1) as u32)),
)
} else if kind == "pair" {
// Ruby `pair` node sits directly under `argument_list` for
// kwarg-style call args (`f(url: x)`). `key`/`value` fields
// are populated; key text is `hash_key_symbol` ("url"),
// `simple_symbol` (":url"), or a string literal.
(
child.child_by_field_name("key"),
child.child_by_field_name("value"),
)
} else {
continue;
};
let (Some(nn), Some(vn)) = (name_node, value_node) else {
continue;
};
let Some(name_raw) = text_of(nn, code) else {
continue;
};
let name = name_raw
.trim_start_matches(':')
.trim_matches(['"', '\''])
.to_string();
if !fields.iter().any(|&f| f == name) {
continue;
}
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(vn, code, &mut idents, &mut paths);
for ident in paths.into_iter().chain(idents) {
if !out.iter().any(|(_, v)| v == &ident) {
out.push((name.clone(), ident));
}
}
}
out
}
/// Extract the string-literal content at argument position `index` (0-based).
/// Returns `None` if the argument is not a string literal or the index is out of range.
/// True when `call_node` is `Object.create(null)` (or its parenthesised /
/// awaited / type-cast wrappers). Strict literal-`null` first-arg match,
/// no aliasing through intermediate variables. Caller restricts to JS/TS.
pub(super) fn is_object_create_null_call(call_node: Node, code: &[u8]) -> bool {
if !matches!(call_node.kind(), "call_expression") {
return false;
}
let callee = call_node
.child_by_field_name("function")
.and_then(|f| text_of(f, code))
.unwrap_or_default();
if callee != "Object.create" {
return false;
}
let Some(args) = call_node.child_by_field_name("arguments") else {
return false;
};
let mut cursor = args.walk();
let named: Vec<Node> = args.named_children(&mut cursor).collect();
if named.len() != 1 {
return false;
}
let mut arg = named[0];
// Unwrap parens / await / TS type-assertions.
for _ in 0..4 {
match arg.kind() {
"parenthesized_expression" => {
if let Some(inner) = arg.named_child(0) {
arg = inner;
continue;
}
}
"await_expression" => {
if let Some(inner) = arg.child_by_field_name("argument") {
arg = inner;
continue;
}
}
"as_expression" | "type_assertion" => {
if let Some(inner) = arg.named_child(0) {
arg = inner;
continue;
}
}
_ => break,
}
}
arg.kind() == "null" || text_of(arg, code).as_deref() == Some("null")
}
pub(super) fn extract_const_string_arg(
call_node: Node,
index: usize,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let mut arg = args.named_children(&mut cursor).nth(index)?;
// PHP / Go wrap each positional argument in an `argument` node; unwrap so
// the kind-match below sees the inner literal.
if arg.kind() == "argument" && arg.named_child_count() == 1 {
if let Some(inner) = arg.named_child(0) {
arg = inner;
}
}
match arg.kind() {
// `string` / `string_literal` cover JS/TS, Python, Java, PHP, C/C++, Ruby, Rust;
// `interpreted_string_literal` / `raw_string_literal` cover Go's
// tree-sitter grammar (double-quoted vs. backtick-quoted forms).
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
let raw = text_of(arg, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
// Boolean literals — JS/TS `true`/`false` are their own node kinds; some
// grammars wrap them as identifiers carrying the keyword text. Returned
// verbatim so `dangerous_values` matching can detect deep-flag forms
// like `extend(true, target, src)`.
"true" | "false" => Some(arg.kind().to_string()),
// PHP double-quoted strings parse as `encapsed_string` whose body is
// a sequence of `string_content` / `escape_sequence` / interpolation
// nodes. Treat the string as constant only when every child is a
// pure-literal segment (no `variable_name` / `subscript_expression`
// interpolations); the returned value is the concatenation of the
// literal segments verbatim.
"encapsed_string" => {
let mut c = arg.walk();
let mut buf = String::new();
for ch in arg.named_children(&mut c) {
match ch.kind() {
"string_content" => {
if let Some(s) = text_of(ch, code) {
buf.push_str(&s);
}
}
"escape_sequence" => {
if let Some(s) = text_of(ch, code) {
buf.push_str(&s);
}
}
_ => return None,
}
}
Some(buf)
}
"template_string" => {
// Only treat as constant if no interpolation (no template_substitution children)
let mut c = arg.walk();
if arg
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return None; // dynamic
}
let raw = text_of(arg, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
// Concat-style binary expression with a leading string literal, e.g.
// PHP `"Location: " . $url`, JS/TS `"Location: " + url`. Returns the
// left-most literal so prefix-driven gates (`dangerous_prefixes`) can
// activate on partially-dynamic concatenations; falls through to
// `None` when the leading segment is not a string literal so
// exact-`dangerous_values` matching keeps its strict semantics.
"binary_expression" => {
let left = arg.child_by_field_name("left")?;
match left.kind() {
"string"
| "string_literal"
| "interpreted_string_literal"
| "raw_string_literal" => {
let raw = text_of(left, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
"encapsed_string" => {
let mut c = left.walk();
let mut buf = String::new();
for ch in left.named_children(&mut c) {
match ch.kind() {
"string_content" | "escape_sequence" => {
if let Some(s) = text_of(ch, code) {
buf.push_str(&s);
}
}
_ => return None,
}
}
Some(buf)
}
_ => None,
}
}
_ => None,
}
}
/// Extract a macro-constant or `define`d identifier name at argument position
/// `index` (0-based). Used for languages where activation values are
/// preprocessor symbols rather than string literals — currently C, C++, and
/// PHP define-constants like `CURLOPT_POSTFIELDS` whose syntactic form is an
/// `identifier` / `name` node, not a `string`.
///
/// Returns `None` for any non-identifier shape so dynamic-activation
/// semantics still apply when the activation arg is a runtime value
/// (variable, expression, function call).
pub(super) fn extract_const_macro_arg(
call_node: Node,
index: usize,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let mut arg = args.named_children(&mut cursor).nth(index)?;
if arg.kind() == "argument" && arg.named_child_count() == 1 {
if let Some(inner) = arg.named_child(0) {
arg = inner;
}
}
match arg.kind() {
// C/C++ identifier / PHP `name` node for define-style constants.
// Scoped C++ identifiers (`Curl::OPT_POSTFIELDS`) and PHP namespaced
// names also surface here so the dangerous_values match catches them.
"identifier" | "name" | "qualified_name" | "scoped_identifier" => text_of(arg, code),
// Ruby bare constant (`NOENT`) — leaf form.
"constant" => text_of(arg, code),
// Ruby scope-qualified constant (`Nokogiri::XML::ParseOptions::NOENT`).
// Return only the rightmost `name` segment so the gate's
// `dangerous_values` list can stay identifier-bare instead of
// enumerating every possible namespacing. Falls back to the full
// text if the `name` field is missing for any reason.
"scope_resolution" => arg
.child_by_field_name("name")
.and_then(|n| text_of(n, code))
.or_else(|| text_of(arg, code)),
// Integer literals at the activation arg position. PHP / C / C++
// commonly use plain `0` to opt into the safe-default option set
// (e.g. `simplexml_load_string($xml, "SimpleXMLElement", 0)`). The
// gate's `dangerous_values` list is identifier-only, so returning
// the literal text lets the comparison fail against `LIBXML_NOENT`
// and suppresses the conservative-fire branch.
"integer" | "integer_literal" | "number_literal" | "decimal_integer_literal" => {
text_of(arg, code)
}
_ => None,
}
}
/// Extract the value of a keyword argument from a call node (e.g. Python `shell=True`).
/// Walks argument children looking for `keyword_argument` nodes, matches the keyword
/// name, and extracts the value node text for literals.
pub(super) fn extract_const_keyword_arg(
call_node: Node,
keyword_name: &str,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
for child in args.named_children(&mut cursor) {
if child.kind() == "keyword_argument" || child.kind() == "named_argument" {
// keyword_argument has a "name" field and a "value" field in Python tree-sitter
let Some(name_node) = child.child_by_field_name("name") else {
continue;
};
let Some(name_text) = text_of(name_node, code) else {
continue;
};
if name_text != keyword_name {
continue;
}
let value_node = child.child_by_field_name("value")?;
// Only return a literal, identifiers / calls / complex exprs are
// "dynamic" and must be reported as `None` so the gate can
// distinguish literal-safe from dynamic.
return match value_node.kind() {
"true" | "false" | "none" | "integer" | "float" | "string" | "string_literal"
| "identifier" => text_of(value_node, code),
_ => None,
}
.filter(|_| {
// identifiers are only "literal" when they're the Python
// booleans True/False/None (tree-sitter-python classifies
// these as identifiers in older grammar versions).
match value_node.kind() {
"identifier" => text_of(value_node, code)
.as_deref()
.is_some_and(|s| matches!(s, "True" | "False" | "None")),
_ => true,
}
});
}
}
None
}
/// Return `true` if the call node has a keyword/named argument whose name
/// matches `keyword_name` (regardless of whether the value is a literal).
/// Used by gated-sink classification to distinguish an absent kwarg (language
/// default) from a present-but-dynamic kwarg (conservative).
pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8]) -> bool {
let Some(args) = call_node.child_by_field_name("arguments") else {
return false;
};
let mut cursor = args.walk();
for child in args.named_children(&mut cursor) {
if child.kind() != "keyword_argument" && child.kind() != "named_argument" {
continue;
}
let Some(name_node) = child.child_by_field_name("name") else {
continue;
};
if text_of(name_node, code).as_deref() == Some(keyword_name) {
return true;
}
}
false
}
/// Extract the literal value of a property `prop_name` from the object
/// literal at positional argument `arg_index`. Returns `None` if the
/// arg is absent, is not an object literal, the prop key isn't found,
/// or the prop value isn't a literal (so callers can distinguish
/// "present but dynamic" from "absent" only via [`has_object_arg_property`]).
///
/// Used by JS/TS-style "options object as kwargs" gates — e.g.
/// `_.template(tpl, { evaluate: false })` — where the safe-flag lives
/// in an inline object literal rather than as a dedicated kwarg node
/// (which JS does not have). Strict-additive: returns `None` for any
/// non-JS-object shape, including bare identifiers passed as the
/// options arg, so the gate falls back to the conservative dynamic
/// branch.
pub(super) fn extract_object_arg_property(
call_node: Node,
arg_index: usize,
prop_name: &str,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg = args.named_children(&mut cursor).nth(arg_index)?;
let arg = unwrap_parens(arg);
if !matches!(arg.kind(), "object" | "dictionary") {
return None;
}
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
if child.kind() != "pair" {
continue;
}
let Some(key_node) = child.child_by_field_name("key") else {
continue;
};
let key_text = match key_node.kind() {
"string" | "string_literal" => text_of(key_node, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
"computed_property_name" => continue,
_ => text_of(key_node, code),
};
if key_text.as_deref() != Some(prop_name) {
continue;
}
let val_node = child.child_by_field_name("value")?;
let val_node = unwrap_parens(val_node);
return match val_node.kind() {
"true" | "false" | "null" | "undefined" | "number" | "string" | "string_literal" => {
text_of(val_node, code)
}
// JS booleans true/false are their own node kinds (above), but
// some grammar versions wrap them as identifier literals; surface
// `undefined` similarly.
"identifier" => text_of(val_node, code)
.filter(|s| matches!(s.as_str(), "true" | "false" | "null" | "undefined")),
_ => None,
};
}
None
}
/// Return `true` if the call node's positional arg at `arg_index` is an
/// object literal containing a property named `prop_name` (whether the
/// value is a literal or a dynamic expression). Used alongside
/// [`extract_object_arg_property`] so gated-sink classification can
/// distinguish "options key absent" (language default) from "options
/// key present with dynamic value" (conservative dangerous).
pub(super) fn has_object_arg_property(
call_node: Node,
arg_index: usize,
prop_name: &str,
code: &[u8],
) -> bool {
let Some(args) = call_node.child_by_field_name("arguments") else {
return false;
};
let mut cursor = args.walk();
let Some(arg) = args.named_children(&mut cursor).nth(arg_index) else {
return false;
};
let arg = unwrap_parens(arg);
if !matches!(arg.kind(), "object" | "dictionary") {
return false;
}
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
match child.kind() {
"shorthand_property_identifier" | "shorthand_property_identifier_pattern"
if text_of(child, code).as_deref() == Some(prop_name) =>
{
return true;
}
"pair" => {
if let Some(key_node) = child.child_by_field_name("key") {
let key_text = match key_node.kind() {
"string" | "string_literal" => text_of(key_node, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
"computed_property_name" => continue,
_ => text_of(key_node, code),
};
if key_text.as_deref() == Some(prop_name) {
return true;
}
}
}
_ => {}
}
}
false
}
/// Inspect the first positional argument of a call node and return its
/// tree-sitter `kind()` plus a flag indicating whether any descendant is an
/// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as
/// `arg0`). Returns `None` when the call has no arguments.
///
/// Used by per-language shape-aware sink suppression, for example, Ruby
/// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically
/// parameterised when arg 0 is a hash/symbol/array/non-interpolated string,
/// regardless of taint reaching that argument.
pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bool)> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg0 = args.named_children(&mut cursor).next()?;
let arg0 = unwrap_parens(arg0);
let kind = arg0.kind().to_string();
let has_interp = subtree_has_interpolation(arg0);
Some((kind, has_interp))
}
/// Walk a Java method-chain receiver looking for an inner `method_invocation`
/// whose method name matches one of `target_methods` (e.g. `createQuery`,
/// `prepareStatement`). Returns the kind of that inner call's arg 0, used
/// to verify the SQL-bearing call up-chain was given a string literal rather
/// than a concatenation / method call.
///
/// Conservative: returns `None` when no matching call is found in the chain.
/// Stops drilling into args of an unrelated call, so the chain walk is
/// strictly down the receiver spine.
pub(super) fn java_chain_arg0_kind_for_method(
expr: Node,
target_methods: &[&str],
code: &[u8],
) -> Option<String> {
let n = unwrap_parens(expr);
if n.kind() == "method_invocation"
&& let Some(name_node) = n.child_by_field_name("name")
&& let Some(name) = text_of(name_node, code)
&& target_methods.iter().any(|m| *m == name)
{
let args = n.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg0 = args.named_children(&mut cursor).next()?;
let arg0 = unwrap_parens(arg0);
return Some(arg0.kind().to_string());
}
// Drill down the receiver spine. Java grammar uses `object` for the
// receiver of a `method_invocation`.
if n.kind() == "method_invocation"
&& let Some(recv) = n.child_by_field_name("object")
&& let Some(found) = java_chain_arg0_kind_for_method(recv, target_methods, code)
{
return Some(found);
}
None
}
/// Walk a Ruby method-chain receiver-side looking for the inner call whose
/// method identifier matches one of `target_methods`, then return that
/// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node
/// represents a chained expression like `Model.where(...).preload(...).to_a`
///, the outermost call (`to_a`) has no arguments, so the shape suppressor
/// must reach down the chain to inspect `where`'s arg 0.
///
/// Conservative: returns `None` if the chain doesn't contain a matching
/// method, so callers fall through to the no-suppression path.
pub(super) fn ruby_chain_arg0_for_method(
expr: Node,
target_methods: &[&str],
code: &[u8],
) -> Option<(String, bool)> {
let n = unwrap_parens(expr);
if n.kind() == "call"
&& let Some(method) = n.child_by_field_name("method")
&& let Some(name) = text_of(method, code)
&& target_methods.iter().any(|m| *m == name)
{
return arg0_kind_and_interpolation(n);
}
// Recurse into the receiver chain (`call.receiver` → next call up).
if n.kind() == "call"
&& let Some(recv) = n
.child_by_field_name("receiver")
.or_else(|| n.child_by_field_name("object"))
&& let Some(found) = ruby_chain_arg0_for_method(recv, target_methods, code)
{
return Some(found);
}
// Also descend into named children to handle wrapping (assignment RHS,
// begin-end blocks, parenthesised expressions, etc.).
let mut cursor = n.walk();
for c in n.named_children(&mut cursor) {
if let Some(found) = ruby_chain_arg0_for_method(c, target_methods, code) {
return Some(found);
}
}
None
}
fn subtree_has_interpolation(n: Node) -> bool {
if n.kind() == "interpolation" || n.kind() == "string_interpolation" {
return true;
}
let mut cursor = n.walk();
n.named_children(&mut cursor).any(subtree_has_interpolation)
}
/// Walk a JS/TS method-chain receiver-side to find an inner `call_expression`
/// whose member-property name matches one of `target_methods` (e.g. `query`,
/// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0.
///
/// Used to recognise ORM-accessor chains where a labelled SQL sink sits on
/// the receiver side of a parameterised execute method:
/// `strapi.db.query('admin::api-token').findOne({...})`. The outer call
/// (`findOne`) is the CFG node; the inner labelled `db.query` call carries
/// the literal model UID that proves the chain is parameterised.
///
/// Conservative: returns `None` when no matching inner call is found, so
/// callers fall through to the no-suppression path.
pub(super) fn js_chain_arg0_kind_for_method(
expr: Node,
target_methods: &[&str],
code: &[u8],
) -> Option<(String, bool)> {
let n = unwrap_parens(expr);
// tree-sitter-typescript / -javascript: call_expression with fields
// `function` (member_expression / identifier) and `arguments`.
if n.kind() == "call_expression" {
// Check this call's callee: if its property name (or full text) ends
// with one of `target_methods`, this is the inner labelled call.
if let Some(function) = n.child_by_field_name("function") {
// Property of a member_expression; falls back to the function
// text itself for bare-identifier calls.
let prop_text = function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
let full_text = text_of(function, code);
let leaf_text = full_text
.as_ref()
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
let matched = target_methods.iter().any(|m| {
prop_text.as_deref() == Some(*m)
|| leaf_text.as_deref() == Some(*m)
|| full_text.as_deref() == Some(*m)
|| full_text
.as_deref()
.is_some_and(|s| s.ends_with(&format!(".{m}")))
});
if matched {
return arg0_kind_and_interpolation(n);
}
// Drill down the receiver spine: function.object is the prior
// call in the chain.
if let Some(object) = function.child_by_field_name("object")
&& let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code)
{
return Some(found);
}
}
}
None
}
/// Walk the receiver chain of a JS/TS call to count *non-execute* method
/// calls between the outer call and an inner labelled call to
/// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer
/// chain method name (e.g. `findOne`) when an inner-call to `target_inner`
/// exists somewhere on the receiver spine, otherwise `None`.
///
/// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain
/// shape `<inner>.query(LITERAL).<orm_method>(...)`, bare
/// `connection.query("SELECT ...")` returns `None` because there is no
/// outer chain method.
pub(super) fn js_chain_outer_method_for_inner<'a>(
outer: Node<'a>,
target_inner: &[&str],
code: &'a [u8],
) -> Option<String> {
let n = unwrap_parens(outer);
if n.kind() != "call_expression" {
return None;
}
let function = n.child_by_field_name("function")?;
let object = function.child_by_field_name("object")?;
// If `object` itself is a call_expression whose property matches
// `target_inner`, the immediate outer is `function.property`.
if object.kind() == "call_expression" {
let inner_function = object.child_by_field_name("function");
if let Some(inner_function) = inner_function {
let prop_text = inner_function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
let full_text = text_of(inner_function, code);
let leaf_text = full_text
.as_ref()
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
let inner_matched = target_inner.iter().any(|m| {
prop_text.as_deref() == Some(*m)
|| leaf_text.as_deref() == Some(*m)
|| full_text.as_deref() == Some(*m)
|| full_text
.as_deref()
.is_some_and(|s| s.ends_with(&format!(".{m}")))
});
if inner_matched {
return function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
}
}
// Recurse: outer chain may have more depth (`a.b().c().d()` ,
// d is outermost, c is next, target may be at b or further in).
return js_chain_outer_method_for_inner(object, target_inner, code);
}
None
}
/// For a chained method call (`a.b().c().d()`), walk down the receiver
/// chain (`function.object`) and return the innermost call_expression
/// alongside its callee text (e.g. `"http.get"`).
///
/// Returns `None` when:
/// * `outer` is not itself a CallFn / CallMethod node, or
/// * its `function`/`method` field is not a member-style expression whose
/// `object` field is itself a call (i.e. there is no chained receiver).
///
/// Motivated by CVE-2025-64430 (Parse Server SSRF via
/// `http.get(uri, cb).on('error', e => ...)`). Without this, the outer
/// `.on(...)` call swallows classification of the inner gated sink.
pub(super) fn find_chained_inner_call<'a>(
outer: Node<'a>,
lang: &str,
code: &[u8],
) -> Option<(Node<'a>, String)> {
if !matches!(lookup(lang, outer.kind()), Kind::CallFn | Kind::CallMethod) {
return None;
}
let function = outer
.child_by_field_name("function")
.or_else(|| outer.child_by_field_name("method"))?;
// Direct double-call form (`f()(x)`): the outer call's `function`
// field IS itself a call_expression, with no intermediate
// member-chain. Treat the inner call as the chain's innermost.
// Without this, lodash-style template-render chains like
// `_.template(t)(data)` evade the chained-inner rebinding because
// the outer's function field is a `call_expression`, not the
// `member_expression` shape the original branch below expects.
if matches!(
lookup(lang, function.kind()),
Kind::CallFn | Kind::CallMethod
) {
// Recurse: the inner call may itself be chained.
if let Some(inner) = find_chained_inner_call(function, lang, code) {
return Some(inner);
}
let inner_func = function
.child_by_field_name("function")
.or_else(|| function.child_by_field_name("method"))
.or_else(|| function.child_by_field_name("name"))?;
let raw = text_of(inner_func, code)?;
let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect();
return Some((function, inner_text));
}
// The function/method field for a chained call is a member_expression
// (JS/TS), attribute (Python), or field_expression (Rust); its
// receiver is the `object` field (JS/TS/Python) or `value` field
// (Rust). Only proceed when that receiver is itself a call.
let object = function
.child_by_field_name("object")
.or_else(|| function.child_by_field_name("value"))?;
if !matches!(lookup(lang, object.kind()), Kind::CallFn | Kind::CallMethod) {
return None;
}
// Decide whether `object` is itself a chained method call (its
// function/method field is a member-style expression). When yes,
// recurse one more level so deeper chains resolve to their innermost
// method (e.g. `axios.get(u).then(h).catch(h)` → `axios.get`).
// When no — the receiver is a plain function/constructor call like
// Rust's `HttpResponse::Found()` — descending one more level would
// strand us on the non-method leaf whose text would not match any
// gate matcher. Stop here and return the current `outer` level,
// which IS the innermost method call.
let object_function = object
.child_by_field_name("function")
.or_else(|| object.child_by_field_name("method"));
let object_is_chained_method = object_function
.map(|f| {
matches!(
f.kind(),
"member_expression"
| "attribute"
| "field_expression"
| "scoped_identifier"
| "scope_resolution"
) && f
.child_by_field_name("object")
.or_else(|| f.child_by_field_name("value"))
.is_some()
})
.unwrap_or(false);
if object_is_chained_method {
// Recurse: the inner call may itself be chained.
if let Some(inner) = find_chained_inner_call(object, lang, code) {
return Some(inner);
}
// `object` is the innermost call_expression in the chain. Extract
// its callee identifier the same way `first_call_ident_with_span`
// does for a CallFn (member_expression text → "http.get").
let inner_func = object
.child_by_field_name("function")
.or_else(|| object.child_by_field_name("method"))
.or_else(|| object.child_by_field_name("name"))?;
// Multi-line dotted member expressions (`http\n .get`) include
// formatting whitespace in the source-text slice. The labels map
// keys are literal `"http.get"` etc., strip whitespace so the
// chained-call inner-gate rebinding fires for both single-line and
// multi-line chain styles. Also strips `\r` for CRLF sources.
// Motivated by upstream Parse Server CVE-2025-64430 which uses the
// multi-line `http\n .get(uri, ...)\n .on(...)` form.
let raw = text_of(inner_func, code)?;
let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect();
return Some((object, inner_text));
}
// Receiver is a non-chained call (Rust constructor `Foo::new()` /
// `HttpResponse::Found()`, JS bare `f()`). Outer level IS the
// innermost method call — return its own function text so gate
// matching sees the method name.
let raw = text_of(function, code)?;
let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect();
Some((outer, inner_text))
}
/// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod
/// node) and yield each *named argument* of every inner call along the
/// way. Outer's own arguments are NOT included, the caller already
/// handles those via the standard `pre_emit_arg_source_nodes` pass over
/// `outer.arguments`.
///
/// For `json.NewDecoder(r.Body).Decode(emoji)`:
/// outer = `.Decode(emoji)` , caller iterates `emoji`
/// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body`
///
/// We only pull from each inner call's `arguments` field, never from its
/// `function`/`method`/receiver expressions. That distinction matters
/// because chained source-receivers like `r.URL.Query()` expose a
/// member-text path that classifies as a Source, but it's the OUTER
/// chain text (`r.URL.Query.Get`) that already classifies, so emitting
/// a synth source for the inner-call's own callee would double-count.
///
/// Used by Go (where chain shapes like `json.NewDecoder(r.Body).Decode`
/// hide source-labeled args inside parens between dots, leaving the
/// outer callee text un-classifiable). The helper itself is
/// language-neutral, but callers should gate per-language until each
/// language's regression coverage catches up.
pub(super) fn walk_chain_inner_call_args<'a>(outer: Node<'a>, lang: &str, out: &mut Vec<Node<'a>>) {
if !matches!(lookup(lang, outer.kind()), Kind::CallFn | Kind::CallMethod) {
return;
}
let function = outer
.child_by_field_name("function")
.or_else(|| outer.child_by_field_name("method"));
let Some(function) = function else { return };
let object = function
.child_by_field_name("object")
.or_else(|| function.child_by_field_name("operand"))
.or_else(|| function.child_by_field_name("value"));
let Some(inner) = object else { return };
if !matches!(lookup(lang, inner.kind()), Kind::CallFn | Kind::CallMethod) {
return;
}
if let Some(args) = inner.child_by_field_name("arguments") {
let mut cursor = args.walk();
for arg in args.named_children(&mut cursor) {
out.push(arg);
}
}
walk_chain_inner_call_args(inner, lang, out);
}
/// Recursively find a call-expression node within an AST subtree (up to
/// 4 levels deep). Unlike `find_call_node` which only checks 2 levels,
/// this handles `await`-wrapped calls inside declarations.
pub(super) fn find_call_node_deep<'a>(n: Node<'a>, lang: &str, depth: u8) -> Option<Node<'a>> {
if depth == 0 {
return None;
}
match lookup(lang, n.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => Some(n),
_ => {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
if let Some(found) = find_call_node_deep(c, lang, depth - 1) {
return Some(found);
}
}
None
}
}
}
/// Detect whether a call node is a parameterized SQL query.
///
/// Returns `true` when:
/// 1. The first argument (arg 0) is a string literal (including template
/// strings without interpolation) containing SQL placeholder patterns:
/// `$1`..`$N`, `?`, `%s`, or `:identifier`.
/// 2. The call has at least 2 arguments (the second being the params
/// array/tuple).
///
/// This is intentionally conservative: if arg 0 is dynamic (variable,
/// concatenation, template with interpolation), returns `false`.
pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool {
let Some(args) = call_node.child_by_field_name("arguments") else {
return false;
};
let mut cursor = args.walk();
let named: Vec<_> = args.named_children(&mut cursor).collect();
// Need at least 2 arguments: query string + params
if named.len() < 2 {
return false;
}
let first_arg = named[0];
// Extract the raw text of arg 0, must be a string literal or
// template string without interpolation.
let query_text = match first_arg.kind() {
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
text_of(first_arg, code)
}
"template_string" => {
// Only constant templates (no interpolation)
let mut c = first_arg.walk();
if first_arg
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return false; // dynamic, not safe
}
text_of(first_arg, code)
}
// Python concatenated strings: "SELECT" "..." are implicit concat
"concatenated_string" => {
// If it's a concatenated_string, get the full text
text_of(first_arg, code)
}
_ => return false, // not a literal
};
let Some(qt) = query_text else {
return false;
};
has_sql_placeholders(&qt)
}
/// Check whether a string contains SQL parameterized-query placeholders.
///
/// Recognised patterns:
/// - `$1`, `$2`, …, `$N` (PostgreSQL positional)
/// - `?` (MySQL / SQLite positional)
/// - `%s` (Python DB-API / psycopg2)
/// - `:identifier` (Oracle / named parameters), requires the colon to be
/// preceded by a space or `=` (to avoid matching JS ternary / object
/// literals).
pub(super) fn has_sql_placeholders(s: &str) -> bool {
let bytes = s.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
match bytes[i] {
b'$' if i + 1 < len && bytes[i + 1].is_ascii_digit() && bytes[i + 1] != b'0' => {
// $N where N is 1..9 (at minimum)
return true;
}
b'?' => return true,
b'%' if i + 1 < len && bytes[i + 1] == b's' => {
return true;
}
b':' if i > 0
&& (bytes[i - 1] == b' '
|| bytes[i - 1] == b'='
|| bytes[i - 1] == b'('
|| bytes[i - 1] == b',')
&& i + 1 < len
&& bytes[i + 1].is_ascii_alphabetic() =>
{
// :identifier, must be preceded by whitespace/= to avoid
// false positives on object literals or ternary operators.
return true;
}
_ => {}
}
i += 1;
}
false
}
/// Returns true when a tree-sitter node is a syntactic literal value.
///
/// Intentionally conservative: if in doubt, returns false. It is better
/// to miss a suppression opportunity than to suppress a real tainted flow.
///
/// NOTE: Literal-kind classification also exists in `ast.rs::is_literal_node`.
/// The two must stay aligned across languages. TODO: consider extracting a
/// shared literal-kind helper if a third call site appears.
#[allow(clippy::only_used_in_recursion)]
pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
match node.kind() {
// Scalar strings, but reject if they contain interpolation
// (e.g. Ruby `"hello #{name}"`, Python f-strings).
"string"
| "string_literal"
| "interpreted_string_literal"
| "raw_string_literal"
| "string_content"
| "string_fragment" => !has_string_interpolation(node),
// Numbers
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
true
}
// Booleans / null / nil / none
"true" | "false" | "null" | "nil" | "none" | "null_literal" | "boolean"
| "boolean_literal" => true,
// PHP encapsed_string: safe only if no variable interpolation
"encapsed_string" => !has_interpolation_cfg(node),
// Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap
"argument" => {
node.named_child_count() == 1
&& node
.named_child(0)
.is_some_and(|c| is_syntactic_literal(c, code))
}
// Unary minus on a number literal: `-42`
"unary_expression" | "unary_op" => {
node.named_child_count() == 1
&& node
.named_child(0)
.is_some_and(|c| is_syntactic_literal(c, code))
}
// String concatenation of literals: `"a" + "b"` or `"a" . "b"`
"binary_expression" | "concatenated_string" => {
let count = node.named_child_count();
count >= 2
&& (0..count).all(|i| {
node.named_child(i as u32)
.is_some_and(|c| is_syntactic_literal(c, code))
})
}
// JS/TS template string: only if no interpolation substitution
"template_string" => {
let mut c = node.walk();
!node
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
}
// Containers: all elements must be syntactic literals
"list"
| "array"
| "array_expression"
| "array_creation_expression"
| "tuple"
| "tuple_expression" => {
let mut c = node.walk();
node.named_children(&mut c)
.all(|ch| is_syntactic_literal(ch, code))
}
// Container entries: `{"key": "value"}` style pairs
"pair" => {
let mut c = node.walk();
node.named_children(&mut c)
.all(|ch| is_syntactic_literal(ch, code))
}
_ => false,
}
}
/// Check if a string node contains interpolation children
/// (e.g. Ruby `"hello #{name}"` has `interpolation` children,
/// Python f-strings may have `interpolation` children).
pub(super) fn has_string_interpolation(node: Node) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind().contains("interpolation") {
return true;
}
}
false
}
/// Check if an encapsed_string node contains interpolation (PHP).
pub(super) fn has_interpolation_cfg(node: Node) -> bool {
for i in 0..node.child_count() as u32 {
if let Some(child) = node.child(i) {
let kind = child.kind();
if kind == "variable_name"
|| kind == "simple_variable"
|| kind.contains("interpolation")
{
return true;
}
}
}
false
}
/// Extract the raw literal text from the RHS of a declaration/assignment AST node.
///
/// Walks the same value/right child paths as `def_use` and returns the text
/// if the RHS is a syntactic literal. Used to populate `NodeInfo::const_text`.
pub(super) fn extract_literal_rhs(ast: Node, lang: &str, code: &[u8]) -> Option<String> {
use crate::labels::lookup;
// Direct value/right field (Rust let, Go short_var, etc.)
let val_node = ast
.child_by_field_name("value")
.or_else(|| ast.child_by_field_name("right"));
if let Some(val) = val_node {
if is_syntactic_literal(val, code) {
return text_of(val, code);
}
}
// Nested declarator pattern (JS let/const → variable_declarator, etc.)
if matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment
) {
let mut cursor = ast.walk();
for child in ast.children(&mut cursor) {
let child_val = child.child_by_field_name("value").or_else(|| {
if matches!(lookup(lang, child.kind()), Kind::Assignment) {
child.child_by_field_name("right")
} else {
None
}
});
if let Some(val) = child_val {
if is_syntactic_literal(val, code) {
return text_of(val, code);
}
}
}
}
// Return statement with a literal argument (`return []`, `return {}`).
// Lets SSA's const-return path ([`crate::ssa::lower`] line ~1066) emit
// `SsaOp::Const(Some(text))` instead of `Const(None)` so downstream
// container-literal detection (heap points-to, fresh-alloc summary)
// can recognise the fresh allocation.
if matches!(lookup(lang, ast.kind()), Kind::Return) {
let mut cursor = ast.walk();
for child in ast.named_children(&mut cursor) {
if is_syntactic_literal(child, code) {
return text_of(child, code);
}
}
}
None
}
/// Returns true when every argument in the call's argument list is a
/// syntactic literal (per `is_syntactic_literal`). Returns true for calls
/// with zero arguments (no argument-carried taint vector). Returns false
/// when the argument list cannot be found.
///
/// For method chains like `a("x").b(y).c()`, the outermost call node
/// represents the entire chain. This function walks nested call expressions
/// to verify ALL argument lists in the chain contain only literals.
pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool {
let Some(args) = call_node.child_by_field_name("arguments") else {
return false;
};
let mut cursor = args.walk();
let mut any_arg = false;
for ch in args.named_children(&mut cursor) {
any_arg = true;
if !is_syntactic_literal(ch, code) {
return false;
}
}
// Zero-arg calls are not "all literal", taint can still flow via a
// non-literal receiver (e.g. `tainted.readObject()`), and the sink-
// suppression gate (`info.all_args_literal`) must not skip these.
if !any_arg {
return false;
}
// Walk nested call expressions in the callee chain.
check_inner_call_args(call_node, code)
}
/// Recursively check nested call expressions in a method chain for
/// non-literal arguments.
pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
let kind = child.kind();
// Skip argument lists, those are checked by the caller.
if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" {
continue;
}
// If this child is itself a call expression, check its arguments.
if child.child_by_field_name("arguments").is_some() {
if !has_only_literal_args(child, code) {
return false;
}
} else {
// Recurse through non-call structural nodes (field_expression, etc.)
if !check_inner_call_args(child, code) {
return false;
}
}
}
true
}
/// Extract identifiers captured by Rust format-string named-argument syntax
/// (`format!("…{name}…")`, stable since 1.58) from a `macro_invocation`
/// node. Returns the identifier names referenced by `{name}` /
/// `{name:fmt-spec}` patterns inside the first `string_literal` child of
/// the macro's `token_tree`.
///
/// Without this lifting, `let q = format!("...{x}...")` carries no `x` in
/// its `uses` because `x` lives in the format string's bytes rather than
/// as a separate AST argument node, so taint stops at the macro
/// boundary. Mirrors the Python f-string interpolation lifting in
/// `patterns/python.rs`.
///
/// Conservative recognition: only fires for known format-style macros
/// (`format`, `print`/`println`, `eprint`/`eprintln`, `write`/`writeln`,
/// `panic`, `format_args`, `assert`/`debug_assert`, the common `log`
/// crate severity macros). Empty for any non-Rust call node, any other
/// macro, or a token_tree whose first string is not present.
pub(super) fn extract_rust_format_macro_named_idents(call_node: Node, code: &[u8]) -> Vec<String> {
if call_node.kind() != "macro_invocation" {
return Vec::new();
}
let Some(macro_node) = call_node.child_by_field_name("macro") else {
return Vec::new();
};
let Some(macro_text) = text_of(macro_node, code) else {
return Vec::new();
};
let leaf = macro_text
.rsplit("::")
.next()
.unwrap_or(macro_text.as_str());
if !is_rust_format_style_macro(leaf) {
return Vec::new();
}
let tt = match call_node.child_by_field_name("token_tree") {
Some(t) => t,
None => {
let mut cursor = call_node.walk();
match call_node
.children(&mut cursor)
.find(|c| c.kind() == "token_tree")
{
Some(t) => t,
None => return Vec::new(),
}
}
};
let mut cursor = tt.walk();
let fmt_lit = match tt
.children(&mut cursor)
.find(|c| matches!(c.kind(), "string_literal" | "raw_string_literal"))
{
Some(n) => n,
None => return Vec::new(),
};
let raw = match text_of(fmt_lit, code) {
Some(s) => s,
None => return Vec::new(),
};
let content = strip_literal_quotes(&raw, fmt_lit, code).unwrap_or_else(|| raw.clone());
parse_rust_format_named_idents(&content)
}
/// Walk `n` and any descendants, accumulating named-format-arg idents from
/// every Rust `macro_invocation` reachable through structural expression
/// children (calls, fields, await, references, blocks, ...). Lets the
/// def-use collectors lift `format!("...{x}...")` named args through one
/// or two levels of expression wrapping (e.g.
/// `let q = format!("{x}").to_owned();` or RHS chained method calls).
pub(super) fn extract_rust_format_macro_named_idents_in(n: Node, code: &[u8]) -> Vec<String> {
let mut out = Vec::new();
collect_format_macro_idents_recursive(n, code, &mut out, 0);
out
}
fn collect_format_macro_idents_recursive(n: Node, code: &[u8], out: &mut Vec<String>, depth: u32) {
if depth > 6 {
return;
}
if n.kind() == "macro_invocation" {
for ident in extract_rust_format_macro_named_idents(n, code) {
out.push(ident);
}
}
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
collect_format_macro_idents_recursive(child, code, out, depth + 1);
}
}
fn is_rust_format_style_macro(name: &str) -> bool {
matches!(
name,
"format"
| "print"
| "println"
| "eprint"
| "eprintln"
| "write"
| "writeln"
| "panic"
| "format_args"
| "assert"
| "debug_assert"
| "todo"
| "unimplemented"
| "unreachable"
| "info"
| "warn"
| "error"
| "debug"
| "trace"
)
}
fn parse_rust_format_named_idents(s: &str) -> Vec<String> {
let bytes = s.as_bytes();
let mut out: Vec<String> = Vec::new();
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b'{' {
if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
i += 2;
continue;
}
let start = i + 1;
let mut j = start;
while j < bytes.len() && bytes[j] != b'}' && bytes[j] != b':' {
j += 1;
}
let ident_bytes = &bytes[start..j];
if is_valid_rust_format_ident(ident_bytes) {
if let Ok(name) = std::str::from_utf8(ident_bytes) {
out.push(name.to_string());
}
}
while j < bytes.len() && bytes[j] != b'}' {
j += 1;
}
i = j + 1;
} else if b == b'}' && i + 1 < bytes.len() && bytes[i + 1] == b'}' {
i += 2;
} else {
i += 1;
}
}
out
}
fn is_valid_rust_format_ident(b: &[u8]) -> bool {
if b.is_empty() {
return false;
}
let first = b[0];
if !(first.is_ascii_alphabetic() || first == b'_') {
return false;
}
if b.iter().all(|c| c.is_ascii_digit()) {
return false;
}
b.iter().all(|c| c.is_ascii_alphanumeric() || *c == b'_')
}
/// Extract per-argument identifiers from a call node's argument list.
/// Returns one `Vec<String>` per argument (in parameter-position order).
/// Returns empty if argument list can't be found or contains spread/keyword args.
pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>> {
// Ruby `subshell` (backticks) has no `arguments` field, its children are
// string fragments and `interpolation` nodes. Lift each interpolation's
// identifiers into a positional arg so taint flows from `#{var}` into the
// synthetic "subshell" sink.
if call_node.kind() == "subshell" {
let mut result = Vec::new();
let mut cursor = call_node.walk();
for child in call_node.named_children(&mut cursor) {
if child.kind() == "interpolation" {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(child, code, &mut idents, &mut paths);
let mut combined = paths;
combined.extend(idents);
if !combined.is_empty() {
result.push(combined);
}
}
}
return result;
}
// Rust `tokio::join!` / `futures::join!` (and their `try_*` variants).
// tree-sitter-rust models macro args as a `token_tree` rather than an
// `arguments` field, so a vanilla extraction returns nothing. Walk the
// top-level token_tree splitting on `,` separators, lifting identifiers
// out of each chunk so the existing PromiseCombinator transfer can union
// arg-side taint into the resulting tuple value.
if call_node.kind() == "macro_invocation"
&& let Some(arg_uses) = extract_rust_macro_join_arg_uses(call_node, code)
{
return arg_uses;
}
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut result = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
// Named / keyword arguments are tracked separately in `CallMeta.kwargs`
// and do not participate in positional indexing, skip them here so
// `arg_uses` remains strictly positional. Splats (spread/dict splat)
// still invalidate positional mapping; bail out in that case.
if kind == "spread_element"
|| kind == "dictionary_splat"
|| kind == "list_splat"
|| kind == "splat_argument"
|| kind == "hash_splat_argument"
{
return Vec::new();
}
if kind == "keyword_argument" || kind == "named_argument" {
continue;
}
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(child, code, &mut idents, &mut paths);
// Dotted paths first, then individual idents as fallback
let mut combined = paths;
combined.extend(idents);
result.push(combined);
}
result
}
/// `tokio::join!` / `futures::join!` (and their `try_*` variants) bundle
/// concurrently-awaited futures into a tuple result. tree-sitter-rust
/// represents the args as a `token_tree` whose children alternate between
/// expressions and `,` separators (`token_tree` itself nests on every
/// parenthesised group, e.g. the `(x)` inside `fetch(x)`). Walk the
/// top-level token_tree, segment by `,` leaves, and lift identifiers out
/// of each chunk so the SSA Call op carries one positional arg per future.
///
/// Returns `Some(arg_uses)` only when the macro is one of the recognised
/// join macros, so `extract_arg_uses` can fall through to its normal
/// `arguments`-field path for every other macro shape (`format!`,
/// `println!`, custom DSL macros) where arg lifting could disturb existing
/// label / SSA flow.
pub(super) fn extract_rust_macro_join_arg_uses(
call_node: Node,
code: &[u8],
) -> Option<Vec<Vec<String>>> {
let macro_node = call_node.child_by_field_name("macro")?;
let macro_text = text_of(macro_node, code)?;
if !is_rust_join_macro(&macro_text) {
return None;
}
let tt = match call_node.child_by_field_name("token_tree") {
Some(t) => t,
None => {
let mut cursor = call_node.walk();
call_node
.children(&mut cursor)
.find(|c| c.kind() == "token_tree")?
}
};
let mut chunks: Vec<Vec<Node>> = vec![Vec::new()];
let mut cursor = tt.walk();
for child in tt.children(&mut cursor) {
// Skip the surrounding `(`/`)` punctuation.
if !child.is_named() {
let kind = child.kind();
if kind == "," {
chunks.push(Vec::new());
continue;
}
if kind == "(" || kind == ")" {
continue;
}
}
chunks.last_mut().unwrap().push(child);
}
let mut result = Vec::new();
for chunk in chunks {
if chunk.is_empty() {
continue;
}
let mut idents = Vec::new();
let mut paths = Vec::new();
for n in chunk {
collect_idents_with_paths(n, code, &mut idents, &mut paths);
}
let mut combined = paths;
combined.extend(idents);
result.push(combined);
}
Some(result)
}
fn is_rust_join_macro(macro_text: &str) -> bool {
matches!(
macro_text,
"tokio::join"
| "tokio::try_join"
| "futures::join"
| "futures::try_join"
| "join"
| "try_join"
)
}
/// Extract keyword / named argument bindings for a call node.
///
/// Returns `Vec<(name, uses)>` where `uses` are the identifier references
/// from the keyword's value expression, in the same shape used by
/// `arg_uses` entries. Empty for calls with no named arguments, or for
/// languages whose grammar does not produce `keyword_argument` / `named_argument`
/// children (C, Java, Go, …).
pub(super) fn extract_kwargs(call_node: Node, code: &[u8]) -> Vec<(String, Vec<String>)> {
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut out = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
// JS/TS object-literal positional arg: `f(x, { a: true, b: 'str' })`.
// The pairs inside the object are not tree-sitter
// `keyword_argument` nodes (those are Python/Ruby), but
// downstream consumers (xml_config's
// `lookup_kwargs(inst.cfg_node)` JS branch checking
// `processEntities`) expect these fields in the kwargs vector.
// Lift each `pair` (and `shorthand_property_identifier`) into
// the kwargs list using the property name as kwarg name and the
// raw text of the value expression as the single value.
// Boolean / numeric / string / identifier values all surface as
// their textual form, which is what xml_config's kwarg-value
// matchers (e.g. `v == "true"`) compare against.
if kind == "object" {
let mut oc = child.walk();
for pair in child.named_children(&mut oc) {
let pk = pair.kind();
if pk == "pair" {
let Some(kn) = pair.child_by_field_name("key") else {
continue;
};
let Some(vn) = pair.child_by_field_name("value") else {
continue;
};
let Some(raw_name) = text_of(kn, code) else {
continue;
};
let name = raw_name
.trim_start_matches(['"', '\''])
.trim_end_matches(['"', '\''])
.to_string();
if let Some(val_text) = text_of(vn, code) {
out.push((name, vec![val_text.to_string()]));
}
} else if pk == "shorthand_property_identifier" {
if let Some(name) = text_of(pair, code) {
out.push((name.to_string(), vec![name.to_string()]));
}
}
}
continue;
}
if kind != "keyword_argument" && kind != "named_argument" {
continue;
}
// Python `keyword_argument` uses `name`/`value`; Ruby `named_argument`
// uses `name`/`value` as well (with `:` syntax in source). Fall back
// to the first/last named children if fields are absent.
let named_count = child.named_child_count();
let name_node = child
.child_by_field_name("name")
.or_else(|| child.named_child(0));
let value_node = child
.child_by_field_name("value")
.or_else(|| child.named_child(named_count.saturating_sub(1) as u32));
let (Some(nn), Some(vn)) = (name_node, value_node) else {
continue;
};
let Some(name) = text_of(nn, code) else {
continue;
};
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(vn, code, &mut idents, &mut paths);
let mut combined = paths;
combined.extend(idents);
// Boolean / numeric literal kwarg values (Python `True`/`False`,
// Ruby `true`/`false`/integer/float, JS `true`/`false`/number)
// do not surface through `collect_idents_with_paths` — the value
// node's kind is `true`/`false`/`integer`/`float`/`number`, not
// an identifier kind. Capture the raw text so consumers like
// `xml_config::classify_call` (which checks
// `values.iter().any(|v| v == "True" || v == "true")` for the
// lxml `resolve_entities=True` opt-in) can match.
if combined.is_empty() {
if matches!(
vn.kind(),
"true"
| "false"
| "integer"
| "float"
| "number"
| "string"
| "string_literal"
| "true_constant"
| "false_constant"
) {
if let Some(txt) = text_of(vn, code) {
combined.push(txt.trim_matches(['"', '\'']).to_string());
}
}
}
out.push((name, combined));
}
out
}
/// Caps that a search literal is known to strip, provided the replacement
/// itself does not reintroduce any dangerous sequence.
///
/// Policy is deliberately narrow and conservative: only literals that contain
/// *known-dangerous* payloads earn a strip credit, so an arbitrary
/// `.replace("foo", "bar")` is never promoted to a sanitizer.
/// * `..`, `/`, `\\` → path-traversal → `Cap::FILE_IO`
/// * `<`, `>` → HTML metachars → `Cap::HTML_ESCAPE`
/// * `;`, `|`, `&`, `$`, `\`` → shell metachars → `Cap::SHELL_ESCAPE`
/// * `'`, `"`, `--` → SQL metachars → `Cap::SQL_QUERY`
pub(super) fn caps_stripped_by_literal_pattern(search: &str) -> Cap {
let mut caps = Cap::empty();
if search.contains("..") || search.contains('/') || search.contains('\\') {
caps |= Cap::FILE_IO;
}
if search.contains('<') || search.contains('>') {
caps |= Cap::HTML_ESCAPE;
}
if search.contains(';')
|| search.contains('|')
|| search.contains('&')
|| search.contains('$')
|| search.contains('`')
{
caps |= Cap::SHELL_ESCAPE;
}
if search.contains('\'') || search.contains('"') || search.contains("--") {
caps |= Cap::SQL_QUERY;
}
caps
}
/// Maximum number of `.replace(LIT, LIT)` hops we'll walk on a single chain.
const MAX_REPLACE_CHAIN_HOPS: usize = 16;
/// Recognise a Rust `param.replace(LIT, LIT)[.replace(LIT, LIT)]*` chain whose
/// receiver bottoms out at a plain identifier, and infer which caps the chain
/// provably strips.
///
/// In tree-sitter-rust a method call is encoded as a `call_expression` whose
/// `function` field is a `field_expression` (`receiver.method`). Chained method
/// calls therefore nest `call_expression` nodes recursively through the
/// `field_expression.value` slot. The detector walks that nest, requiring
/// every hop to be a pure literal-to-literal `replace` / `replacen` call and
/// the innermost receiver to be a bare identifier. Returns the union of caps
/// stripped across the chain when at least one literal contains a recognised
/// dangerous pattern, or `None` when the pattern doesn't apply (so the caller
/// falls back to normal unresolved-call propagation).
pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) -> Option<Cap> {
fn is_rust_str_literal(k: &str) -> bool {
matches!(k, "string_literal" | "raw_string_literal")
}
fn extract_rust_str_content<'a>(n: Node<'a>, code: &'a [u8]) -> Option<String> {
// A `string_literal` node in tree-sitter-rust has a `string_content`
// child that holds the unquoted bytes. Fall back to whole-node text
// with outer-character trimming only as a last resort.
let mut cur = n.walk();
for c in n.named_children(&mut cur) {
if c.kind() == "string_content" {
return text_of(c, code);
}
}
let raw = text_of(n, code)?;
if raw.len() >= 2 {
Some(
raw.trim_start_matches('r')
.trim_start_matches('#')
.trim_end_matches('#')
.trim_matches('"')
.to_string(),
)
} else {
None
}
}
let mut current = call_ast;
let mut earned = Cap::empty();
for _ in 0..MAX_REPLACE_CHAIN_HOPS {
if current.kind() != "call_expression" {
// Chain base: must be a plain identifier (parameter / local) to
// qualify. A base that's another expression (field access,
// nested non-method call, …) breaks the sanitizer invariant.
if current.kind() == "identifier" && !earned.is_empty() {
return Some(earned);
}
return None;
}
// Must be a method-style call: function is a field_expression whose
// `field` names a `replace`-like method.
let func = current.child_by_field_name("function")?;
if func.kind() != "field_expression" {
return None;
}
let method_ident = func.child_by_field_name("field")?;
let method_name = text_of(method_ident, code)?;
if method_name != "replace" && method_name != "replacen" {
return None;
}
let args_node = current.child_by_field_name("arguments")?;
let mut cursor = args_node.walk();
let positional: Vec<Node<'_>> = args_node
.named_children(&mut cursor)
.filter(|c| {
!matches!(
c.kind(),
"keyword_argument"
| "named_argument"
| "spread_element"
| "list_splat"
| "dictionary_splat"
| "splat_argument"
| "hash_splat_argument"
)
})
.collect();
let (arg0, arg1) = match positional.as_slice() {
[a, b, ..] => (*a, *b),
_ => return None,
};
if !is_rust_str_literal(arg0.kind()) || !is_rust_str_literal(arg1.kind()) {
return None;
}
let search = extract_rust_str_content(arg0, code)?;
let replacement = extract_rust_str_content(arg1, code)?;
// If the replacement itself contains a dangerous sequence, this hop
// can reintroduce the pattern that a later hop tries to strip. Be
// conservative: abandon all credit.
if !caps_stripped_by_literal_pattern(&replacement).is_empty() {
return None;
}
earned |= caps_stripped_by_literal_pattern(&search);
// Walk to receiver via field_expression.value.
current = func.child_by_field_name("value")?;
}
None
}
/// Recognise a Go `strings.Replace(s, OLD, NEW, n)` /
/// `strings.ReplaceAll(s, OLD, NEW)` call that provably strips one of the
/// known-dangerous metacharacter classes from its first argument.
///
/// Returns the union of caps stripped, or `None` when the pattern doesn't
/// apply (so the caller falls back to normal unresolved-call propagation).
///
/// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call
/// (non-method-chain) Go shape. The caller wires the resulting cap into
/// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the
/// taint engine consumes via the standard sanitizer pathway, taint flows
/// in on `s`, the matching cap is stripped from the result.
pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option<Cap> {
if call_ast.kind() != "call_expression" {
return None;
}
// The call's `function` field is a `selector_expression`, `operand`
// is the package ident (`strings`), `field` is the method ident.
let func = call_ast.child_by_field_name("function")?;
if func.kind() != "selector_expression" {
return None;
}
let operand = func.child_by_field_name("operand")?;
if text_of(operand, code).as_deref() != Some("strings") {
return None;
}
let field = func.child_by_field_name("field")?;
let method_name = text_of(field, code)?;
if method_name != "Replace" && method_name != "ReplaceAll" {
return None;
}
// Args layout: (s, old, new[, n]). Need positional args 1 (old) and
// 2 (new) to be string literals.
let old_lit = extract_const_string_arg(call_ast, 1, code)?;
let new_lit = extract_const_string_arg(call_ast, 2, code)?;
// If the replacement itself reintroduces a dangerous sequence, don't
// credit the strip, matches the Rust chain detector's policy.
if !caps_stripped_by_literal_pattern(&new_lit).is_empty() {
return None;
}
let caps = caps_stripped_by_literal_pattern(&old_lit);
if caps.is_empty() { None } else { Some(caps) }
}
/// Like `first_call_ident`, but also checks if `n` itself is a call node.
/// `first_call_ident` only searches children, so when `n` IS the call
/// expression (e.g. the argument `sanitize(cmd)`), this function catches it.
pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Option<String> {
// C++ new/delete: normalize callee before field extraction.
if lang == "cpp" && n.kind() == "new_expression" {
return Some("new".to_string());
}
if lang == "cpp" && n.kind() == "delete_expression" {
return Some("delete".to_string());
}
match lookup(lang, n.kind()) {
Kind::Function => {
// Function/closure expression passed as argument, return the same
// synthetic anon name used by build_sub so callback_bindings and
// source_to_callback can match it to the extracted BodyCfg.
n.child_by_field_name("name")
.and_then(|nm| text_of(nm, code))
.or_else(|| Some(anon_fn_name(n.start_byte())))
}
Kind::CallFn => n
.child_by_field_name("function")
.or_else(|| n.child_by_field_name("method"))
.or_else(|| n.child_by_field_name("name"))
.or_else(|| n.child_by_field_name("type"))
.or_else(|| find_constructor_type_child(n))
.and_then(|f| {
let unwrapped = unwrap_parens(f);
if lookup(lang, unwrapped.kind()) == Kind::Function {
Some(anon_fn_name(unwrapped.start_byte()))
} else {
text_of(f, code)
}
}),
Kind::CallMethod => {
let func = n
.child_by_field_name("method")
.or_else(|| n.child_by_field_name("name"))
.and_then(|f| text_of(f, code));
let recv_node = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("receiver"))
.or_else(|| n.child_by_field_name("scope"));
let recv = recv_node.and_then(|f| root_receiver_text(f, lang, code));
// Preserve Java `.getClass()` segment in the chained callee text
// so downstream predicates (e.g.
// [`crate::ssa::type_facts::is_safe_string_producing_callee`])
// can recognise idiomatic `obj.getClass().<accessor>()` chains.
// Without this, `root_receiver_text` collapses the chain to
// `obj.<accessor>`, indistinguishable from a user-defined method.
let recv = if lang == "java"
&& let Some(rn) = recv_node
&& lookup(lang, rn.kind()) == Kind::CallMethod
&& let Some(inner_method) = rn
.child_by_field_name("method")
.or_else(|| rn.child_by_field_name("name"))
.and_then(|f| text_of(f, code))
&& inner_method == "getClass"
&& let Some(r) = recv
{
Some(format!("{r}.getClass"))
} else {
recv
};
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f),
_ => None,
}
}
Kind::CallMacro => n
.child_by_field_name("macro")
.and_then(|f| text_of(f, code)),
_ => first_call_ident(n, lang, code),
}
}
/// For each argument of `call_node`, return `Some(s)` when the argument is a
/// syntactic string literal (unquoted contents) and `None` otherwise. The
/// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`].
///
/// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces
/// an empty vector, positional indices past the splat are meaningless and
/// downstream passes already treat an empty vector as "no info".
pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<Option<String>> {
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut result = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
// Splat → positional indexing breaks; bail.
if kind == "spread_element"
|| kind == "dictionary_splat"
|| kind == "list_splat"
|| kind == "splat_argument"
|| kind == "hash_splat_argument"
{
return Vec::new();
}
// Named / keyword arguments are tracked separately in `kwargs` and
// don't participate in positional indexing, skip them here so this
// vector stays aligned with `arg_uses`.
if kind == "keyword_argument" || kind == "named_argument" {
continue;
}
// PHP wraps each call argument in an `argument` node whose first
// named child is the actual expression. Unwrap one level so the
// string-literal arm below sees the literal directly rather than
// the wrapper kind, otherwise PHP `f("https://…")` records
// `None` for arg 0 and downstream prefix-aware suppressions miss.
let target = if kind == "argument" {
child.named_child(0).unwrap_or(child)
} else {
child
};
let target_kind = target.kind();
let literal = match target_kind {
"string"
| "string_literal"
| "interpreted_string_literal"
| "raw_string_literal"
// PHP's double-quoted form (single-quoted maps to `string`).
// Only safe to lift when there is no `encapsed_string` /
// `embedded_expression` interpolation child, checked below.
| "encapsed_string" => {
let raw = text_of(target, code);
raw.and_then(|s| strip_literal_quotes(&s, target, code))
}
// Boolean / null / numeric literal tokens — capture verbatim so
// downstream pattern-aware analysis (e.g. the XXE config-fact
// pass that needs to read the boolean polarity arg of
// `setFeature(NAME, true)`) can recover the literal text without
// re-walking the AST. Existing string-only consumers (URL
// prefix matching, etc.) are unaffected: a "true" / "false"
// token never satisfies their matching predicates.
"true"
| "false"
| "null"
| "null_literal"
| "nil"
| "nil_literal"
| "none"
| "boolean_literal"
| "true_literal"
| "false_literal"
| "decimal_integer_literal"
| "integer_literal"
| "integer"
| "number"
| "number_literal"
| "decimal_literal" => text_of(target, code),
_ => None,
};
result.push(literal);
}
result
}
/// Strip surrounding quotes from a syntactic string literal, resolving the
/// `string_content` child for Rust-style two-level string nodes. Returns the
/// raw inner text (no escape-sequence processing), sufficient for whitelist
/// matching against shell-metachar sets.
pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option<String> {
// Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child.
// Prefer the content text so the caller doesn't have to deal with quote
// pairing for raw strings (`r"..."`, `r#"..."#`, etc.).
let mut cursor = node.walk();
for child in node.named_children(&mut cursor) {
if child.kind() == "string_content" {
return text_of(child, code);
}
}
if raw.len() >= 2 {
let bytes = raw.as_bytes();
let first = bytes[0];
let last = bytes[raw.len() - 1];
if (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') {
return Some(raw[1..raw.len() - 1].to_string());
}
}
None
}
/// For each argument of `call_node`, find the callee name if that argument
/// is itself a call expression (e.g. `sanitize(x)` in `os.system(sanitize(x))`).
/// Returns a `Vec<Option<String>>` parallel to `extract_arg_uses` output.
pub(super) fn extract_arg_callees(call_node: Node, lang: &str, code: &[u8]) -> Vec<Option<String>> {
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut result = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
// Bail on spread/splat like extract_arg_uses does
let kind = child.kind();
if kind == "spread_element"
|| kind == "dictionary_splat"
|| kind == "list_splat"
|| kind == "keyword_argument"
|| kind == "splat_argument"
|| kind == "hash_splat_argument"
|| kind == "named_argument"
{
return Vec::new();
}
result.push(call_ident_of(child, lang, code));
}
result
}
/// Return `(defines, uses, extra_defines, array_pattern_indices,
/// rhs_array_elements)` for the AST fragment `ast`.
///
/// `extra_defines` captures additional bindings from destructuring patterns
/// beyond the primary define. `array_pattern_indices`, when non-empty, gives
/// the source-order position of each binding in `iter::once(defines).chain(
/// extra_defines)` for `array_pattern` / `tuple_pattern` LHS shapes. Empty
/// for non-array destructures and for non-skip array patterns where callers
/// can derive sequential 0..N indices implicitly.
///
/// `rhs_array_elements`, when non-empty, gives source-order RHS slots for
/// destructure-from-array-literal shapes (`const [a, b] = [safe, tainted]`,
/// `let (a, b) = (safe, tainted)`, Python `a, b = safe, tainted`). Each slot
/// is `Some(ident)` for a bare-ident element or `None` for a syntactic
/// literal. Empty when RHS isn't an array-literal shape or any element is
/// too complex; callers fall back to scalar union in that case.
#[allow(clippy::type_complexity)]
pub(super) fn def_use(
ast: Node,
lang: &str,
code: &[u8],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> (
Option<String>,
Vec<String>,
Vec<String>,
SmallVec<[usize; 4]>,
SmallVec<[crate::cfg::RhsArraySlot; 4]>,
) {
match lookup(lang, ast.kind()) {
// Declaration wrappers (let, var, short_var_declaration, etc.)
Kind::CallWrapper => {
let mut defs = None;
let mut extra_defs = Vec::new();
let mut uses = Vec::new();
let mut pattern_indices: SmallVec<[usize; 4]> = SmallVec::new();
let mut rhs_array_elements: SmallVec<[crate::cfg::RhsArraySlot; 4]> = SmallVec::new();
// Try direct field names first (Rust `let_declaration`, Go `short_var_declaration`)
let def_node = ast
.child_by_field_name("pattern")
.or_else(|| ast.child_by_field_name("name"))
.or_else(|| ast.child_by_field_name("left"))
// Python `with_item`: value is `as_pattern` whose `alias` holds the target
.or_else(|| {
ast.child_by_field_name("value")
.and_then(|v| v.child_by_field_name("alias"))
});
let val_node = ast
.child_by_field_name("value")
.or_else(|| ast.child_by_field_name("right"));
if def_node.is_some() || val_node.is_some() {
if let Some(pat) = def_node {
let bindings = collect_array_pattern_bindings_indexed(pat, code);
if !bindings.is_empty() {
let mut iter = bindings.into_iter();
if let Some((first_name, first_idx)) = iter.next() {
defs = Some(first_name);
pattern_indices.push(first_idx);
}
for (name, idx) in iter {
extra_defs.push(name);
pattern_indices.push(idx);
}
} else {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(pat, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
// Remaining idents are extra defines (for destructuring)
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
}
if let Some(val) = val_node {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(val, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
// Rust format-string named-arg capture: `let q =
// format!("...{x}...")` reads `x`, but `x` lives in
// the format-string bytes, not as a separate AST
// argument node, so collect_idents misses it.
uses.extend(extract_rust_format_macro_named_idents_in(val, code));
// When the LHS is a recognised destructure pattern AND
// the RHS is a bare array-literal shape (no call), record
// per-element idents so the SSA destructure rewrite can
// map each binding to its specific RHS slot.
if !pattern_indices.is_empty() {
rhs_array_elements =
collect_rhs_array_literal_elements(val, lang, code, extra_labels);
}
}
} else {
// Try nested declarator pattern (JS/TS `lexical_declaration` → `variable_declarator`,
// Java `local_variable_declaration` → `variable_declarator`,
// C/C++ `declaration` → `init_declarator`,
// Python/Ruby `expression_statement` → `assignment`)
let mut cursor = ast.walk();
for child in ast.children(&mut cursor) {
// Only use left/right fields for actual assignment nodes, binary
// expressions also have left/right but are not definitions.
let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment);
let child_name = child
.child_by_field_name("name")
.or_else(|| child.child_by_field_name("declarator"))
.or_else(|| {
if is_assign {
child.child_by_field_name("left")
} else {
None
}
});
let child_value = child.child_by_field_name("value").or_else(|| {
if is_assign {
child.child_by_field_name("right")
} else {
None
}
});
// Only treat this child as a declarator if it has BOTH a name
// and a value (or at least a value). This prevents method_invocation
// nodes (which have a `name` field) from being misinterpreted.
if child_value.is_some() {
if let Some(name_node) = child_name
&& defs.is_none()
{
let bindings = collect_array_pattern_bindings_indexed(name_node, code);
if !bindings.is_empty() {
let mut iter = bindings.into_iter();
if let Some((first_name, first_idx)) = iter.next() {
defs = Some(first_name);
pattern_indices.push(first_idx);
}
for (name, idx) in iter {
extra_defs.push(name);
pattern_indices.push(idx);
}
} else {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(name_node, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
}
if let Some(val_node) = child_value {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(val_node, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(val_node, code));
if !pattern_indices.is_empty() && rhs_array_elements.is_empty() {
rhs_array_elements = collect_rhs_array_literal_elements(
val_node,
lang,
code,
extra_labels,
);
}
}
}
}
// Fallback: if still nothing found, collect all idents as uses.
// This handles expression_statement wrappers.
if defs.is_none() && uses.is_empty() {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(ast, code));
}
}
(defs, uses, extra_defs, pattern_indices, rhs_array_elements)
}
// Plain assignment `x = y` or destructuring assignment such as
// Python `a, b = await asyncio.gather(...)` whose LHS surfaces as
// a `pattern_list` / `tuple_pattern`. When the LHS is a
// destructure pattern that the indexed helper recognises, the
// primary binding lands in `defs`, the rest land in `extra_defs`,
// and `pattern_indices` carries source-order positions so the
// SSA lowering's destructure-promise rewrite can paint each
// binding from the matching combinator argument.
Kind::Assignment => {
let mut defs = None;
let mut extra_defs = Vec::new();
let mut pattern_indices: SmallVec<[usize; 4]> = SmallVec::new();
let mut rhs_array_elements: SmallVec<[crate::cfg::RhsArraySlot; 4]> = SmallVec::new();
let mut uses = Vec::new();
if let Some(lhs) = ast.child_by_field_name("left") {
let bindings = collect_array_pattern_bindings_indexed(lhs, code);
if !bindings.is_empty() {
let mut iter = bindings.into_iter();
if let Some((first_name, first_idx)) = iter.next() {
defs = Some(first_name);
pattern_indices.push(first_idx);
}
for (name, idx) in iter {
extra_defs.push(name);
pattern_indices.push(idx);
}
} else {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(lhs, code, &mut idents, &mut paths);
// Prefer dotted path (member expression) over last ident
defs = paths.pop().or_else(|| idents.pop());
}
}
if let Some(rhs) = ast.child_by_field_name("right") {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(rhs, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(rhs, code));
// When the LHS is a recognised destructure pattern AND the
// RHS is a bare array-literal shape, record per-element
// idents so the SSA destructure rewrite can map each
// binding to its specific RHS slot.
if !pattern_indices.is_empty() {
rhs_array_elements =
collect_rhs_array_literal_elements(rhs, lang, code, extra_labels);
}
}
(defs, uses, extra_defs, pattern_indices, rhs_array_elements)
}
// iflet / whilelet, the `let_condition` binds a variable from
// the value expression. E.g. `if let Ok(cmd) = env::var("CMD")`
// defines `cmd` and uses `env`, `var`, `CMD`.
Kind::If | Kind::While => {
let cond = ast.child_by_field_name("condition");
if let Some(c) = cond
&& c.kind() == "let_condition"
{
let mut defs = None;
let mut uses = Vec::new();
if let Some(pat) = c.child_by_field_name("pattern") {
let mut tmp = Vec::<String>::new();
collect_idents(pat, code, &mut tmp);
// The first plain identifier in the pattern is the binding.
// Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the
// last ident which is the inner binding name.
defs = tmp.into_iter().last();
}
if let Some(val) = c.child_by_field_name("value") {
collect_idents(val, code, &mut uses);
}
return (defs, uses, vec![], SmallVec::new(), SmallVec::new());
}
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
(None, uses, vec![], SmallVec::new(), SmallVec::new())
}
// for-in / for-of / Python `for x in iter:` ─────────────────────────
//
// Tree-sitter classifies these as `Kind::For` with a `left`/`right`
// field pair (binding pattern + iterable). Without an explicit
// arm here, the default branch collects every ident as a `use` and
// never registers the iteration binding as a `define`, so taint
// entering the iterable does not propagate into the body's
// references to the binding (`for (const [a, b] of obj) { sink(a) }`
// would lose the flow at `a`).
//
// C-style `for_statement` has no `left`/`right` fields (it uses
// `initializer`/`condition`/`increment`), so this path falls through
// to the default-collecting behaviour for those, preserving today's
// semantics.
//
// Go's `for ident := range iter` shape places the binding pattern
// and iterable on a `range_clause` child of the `for_statement`
// rather than as direct fields. Without the range_clause lookup
// below, taint from the iterable never reaches the loop binding
// (CVE-2026-41422 daptin: `c.QueryArray("col")` loop var `project`
// flows into `goqu.L(project)` SQL_QUERY sink).
Kind::For => {
let mut left = ast.child_by_field_name("left");
let mut right = ast.child_by_field_name("right");
if left.is_none() && right.is_none() {
let mut cursor = ast.walk();
for child in ast.children(&mut cursor) {
if child.kind() == "range_clause" {
left = child.child_by_field_name("left");
right = child.child_by_field_name("right");
break;
}
}
}
if left.is_none() && right.is_none() {
// C-style for, defer to default ident collection.
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
return (None, uses, vec![], SmallVec::new(), SmallVec::new());
}
let mut defs: Option<String> = None;
let mut extra_defs: Vec<String> = Vec::new();
let mut uses: Vec<String> = Vec::new();
if let Some(pat) = left {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(pat, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
if let Some(val) = right {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(val, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
}
(defs, uses, extra_defs, SmallVec::new(), SmallVec::new())
}
// everything else no definition, but may read vars
_ => {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
(None, uses, vec![], SmallVec::new(), SmallVec::new())
}
}
}
/// One match from [`extract_shell_array_payload_idents`].
///
/// `arg_position` is the positional argument index of the call where the
/// shell-array literal was found. `payload_idents` is the union of
/// identifiers (and dotted paths) lifted from the array's payload elements
/// (positions 2+ for POSIX `sh -c <cmd>` form; positions 2+ for `cmd /c <cmd>`
/// likewise). Empty `payload_idents` means the payload is a constant string,
/// which the caller should treat as benign (no SHELL_ESCAPE finding possible).
#[derive(Debug, Clone)]
pub(super) struct ShellArrayMatch {
pub arg_position: usize,
pub payload_idents: Vec<String>,
}
/// Detect inline shell-execution array literals at a call site.
///
/// Recognises the pattern `[<shell>, "-c", <payload>]` (POSIX shells) and
/// `[<cmd-shell>, "/c"|"/C", <payload>]` (Windows `cmd.exe`) appearing as
/// either:
/// * a direct positional argument of `call_node`, or
/// * the value of any field within an object-literal positional argument
/// (covers `container.exec({Cmd: ["bash", "-c", x]})` form).
///
/// Returns one [`ShellArrayMatch`] per detected shell-array. Empty when the
/// call has no shell-array literals.
///
/// The shell-name list is intentionally narrow (POSIX shells + Windows
/// `cmd.exe`/`powershell`) to avoid false positives on benign array literals
/// like `["ls", "-la"]` or `["git", "rev-parse", "HEAD"]`, where element 0 is
/// not a shell. Element 1 must be a literal `-c` (POSIX) or `/c`/`/C` (cmd);
/// otherwise the array is not in shell-exec form regardless of element 0.
///
/// Identifiers from elements at positions 2+ are lifted via
/// [`collect_idents_with_paths`] so template-literal interpolations
/// (`` `echo ${x}` ``), member-expressions (`obj.field`), and bare idents are
/// all captured. Dedup is preserved across array elements so a single ident
/// referenced in multiple payload positions appears once.
pub(super) fn extract_shell_array_payload_idents(
call_node: Node,
code: &[u8],
) -> Vec<ShellArrayMatch> {
let mut out = Vec::new();
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return out;
};
let mut cursor = args_node.walk();
for (idx, child) in args_node.named_children(&mut cursor).enumerate() {
let kind = child.kind();
// Splats break positional indexing; bail conservatively on the whole call.
if kind == "spread_element"
|| kind == "dictionary_splat"
|| kind == "list_splat"
|| kind == "splat_argument"
|| kind == "hash_splat_argument"
{
return Vec::new();
}
if kind == "keyword_argument" || kind == "named_argument" {
continue;
}
// Direct array-literal arg.
if let Some(idents) = shell_array_payload_idents_of(child, code) {
out.push(ShellArrayMatch {
arg_position: idx,
payload_idents: idents,
});
continue;
}
// Object-literal arg whose field value is a shell-array literal.
// Covers `container.exec({Cmd: [...]})` form. Field name is not
// restricted to `Cmd` / `cmd`: the shell-shape itself is the gate,
// and the payload extraction is per-array.
if matches!(kind, "object" | "dictionary") {
let mut cc = child.walk();
for pair in child.named_children(&mut cc) {
if pair.kind() != "pair" {
continue;
}
let Some(val_node) = pair.child_by_field_name("value") else {
continue;
};
let val_node = unwrap_parens(val_node);
if let Some(idents) = shell_array_payload_idents_of(val_node, code) {
out.push(ShellArrayMatch {
arg_position: idx,
payload_idents: idents,
});
}
}
}
}
out
}
/// If `node` is an array literal of shape `[<shell>, "-c", *]` (POSIX shells)
/// or `[<cmd-shell>, "/c", *]` (Windows cmd.exe), return the identifiers
/// referenced in the payload elements (positions 2+). Otherwise return
/// `None`. Returning `Some(vec![])` means the payload is a constant string
/// — caller should still skip emitting a sink (no taint can reach a literal).
fn shell_array_payload_idents_of(node: Node, code: &[u8]) -> Option<Vec<String>> {
let node = unwrap_parens(node);
if node.kind() != "array" {
return None;
}
// Walk named children to skip commas and other trivia.
let mut cursor = node.walk();
let elems: Vec<Node> = node.named_children(&mut cursor).collect();
if elems.len() < 3 {
return None;
}
let shell = const_string_value(elems[0], code)?;
if !is_known_shell(&shell) {
return None;
}
let flag = const_string_value(elems[1], code)?;
if !is_shell_command_flag(&shell, &flag) {
return None;
}
// Lift identifiers from the payload elements (positions 2+). Constants
// contribute nothing. An empty result means the entire payload is
// statically benign.
let mut idents: Vec<String> = Vec::new();
let mut paths: Vec<String> = Vec::new();
for elem in &elems[2..] {
collect_idents_with_paths(*elem, code, &mut idents, &mut paths);
}
let mut combined = paths;
combined.extend(idents);
// Dedup (preserve first-seen order).
let mut seen = std::collections::HashSet::new();
combined.retain(|s| seen.insert(s.clone()));
if combined.is_empty() {
// Static payload — no taint can reach it. Return None so the caller
// does not emit a useless sink filter.
return None;
}
Some(combined)
}
/// Extract a constant string value from `node`, handling JS/TS `string` /
/// `template_string` (no interpolation) forms. Returns `None` for dynamic
/// values, identifiers, or expressions.
fn const_string_value(node: Node, code: &[u8]) -> Option<String> {
let node = unwrap_parens(node);
match node.kind() {
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
let raw = text_of(node, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
"template_string" => {
let mut c = node.walk();
if node
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return None;
}
let raw = text_of(node, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
_ => None,
}
}
/// Known shell executable names that activate the shell-array detector.
/// Scoped narrowly to POSIX shells + Windows command interpreters, listing
/// only canonical names so benign arrays like `["ls", ...]`, `["git", ...]`,
/// or `["python", ...]` do not match.
fn is_known_shell(name: &str) -> bool {
// Strip directory prefix for matching: `/bin/bash` → `bash`.
let leaf = name.rsplit('/').next().unwrap_or(name);
matches!(
leaf,
"bash"
| "sh"
| "zsh"
| "dash"
| "ksh"
| "fish"
| "ash"
| "tcsh"
| "csh"
| "cmd"
| "cmd.exe"
| "powershell"
| "powershell.exe"
| "pwsh"
| "pwsh.exe"
)
}
/// True when `flag` is the "execute the following string as a shell command"
/// switch for the given `shell`. POSIX shells use `-c`; cmd.exe accepts
/// `/c` / `/C`; PowerShell uses `-Command` (also `-c` as alias) and
/// `-EncodedCommand`.
fn is_shell_command_flag(shell: &str, flag: &str) -> bool {
let leaf = shell.rsplit('/').next().unwrap_or(shell);
let is_cmd = matches!(leaf, "cmd" | "cmd.exe");
let is_powershell = matches!(leaf, "powershell" | "powershell.exe" | "pwsh" | "pwsh.exe");
if is_cmd {
return matches!(flag, "/c" | "/C" | "/k" | "/K");
}
if is_powershell {
return matches!(
flag,
"-c" | "-Command" | "-command" | "-EncodedCommand" | "-encodedcommand"
);
}
// POSIX shells.
flag == "-c"
}