nyx/src/cfg/helpers.rs

1093 lines
46 KiB
Rust

use super::anon_fn_name;
use super::conditions::unwrap_parens;
use crate::labels::{DataLabel, Kind, classify, lookup};
use smallvec::SmallVec;
use tree_sitter::Node;
// -------------------------------------------------------------------------
// Utility helpers
// -------------------------------------------------------------------------
/// Return the text of a node.
#[inline]
pub(crate) fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option<String> {
std::str::from_utf8(&code[n.start_byte()..n.end_byte()])
.ok()
.map(|s| s.to_string())
}
/// Walk through chained calls / member accesses to find the root receiver.
///
/// For `Runtime.getRuntime().exec(cmd)`, the receiver of `exec` is the call
/// `Runtime.getRuntime()`. This function drills through that to return
/// `"Runtime"`, the outermost non-call object. This lets labels like
/// `"Runtime.exec"` match correctly.
pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<String> {
match lookup(lang, n.kind()) {
// The receiver is itself a call, drill into ITS receiver.
// e.g. for `Runtime.getRuntime()`, the object is `Runtime`.
Kind::CallFn | Kind::CallMethod => {
let inner = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("receiver"))
.or_else(|| n.child_by_field_name("function"));
match inner {
Some(child) => root_receiver_text(child, lang, code),
None => text_of(n, code),
}
}
// PHP `variable_name` text carries a leading `$` (`$smarty`, `$twig`).
// Strip it so chain text built downstream (`{recv}.{method}`) presents
// a `.`-only delimiter sequence — required by the suffix-matcher
// boundary rule, which only accepts `.`/`:` as chain separators.
// Without this strip, gate matchers like `Smarty.fetch` /
// `Environment.createTemplate` never fire on idiomatic
// `$smarty->fetch(...)` / `$twig->createTemplate(...)` shapes.
_ if lang == "php" && n.kind() == "variable_name" => {
text_of(n, code).map(|s| s.trim_start_matches('$').to_string())
}
_ => text_of(n, code),
}
}
/// Walk a member-expression / attribute chain down to its root identifier.
///
/// Unlike [`root_receiver_text`], which returns the raw text of a nested
/// attribute (yielding `"request.args.get"` for the attribute node covering
/// `request.args.get`), this drills through `object`/`value` fields until it
/// hits a terminal identifier and returns just that leaf.
///
/// Used when JS/Python `obj.method(x)` is classified as `Kind::CallFn` with a
/// dotted function child: we want the leftmost segment (`request` in
/// `request.args.get("q")`) as the structured receiver for type-qualified
/// resolution. Returns `None` when the chain does not resolve to a plain
/// identifier (e.g. call expressions, subscripts, `this`/`self`, etc.).
pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
let mut cur = n;
// Bounded walk, tree-sitter can nest deeply but we only need a handful
// of hops for real code.
for _ in 0..16 {
match cur.kind() {
"identifier" | "variable_name" | "this" | "self" => {
return text_of(cur, code);
}
"member_expression" | "attribute" => {
cur = cur.child_by_field_name("object")?;
}
// Rust `x.y` is `field_expression` with a `value` field.
"field_expression" => {
cur = cur.child_by_field_name("value")?;
}
// Drill through nested calls / method chains to find the base
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` ,
// the receiver of `.execute` is the `.unwrap()` call whose
// object is `Connection::open(p)`; we want the leftmost plain
// identifier the chain resolves to (for SSA var_stacks lookup).
"call_expression" => {
cur = cur.child_by_field_name("function")?;
}
"method_call_expression" => {
cur = cur
.child_by_field_name("object")
.or_else(|| cur.child_by_field_name("receiver"))?;
}
_ => return None,
}
}
None
}
/// Check if a callee represents an RAII-managed factory whose resources are
/// automatically cleaned up by language semantics (Rust ownership/Drop, C++
/// smart pointers). Returns `true` to set `managed_resource` on the acquire
/// node, suppressing false `state-resource-leak` findings.
pub(crate) fn is_raii_factory(lang: &str, callee: &str) -> bool {
fn matches_any(callee: &str, patterns: &[&str]) -> bool {
let cl = callee.to_ascii_lowercase();
// Strip C++ template arguments: make_unique<int> → make_unique
let base = cl.split('<').next().unwrap_or(&cl);
patterns.iter().any(|p| base == *p || base.ends_with(p))
}
match lang {
"cpp" => {
static CPP_RAII_FACTORIES: &[&str] = &[
"make_unique",
"make_shared",
"std::make_unique",
"std::make_shared",
];
matches_any(callee, CPP_RAII_FACTORIES)
}
"rust" => {
static RUST_RAII_CONSTRUCTORS: &[&str] = &[
"file::open",
"file::create",
"box::new",
"bufwriter::new",
"bufreader::new",
"tcplistener::bind",
"tcpstream::connect",
"udpsocket::bind",
"mutex::new",
"rwlock::new",
"fs::file::open",
"fs::file::create",
"std::fs::file::open",
"std::fs::file::create",
];
matches_any(callee, RUST_RAII_CONSTRUCTORS)
}
_ => false,
}
}
/// Fallback for constructor expressions whose grammar lacks field names.
/// For example, PHP `object_creation_expression` has positional children
/// `new name arguments` where `name` is a node kind (not a field).
/// Returns the first child whose kind is `"name"` or `"type_identifier"`.
pub(crate) fn find_constructor_type_child(n: Node) -> Option<Node> {
let mut cursor = n.walk();
n.children(&mut cursor)
.find(|c| matches!(c.kind(), "name" | "type_identifier" | "qualified_name"))
}
/// Return the callee identifier and byte span for the first call / method /
/// macro inside `n`. Searches recursively through all descendants.
///
/// The span is the byte range of the call expression itself, so a caller that
/// overrides `text` with the returned identifier can also record a
/// `callee_span` pointing at the inner call (narrower than the enclosing
/// statement) for accurate source-location reporting.
pub(crate) fn first_call_ident_with_span<'a>(
n: Node<'a>,
lang: &str,
code: &'a [u8],
) -> Option<(String, (usize, usize))> {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => {
let span = (c.start_byte(), c.end_byte());
// C++ new/delete: normalize callee before returning.
if lang == "cpp" && c.kind() == "new_expression" {
return Some(("new".to_string(), span));
}
if lang == "cpp" && c.kind() == "delete_expression" {
return Some(("delete".to_string(), span));
}
// Ruby backtick subshell: no `function` field, normalise to
// the synthetic callee so assignment-wrapped subshells classify.
if lang == "ruby" && c.kind() == "subshell" {
return Some(("subshell".to_string(), span));
}
let ident = match lookup(lang, c.kind()) {
Kind::CallFn => c
.child_by_field_name("function")
.or_else(|| c.child_by_field_name("method"))
.or_else(|| c.child_by_field_name("name"))
.or_else(|| c.child_by_field_name("type"))
.or_else(|| c.child_by_field_name("constructor"))
// Fallback for constructors whose grammar lacks field names
// (e.g. PHP `object_creation_expression` has positional children).
.or_else(|| find_constructor_type_child(c))
.and_then(|f| {
let unwrapped = unwrap_parens(f);
if lookup(lang, unwrapped.kind()) == Kind::Function {
Some(anon_fn_name(unwrapped.start_byte()))
} else {
text_of(f, code)
}
}),
Kind::CallMethod => {
let func = c
.child_by_field_name("method")
.or_else(|| c.child_by_field_name("name"))
.and_then(|f| text_of(f, code));
let recv = c
.child_by_field_name("object")
.or_else(|| c.child_by_field_name("receiver"))
.or_else(|| c.child_by_field_name("scope"))
.and_then(|f| root_receiver_text(f, lang, code));
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f),
_ => None,
}
}
Kind::CallMacro => c
.child_by_field_name("macro")
.and_then(|f| text_of(f, code)),
_ => None,
};
return ident.map(|s| (s, span));
}
Kind::Function => {
// Do not descend into nested function/lambda bodies ,
// they are separate scopes and should not contribute
// callee identifiers to the parent expression.
continue;
}
_ => {
// Recurse into children (handles nested declarators)
if let Some(found) = first_call_ident_with_span(c, lang, code) {
return Some(found);
}
}
}
}
None
}
/// Convenience wrapper around [`first_call_ident_with_span`] that discards
/// the byte-span when only the callee identifier is needed (e.g. for
/// Python-side label lookup that does not participate in span-narrowed
/// location reporting).
pub(crate) fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Option<String> {
first_call_ident_with_span(n, lang, code).map(|(s, _)| s)
}
/// Search recursively for any nested call whose identifier classifies as a label.
/// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does.
///
/// Returns `(callee_text, label, span)` where `span` is the byte range of the
/// inner call node itself, used to populate `CallMeta.callee_span` so that
/// display sites can report the actual call location rather than the enclosing
/// statement's span.
pub(crate) fn find_classifiable_inner_call<'a>(
n: Node<'a>,
lang: &str,
code: &'a [u8],
extra: Option<&[crate::labels::RuntimeLabelRule]>,
) -> Option<(String, DataLabel, (usize, usize))> {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
// Do not descend into Kind::Function nodes, they will be extracted
// as separate BodyCfg entries and should not contribute inner callees
// to the parent expression.
if lookup(lang, c.kind()) == Kind::Function {
continue;
}
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => {
// For CallMethod we also remember the bare receiver
// identifier so we can try a type-qualified rewrite
// when the literal classify misses.
let mut method_receiver: Option<String> = None;
let mut method_name: Option<String> = None;
let ident = match lookup(lang, c.kind()) {
Kind::CallFn => c
.child_by_field_name("function")
.or_else(|| c.child_by_field_name("method"))
.or_else(|| c.child_by_field_name("name"))
.or_else(|| c.child_by_field_name("type"))
.and_then(|f| text_of(f, code)),
Kind::CallMethod => {
let func = c
.child_by_field_name("method")
.or_else(|| c.child_by_field_name("name"))
.and_then(|f| text_of(f, code));
let recv = c
.child_by_field_name("object")
.or_else(|| c.child_by_field_name("receiver"))
.or_else(|| c.child_by_field_name("scope"))
.and_then(|f| root_receiver_text(f, lang, code));
method_receiver = recv.clone();
method_name = func.clone();
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f),
_ => None,
}
}
Kind::CallMacro => c
.child_by_field_name("macro")
.and_then(|f| text_of(f, code)),
_ => None,
};
if let Some(ref id) = ident
&& let Some(lbl) = classify(lang, id, extra)
{
return Some((id.clone(), lbl, (c.start_byte(), c.end_byte())));
}
// Receiver-type rewrite fallback: when the literal
// `recv.method` text didn't classify, AND we're inside
// a chained call (parent `n` is itself a call), look
// up `recv`'s locally-bound type and retry with the
// type prefix. E.g. for
// `sess.createNativeQuery(sql).getResultList()`, the
// inner `sess.createNativeQuery` rewrites to
// `HibernateSession.createNativeQuery` (rule fires).
//
// Gated on `n` being a Call-kind so the rewrite only
// fires on chain-hop inner calls. When `n` is an
// expression-statement / variable-declarator / etc.
// the candidate `c` IS the outermost call of the
// statement, and the SSA-time
// `resolve_type_qualified_labels` path handles it
// with multi-label semantics that single-label
// `classify` here would erase.
let parent_is_call = matches!(
lookup(lang, n.kind()),
Kind::CallFn | Kind::CallMethod | Kind::CallMacro
);
if parent_is_call
&& let (Some(recv), Some(method)) = (method_receiver, method_name)
&& let Some(prefix) = crate::cfg::local_receiver_type_prefix(c, &recv, lang)
{
let alt = format!("{prefix}.{method}");
if let Some(lbl) = classify(lang, &alt, extra) {
return Some((alt, lbl, (c.start_byte(), c.end_byte())));
}
}
// Recurse into arguments of this call
if let Some(found) = find_classifiable_inner_call(c, lang, code, extra) {
return Some(found);
}
}
_ => {
if let Some(found) = find_classifiable_inner_call(c, lang, code, extra) {
return Some(found);
}
}
}
}
None
}
/// Build the dot-joined text of a member_expression / attribute / selector_expression.
/// E.g. for `process.env.CMD` this returns `"process.env.CMD"`.
/// Field paths are capped at 3 segments (2 dots) to bound state size.
pub(crate) fn member_expr_text(n: Node, code: &[u8]) -> Option<String> {
let path = member_expr_text_inner(n, code)?;
// Depth limit: keep at most 3 segments (2 dots)
let mut dots = 0;
for (i, c) in path.char_indices() {
if c == '.' {
dots += 1;
}
if dots >= 3 {
return Some(path[..i].to_string());
}
}
Some(path)
}
pub(crate) fn member_expr_text_inner(n: Node, code: &[u8]) -> Option<String> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => {
// Tree-sitter exposes the receiver under `object` (JS/TS, Python),
// `value` (Rust field_expression, handled in the matching arm
// above), or `operand` (Go selector_expression). Without the
// `operand` fallback, Go member access like `r.Body` collapsed to
// just the trailing field (`Body`), so source rules keyed on the
// dotted form (e.g. Go's `r.Body`) would never match.
let obj = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child_by_field_name("operand"))
.and_then(|o| member_expr_text_inner(o, code))
.or_else(|| {
n.child_by_field_name("object")
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child_by_field_name("operand"))
.and_then(|o| text_of(o, code))
});
let prop = n
.child_by_field_name("property")
.or_else(|| n.child_by_field_name("attribute"))
.or_else(|| n.child_by_field_name("field"))
.and_then(|p| text_of(p, code));
match (obj, prop) {
(Some(o), Some(p)) => Some(format!("{o}.{p}")),
(_, Some(p)) => Some(p),
(Some(o), _) => Some(o),
_ => text_of(n, code),
}
}
_ => text_of(n, code),
}
}
/// Recursively search `n` for a member expression whose text classifies as a label.
pub(crate) fn first_member_label(
n: Node,
lang: &str,
code: &[u8],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> Option<DataLabel> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => {
if let Some(full) = member_expr_text(n, code) {
// Try the full text first, then progressively strip the last segment
// to match rules like "process.env" from "process.env.CMD".
//
// The strip-and-retry only ever yields a sound label for Sources:
// `process.env.CMD` → strip → `process.env` makes sense because
// the receiver itself IS the source. Sinks and Sanitizers, by
// contrast, name the *operation* — `connection.query`, `eval`,
// `exec` — and stripping a trailing segment to match them is
// not semantically valid (e.g. `exec.start` should never be
// treated as a SHELL_ESCAPE sink because of bare `exec`). We
// accept any label on a full-text match (the behaviour callers
// already depend on for Source/Sink labels alike), but only
// accept Source labels after segment stripping.
let mut candidate = full.as_str();
let mut first = true;
loop {
if let Some(lbl) = classify(lang, candidate, extra_labels) {
if first || matches!(lbl, DataLabel::Source(_)) {
return Some(lbl);
}
}
first = false;
match candidate.rsplit_once('.') {
Some((prefix, _)) => candidate = prefix,
None => break,
}
}
}
}
// PHP/Python/Ruby subscript access: `$_GET['cmd']`, `os.environ['KEY']`, `params[:cmd]`
// Try to classify the object (before the `[`) as a source.
//
// Source-only on the receiver: a subscript reads a value from the
// receiver, so a Sink label found on the receiver text (e.g.
// `response.headers['content-type']`, where `response.headers`
// matches the JS HEADER_INJECTION sink rule) describes the
// *target* of a hypothetical write, not this read. Promoting it
// would fire phantom sinks at every `body =
// response.headers["X"]`-shape line. Sinks/Sanitizers reachable
// via callable positions (function-arg, method-receiver) still
// flow through the outer recursive walk below.
"subscript_expression" | "subscript" | "element_reference" => {
if let Some(obj) = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child(0))
{
if let Some(txt) = text_of(obj, code)
&& let Some(lbl @ DataLabel::Source(_)) = classify(lang, &txt, extra_labels)
{
return Some(lbl);
}
// Recurse into the object for nested member accesses, but
// keep the same Source-only restriction as above by passing
// through the dedicated source-only walker.
if let Some(lbl @ DataLabel::Source(_)) =
first_member_label(obj, lang, code, extra_labels)
{
return Some(lbl);
}
}
// Suppress further descent into this subscript node, the outer
// child-walk loop would otherwise enter the receiver via the
// member_expression arm and reattach a value-extraction Sink.
return None;
}
_ => {}
}
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
if let Some(lbl) = first_member_label(child, lang, code, extra_labels) {
return Some(lbl);
}
}
None
}
/// Return the text of the first member expression found in `n`.
pub(crate) fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => member_expr_text(n, code),
"subscript_expression" | "subscript" | "element_reference" => n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child(0))
.and_then(|obj| text_of(obj, code)),
_ => {
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
if let Some(t) = first_member_text(child, code) {
return Some(t);
}
}
None
}
}
}
/// Check whether any descendant of `n` is a call expression.
/// Collect function-expression nodes nested inside a call's arguments.
///
/// This finds anonymous functions / arrow functions / closures that are
/// passed as arguments to a call and should be analysed as separate
/// function scopes. Only direct function-argument children are collected
/// (not functions nested inside other functions, those get handled when
/// the outer function is recursed into).
pub(crate) fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec<Node<'a>> {
let mut funcs = Vec::new();
collect_nested_functions_rec(n, lang, &mut funcs, false);
funcs
}
pub(crate) fn collect_nested_functions_rec<'a>(
n: Node<'a>,
lang: &str,
out: &mut Vec<Node<'a>>,
inside_function: bool,
) {
let kind = lookup(lang, n.kind());
// Only treat as a function if it's a real function node (has children),
// not a keyword token like `function` in JS which shares the same kind name.
if kind == Kind::Function && n.child_count() > 0 {
if inside_function {
// Don't recurse into nested functions of nested functions
return;
}
out.push(n);
return;
}
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
collect_nested_functions_rec(c, lang, out, inside_function);
}
}
/// Derive a binding name for an anonymous function literal from its syntactic
/// context. Returns `None` when no unambiguous binding exists (e.g. function
/// passed directly as a call argument, nested in a destructuring pattern, or
/// stored into a subscript expression).
///
/// Supported shapes (across JS/TS, Python, Ruby, Go, PHP, Rust):
/// * `var|let|const h = <fn>` → `"h"`
/// * `h := <fn>` → `"h"` (Go short-var)
/// * `h = <fn>` → `"h"` (reassignment)
/// * `obj.prop = <fn>` / `obj::prop` → `"prop"` (bind via rightmost member)
///
/// Parenthesised wrappers (`var h = (function(){})`) are transparently
/// skipped. The disambig start-byte on the generated FuncKey prevents
/// shadowed same-name bindings from colliding.
pub(crate) fn derive_anon_fn_name_from_context<'a>(
func_node: Node<'a>,
lang: &str,
code: &'a [u8],
) -> Option<String> {
// Walk up past parenthesized wrappers so `var h = (fn)` works.
let mut cur = func_node.parent()?;
while cur.kind() == "parenthesized_expression" {
cur = cur.parent()?;
}
let parent = cur;
let lhs_ident_text = |lhs: Node<'a>| -> Option<String> {
let lhs = unwrap_parens(lhs);
match lhs.kind() {
"identifier" | "variable_name" | "simple_identifier" => text_of(lhs, code),
// `obj.prop = <fn>` → "prop" (JS/TS/Python/PHP/Ruby/Go)
"member_expression"
| "attribute"
| "field_expression"
| "selector_expression"
| "scoped_identifier" => lhs
.child_by_field_name("property")
.or_else(|| lhs.child_by_field_name("field"))
.or_else(|| lhs.child_by_field_name("name"))
.and_then(|n| text_of(n, code)),
_ => None,
}
};
match parent.kind() {
// JS/TS: `var h = fn`, Java/Rust: `let h = fn`, C++: `auto h = fn`,
// PHP: `$h = fn` also lands here when the parent is `variable_declarator`.
"variable_declarator" | "init_declarator" | "let_declaration" => parent
.child_by_field_name("name")
.or_else(|| parent.child_by_field_name("pattern"))
.and_then(|n| match n.kind() {
"identifier" | "variable_name" | "simple_identifier" => text_of(n, code),
_ => None, // destructuring / tuple patterns are ambiguous
}),
// JS/TS: `h = fn`, `obj.prop = fn`
// Ruby `assignment` / C `assignment_expression`
"assignment_expression" | "assignment" => {
parent.child_by_field_name("left").and_then(lhs_ident_text)
}
// Go: `h := fn` (short_var_declaration). The left child is an
// expression_list with one identifier.
"short_var_declaration" => {
let left = parent.child_by_field_name("left")?;
let mut cur = left.walk();
left.children(&mut cur).find_map(|c| {
(c.kind() == "identifier")
.then(|| text_of(c, code))
.flatten()
})
}
// Go: `var h = fn` → var_spec with names field.
"var_spec" | "const_spec" => {
let names = parent.child_by_field_name("name")?;
let mut cur = names.walk();
names.children(&mut cur).find_map(|c| {
(c.kind() == "identifier")
.then(|| text_of(c, code))
.flatten()
})
}
// Python: `h = lambda: ...` parents as `assignment`, handled above.
// Python `default_parameter` assigning `def foo(x=lambda: 0)`, ambiguous, skip.
_ => {
// Some grammars wrap the RHS in an `expression`, `expression_list`,
// or similar node between the binding site and the function literal.
// Do one more hop to catch these without blowing past meaningful
// scopes (e.g. enclosing function body / block).
let grand = parent.parent()?;
match grand.kind() {
"variable_declarator" | "init_declarator" => grand
.child_by_field_name("name")
.and_then(|n| match n.kind() {
"identifier" | "variable_name" | "simple_identifier" => text_of(n, code),
_ => None,
}),
"assignment_expression" | "assignment" => {
grand.child_by_field_name("left").and_then(lhs_ident_text)
}
// Go: `run := func(){...}` → func_literal's parent is
// `expression_list`, grandparent is `short_var_declaration`.
"short_var_declaration" => {
let left = grand.child_by_field_name("left")?;
let mut cur = left.walk();
left.children(&mut cur).find_map(|c| {
(c.kind() == "identifier")
.then(|| text_of(c, code))
.flatten()
})
}
// Go: `var run = func(){...}` wraps through var_spec via
// expression_list in older grammar versions.
"var_spec" | "const_spec" => {
let names = grand.child_by_field_name("name")?;
let mut cur = names.walk();
names.children(&mut cur).find_map(|c| {
(c.kind() == "identifier")
.then(|| text_of(c, code))
.flatten()
})
}
_ => None,
}
}
}
.and_then(|name| {
// Guard against degenerate names that would collide with label rules
// or produce unstable summary keys. Lang-specific leaf only.
if name.is_empty()
|| name.contains(|c: char| !(c.is_alphanumeric() || c == '_' || c == '$'))
{
None
} else {
// Silence unused-binding warning if lang matching never fires.
let _ = lang;
Some(name)
}
})
}
pub(crate) fn has_call_descendant(n: Node, lang: &str) -> bool {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return true,
_ => {
if has_call_descendant(c, lang) {
return true;
}
}
}
}
false
}
/// Recursively collect identifiers AND full dotted member-expression paths.
///
/// For `member_expression` / `attribute` / `selector_expression` / `field_expression`
/// nodes the full dotted path (via `member_expr_text`) is pushed into `paths`,
/// and the individual leaf identifiers are pushed into `idents` as a fallback.
/// Plain identifiers go only into `idents`.
pub(crate) fn collect_idents_with_paths(
n: Node,
code: &[u8],
idents: &mut Vec<String>,
paths: &mut Vec<String>,
) {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" | "field_expression" => {
if let Some(path) = member_expr_text(n, code) {
paths.push(path);
}
collect_idents(n, code, idents);
}
"identifier"
| "field_identifier"
| "property_identifier"
| "shorthand_property_identifier"
| "shorthand_property_identifier_pattern" => {
if let Some(txt) = text_of(n, code) {
idents.push(txt);
}
}
"variable_name" => {
if let Some(txt) = text_of(n, code) {
idents.push(txt.trim_start_matches('$').to_string());
}
}
_ => {
let mut c = n.walk();
for ch in n.children(&mut c) {
collect_idents_with_paths(ch, code, idents, paths);
}
}
}
}
/// Walk an array/tuple destructure pattern in source order and return
/// each simple-identifier binding paired with its position index.
///
/// Recognises:
/// * JS/TS `array_pattern` — `const [a, b] = ...`, `const [, b] = ...`,
/// `const [a, ,] = ...`. Skip slots (commas with no binding between)
/// advance the position counter without emitting a binding.
/// * Rust `tuple_pattern` — `let (a, _, b) = ...`. `_pattern` (wildcard)
/// advances the position counter without emitting a binding.
/// * Python `pattern_list` / `tuple_pattern` — `a, b = ...` and
/// `(a, b) = ...`. Python `_` is a normal identifier binding (not a
/// wildcard), so every `identifier` child emits a (name, position)
/// entry.
/// * Ruby `left_assignment_list` — `a, b = ...`. Bare comma-list LHS
/// produced by `assignment` whose RHS is an array literal, a call
/// return, or another tuple-yielding expression. Ruby `_` is a normal
/// identifier (matches Python convention; `_` may still be referenced
/// later in scope). Splat (`*rest` parsed as `rest_assignment`) and
/// parenthesised nested destructure (`destructured_left_assignment`)
/// hit the bail branch and fall back to scalar union.
///
/// Returns an empty `SmallVec` when the pattern is not one of the above
/// kinds OR contains complex sub-patterns (`assignment_pattern` for
/// `[a = 1, b]`, `rest_pattern` for `[a, ...rest]`, Python
/// `list_splat_pattern` for `a, *rest = ...`, Ruby `rest_assignment` for
/// `a, *rest = ...`, nested `array_pattern`, `object_pattern`,
/// `destructured_left_assignment`). Callers treat the empty return as
/// "no position-aware rewrite available; fall back to scalar union".
pub(crate) fn collect_array_pattern_bindings_indexed(
pat: Node,
code: &[u8],
) -> SmallVec<[(String, usize); 4]> {
let mut out: SmallVec<[(String, usize); 4]> = SmallVec::new();
let kind = pat.kind();
if !matches!(
kind,
"array_pattern" | "tuple_pattern" | "pattern_list" | "left_assignment_list"
) {
return out;
}
let mut cursor = pat.walk();
let mut pos: usize = 0;
for child in pat.children(&mut cursor) {
match child.kind() {
"[" | "]" | "(" | ")" => {}
"," => {
pos += 1;
}
"identifier" | "shorthand_property_identifier_pattern" => {
if let Some(txt) = text_of(child, code) {
out.push((txt, pos));
}
}
// Rust wildcard `_` in tuple_pattern. Advances position counter
// without binding; no emit. Tree-sitter-rust models the
// wildcard as a leaf node whose `kind()` is literally "_".
"_" => {}
_ => {
// Complex sub-pattern. Bail by clearing — caller treats
// empty as "no position-aware rewrite", preserving the
// pre-existing scalar-union behavior for these shapes.
out.clear();
return out;
}
}
}
out
}
/// Walk an array-literal-shape RHS node and return one slot per source-order
/// element. Each slot is one of:
/// * `RhsArraySlot::Ident(name)` — bare identifier element.
/// * `RhsArraySlot::Literal` — syntactic literal (string, number, bool,
/// null/nil).
/// * `RhsArraySlot::Complex(uses)` — call / binary / subscript / member
/// access / nested array literal / etc. `uses` carries the inner
/// identifier names (member-access paths first, bare idents second)
/// harvested from the slot's subtree via `collect_idents_with_paths`.
///
/// Recognised RHS kinds:
/// * JS/TS / Ruby `array` — `[a, b]`
/// * Python `list` — `[a, b]`
/// * Python `tuple` — `(a, b)`
/// * Python `expression_list` — bare comma form `a, b`
/// * Rust `tuple_expression` — `(a, b)`
///
/// Bails (returns empty) when the RHS is not one of these kinds OR contains
/// a slot whose shape would shift index alignment (spread, list splat).
/// Callers treat empty as "no per-element rewrite available; fall back to
/// scalar union".
pub(crate) fn collect_rhs_array_literal_elements(
rhs: Node,
lang: &str,
code: &[u8],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> SmallVec<[crate::cfg::RhsArraySlot; 4]> {
use crate::cfg::RhsArraySlot;
use crate::labels::{Cap, DataLabel};
// Per-slot source classification: when a slot's own subtree carries a
// Source-labeled member-expression / subscript, capture the Cap so the
// SSA destructure rewrite emits Source for THIS slot specifically and
// lets sibling Complex slots stay slot-scoped Assign. Falls back to
// Cap::empty() when no per-slot source is recognised; the lowering
// path then consults the outer-node Source flag for conservative
// preservation of legacy behavior on shapes whose source pattern
// doesn't text-classify (e.g. a subscript on a tainted local).
let slot_source_cap = |slot: Node| -> Cap {
match first_member_label(slot, lang, code, extra_labels) {
Some(DataLabel::Source(c)) => c,
_ => Cap::empty(),
}
};
let mut out: SmallVec<[RhsArraySlot; 4]> = SmallVec::new();
let kind = rhs.kind();
if !matches!(
kind,
"array" | "array_literal" | "list" | "tuple" | "tuple_expression" | "expression_list"
) {
return out;
}
let mut cursor = rhs.walk();
for child in rhs.named_children(&mut cursor) {
let ck = child.kind();
match ck {
"identifier"
| "shorthand_property_identifier"
| "shorthand_property_identifier_pattern"
| "field_identifier"
| "property_identifier" => match text_of(child, code) {
Some(txt) => out.push(RhsArraySlot::Ident(txt)),
None => {
out.clear();
return out;
}
},
"variable_name" => match text_of(child, code) {
Some(txt) => out.push(RhsArraySlot::Ident(txt.trim_start_matches('$').to_string())),
None => {
out.clear();
return out;
}
},
// Syntactic literal slots: no ident, no taint contribution.
// Names follow tree-sitter's per-grammar literal kinds across
// the supported languages.
"string"
| "string_literal"
| "raw_string_literal"
| "interpreted_string_literal"
| "concatenated_string"
| "integer"
| "integer_literal"
| "float"
| "float_literal"
| "number"
| "numeric_literal"
| "true"
| "false"
| "boolean_literal"
| "boolean"
| "null"
| "null_literal"
| "nil"
| "none"
| "None"
| "undefined" => {
out.push(RhsArraySlot::Literal);
}
// Spread / list-splat shift index alignment unpredictably
// (`[...arr, b]` may expand to N elements at index 0). Bail
// so callers fall back to scalar union.
"spread_element" | "list_splat" | "list_splat_pattern" | "splat_argument"
| "unary_splat" | "splat_expression" => {
out.clear();
return out;
}
// Interpolated strings carry inner identifier uses. Treat as
// Complex so the slot picks up the contributions from
// `${user.id}` etc.
"template_string" | "string_interpolation" | "interpolation" | "encapsed_string" => {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(child, code, &mut idents, &mut paths);
let mut uses: SmallVec<[String; 4]> = SmallVec::new();
for p in paths {
uses.push(p);
}
for ident in idents {
if !uses.iter().any(|u| u == &ident) {
uses.push(ident);
}
}
let source_cap = slot_source_cap(child);
out.push(RhsArraySlot::Complex { uses, source_cap });
}
// Everything else (call, member access, binary, subscript,
// unary, ternary, nested array literal, etc.) is a "complex"
// slot. Harvest inner ident uses so the SSA lowering can paint
// the binding with this slot's contributions only — not the
// union of every ident on the RHS.
_ => {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(child, code, &mut idents, &mut paths);
let mut uses: SmallVec<[String; 4]> = SmallVec::new();
for p in paths {
uses.push(p);
}
for ident in idents {
if !uses.iter().any(|u| u == &ident) {
uses.push(ident);
}
}
let source_cap = slot_source_cap(child);
out.push(RhsArraySlot::Complex { uses, source_cap });
}
}
}
out
}
/// Recursively collect every identifier that occurs inside `n`.
///
/// Recognises `identifier` (most languages), `variable_name` (PHP),
/// `field_identifier` (Go), `property_identifier` (JS/TS), and
/// `shorthand_property_identifier` / `shorthand_property_identifier_pattern`
/// (JS/TS object-literal shorthand uses and destructuring binding patterns).
pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec<String>) {
match n.kind() {
"identifier"
| "field_identifier"
| "property_identifier"
| "shorthand_property_identifier"
| "shorthand_property_identifier_pattern"
// PHP `name`: leaf node carrying the bare identifier text for
// function/method names and similar grammar slots. Without this
// arm `function_definition` → `name` extraction returns empty
// for PHP, demoting every named function to `<anon#N>` and
// breaking cross-function summary lookup at the call site.
| "name" => {
if let Some(txt) = text_of(n, code) {
out.push(txt);
}
}
// PHP: $x is `variable_name` → `$` + `name`. Use the whole text minus `$`.
"variable_name" => {
if let Some(txt) = text_of(n, code) {
out.push(txt.trim_start_matches('$').to_string());
}
}
_ => {
let mut c = n.walk();
for ch in n.children(&mut c) {
collect_idents(ch, code, out);
}
}
}
}
/// AST kind names for subscript / index expressions
/// across the languages whose container-element flow we model.
///
/// JS/TS and C/C++ use `subscript_expression`; Python uses `subscript`;
/// Go uses `index_expression`. Other languages either lower indexing
/// through method calls (Rust slice indexing) or are out of scope for
/// the initial W5 rollout (Java/Ruby/PHP).
#[inline]
pub(crate) fn is_subscript_kind(kind: &str) -> bool {
matches!(
kind,
"subscript_expression" | "subscript" | "index_expression"
)
}
/// when the LHS of an assignment statement is a
/// subscript / index expression (or a single-element wrapper around
/// one), return that node. Returns `None` for multi-target Go
/// `expression_list`s, identifier LHSs, member-expression LHSs, etc.
pub(crate) fn subscript_lhs_node<'a>(lhs: Node<'a>, lang: &str) -> Option<Node<'a>> {
if is_subscript_kind(lhs.kind()) {
return Some(lhs);
}
// Go: `assignment_statement.left` is an `expression_list`; for
// single-target subscript writes (`m[k] = v`) it has exactly one
// named child which is `index_expression`.
if lang == "go" && lhs.kind() == "expression_list" {
let mut cursor = lhs.walk();
let named: Vec<Node> = lhs.named_children(&mut cursor).collect();
if named.len() == 1 && is_subscript_kind(named[0].kind()) {
return Some(named[0]);
}
}
None
}
/// extract `(array_text, index_text)` from a
/// subscript / index AST node.
///
/// Returns `None` when the array operand is not a plain identifier, we
/// only synthesise `__index_get__` / `__index_set__` calls when the
/// receiver resolves cleanly to a SSA-renamed local, since the W2/W4
/// container hooks need a stable receiver var_name to drive
/// `pt(receiver)`.
pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(String, String)> {
if !is_subscript_kind(n.kind()) {
return None;
}
let arr = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("operand"))
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child(0))?;
let idx = n
.child_by_field_name("index")
.or_else(|| n.child_by_field_name("subscript"))
.or_else(|| {
// Fallback: take the second named child after the array.
let mut cur = n.walk();
n.named_children(&mut cur).nth(1)
})?;
let arr_kind = arr.kind();
// Only proceed when the array is a plain identifier, otherwise
// we can't bind a stable receiver name for the synth Call.
if !matches!(
arr_kind,
"identifier" | "variable_name" | "simple_identifier"
) {
return None;
}
let arr_text = text_of(arr, code)?;
// PHP-style `$x` strip not needed here; the supported languages
// don't use it for local array identifiers.
let idx_text = text_of(idx, code)?;
Some((arr_text, idx_text))
}