nyx/src/cfg/mod.rs

4865 lines
204 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#![allow(
clippy::collapsible_if,
clippy::let_and_return,
clippy::unnecessary_map_or
)]
use petgraph::algo::dominators::{Dominators, simple_fast};
use petgraph::prelude::*;
use tracing::{debug, warn};
use tree_sitter::{Node, Tree};
use crate::labels::{
Cap, DataLabel, Kind, LangAnalysisRules, classify, classify_all, classify_gated_sink, lookup,
};
use crate::summary::FuncSummary;
use crate::symbol::{FuncKey, Lang};
use crate::utils::snippet::truncate_at_char_boundary;
use smallvec::SmallVec;
use std::cell::RefCell;
use std::collections::{HashMap, HashSet};
mod blocks;
mod conditions;
mod decorators;
mod dto;
mod helpers;
mod hierarchy;
mod imports;
mod literals;
mod params;
use blocks::{build_begin_rescue, build_switch, build_try};
use helpers::{
collect_nested_function_nodes, derive_anon_fn_name_from_context, find_classifiable_inner_call,
first_call_ident_with_span, first_member_label, first_member_text, is_raii_factory,
is_subscript_kind, root_member_receiver, subscript_components, subscript_lhs_node,
};
// Re-exports so sibling submodules can keep using `super::name` for
// helpers that physically live in `helpers.rs`.
use conditions::{
build_condition_chain, build_ternary_diamond, classify_ternary_lhs,
detect_rust_let_match_guard, emit_rust_match_guard_if, find_ternary_rhs_wrapper,
is_boolean_operator, unwrap_parens,
};
use decorators::extract_auth_decorators;
pub(crate) use helpers::{
collect_idents, collect_idents_with_paths, find_constructor_type_child, first_call_ident,
has_call_descendant, member_expr_text, root_receiver_text, text_of,
};
use imports::{extract_import_bindings, extract_promisify_aliases};
#[cfg(test)]
use literals::has_sql_placeholders;
use literals::{
arg0_kind_and_interpolation, call_ident_of, def_use, detect_go_replace_call_sanitizer,
detect_rust_replace_chain_sanitizer, extract_arg_callees, extract_arg_string_literals,
extract_arg_uses, extract_const_keyword_arg, extract_const_macro_arg, extract_const_string_arg,
extract_destination_field_pairs, extract_destination_kwarg_pairs, extract_kwargs,
extract_literal_rhs, extract_shell_array_payload_idents, find_call_node, find_call_node_deep,
find_chained_inner_call, has_keyword_arg, has_only_literal_args, is_parameterized_query_call,
java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
};
use params::{
compute_container_and_kind, extract_param_meta, inject_framework_param_sources,
is_configured_terminator,
};
/// Test-only re-export of [`extract_param_meta`] so the external
/// `tests/typed_extractors_audit.rs` harness can drive the per-param
/// classifier directly without spinning up the full scan pipeline.
pub fn extract_param_meta_for_test<'a>(
func_node: tree_sitter::Node<'a>,
lang: &str,
code: &'a [u8],
) -> Vec<(String, Option<crate::ssa::type_facts::TypeKind>)> {
extract_param_meta(func_node, lang, code)
}
/// Test-only helper to populate the per-file DTO class map without
/// running `build_cfg`. Used by the DTO audit harness in
/// `tests/typed_extractors_audit.rs` to verify that
/// `classify_param_type_*` resolves a same-file DTO via the
/// thread-local map.
pub fn populate_dto_classes_for_test(root: tree_sitter::Node<'_>, lang: &str, code: &[u8]) {
DTO_CLASSES.with(|cell| {
*cell.borrow_mut() = dto::collect_dto_classes(root, lang, code);
});
}
/// Test-only counterpart to [`populate_dto_classes_for_test`]. Always
/// call this at the end of a test that populated the map so per-thread
/// state never leaks into another test.
pub fn clear_dto_classes_for_test() {
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
}
// Per-file map of function-node start_byte → DFS preorder index. Stable
// against unrelated edits (inserting a line above a function doesn't
// change its index). Thread-local is safe, `build_cfg` is not
// re-entrant within a single rayon worker.
thread_local! {
static FN_DFS_INDICES: RefCell<HashMap<usize, u32>> = RefCell::new(HashMap::new());
/// Per-file DTO class definitions, populated at the top of
/// [`build_cfg`] so per-parameter classifiers can resolve typed
/// extractors against same-file DTOs.
pub(crate) static DTO_CLASSES: RefCell<HashMap<String, crate::ssa::type_facts::DtoFields>>
= RefCell::new(HashMap::new());
/// Per-file set of TS / JS `type X = Map<...>` (or `Set<...>` /
/// `Array<...>` / `T[]`) aliases, populated at the top of
/// [`build_cfg`]. Lets `classify_param_type_ts` resolve a
/// parameter typed `m: ElementsMap` to
/// [`crate::ssa::type_facts::TypeKind::LocalCollection`] via
/// same-file alias lookup. Cross-file aliases are not yet
/// resolved.
pub(crate) static TYPE_ALIAS_LC: RefCell<std::collections::HashSet<String>>
= RefCell::new(std::collections::HashSet::new());
}
/// Populate the per-file DFS-index map from a preorder walk of the
/// tree-sitter AST. Every node classifying as `Kind::Function` gets
/// a monotonically increasing `u32` starting at 0.
fn populate_fn_dfs_indices(tree: &Tree, lang: &str) {
fn walk(n: Node, lang: &str, counter: &mut u32, map: &mut HashMap<usize, u32>) {
if lookup(lang, n.kind()) == Kind::Function {
map.insert(n.start_byte(), *counter);
*counter += 1;
}
let mut c = n.walk();
for child in n.children(&mut c) {
walk(child, lang, counter, map);
}
}
let mut map = HashMap::new();
let mut counter: u32 = 0;
walk(tree.root_node(), lang, &mut counter, &mut map);
FN_DFS_INDICES.with(|cell| *cell.borrow_mut() = map);
}
/// Clear the per-file DFS-index map. Called at the end of `build_cfg`
/// to avoid leaking state between files on the same thread.
fn clear_fn_dfs_indices() {
FN_DFS_INDICES.with(|cell| cell.borrow_mut().clear());
}
/// Lookup a function node's DFS index by its `start_byte`.
fn fn_dfs_index(start_byte: usize) -> Option<u32> {
FN_DFS_INDICES.with(|cell| cell.borrow().get(&start_byte).copied())
}
/// Synthetic name for an anonymous function: `<anon#N>` from the DFS
/// index when available, `<anon@OFFSET>` as fallback.
pub(crate) fn anon_fn_name(start_byte: usize) -> String {
match fn_dfs_index(start_byte) {
Some(idx) => format!("<anon#{idx}>"),
None => format!("<anon@{start_byte}>"),
}
}
/// True for any anonymous-function synthesis prefix.
pub(crate) fn is_anon_fn_name(name: &str) -> bool {
name.starts_with("<anon#") || name.starts_with("<anon@")
}
/// -------------------------------------------------------------------------
/// Public ASTtoCFG data structures
/// -------------------------------------------------------------------------
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
pub enum StmtKind {
Entry,
Exit,
#[default]
Seq,
If,
Loop,
Break,
Continue,
Return,
Throw,
Call,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EdgeKind {
Seq, // ordinary fallthrough
True, // `cond == true` branch
False, // `cond == false` branch
Back, // backedge that closes a loop
Exception, // from call/throw inside try body → catch entry
}
/// Maximum number of identifiers to store from a condition expression.
pub(super) const MAX_COND_VARS: usize = 8;
pub(super) const MAX_CONDITION_TEXT_LEN: usize = 256;
/// Binary operator extracted from the AST.
///
/// Only set when the SSA assignment maps one-to-one to a single
/// binary expression. Left `None` for nested, compound, or boolean
/// expressions.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum BinOp {
Add,
Sub,
Mul,
Div,
Mod,
// Bitwise
BitAnd,
BitOr,
BitXor,
LeftShift,
RightShift,
// Comparison
Eq,
NotEq,
Lt,
LtEq,
Gt,
GtEq,
}
/// Call-related metadata for CFG nodes.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct CallMeta {
pub callee: Option<String>,
/// Original textual callee path (e.g. `"c.mu.Lock"`) preserved for legacy
/// consumers when SSA lowering decomposes a chained-receiver call into a
/// `FieldProj` chain plus a bare-method `Call`.
///
/// CFG construction does NOT populate this field today (callee already
/// carries the full path). It is the canonical place to read the original
/// textual callee for **debug/display only**, analysis code should walk
/// SSA `FieldProj` receivers or use the
/// [`crate::labels::bare_method_name`] textual fallback.
#[doc(hidden)]
#[serde(default)]
pub callee_text: Option<String>,
/// When `find_classifiable_inner_call` overrides the primary callee
/// (e.g. `parts.add(req.getParameter("input"))` → callee becomes
/// "req.getParameter"), this field preserves the original outer callee
/// ("parts.add") so container propagation can still recognise it.
pub outer_callee: Option<String>,
/// Byte span of the inner call that supplied the classification, when
/// `find_classifiable_inner_call` overrode the outer callee. `None` when
/// the classification came from the outer AST node directly, in that
/// case `AstMeta.span` already points at the classified expression.
///
/// Consumers that want the location of the *labeled* call (sink/source/
/// sanitizer display, flow-step rendering, taint origin attribution)
/// should use [`NodeInfo::classification_span`] rather than reading this
/// field directly. `AstMeta.span` remains the authoritative "whole
/// statement" span, used by structural passes (unreachability,
/// resource lifecycle, guard byte scans, CFG/taint span dedup).
#[serde(default)]
pub callee_span: Option<(usize, usize)>,
/// Per-function call ordinal (0-based, only meaningful for Call nodes).
pub call_ordinal: u32,
/// Per-argument identifiers for Call nodes. Each inner Vec holds the
/// identifiers from one argument expression, in parameter-position order.
/// Empty for non-call nodes or when argument boundaries can't be determined.
pub arg_uses: Vec<Vec<String>>,
/// For `CallMethod` nodes: the receiver identifier (e.g. `tainted` in
/// `tainted.foo()`). `None` for non-method calls or complex receivers
/// (member expressions, call expressions, etc.).
pub receiver: Option<String>,
/// For gated sinks: which argument positions carry the tainted payload.
/// When `Some`, only variables from these `arg_uses` positions are checked
/// for taint. `None` = all arguments are payload (default).
pub sink_payload_args: Option<Vec<usize>>,
/// Keyword/named arguments attached to this call, in source order.
///
/// Each entry is `(keyword_name, uses)` where `uses` are the identifier
/// references from the keyword's value expression (same shape as an entry
/// in `arg_uses`). Populated for languages that expose named arguments
/// at the call site (e.g. Python `func(shell=True)`, Ruby hash-arg style).
/// Empty for languages without named arguments and for calls that use
/// only positional arguments.
pub kwargs: Vec<(String, Vec<String>)>,
/// String-literal value at each positional argument of this call, parallel
/// to `arg_uses`, `Some(s)` when the argument is a syntactic string
/// literal, `None` otherwise. Empty for non-call nodes or when positional
/// boundaries can't be determined. Consumed by the static-map abstract
/// analysis (and future literal-aware passes) so they don't need the
/// source bytes.
pub arg_string_literals: Vec<Option<String>>,
/// Destination-aware sink filter for outbound-HTTP gates.
///
/// When `Some(names)`, the SSA sink scan restricts taint checks to
/// identifiers whose `var_name` matches one of `names`. Populated by
/// gated sinks whose activation is [`crate::labels::GateActivation::Destination`]
/// with `object_destination_fields` set and whose positional destination
/// arg is an object literal: CFG walks the object literal, collects
/// identifiers from the named destination fields (url, host, path, …),
/// and stores them here so `fetch({url: taintedUrl, body: fixed})` fires
/// while `fetch({url: fixed, body: taintedData})` does not.
///
/// Takes priority over `sink_payload_args` in the SSA sink scan: when a
/// call has an object-literal destination arg, only idents under the
/// listed fields may contribute sink findings, not every ident in the
/// positional slot.
///
/// Legacy single-gate path: populated only when this call site matched
/// exactly one gate. When a callee carries multiple gates (e.g. `fetch`
/// is both an SSRF and a `DATA_EXFIL` gate), per-gate filters live in
/// [`Self::gate_filters`] and this field is left `None`.
#[serde(default)]
pub destination_uses: Option<Vec<String>>,
/// Per-gate filters for callees that carry multiple gated-sink rules.
///
/// Each entry preserves one matching gate's `(label_caps, payload_args,
/// destination_uses)` so the SSA sink scan can attribute findings
/// per-cap. Empty when the call site matches zero or exactly one gate
/// (the single-gate case continues to use [`Self::sink_payload_args`] +
/// [`Self::destination_uses`]).
#[serde(default)]
pub gate_filters: Vec<GateFilter>,
/// True when this call expression is a constructor invocation
/// (e.g. JS/TS `new Stripe(key)`, PHP `new PDO(...)`). The SSA Call
/// transfer uses this to narrow the constructed value's caps: a wrapper
/// object instance is structurally not a path string, format string,
/// URL component, or JSON input, so out-of-process side-effect bits
/// (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on the arguments
/// must not survive into the constructed object.
#[serde(default)]
pub is_constructor: bool,
}
/// One gate's contribution at a call site whose callee matches multiple
/// gates. The SSA taint engine processes each filter independently so a
/// `fetch({url: tainted}, {body: tainted})` flow surfaces as one SSRF
/// finding (URL filter) plus one `DATA_EXFIL` finding (body filter), each
/// carrying its own cap mask rather than a conflated union.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct GateFilter {
/// Sink caps emitted by this gate (e.g. `Cap::SSRF`, `Cap::DATA_EXFIL`).
pub label_caps: crate::labels::Cap,
/// Argument positions that carry the tainted payload for this gate.
pub payload_args: Vec<usize>,
/// Destination-aware filter: when `Some(names)`, the sink check only
/// considers SSA values whose `var_name` matches one of `names` (object-
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
pub destination_uses: Option<Vec<String>>,
/// Parallel to [`Self::destination_uses`]: for each entry, the
/// destination object-literal field name (e.g. `"body"`, `"headers"`,
/// `"json"`) where the corresponding ident was bound. Empty when
/// `destination_uses` is `None` or the gate had no
/// `object_destination_fields` configured. Consumed by diag rendering
/// to embed the destination field in `DATA_EXFIL` messages and SARIF
/// `properties.data_exfil_field`.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub destination_fields: Vec<String>,
}
/// Taint-classification and variable-flow metadata.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct TaintMeta {
pub labels: SmallVec<[DataLabel; 2]>, // taint classifications (multi-label)
/// Raw text of a constant/literal RHS when this node defines a variable
/// from a syntactic literal with no uses. Used by SSA constant propagation.
pub const_text: Option<String>,
pub defines: Option<String>, // variable written by this stmt
pub uses: Vec<String>, // variables read
/// Additional variable definitions from destructuring patterns.
/// E.g. `const { a, b, c } = source()` → defines="a", extra_defines=["b", "c"].
pub extra_defines: Vec<String>,
}
/// AST origin/location metadata.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct AstMeta {
pub span: (usize, usize), // byte offsets in the original file
/// Name of the enclosing function (set during CFG construction).
pub enclosing_func: Option<String>,
}
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct NodeInfo {
pub kind: StmtKind,
pub call: CallMeta,
pub taint: TaintMeta,
pub ast: AstMeta,
/// For If nodes: raw condition text (truncated to 256 chars). None for non-If nodes.
pub condition_text: Option<String>,
/// For If nodes: identifiers referenced in the condition (sorted, deduped, max 8).
pub condition_vars: Vec<String>,
/// For If nodes: whether the condition has a leading negation (`!` / `not`).
pub condition_negated: bool,
/// True when this is a Call node whose argument list contains only
/// syntactic literal values (strings, numbers, booleans, null/nil,
/// arrays/lists/tuples of literals). Also true for zero-argument calls
/// (no argument-carried taint vector).
///
/// This flag is scoped to taint-style sink suppression: it indicates
/// that no attacker-controlled data enters through the immediate
/// arguments. It does NOT mean the call is "safe" in general, other
/// detectors (resource lifecycle, structural analysis) may still
/// legitimately flag these calls.
pub all_args_literal: bool,
/// True for synthetic catch-parameter nodes injected at catch clause entry.
/// The taint transfer function uses this to conservatively taint the
/// caught exception variable.
pub catch_param: bool,
/// For Call nodes: the callee name of the call expression wrapping each
/// argument (per-position, matching arg_uses). For Assignment sink nodes:
/// the RHS call callee at position 0 (if the RHS is a call expression).
/// Used by SSA sink detection for interprocedural sanitizer resolution.
pub arg_callees: Vec<Option<String>>,
/// For cast/type-assertion expressions: the target type name extracted
/// from the AST. E.g. `(String) x` → `"String"`, `x as number` → `"number"`,
/// `x.(io.Reader)` → `"io.Reader"`. Used by type-flow constraint solving
/// to refine the type environment at the SSA level.
pub cast_target_type: Option<String>,
/// Arithmetic operator for binary expression assignments.
/// Only set when the CFG node is a single binary expression with a
/// clear one-to-one operator mapping. `None` for nested, compound,
/// boolean, or ambiguous expressions.
pub bin_op: Option<BinOp>,
/// Parsed literal operand from a binary expression.
/// When `bin_op` is set and one operand is a numeric literal (the other
/// being an identifier captured in `uses`), this holds the parsed value.
/// Enables abstract-domain transfer even when the SSA instruction has
/// only one use (the literal isn't an identifier and isn't in `uses`).
pub bin_op_const: Option<i64>,
/// True when this acquisition node is inside a language-managed cleanup
/// scope (Python `with`, Java try-with-resources, C# `using`).
/// Only meaningful on Call nodes that define a resource variable.
/// Leak detectors check this flag on the acquire site, not the variable.
pub managed_resource: bool,
/// True when this Call node is a deferred release (Go `defer f.Close()`).
/// Deferred releases are not processed as immediate closes; instead they
/// suppress leak findings (defer guarantees cleanup at function exit).
/// Only set on Call nodes, not on all nodes within a defer_statement.
pub in_defer: bool,
/// True when this is a SQL_QUERY sink whose first argument is a string
/// literal containing parameterized-query placeholders (`$1`, `?`, `%s`,
/// `:name`) AND the call has >= 2 arguments (the params array/tuple).
/// Both CFG analysis and SSA taint suppress findings on such nodes.
pub parameterized_query: bool,
/// Constant leading string prefix recovered from the node's RHS when it
/// is a template literal (JS/TS) with a leading `string_fragment` or an
/// equivalent constant-string-then-interpolation shape. Populated for
/// assignment-like nodes (`variable_declarator`, `assignment_expression`,
/// `lexical_declaration`). Consumed by the abstract string domain in
/// `transfer_abstract` to seed a `StringFact::from_prefix` on the result
/// SSA value so SSRF prefix-suppression can fire for values constructed
/// from template literals.
pub string_prefix: Option<String>,
/// True when this node is a binary equality/inequality expression whose
/// operator is `==` / `!=` / `===` / `!==` and exactly one operand is a
/// syntactic literal (string / number / null / boolean). The SSA taint
/// transfer uses this to suppress boolean-result taint propagation: the
/// boolean outcome of `x === 'literal'` carries no attacker-controlled
/// data, so downstream branches on it should not inherit x's caps.
pub is_eq_with_const: bool,
/// True when this node reads a numeric-length property on a container:
/// `arr.length`, `map.size`, `buf.byteLength`, `items.count`, `vec.len()`
///, either as a pure property access or as a zero-arg method call.
/// Populated by inspecting the AST in `push_node` across JS/TS, Python,
/// Ruby, Java, Rust, PHP, and C/C++ idioms where these accessors return
/// an integer. Consumed by the type-fact analysis (`ssa::type_facts`)
/// to infer `TypeKind::Int`, which drives HTML_ESCAPE / SQL_QUERY /
/// FILE_IO / SHELL_ESCAPE sink suppression for provably numeric
/// payloads.
pub is_numeric_length_access: bool,
/// the field name read on the RHS of an assignment whose
/// RHS is a single member-access expression (e.g. `let x = dto.email`).
/// Set to `Some("email")` for that shape; left `None` otherwise.
/// Consumed by the type-fact analysis (`ssa::type_facts`) so reads
/// against a [`crate::ssa::type_facts::TypeKind::Dto`] receiver pick
/// up the field's declared `TypeKind`. Strictly additive, when
/// `None`, the legacy copy-prop semantics apply.
pub member_field: Option<String>,
/// True when this assignment / declaration's RHS is a function or
/// lambda literal (`obj.handler = (e) => {...}`, `let f = function(){}`).
/// State analysis uses this to suppress resource-ownership transfer:
/// storing a function reference into a property does not move the
/// resources captured by the closure body, so the lifecycle of those
/// captures must remain unchanged on the assignment node.
pub rhs_is_function_literal: bool,
}
impl NodeInfo {
/// Byte span of the *labeled* sub-expression in this CFG node.
///
/// When `find_classifiable_inner_call` found the source/sink/sanitizer
/// deep inside an enclosing statement (e.g. `escapeHtml(...)` buried in
/// a template literal whose outer node is the `overlay.innerHTML = ...`
/// assignment), `call.callee_span` pinpoints the inner call; otherwise
/// the whole node's span is the classification span.
///
/// Use this for **display and source-attribution**: taint finding sink
/// lines, flow-step rendering, symbolic witness extraction, debug views.
///
/// Use `ast.span` directly for **structural grain**: unreachability,
/// resource lifecycle, guard byte scans, CFG/taint span dedup, anywhere
/// the enclosing statement is the meaningful unit.
#[inline]
pub fn classification_span(&self) -> (usize, usize) {
self.call.callee_span.unwrap_or(self.ast.span)
}
}
/// Intrafile function summary with graphlocal node indices.
///
/// Keeps all three cap dimensions independently so that a function that is
/// *both* a source and a sink (e.g. reads env then shells out) does not
/// lose information.
#[derive(Debug, Clone)]
pub struct LocalFuncSummary {
#[allow(dead_code)] // used for future intra-file graph traversal
pub entry: NodeIndex,
#[allow(dead_code)] // used for future intra-file graph traversal
pub exit: NodeIndex,
pub source_caps: Cap,
pub sanitizer_caps: Cap,
pub sink_caps: Cap,
pub param_count: usize,
pub param_names: Vec<String>,
/// Which parameter indices (0based) flow through to the return value.
pub propagating_params: Vec<usize>,
/// Which parameter indices flow to internal sinks.
pub tainted_sink_params: Vec<usize>,
/// Per-call-site metadata for every call inside this function body.
/// Each entry carries the callee's raw name plus arity, receiver,
/// qualifier, and ordinal so callers can resolve overloads and
/// method-call targets without re-parsing.
pub callees: Vec<crate::summary::CalleeSite>,
/// Identity discriminator: enclosing container path, `""` for free
/// top-level functions. Copied into `FuncSummary.container` at export.
pub container: String,
/// Identity discriminator: byte offset / occurrence index for disambiguating
/// same-name siblings (closures, duplicate defs).
pub disambig: Option<u32>,
/// Structural role of this definition.
pub kind: crate::symbol::FuncKind,
}
pub type Cfg = Graph<NodeInfo, EdgeKind>;
pub type FuncSummaries = HashMap<FuncKey, LocalFuncSummary>;
// -------------------------------------------------------------------------
// Per-body CFG types
// -------------------------------------------------------------------------
/// Opaque identifier for an executable body within a file.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct BodyId(pub u32);
/// Identifies the kind of executable body.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BodyKind {
TopLevel,
NamedFunction,
AnonymousFunction,
}
/// Metadata for a single executable body.
#[derive(Debug, Clone)]
pub struct BodyMeta {
pub id: BodyId,
pub kind: BodyKind,
pub name: Option<String>,
pub params: Vec<String>,
/// Per-parameter [`crate::ssa::type_facts::TypeKind`] inferred from
/// decorators / annotations / static type text at CFG construction
/// time. Same length as `params`; positions with no recoverable
/// type info are `None`. Strictly additive, when every entry is
/// `None`, downstream behaviour is identical to the pre-Phase-1
/// engine.
pub param_types: Vec<Option<crate::ssa::type_facts::TypeKind>>,
pub param_count: usize,
pub span: (usize, usize),
pub parent_body_id: Option<BodyId>,
/// Canonical identity for this body.
///
/// `Some(..)` for named/anonymous function bodies, carrying the same
/// `FuncKey` under which `FileCfg::summaries` stores its
/// `LocalFuncSummary`. `None` for the synthetic top-level body.
///
/// All intra-file maps keyed on function identity (SSA summaries, callee
/// bodies, inline cache, callback bindings) use this key, never the bare
/// leaf `name`, which is collision-prone across (container, arity,
/// disambig, kind).
pub func_key: Option<FuncKey>,
/// Normalized auth-decorator/annotation/attribute names attached to this
/// function (Python `@login_required`, Java `@PreAuthorize`, Ruby class
/// `before_action :authenticate_user!`, etc.). Lowercased, bare names
/// without `@`, `#[..]`, `[[..]]` wrappers or argument tails. The state
/// machine consumes this to seed the entry `AuthLevel` for privileged-sink
/// checks. Empty for top-level and for functions without auth markers.
pub auth_decorators: Vec<String>,
}
/// A single executable body's CFG plus metadata.
#[derive(Debug)]
pub struct BodyCfg {
pub meta: BodyMeta,
pub graph: Cfg,
pub entry: NodeIndex,
pub exit: NodeIndex,
}
/// A single import alias binding: local alias → original exported name + module.
#[derive(Debug, Clone)]
pub struct ImportBinding {
/// The original exported symbol name (e.g. `getInput`).
pub original: String,
/// The module path (e.g. `./source`), if extractable.
pub module_path: Option<String>,
}
/// Per-file map from locally-bound alias name to its import origin.
/// Populated during CFG construction for ES6 `import { A as B }` and
/// CommonJS `const { A: B } = require(...)` patterns.
pub type ImportBindings = HashMap<String, ImportBinding>;
/// A single promisify alias binding: local name bound to `util.promisify(X)`
/// carries the labels of its wrapped callee `X`.
#[derive(Debug, Clone)]
pub struct PromisifyAlias {
/// The wrapped callee's canonical textual name (e.g. `child_process.exec`
/// or `fs.readFile`). Used directly for label classification so downstream
/// sink / source detection treats the alias the same as the original.
pub wrapped: String,
}
/// Per-file map from local binding name to its promisify wrap origin.
/// Populated for JS/TS files at CFG construction time for patterns like
/// `const alias = util.promisify(wrapped)` or `const alias = promisify(wrapped)`.
pub type PromisifyAliases = HashMap<String, PromisifyAlias>;
/// All CFGs for a file.
#[derive(Debug)]
pub struct FileCfg {
pub bodies: Vec<BodyCfg>,
pub summaries: FuncSummaries,
/// Import alias bindings: local alias → (original name, module path).
pub import_bindings: ImportBindings,
/// Promisify wrapper aliases: local name → wrapped callee name.
/// Only populated for JS/TS files.
pub promisify_aliases: PromisifyAliases,
/// per-file class / trait / interface hierarchy edges.
/// Each entry is `(sub_container, super_container)` after
/// language-specific normalisation. See
/// [`crate::cfg::hierarchy`] for the per-language extraction
/// rules and [`crate::callgraph::TypeHierarchyIndex`] for the
/// downstream consumer. Empty for languages without an
/// extractor (Go, C) and for files with no inheritance / impl
/// declarations.
pub hierarchy_edges: Vec<(String, String)>,
}
impl FileCfg {
/// The top-level / module body (always `BodyId(0)`).
pub fn toplevel(&self) -> &BodyCfg {
&self.bodies[0]
}
/// Look up a body by its `BodyId`.
pub fn body(&self, id: BodyId) -> &BodyCfg {
&self.bodies[id.0 as usize]
}
/// All non-top-level bodies (functions, closures, callbacks).
pub fn function_bodies(&self) -> &[BodyCfg] {
&self.bodies[1..]
}
/// The first function body, or top-level if no functions exist.
/// Useful for tests where source is wrapped in a single function.
pub fn first_body(&self) -> &BodyCfg {
if self.bodies.len() > 1 {
&self.bodies[1]
} else {
&self.bodies[0]
}
}
/// Total CFG node count across all bodies.
pub fn total_node_count(&self) -> usize {
self.bodies.iter().map(|b| b.graph.node_count()).sum()
}
}
/// Create a `NodeInfo` with only kind, span, and enclosing_func set.
/// All other fields are empty/default.
fn make_empty_node_info(
kind: StmtKind,
span: (usize, usize),
enclosing_func: Option<&str>,
) -> NodeInfo {
NodeInfo {
kind,
ast: AstMeta {
span,
enclosing_func: enclosing_func.map(|s| s.to_owned()),
},
..Default::default()
}
}
/// Create a fresh body-level `Cfg` with synthetic Entry and Exit nodes.
fn create_body_graph(
span_start: usize,
span_end: usize,
enclosing_func: Option<&str>,
) -> (Cfg, NodeIndex, NodeIndex) {
let mut g: Cfg = Graph::with_capacity(32, 64);
let entry = g.add_node(make_empty_node_info(
StmtKind::Entry,
(span_start, span_start),
enclosing_func,
));
let exit = g.add_node(make_empty_node_info(
StmtKind::Exit,
(span_end, span_end),
enclosing_func,
));
(g, entry, exit)
}
/// Extract raw condition metadata from an If AST node.
///
/// Returns `(condition_text, condition_vars, condition_negated)`.
/// The condition subtree is located via `child_by_field_name("condition")`
/// for most languages, with a positional fallback for Rust `if_expression`.
///
/// Negation is detected by checking for a leading unary `!` operator or
/// `not` keyword. Variables are sorted, deduped, and capped at
/// [`MAX_COND_VARS`].
fn extract_condition_raw<'a>(
ast: Node<'a>,
lang: &str,
code: &'a [u8],
) -> (Option<String>, Vec<String>, bool) {
// 1. Find the condition subtree.
let cond_node = ast.child_by_field_name("condition").or_else(|| {
// Rust `if_expression` uses positional children: the condition is
// the first child that is not a keyword, block, or `let` pattern.
let mut cursor = ast.walk();
ast.children(&mut cursor).find(|c| {
let k = c.kind();
!matches!(lookup(lang, k), Kind::Block | Kind::Trivia)
&& k != "if"
&& k != "else"
&& k != "let"
&& k != "{"
&& k != "}"
&& k != "("
&& k != ")"
})
});
let Some(cond) = cond_node else {
return (None, Vec::new(), false);
};
// 2. Detect leading negation (`!expr`, `not expr`, Ruby `unless`).
let (inner, negated) = detect_negation(cond, ast, lang);
// 3. Collect identifiers from the (inner) condition subtree.
let mut vars = Vec::new();
collect_idents(inner, code, &mut vars);
vars.sort();
vars.dedup();
vars.truncate(MAX_COND_VARS);
// 4. Extract text, truncated. UTF-8-safe, gogs (Gurmukhi) /
// discourse (Cyrillic) trip raw byte slices on regex literals.
let text = text_of(cond, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
(text, vars, negated)
}
/// Detect leading negation and return the inner expression.
///
/// Handles:
/// - `!expr` (unary_expression / prefix_unary_expression with `!` operator)
/// - `not expr` (Python `not_operator`, Ruby)
///
/// NOTE: Ruby `unless` is NOT handled here. The CFG builder already swaps
/// True/False edges for `unless` (cfg.rs lines 2076-2085), so the edge labels
/// encode the correct branch semantics. Setting `condition_negated=true` here
/// would cause a double-negation in `compute_succ_states`, applying validation
/// to the wrong branch.
pub(super) fn detect_negation<'a>(
cond: Node<'a>,
_if_ast: Node<'a>,
_lang: &str,
) -> (Node<'a>, bool) {
// Unwrap parenthesized_expression, JS/Java/PHP wrap if-conditions in parens.
// This lets us detect negation inside: `if (!expr)` → cond is `(!expr)`.
let cond = if cond.kind() == "parenthesized_expression" {
cond.child_by_field_name("expression")
.or_else(|| {
let mut cursor = cond.walk();
cond.children(&mut cursor)
.find(|c| c.kind() != "(" && c.kind() != ")")
})
.unwrap_or(cond)
} else {
cond
};
// `!expr` appears as unary_expression, not_operator, or prefix_unary_expression
// with a `!` or `not` operator child.
let is_negation_wrapper = matches!(
cond.kind(),
"unary_expression" | "not_operator" | "prefix_unary_expression" | "unary_not"
);
if is_negation_wrapper {
// Check if the first child is a `!` or `not` operator.
let has_not = cond
.child(0)
.is_some_and(|c| c.kind() == "!" || c.kind() == "not");
if has_not {
// Return the operand (inner expression after the `!` / `not`).
let inner = cond
.child_by_field_name("argument")
.or_else(|| cond.child_by_field_name("operand"))
.or_else(|| {
// Last non-operator child.
let mut cursor = cond.walk();
cond.children(&mut cursor)
.filter(|c| c.kind() != "!" && c.kind() != "not")
.last()
})
.unwrap_or(cond);
return (inner, true);
}
}
(cond, false)
}
/// Extract a binary operator from an AST node.
///
/// Covers arithmetic, bitwise, and comparison operators. Conservative
/// policy: only returns `Some(BinOp)` when the AST node directly IS a
/// binary expression or is an assignment/expression wrapper containing
/// a single binary expression as its immediate RHS. Returns `None` for
/// nested binary expressions, compound assignments (`+=`), boolean
/// operators (`&&`, `||`), and any ambiguous cases.
fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
// Find the binary expression node: either ast itself or immediate child.
let bin_expr = find_single_binary_expr(ast, lang)?;
// Walk children to find the operator token (anonymous node between operands).
let mut cursor = bin_expr.walk();
for child in bin_expr.children(&mut cursor) {
if child.is_named() {
continue; // Skip named children (operands)
}
let kind = child.kind();
return match kind {
"+" => Some(BinOp::Add),
"-" => Some(BinOp::Sub),
"*" => Some(BinOp::Mul),
"/" => Some(BinOp::Div),
"%" => Some(BinOp::Mod),
// Bitwise (single-char tokens, no conflict with && / ||)
"&" => Some(BinOp::BitAnd),
"|" => Some(BinOp::BitOr),
"^" => Some(BinOp::BitXor),
"<<" => Some(BinOp::LeftShift),
">>" => Some(BinOp::RightShift),
// Comparison (=== / !== are JS/TS strict equality)
"==" | "===" => Some(BinOp::Eq),
"!=" | "!==" => Some(BinOp::NotEq),
"<" => Some(BinOp::Lt),
"<=" => Some(BinOp::LtEq),
">" => Some(BinOp::Gt),
">=" => Some(BinOp::GtEq),
_ => None, // Boolean (&&, ||), assignment ops, etc.
};
}
None
}
/// Find the RHS value node of an assignment-like AST node (variable declarator,
/// lexical declaration, assignment expression). Used by helpers that need to
/// inspect what an identifier is being initialized to.
fn assignment_rhs<'a>(ast: Node<'a>) -> Option<Node<'a>> {
match ast.kind() {
"variable_declarator" | "assignment_expression" | "assignment" => ast
.child_by_field_name("value")
.or_else(|| ast.child_by_field_name("right")),
"variable_declaration" | "lexical_declaration" => {
// Walk direct children for the first variable_declarator with a value.
let mut w = ast.walk();
ast.named_children(&mut w)
.find(|c| c.kind() == "variable_declarator")
.and_then(|d| {
d.child_by_field_name("value")
.or_else(|| d.child_by_field_name("right"))
})
}
"expression_statement" => {
// expression_statement wraps an assignment_expression
let mut w = ast.walk();
ast.named_children(&mut w).find_map(|c| match c.kind() {
"assignment_expression" | "assignment" => c
.child_by_field_name("right")
.or_else(|| c.child_by_field_name("value")),
_ => None,
})
}
_ => None,
}
}
/// Extract a constant leading string prefix from an assignment-like node's
/// RHS when the RHS is a JS/TS template literal beginning with a
/// `string_fragment` or a binary `+` expression whose left operand is a string
/// literal. Returns `None` if the grammar does not expose such a shape.
///
/// The recovered prefix is used by the abstract string domain to seed a
/// `StringFact::from_prefix` on the result SSA value. For SSRF detection,
/// when the prefix contains `scheme://host/`, the sink is suppressed because
/// the attacker cannot reach a different host.
fn extract_template_prefix(ast: Node, lang: &str, code: &[u8]) -> Option<String> {
// Only JS/TS expose `template_string` nodes; cheap early exit elsewhere.
if !matches!(lang, "javascript" | "typescript") {
return None;
}
// Assignment-like node: inspect the RHS directly.
if let Some(rhs) = assignment_rhs(ast) {
if let Some(p) = prefix_of_expression(rhs, code) {
return Some(p);
}
}
// Call expression (including sink call nodes): inspect the first
// positional argument. Covers `axios.get(\`https://host/…${x}\`)` shape
// where the template literal is inline at the sink.
if matches!(ast.kind(), "call_expression" | "call" | "new_expression") {
let args = ast
.child_by_field_name("arguments")
.or_else(|| ast.child_by_field_name("argument_list"));
if let Some(args_node) = args {
let mut w = args_node.walk();
if let Some(first) = args_node.named_children(&mut w).next() {
if let Some(p) = prefix_of_expression(first, code) {
return Some(p);
}
}
}
}
None
}
/// Return the leading constant string of `node` if it is a template literal or
/// a left-associated `"lit" + x` binary expression. Used by
/// `extract_template_prefix` for both assignment RHS and call arguments.
///
/// Also descends through `await` / `yield` wrappers and into the first
/// argument of a call expression, this covers the common sink shape
/// `await axios.get(\`https://host/…${x}\`)` where the template literal lives
/// inside a call inside an `await` wrapper.
fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
// Unwrap trivial wrappers (parentheses, TS `as` / type assertions, await/yield).
let mut cur = node;
for _ in 0..6 {
match cur.kind() {
"parenthesized_expression" => {
cur = cur.named_child(0)?;
}
"as_expression" | "type_assertion" | "satisfies_expression" | "non_null_expression" => {
cur = cur
.child_by_field_name("expression")
.or_else(|| cur.named_child(0))?;
}
"await_expression" | "yield_expression" => {
cur = cur.named_child(0)?;
}
"call_expression" | "call" | "new_expression" => {
// Descend into the first positional argument (e.g.
// `axios.get(\`https://…${x}\`)`, the URL we want to lock
// is the template-literal first argument of the call).
let args = cur
.child_by_field_name("arguments")
.or_else(|| cur.child_by_field_name("argument_list"))?;
let mut w = args.walk();
cur = args.named_children(&mut w).next()?;
}
_ => break,
}
}
// Case 1: template literal, `\`scheme://host/…${x}…\``.
if cur.kind() == "template_string" {
let mut w = cur.walk();
let first_child = cur.named_children(&mut w).next()?;
// Leading fragment only counts when the very first piece is a literal
// text fragment (not an interpolation like `\`${x}…\``).
if first_child.kind() == "string_fragment" {
let frag = text_of(first_child, code)?;
if !frag.is_empty() {
return Some(frag);
}
}
return None;
}
// Case 2: `"scheme://host/" + x`, LHS is a string literal.
if cur.kind() == "binary_expression" {
let mut w2 = cur.walk();
let mut ops = cur.children(&mut w2).filter(|c| !c.is_named());
if !ops.any(|c| c.kind() == "+") {
return None;
}
let left = cur.named_child(0)?;
if matches!(left.kind(), "string" | "string_fragment") {
let raw = text_of(left, code)?;
let trimmed = if (raw.starts_with('"') && raw.ends_with('"'))
|| (raw.starts_with('\'') && raw.ends_with('\''))
|| (raw.starts_with('`') && raw.ends_with('`'))
{
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
} else {
raw
};
if !trimmed.is_empty() {
return Some(trimmed);
}
}
}
None
}
/// Extract the numeric literal operand from a binary expression.
///
/// When a binary expression has one identifier operand (captured in `uses`)
/// and one numeric literal operand, this returns the parsed literal value.
/// Used for abstract-domain transfer when the SSA only has the identifier use.
fn extract_bin_op_const(ast: Node, lang: &str, code: &[u8]) -> Option<i64> {
let bin_expr = find_single_binary_expr(ast, lang)?;
// Look for a numeric literal child
let left = bin_expr.named_child(0)?;
let right = bin_expr.named_child(1)?;
fn try_parse_number(n: Node, code: &[u8]) -> Option<i64> {
let kind = n.kind();
if kind == "number"
|| kind == "integer"
|| kind == "integer_literal"
|| kind == "number_literal"
|| kind == "float"
{
let text = std::str::from_utf8(&code[n.byte_range()]).ok()?.trim();
// Try standard decimal parse first
if let Ok(v) = text.parse::<i64>() {
return Some(v);
}
// Try hex (0x...), octal (0o...), binary (0b...) prefixed literals
if let Some(hex) = text.strip_prefix("0x").or_else(|| text.strip_prefix("0X")) {
return i64::from_str_radix(hex, 16).ok();
}
if let Some(oct) = text.strip_prefix("0o").or_else(|| text.strip_prefix("0O")) {
return i64::from_str_radix(oct, 8).ok();
}
if let Some(bin) = text.strip_prefix("0b").or_else(|| text.strip_prefix("0B")) {
return i64::from_str_radix(bin, 2).ok();
}
None
} else {
None
}
}
// Try left, then right, one of them should be a literal
try_parse_number(left, code).or_else(|| try_parse_number(right, code))
}
/// Detect whether the expression(s) in `ast` produce a boolean-only result
/// rooted in equality/inequality comparisons against literals.
///
/// True when `ast` is (or wraps) either:
/// - a direct equality comparison (`==` / `!=` / `===` / `!==`) with exactly
/// one literal operand, or
/// - a compound boolean expression (`&&`, `||`, `!`, `and`, `or`, `not`)
/// whose every leaf is a qualifying equality comparison.
///
/// Covers JS/TS `binary_expression`, Python `comparison_operator`, Ruby
/// `binary`, and languages that share the `binary_expression` kind (Java, Go,
/// PHP, C/C++, Rust). Compound chains like `a === 'x' || b === 'y'` qualify
/// because their result is provably a boolean even though the taint engine
/// sees all leaf operands on a single CFG Assign node.
///
/// The SSA taint transfer uses this flag to suppress propagation of operand
/// taint into the boolean result: the outcome carries no attacker-controlled
/// data, so downstream ternaries/branches should not inherit operand caps.
pub(super) fn detect_eq_with_const(ast: Node, lang: &str) -> bool {
// Prefer inspecting the RHS of assignment-like wrappers, so flags on e.g.
// `var ok = a === 'x' || b === 'y'` examine the full right-hand side.
let target = assignment_rhs(ast).unwrap_or(ast);
is_boolean_eq_const_tree(target, lang)
}
/// Recursive predicate: does `node` evaluate to a boolean whose value is
/// determined solely by equality comparisons against literals, joined by
/// boolean operators? Parentheses, `!`/`not`, and `&&`/`||`/`and`/`or` are
/// transparent; every leaf must be a direct equality-with-constant.
fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
match node.kind() {
"parenthesized_expression" => node
.named_child(0)
.is_some_and(|c| is_boolean_eq_const_tree(c, lang)),
"unary_expression" | "not_operator" => {
// `!` / `not`, operator is an anonymous child; operand is the
// single named child.
let mut w = node.walk();
let mut op_is_not = false;
for child in node.children(&mut w) {
if !child.is_named() && matches!(child.kind(), "!" | "not") {
op_is_not = true;
break;
}
}
if !op_is_not {
return false;
}
node.named_child(0)
.is_some_and(|c| is_boolean_eq_const_tree(c, lang))
}
"boolean_operator" => {
// Python `and`/`or`, operands are named children.
let l = node.named_child(0);
let r = node.named_child(1);
l.is_some_and(|n| is_boolean_eq_const_tree(n, lang))
&& r.is_some_and(|n| is_boolean_eq_const_tree(n, lang))
}
_ => {
if !is_binary_expr_kind(node.kind(), lang) {
return false;
}
let op = binary_operator_token(node);
match op.as_deref() {
Some("&&") | Some("||") | Some("and") | Some("or") => {
node.named_child(0)
.is_some_and(|l| is_boolean_eq_const_tree(l, lang))
&& node
.named_child(1)
.is_some_and(|r| is_boolean_eq_const_tree(r, lang))
}
Some("==") | Some("===") | Some("!=") | Some("!==") => {
let Some(left) = node.named_child(0) else {
return false;
};
let Some(right) = node.named_child(1) else {
return false;
};
let left_lit = is_equality_literal_kind(left.kind());
let right_lit = is_equality_literal_kind(right.kind());
// Exactly one side literal. Both-literal is constant-fold
// territory; neither-literal is a generic identity check
// whose operands may both be tainted.
left_lit ^ right_lit
}
_ => false,
}
}
}
}
/// Return the anonymous operator token text of a binary expression node.
fn binary_operator_token(node: Node) -> Option<String> {
let mut w = node.walk();
for child in node.children(&mut w) {
if !child.is_named() {
return Some(child.kind().to_string());
}
}
None
}
/// Property names whose value is provably an integer across the supported
/// languages: JS/TS `arr.length` (Array/String/TypedArray), `map.size`
/// (Map/Set), `buffer.byteLength` (ArrayBuffer/TypedArray); Python `.count`
/// (`str.count`, `list.count`, `tuple.count`, all return int); Ruby `.length`
/// / `.size` / `.count`; Java `.size()` / `.length()`; Rust `.len()`. This
/// list is intentionally narrow, only properties whose semantics across every
/// host we scan return an integer, so the `TypeKind::Int` fact is sound.
fn is_numeric_length_property(name: &str) -> bool {
matches!(name, "length" | "size" | "byteLength" | "count" | "len")
}
/// Detect whether this CFG node is a read of a numeric-length property on a
/// container. Covers both pure property access (`arr.length` as the RHS of
/// an assignment or declaration) and zero-argument method calls
/// (`list.size()`, `vec.len()`). Returns `true` when the relevant value
/// expression is a `member_expression` / `attribute` / `selector_expression`
/// / `field_expression` whose property leaf matches
/// [`is_numeric_length_property`], or a zero-arg call around such an
/// expression.
///
/// Consumed by the type-fact analysis (`ssa::type_facts::analyze_types`) to
/// infer `TypeKind::Int` on the defined value so sink-cap suppression can
/// treat `"row " + arr.length` as a non-injectable payload.
/// when the RHS of an assignment / declaration is a single
/// member-access expression (`let x = dto.email`, `x = obj.field`,
/// `let x = obj["field"]`), return the property name. The CFG type-fact
/// analysis uses the recovered name to look up the field's declared
/// [`crate::ssa::type_facts::TypeKind`] when the receiver is a
/// [`crate::ssa::type_facts::TypeKind::Dto`].
///
/// Returns `None` for any other shape (function calls, complex
/// expressions, computed-key subscripts, optional-chaining, etc.) so
/// the legacy copy-prop / Unknown propagation continues to apply.
fn detect_member_field_assignment(ast: Node, code: &[u8]) -> Option<String> {
// Pull the RHS the same way `detect_numeric_length_access` does so
// both detectors look at the same node grain.
let target = ast
.child_by_field_name("value")
.or_else(|| ast.child_by_field_name("right"))
.or_else(|| {
let mut cursor = ast.walk();
ast.named_children(&mut cursor)
.find(|c| matches!(c.kind(), "variable_declarator" | "init_declarator"))
.and_then(|d| {
d.child_by_field_name("value")
.or_else(|| d.child_by_field_name("initializer"))
})
})
.unwrap_or(ast);
extract_member_field_name(target, code)
}
fn extract_member_field_name(node: Node, code: &[u8]) -> Option<String> {
match node.kind() {
// JS / TS / Java / C / C++ / Go (selector) / Rust (field).
"member_expression"
| "member_access_expression"
| "field_expression"
| "selector_expression"
| "attribute" => {
let prop = node
.child_by_field_name("property")
.or_else(|| node.child_by_field_name("attribute"))
.or_else(|| node.child_by_field_name("field"))
.or_else(|| node.child_by_field_name("name"))?;
let text = text_of(prop, code)?;
// Defensive: reject anything that doesn't look like an
// identifier (e.g. numeric subscripts). Allows ASCII
// letters / digits / underscore.
if text.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') && !text.is_empty() {
Some(text)
} else {
None
}
}
_ => None,
}
}
fn detect_numeric_length_access(ast: Node, _lang: &str, code: &[u8]) -> bool {
// Pull the value expression for variable declarations / assignments.
// Other node shapes (e.g. plain member-expression reads) are checked
// as-is.
let target = ast
.child_by_field_name("value")
.or_else(|| ast.child_by_field_name("right"))
.or_else(|| {
let mut cursor = ast.walk();
ast.named_children(&mut cursor)
.find(|c| matches!(c.kind(), "variable_declarator" | "init_declarator"))
.and_then(|d| {
d.child_by_field_name("value")
.or_else(|| d.child_by_field_name("initializer"))
})
})
.unwrap_or(ast);
is_numeric_length_access_expr(target, code)
}
fn is_numeric_length_access_expr(node: Node, code: &[u8]) -> bool {
match node.kind() {
"member_expression"
| "attribute"
| "selector_expression"
| "field_expression"
| "member_access_expression" => {
let prop = node
.child_by_field_name("property")
.or_else(|| node.child_by_field_name("attribute"))
.or_else(|| node.child_by_field_name("field"))
.or_else(|| node.child_by_field_name("name"));
prop.and_then(|p| text_of(p, code))
.is_some_and(|t| is_numeric_length_property(&t))
}
// Zero-arg method call: `list.size()` / `vec.len()` / `str.length()`.
"call_expression" | "method_invocation" | "method_call_expression" | "call" => {
let args = node
.child_by_field_name("arguments")
.or_else(|| node.child_by_field_name("argument_list"));
let arity = args
.map(|a| {
let mut c = a.walk();
a.named_children(&mut c).count()
})
.unwrap_or(0);
if arity != 0 {
return false;
}
let callee = node
.child_by_field_name("function")
.or_else(|| node.child_by_field_name("name"))
.or_else(|| node.child_by_field_name("method"));
match callee {
Some(c) => is_numeric_length_access_expr(c, code),
None => false,
}
}
_ => false,
}
}
/// Literal kinds accepted for equality-with-constant detection. Conservatively
/// limited to scalar literals across the supported tree-sitter grammars.
fn is_equality_literal_kind(kind: &str) -> bool {
matches!(
kind,
// Strings
"string"
| "string_literal"
| "interpreted_string_literal"
| "raw_string_literal"
| "encapsed_string"
// Numbers
| "number"
| "integer"
| "float"
| "integer_literal"
| "float_literal"
| "number_literal"
| "decimal_integer_literal"
| "hex_integer_literal"
| "octal_integer_literal"
| "binary_integer_literal"
| "decimal_floating_point_literal"
| "hex_floating_point_literal"
// Null / nil / none / undefined
| "null"
| "null_literal"
| "nil"
| "none"
| "undefined"
// Booleans
| "true"
| "false"
| "boolean_literal"
)
}
/// Find a single binary expression node at or directly under `ast`.
///
/// Returns `None` if there are zero or multiple binary expressions
/// (ambiguous). Only descends one level into assignment/expression wrappers.
fn find_single_binary_expr<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
let ast_kind = ast.kind();
// Check if ast itself is a binary expression
if is_binary_expr_kind(ast_kind, lang) {
// Verify it has exactly 2 named children (left, right), no nesting
let named_count = ast.named_child_count();
if named_count == 2 {
// Ensure neither child is itself a binary expression (that would
// mean the operator is for a compound expression like `a + b * c`)
let left = ast.named_child(0);
let right = ast.named_child(1);
let left_is_bin = left.is_some_and(|n| is_binary_expr_kind(n.kind(), lang));
let right_is_bin = right.is_some_and(|n| is_binary_expr_kind(n.kind(), lang));
if !left_is_bin && !right_is_bin {
return Some(ast);
}
}
return None; // Nested or complex
}
// Check one level down for assignment wrappers, expression statements, etc.
let wrapper_kinds = [
"expression_statement",
"assignment_expression",
"assignment",
"variable_declaration",
"variable_declarator",
"short_var_declaration",
"lexical_declaration",
];
if wrapper_kinds.contains(&ast_kind) || ast_kind.ends_with("_statement") {
let mut found: Option<Node<'a>> = None;
let mut cursor = ast.walk();
for child in ast.named_children(&mut cursor) {
if is_binary_expr_kind(child.kind(), lang) {
if found.is_some() {
return None; // Multiple binary expressions → ambiguous
}
// Same check: must have exactly 2 non-binary named children
if child.named_child_count() == 2 {
let l = child.named_child(0);
let r = child.named_child(1);
let l_bin = l.is_some_and(|n| is_binary_expr_kind(n.kind(), lang));
let r_bin = r.is_some_and(|n| is_binary_expr_kind(n.kind(), lang));
if !l_bin && !r_bin {
found = Some(child);
}
}
} else if wrapper_kinds.contains(&child.kind()) {
// Recurse one more level into nested wrappers (e.g.,
// variable_declaration → variable_declarator → binary_expression)
let mut inner_cursor = child.walk();
for grandchild in child.named_children(&mut inner_cursor) {
if is_binary_expr_kind(grandchild.kind(), lang) {
if found.is_some() {
return None;
}
if grandchild.named_child_count() == 2 {
let l = grandchild.named_child(0);
let r = grandchild.named_child(1);
let l_bin = l.is_some_and(|n| is_binary_expr_kind(n.kind(), lang));
let r_bin = r.is_some_and(|n| is_binary_expr_kind(n.kind(), lang));
if !l_bin && !r_bin {
found = Some(grandchild);
}
}
}
}
}
}
return found;
}
None
}
/// Check if an AST node kind is a binary expression in the given language.
///
/// Python uses `binary_operator` for arithmetic/bitwise and
/// `comparison_operator` for comparisons. Chained Python comparisons
/// (`a < b < c`) have 3+ named children and are rejected by the
/// `named_child_count() == 2` guard in `find_single_binary_expr`.
fn is_binary_expr_kind(kind: &str, lang: &str) -> bool {
match lang {
"python" => kind == "binary_operator" || kind == "comparison_operator",
"ruby" => kind == "binary",
_ => kind == "binary_expression",
}
}
/// Create a node in one short borrow and optionally attach a taint label.
#[allow(clippy::too_many_arguments)]
pub(super) fn push_node<'a>(
g: &mut Cfg,
kind: StmtKind,
ast: Node<'a>,
lang: &str,
code: &'a [u8],
enclosing_func: Option<&str>,
call_ordinal: u32,
analysis_rules: Option<&LangAnalysisRules>,
) -> NodeIndex {
/* ── 1. IDENTIFIER EXTRACTION ─────────────────────────────────────── */
// Primary guess (varies by AST kind)
let mut text = match lookup(lang, ast.kind()) {
// plain `foo(bar)` style call
Kind::CallFn => ast
.child_by_field_name("function")
.or_else(|| ast.child_by_field_name("method"))
.or_else(|| ast.child_by_field_name("name"))
.or_else(|| ast.child_by_field_name("type"))
// JS/TS `new_expression` uses `constructor` field.
.or_else(|| ast.child_by_field_name("constructor"))
// Fallback for constructors whose grammar lacks field names
// (e.g. PHP `object_creation_expression` has positional children).
.or_else(|| find_constructor_type_child(ast))
.and_then(|n| {
// IIFE: `(function(x){...})(arg)`, the called expression is a
// function literal with no identifier. Bind the call to the
// anonymous body's synthetic name so resolve_callee can find
// the extracted BodyCfg/summary. Without this, text_of() would
// return the function's full source slice, which matches no
// summary key.
let unwrapped = unwrap_parens(n);
if lookup(lang, unwrapped.kind()) == Kind::Function {
Some(anon_fn_name(unwrapped.start_byte()))
} else {
text_of(n, code)
}
})
.unwrap_or_default(),
// method / UFCS call `recv.method()` or `Type::func()`
Kind::CallMethod => {
let func = ast
.child_by_field_name("method")
.or_else(|| ast.child_by_field_name("name"))
.and_then(|n| text_of(n, code));
let recv = ast
.child_by_field_name("object")
.or_else(|| ast.child_by_field_name("receiver"))
.or_else(|| ast.child_by_field_name("scope"))
.and_then(|n| root_receiver_text(n, lang, code));
match (recv, func) {
(Some(r), Some(f)) => format!("{r}.{f}"),
(_, Some(f)) => f,
_ => String::new(),
}
}
// `my_macro!(…)`
Kind::CallMacro => ast
.child_by_field_name("macro")
.and_then(|n| text_of(n, code))
.unwrap_or_default(),
// Function definitions: use just the function name, not the full
// body text. The raw body text can spuriously match label rules
// (e.g. `def search\n find_by_sql(…)\nend` would suffix-match
// the `find_by_sql` sink via the `head = text.split('(')` logic
// in classify_all).
Kind::Function => ast
.child_by_field_name("name")
.or_else(|| ast.child_by_field_name("declarator"))
.and_then(|n| text_of(n, code))
.unwrap_or_default(),
// everything else fallback to raw slice
_ => text_of(ast, code).unwrap_or_default(),
};
// C++ new/delete: normalize callee to "new"/"delete" for resource pair
// matching. Without this, new_expression extracts the type name (e.g.
// "int") and delete_expression extracts the full expression text.
// Guarded to C++ only so JS/TS `new_expression` is unaffected.
if lang == "cpp" {
if ast.kind() == "new_expression" {
text = "new".to_string();
} else if ast.kind() == "delete_expression" {
text = "delete".to_string();
}
}
// Ruby backtick shell execution: the `subshell` AST node has no
// `function`/`method` field so the CallFn text extraction above yields
// "". Stamp a synthetic callee name so the Sink(SHELL_ESCAPE) rule in
// labels/ruby.rs fires.
if lang == "ruby" && ast.kind() == "subshell" {
text = "subshell".to_string();
}
// If this is a declaration/expression wrapper or an assignment that
// *contains* a call, prefer the first inner call identifier instead of
// the whole line. Track the inner call's byte span so we can populate
// `CallMeta.callee_span` once the labels settle, enabling narrow
// source-location reporting when the classified call lives several lines
// below the enclosing statement (e.g. call inside a multi-line template
// literal).
let mut inner_text_span: Option<(usize, usize)> = None;
if matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment | Kind::Return
) {
if let Some((inner, inner_span)) = first_call_ident_with_span(ast, lang, code) {
text = inner;
inner_text_span = Some(inner_span);
} else if matches!(lookup(lang, ast.kind()), Kind::CallWrapper) {
// Fallback for language-construct "calls" (e.g. PHP `echo_statement`,
// `print` expression): the first child is a keyword leaf (e.g. "echo")
// that acts as a callee but is not a function_call_expression.
let mut cursor = ast.walk();
if let Some(first) = ast.children(&mut cursor).next()
&& first.child_count() == 0
&& let Some(kw) = text_of(first, code)
&& kw.len() <= 16
{
text = kw;
inner_text_span = Some((first.start_byte(), first.end_byte()));
}
}
}
/* ── 2. LABEL LOOK-UP ───────────────────────────────────────────── */
let extra = analysis_rules.map(|r| r.extra_labels.as_slice());
let mut labels = classify_all(lang, &text, extra);
// Rust chain-text classification. The default `text` for a Rust
// CallMethod is `{root_receiver}.{method}`, where `root_receiver`
// is the leftmost identifier after walking through every nested
// call/method receiver. That convention loses the intermediate
// chain methods, so a body-binding chain like
// `Client::post(url).body(payload).send()` reduces to
// `Client::post.send` and rules keyed on `body.send` /
// `RequestBuilder.body` cannot fire.
//
// Reclassify against the call-AST's source text (with paren groups
// stripped) so suffix matchers covering chain shapes
// (`body.send`, `body_string`, `Request::builder.body`, ...) attach.
// Strictly additive: we union new labels with the existing ones,
// never override. Limited to Rust to avoid disturbing the other
// languages' chain conventions.
if lang == "rust" {
if let Some(cn) = find_call_node(ast, lang) {
if let Some(chain_raw) = text_of(cn, code) {
// Multi-line Rust chains (`Client::new()\n .post(url)\n
// .body(p)\n .send()`) preserve interior whitespace in
// the source slice, which would prevent suffix matchers
// like `body.send` from firing. Strip whitespace before
// normalizing paren groups, mirroring the same trick
// used by `find_chained_inner_call` for JS/TS chains.
let chain_compact: String =
chain_raw.chars().filter(|c| !c.is_whitespace()).collect();
let chain_text = crate::labels::normalize_chained_call_for_classify(&chain_compact);
if chain_text != text {
let chain_labels = classify_all(lang, &chain_text, extra);
for l in chain_labels {
if !labels.contains(&l) {
labels.push(l);
}
}
}
// Also try classification against the chain with
// trailing identity methods peeled. Rust chains often
// end in `.unwrap()` / `.expect("...")` / `.await` /
// `.clone()` etc., which obscure the body-bind verb
// for suffix matchers. E.g. hyper's
// `Request::builder().method(..).uri(..).body(p).unwrap()`
// peels to `...body`, allowing a simpler `body` /
// `Request::builder.body` matcher to fire.
let peeled = crate::ssa::type_facts::peel_identity_suffix(&chain_text);
if peeled != chain_text && peeled != text {
let peeled_labels = classify_all(lang, &peeled, extra);
for l in peeled_labels {
if !labels.contains(&l) {
labels.push(l);
}
}
}
// Pattern synthesis: the hyper request-builder chain
// (`hyper::Request::builder().method(..).uri(..).body(p)`)
// can interleave `.method`, `.uri`, `.header`, `.version`
// etc. between `Request::builder` and the body-bind step.
// Suffix matchers can't span those, so synthesise a
// DATA_EXFIL sink whenever the chain begins with
// `Request::builder` and ends in a body-binding verb.
// Strictly additive: no labels are removed, only added,
// and the synthesis only fires when an explicit Sink
// hasn't already attached.
let chain_for_synth = if peeled != chain_text {
&peeled
} else {
&chain_text
};
if !labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::DATA_EXFIL)))
&& (chain_for_synth.contains("Request::builder.")
|| chain_for_synth.contains("hyper::Request::builder."))
{
let last_seg =
chain_for_synth.rsplit('.').next().unwrap_or(chain_for_synth);
if matches!(
last_seg,
"body" | "body_mut" | "body_string" | "body_json" | "body_bytes"
) {
labels.push(DataLabel::Sink(crate::labels::Cap::DATA_EXFIL));
}
}
}
}
}
// If the outermost call didn't classify, try inner/nested calls.
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
// When the callee is overridden, save the original for container ops
// (e.g. `parts.add(req.getParameter(...))`, callee becomes
// "req.getParameter" but outer_callee preserves "parts.add").
//
// Statement-level calls in languages without a separate
// `expression_statement` wrapper (Ruby, where `body_statement` directly
// contains the call AST node) reach `push_node` with `ast.kind() ==
// "call"` (`Kind::CallMethod`) rather than `Kind::CallWrapper`. Without
// including the call kinds in the gate, an unclassified outer wrapper
// around a sink (e.g. `YAML.safe_load(File.read(filename))` or
// `String.new(File.read(x))`) loses the inner sink's classification
// entirely — the outer call becomes a non-sink node, and the inner call
// is not emitted as a standalone CFG node because it sits inside the
// outer's `argument_list`. Cross-function summary extraction then
// misses the `param_to_sink` for the wrapper helper, breaking detection
// of every chain-style sink wrapper used in real Ruby CVEs (rswag
// CVE-2023-38337, the Marshal/JSON/YAML-of-File.read pattern, etc.).
let mut outer_callee: Option<String> = None;
let mut inner_callee_span: Option<(usize, usize)> = None;
if labels.is_empty()
&& matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper
| Kind::Assignment
| Kind::Return
| Kind::CallFn
| Kind::CallMethod
| Kind::CallMacro
)
&& let Some((inner_text, inner_label, inner_span)) =
find_classifiable_inner_call(ast, lang, code, extra)
{
labels.push(inner_label);
outer_callee = Some(text.clone());
text = inner_text;
inner_callee_span = Some(inner_span);
}
// For assignments like `element.innerHTML = value`, the inner-call heuristic
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
// If that didn't produce a label, check the LHS property name, it may be a
// sink like `innerHTML`.
//
// This covers both direct `Kind::Assignment` nodes and `Kind::CallWrapper`
// nodes (expression_statement) that wrap an assignment.
if labels.is_empty() {
let assign_node = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
Some(ast)
} else if matches!(lookup(lang, ast.kind()), Kind::CallWrapper) {
// Walk children to find a nested assignment_expression
let mut cursor = ast.walk();
ast.children(&mut cursor)
.find(|c| matches!(lookup(lang, c.kind()), Kind::Assignment))
} else {
None
};
if let Some(assign) = assign_node
&& let Some(lhs) = assign.child_by_field_name("left")
{
// Try full member expression first (e.g. "location.href"), more
// specific and avoids false positives on `a.href`.
if let Some(full) = member_expr_text(lhs, code) {
if let Some(l) = classify(lang, &full, extra) {
labels.push(l);
}
}
// Fall back to property-only (e.g. "innerHTML") for sinks that
// don't need object context.
if labels.is_empty()
&& let Some(prop) = lhs.child_by_field_name("property")
&& let Some(prop_text) = text_of(prop, code)
{
if let Some(l) = classify(lang, &prop_text, extra) {
labels.push(l);
}
}
}
}
// For declarations/assignments whose RHS is a member expression (not a call),
// try to classify the member expression text as a source.
// This handles `var x = process.env.CMD` (JS), `os.environ["KEY"]` (Python),
// and similar property-access-based source patterns.
// Skip when the assignment's RHS is itself a function/lambda literal ,
// labels found by `first_member_label` would come from inside the
// closure body and shouldn't tag the outer wrapper (e.g. Go's
// `run := func() { exec.Command(...) }` would otherwise inherit
// `exec.Command`'s Sink label). The function literal is handled as
// its own scope by `collect_nested_function_nodes`.
if labels.is_empty()
&& matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment
)
&& !rhs_is_function_literal(ast, lang)
&& let Some(found) = first_member_label(ast, lang, code, extra)
{
labels.push(found);
// Update text so the callee name reflects the source.
// Preserve the original callee in outer_callee so inter-procedural
// summary resolution can still find the wrapping function
// (e.g. `storeInto(req.query.input, items)` → callee="req.query.input"
// but outer_callee="storeInto").
if let Some(member_text) = first_member_text(ast, code) {
if outer_callee.is_none() && text != member_text {
outer_callee = Some(text.clone());
}
text = member_text;
}
}
// For `if let` / `while let` patterns: try to classify the value expression
// in the let-condition as a source/sink. E.g. `if let Ok(cmd) = env::var("CMD")`
// should recognise `env::var` as a taint source and label this node accordingly.
if labels.is_empty()
&& matches!(lookup(lang, ast.kind()), Kind::If | Kind::While)
&& let Some(cond) = ast.child_by_field_name("condition")
&& cond.kind() == "let_condition"
&& let Some(val) = cond.child_by_field_name("value")
{
if let Some((ident, ident_span)) = first_call_ident_with_span(val, lang, code)
&& let Some(l) = classify(lang, &ident, extra)
{
labels.push(l);
text = ident;
if inner_text_span.is_none() {
inner_text_span = Some(ident_span);
}
}
if labels.is_empty()
&& let Some(ident_text) = text_of(val, code)
&& let Some(l) = classify(lang, &ident_text, extra)
{
labels.push(l);
text = ident_text;
}
}
// Hoist call-node lookup: reused for gated sinks and arg_uses.
let mut call_ast = find_call_node(ast, lang);
// Chained-call inner-gate rebinding. When the outer call is a method-
// chain wrapper whose receiver is itself a call to a known gated sink
// (e.g. `http.get(uri, cb).on('error', e => ...)` or
// `axios.get(url).then(handler).catch(handler)`), the outer callee
// (`.on`, `.catch`) doesn't classify and the inner sink is invisible to
// gate classification + arg-use extraction. Rebind to the inner call
// so its sink fires and its args are checked.
//
// Only fires when:
// * `labels.is_empty()` (the outer call is non-classified)
// * the chain has a real inner call_expression
// * that inner callee actually matches a gate matcher for this lang
//
// Motivated by CVE-2025-64430 (Parse Server SSRF).
if labels.is_empty()
&& let Some(outer) = call_ast
&& let Some((inner, inner_callee_text)) = find_chained_inner_call(outer, lang, code)
&& !classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_empty()
{
call_ast = Some(inner);
outer_callee = Some(text.clone());
text = inner_callee_text;
inner_callee_span = Some((inner.start_byte(), inner.end_byte()));
}
// Gated sinks: argument-sensitive classification (e.g., setAttribute).
// Runs for any node containing a classifiable call, regardless of StmtKind.
//
// Prefer the shallow `call_ast` from `find_call_node` when available, but
// fall back to a deeper walk (up to 4 levels) so wrapped calls still reach
// the gate. This is necessary for forms like `var r = await fetch(url)`
// (variable_declaration > variable_declarator > await_expression >
// call_expression) where the call sits at depth 3. When using the deeper
// walker we must also derive the callee text from the inner call node, not
// the outer statement `text`, so gate matcher names like `"fetch"` hit.
let mut sink_payload_args: Option<Vec<usize>> = None;
let mut destination_uses: Option<Vec<String>> = None;
let mut gate_filters: Vec<GateFilter> = Vec::new();
// Gates run when no flat `Sink` label is already present, OR when a
// matching gate restricts the payload-arg set on top of an existing flat
// sink. Source / Sanitizer labels are orthogonal — a callee like
// Python's `requests.post` is a `Source` for its response object AND a
// gated `Sink` for its URL/body argument positions; both should attach.
//
// Payload-arg refinement: when a flat sink matches a callee that ALSO
// has a gate entry restricting `payload_args`, the gate's `payload_args`
// are propagated to `sink_payload_args` so only those positions are
// taint-checked. Example: `execSync(cmd, { env: process.env })` matches
// the bare `execSync` flat `Sink(SHELL_ESCAPE)` AND the gate `=execSync`
// with `payload_args: &[0]`; without the refinement, the flat rule's
// implicit "all args" would flag `process.env` flowing into the options
// object's `env` field. The gate's labels themselves are deduped so a
// single capability never double-attributes.
let has_sink_label = labels.iter().any(|l| matches!(l, DataLabel::Sink(_)));
{
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(cn) = gate_call {
let gate_callee_text = if call_ast.is_some() {
text.clone()
} else {
// Inner call reached via wrapper, use the call-expression's
// function name directly. Falls back to `text` so non-call-
// expression kinds (method calls, Ruby `call` nodes, macros)
// still have a usable callee string.
cn.child_by_field_name("function")
.or_else(|| cn.child_by_field_name("method"))
.or_else(|| cn.child_by_field_name("name"))
.and_then(|f| text_of(f, code))
.unwrap_or_else(|| text.clone())
};
let matches = classify_gated_sink(
lang,
&gate_callee_text,
|idx| {
extract_const_string_arg(cn, idx, code).or_else(|| {
// C/C++ preprocessor macros and PHP `define`d constants
// surface as identifier nodes, not string literals.
// Falling back to the macro-arg extractor for those
// languages lets gates like `curl_easy_setopt` /
// `curl_setopt` activate on a `CURLOPT_POSTFIELDS`
// ident match instead of firing conservatively on
// every positional arg.
if matches!(lang, "c" | "cpp" | "c++" | "php") {
extract_const_macro_arg(cn, idx, code)
} else {
None
}
})
},
|kw| extract_const_keyword_arg(cn, kw, code),
|kw| has_keyword_arg(cn, kw, code),
);
if !matches.is_empty() {
// Per-gate filter accumulation. Each match contributes:
// * its label (added to `labels` so `resolve_sink_caps`
// downstream sees the union),
// * a `GateFilter` carrying that gate's specific
// `(label_caps, payload_args, destination_uses)` so
// the SSA sink scan can attribute taint per-cap.
//
// When a flat sink already matches, gate labels are deduped
// so the same capability isn't attributed twice (once flat,
// once gated). Their `payload_args` still flow into
// `sink_payload_args` so the gate's arg-position restriction
// applies on top of the flat sink.
let mut union_payload: Vec<usize> = Vec::new();
for gm in &matches {
if has_sink_label {
if !labels.contains(&gm.label) {
labels.push(gm.label);
}
} else {
labels.push(gm.label);
}
let mut payload_vec: Vec<usize> =
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual
// call arity so `collect_tainted_sink_values` checks
// each one.
let arity = extract_arg_uses(cn, code).len();
(0..arity).collect()
} else {
gm.payload_args.to_vec()
};
// Destination-aware gates: when the gate declares
// destination-bearing object fields and a payload-position
// arg is an object literal at call time, narrow sink-taint
// checks to identifiers under those fields. Non-object
// arg forms return `None` from the extractor and the gate
// falls back to whole-arg positional filtering.
//
// The pair form preserves which object-literal field each
// ident was bound to (e.g. `body` vs `headers` vs `json`)
// so diag rendering can attribute `DATA_EXFIL` findings to
// a specific destination field.
let mut dest_uses: Option<Vec<String>> = None;
let mut dest_fields: Vec<String> = Vec::new();
if !gm.object_destination_fields.is_empty() {
let mut all_pairs: Vec<(String, String)> = Vec::new();
let mut had_object_match = false;
for &pos in gm.payload_args {
if let Some(pairs) = extract_destination_field_pairs(
cn,
pos,
gm.object_destination_fields,
code,
) {
all_pairs.extend(pairs);
had_object_match = true;
break;
}
}
// Direct kwargs: languages where destination-bearing
// fields are passed as `keyword_argument` siblings of
// the positional args (Python `data=`, Ruby kwargs).
// SSA lowering folds kwarg idents into the implicit
// args group at index `arity`, so we expand
// `payload_vec` to include that position; the
// `destination_filter` then narrows to the kwarg
// ident's `var_name`.
let kwarg_pairs =
extract_destination_kwarg_pairs(cn, gm.object_destination_fields, code);
if !kwarg_pairs.is_empty() {
let arity = extract_arg_uses(cn, code).len();
if !payload_vec.contains(&arity) {
payload_vec.push(arity);
}
for pair in kwarg_pairs {
if !all_pairs.iter().any(|(_, v)| v == &pair.1) {
all_pairs.push(pair);
}
}
}
if had_object_match || !all_pairs.is_empty() {
let (fields, vars): (Vec<String>, Vec<String>) =
all_pairs.into_iter().unzip();
dest_uses = Some(vars);
dest_fields = fields;
}
}
let label_caps = match gm.label {
crate::labels::DataLabel::Sink(c) => c,
_ => crate::labels::Cap::empty(),
};
for &p in &payload_vec {
if !union_payload.contains(&p) {
union_payload.push(p);
}
}
gate_filters.push(GateFilter {
label_caps,
payload_args: payload_vec,
destination_uses: dest_uses,
destination_fields: dest_fields,
});
}
if !union_payload.is_empty() {
sink_payload_args = Some(union_payload);
}
// Legacy single-gate path keeps `destination_uses` populated so
// the SSA fast-path (one filter) continues to work without
// consulting `gate_filters`. When multiple gates match,
// per-position filters live in `gate_filters` and the legacy
// field is intentionally left `None`.
if gate_filters.len() == 1 {
destination_uses = gate_filters[0].destination_uses.clone();
}
}
}
}
// ── Inline shell-array sink synthesis ────────────────────────────────
//
// Recognise `[<shell>, "-c", <payload>]` (and `cmd /c <payload>`)
// appearing as an argument to *any* call. The shell-array shape itself
// is the gate, regardless of callee, so this fires through user-defined
// wrappers like `execInContainer(id, ["bash", "-c", `echo ${tainted}`])`
// without needing per-wrapper summary annotations. Only fires for JS/TS
// because the array-literal grammar (`array` node) and shell-form usage
// are JS/TS conventions; other languages use different shapes for
// shell-exec wrappers.
//
// The inner array also covers Dockerode's
// `container.exec({Cmd: [shell, "-c", payload]})`: the helper looks
// inside object-literal args for shell-array values under any field.
//
// Existing FP carve-outs are preserved. `["ls", "-la"]` doesn't match
// (element 0 is not a known shell). `untaintedArrayVariable` doesn't
// match (variable, not literal). `execSync(cmd, { env: process.env })`
// doesn't match (string + object args, no shell-array literal). When
// the payload elements are constant strings the helper returns no
// match, so a literal `["bash", "-c", "ls -la"]` doesn't fire either.
if matches!(lang, "javascript" | "js" | "typescript" | "ts") {
if let Some(cn) = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)) {
let shell_matches = extract_shell_array_payload_idents(cn, code);
if !shell_matches.is_empty() {
let shell_label = DataLabel::Sink(Cap::SHELL_ESCAPE);
let already_has_shell_sink = labels.iter().any(|l| match l {
DataLabel::Sink(c) => c.contains(Cap::SHELL_ESCAPE),
_ => false,
});
if !already_has_shell_sink {
labels.push(shell_label);
}
let mut union_payload: Vec<usize> = sink_payload_args.clone().unwrap_or_default();
for sm in shell_matches {
if !union_payload.contains(&sm.arg_position) {
union_payload.push(sm.arg_position);
}
gate_filters.push(GateFilter {
label_caps: Cap::SHELL_ESCAPE,
payload_args: vec![sm.arg_position],
destination_uses: Some(sm.payload_idents),
destination_fields: Vec::new(),
});
}
if !union_payload.is_empty() {
sink_payload_args = Some(union_payload);
}
// Legacy single-gate path: when this is the only gate filter,
// populate the top-level destination_uses too so the SSA
// fast-path stays consistent with the multi-gate behaviour.
if gate_filters.len() == 1 {
destination_uses = gate_filters[0].destination_uses.clone();
}
}
}
}
// Pattern-based sanitizer synthesis: recognise a Rust
// `param.replace(LIT, LIT)[.replace(LIT, LIT)]*` chain that provably strips
// path-traversal or HTML metacharacters. The CFG collapses the whole
// chain into a single call node, so detection must inspect the AST of
// that node directly. Only fires when no Sanitizer label already
// classifies this node, existing label rules win.
if lang == "rust" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) {
if let Some(cn) = call_ast {
if cn.kind() == "call_expression" || cn.kind() == "method_call_expression" {
if let Some(caps) = detect_rust_replace_chain_sanitizer(cn, code) {
labels.push(DataLabel::Sanitizer(caps));
}
}
}
}
// Pattern-based sanitizer synthesis for Go's `strings.Replace` /
// `strings.ReplaceAll`. When the call's OLD literal contains a known
// dangerous payload (shell metachars, path-traversal, HTML, SQL) and
// the NEW literal does not reintroduce one, treat the call as a
// Sanitizer over the matching caps. Same precedence as the Rust
// chain synthesis: explicit Sanitizer labels win, but otherwise the
// synthesised label feeds the standard sanitizer pathway in the
// taint engine. Motivated by helpers like
// `func validate(s string) string { return strings.ReplaceAll(s, ";", "") }`
// whose return is appended to a slice that later flows into
// `exec.Command(slice[i])`.
if lang == "go" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) {
if let Some(cn) = call_ast {
if cn.kind() == "call_expression" {
if let Some(caps) = detect_go_replace_call_sanitizer(cn, code) {
labels.push(DataLabel::Sanitizer(caps));
}
}
}
}
// Shape-based sanitizer synthesis for Ruby ActiveRecord query methods.
// The static label table marks `where` / `order` / `pluck` / `group` /
// `having` / `joins` as `Sink(SQL_QUERY)` because their string-interpolation
// form (`Model.where("id = #{x}")`) is a real SQLi vector. But the same
// methods are intrinsically parameterised when arg 0 is a hash, symbol,
// array, or non-interpolated string, Rails escapes the values. Rather
// than dropping the sink (which would lose the genuine TPs), synthesise
// a same-node `Sanitizer(SQL_QUERY)` for the safe shapes; this clears
// SQL taint at the call and reflexively dominates the sink, suppressing
// both `taint-unsanitised-flow` and `cfg-unguarded-sink` for the safe
// forms while leaving the dangerous ones to fire.
//
// Chained calls (`Model.where(...).preload(...).to_a`) collapse into a
// single CFG node whose outer `call_ast` may be `to_a` (no args). The
// shape inspection has to walk the receiver chain to reach the AR query
// call itself, `ruby_chain_arg0_for_method` does that walk.
if (lang == "ruby" || lang == "rb")
&& labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
&& !labels
.iter()
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
{
// Identify the matched AR query method from the callee `text`
// (e.g. "Issue.where" → "where", "joins(:project).where" → "where").
let leaf = text.rsplit(['.', ':']).next().unwrap_or(&text);
const AR_QUERY_METHODS: &[&str] = &["where", "order", "group", "having", "joins", "pluck"];
if AR_QUERY_METHODS.contains(&leaf) {
// Try the outer call's arg 0 first (handles direct calls);
// fall back to walking the receiver chain for collapsed
// chained-call CFG nodes.
let shape = call_ast
.and_then(arg0_kind_and_interpolation)
.or_else(|| ruby_chain_arg0_for_method(ast, &[leaf], code));
if let Some((arg0_kind, has_interp)) = shape
&& crate::labels::ruby::ar_query_safe_shape(&text, &arg0_kind, has_interp)
{
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
}
}
}
// Shape-based sanitizer synthesis for Java JPA / JDBC parameterised
// execute calls. `executeUpdate` and `executeQuery` are labelled
// `Sink(SQL_QUERY)` because the JDBC `Statement.executeUpdate(String)`
// and `Statement.executeQuery(String)` overloads are real injection
// sinks when given a concatenated SQL string. But the same method
// names on JPA `javax.persistence.Query` and JDBC `PreparedStatement`
// are zero-arg, they execute SQL that was bound upstream by
// `entityManager.createQuery(LITERAL)` / `connection.prepareStatement(LITERAL)`,
// and any bind values went through `setParameter` / `setString`
// (which the JDBC/JPA driver escapes). Walk the receiver chain to
// find the SQL-binding call and verify its arg 0 is a string literal;
// if so, synthesise a same-node `Sanitizer(SQL_QUERY)` which
// reflexively dominates the sink, suppressing both
// `cfg-unguarded-sink` and `taint-unsanitised-flow` for the safe
// chain shape while leaving `Statement.executeUpdate(concat)` and
// `createQuery(concat)` to fire as real findings.
if lang == "java"
&& labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
&& !labels
.iter()
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
{
let leaf = text.rsplit('.').next().unwrap_or(&text);
if matches!(leaf, "executeUpdate" | "executeQuery") {
// Outer call must be zero-arg (the prepared/parameterised
// execute shape). The N-arg overload `Statement.executeUpdate(SQL)`
// is a real sink and must continue to fire.
let outer_zero_arg = call_ast
.and_then(|cn| cn.child_by_field_name("arguments"))
.map(|args| {
let mut c = args.walk();
args.named_children(&mut c).count() == 0
})
.unwrap_or(false);
if outer_zero_arg {
// Walk the receiver chain to find a SQL-binding call
// (`createQuery` / `createNativeQuery` / `prepareStatement`)
// and require its arg 0 to be a string literal. Anything
// else (binary concat, identifier, method call) leaves
// the sink in place, we cannot prove the SQL is
// parameterised, so the structural finding stands.
const JPA_BIND_METHODS: &[&str] = &[
"createQuery",
"createNativeQuery",
"createNamedQuery",
"prepareStatement",
"prepareCall",
];
if let Some(call_node) = call_ast
&& let Some(arg0_kind) =
java_chain_arg0_kind_for_method(call_node, JPA_BIND_METHODS, code)
&& arg0_kind == "string_literal"
{
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
}
}
}
}
// Shape-based sanitizer synthesis for JS/TS ORM-accessor chains.
// The static label table marks `db.query` / `connection.query` /
// `pool.query` / `client.query` / `db.execute` as `Sink(SQL_QUERY)`
// because the bare `connection.query("SELECT ..." + name)` form is a
// real SQLi sink. But the same `db.query` method on Strapi-style ORMs
// takes a model UID literal and returns a chainable model accessor:
// `strapi.db.query('admin::api-token').findOne({ where: whereParams })`.
// The trailing `.findOne({...})` / `.findMany({...})` / `.create(...)`
// calls are intrinsically parameterised, the actual SQL is generated
// by the ORM, and the per-call values arrive through field-keyed object
// literals that the ORM driver escapes.
//
// Recognition rule: when the CFG node's classified text reaches a sink
// with `SQL_QUERY` cap, walk the receiver chain looking for an inner
// `*.query(...)` / `*.execute(...)` whose arg 0 is a string literal
// and whose result has at least one chained method call appended whose
// name is in the ORM-accessor whitelist. If both hold, synthesise a
// same-node `Sanitizer(SQL_QUERY)` mirroring the Java JPA fix. Bare
// `connection.query("SELECT ...")` (no chained method) and
// `db.query("UPDATE x SET y=" + name)` (non-literal arg 0) leave the
// sink in place, both are genuine SQLi shapes.
if (lang == "javascript"
|| lang == "js"
|| lang == "typescript"
|| lang == "ts"
|| lang == "tsx")
&& labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
&& !labels
.iter()
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
{
const QUERY_TARGETS: &[&str] = &["query", "execute"];
// ORM-accessor methods that take object-literal args and return
// promises of rows / row counts. Promise methods (`then`, `catch`,
// `finally`) deliberately excluded, they don't prove ORM shape.
const ORM_CHAIN_METHODS: &[&str] = &[
"findOne",
"findMany",
"findFirst",
"findUnique",
"findById",
"find",
"create",
"createMany",
"update",
"updateMany",
"upsert",
"delete",
"deleteMany",
"count",
"aggregate",
"distinct",
"save",
];
// Fall back to a deeper walk (up to 4 levels) for await/return-
// wrapped calls (e.g. `const x = await db.query(...).findOne(...)` ,
// call sits at depth 3 inside lexical_declaration > variable_declarator
// > await_expression > call_expression).
let chain_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(call_node) = chain_call {
// Outer method must be in the ORM whitelist *and* the chain must
// have a deeper inner call to a `query`/`execute` whose arg 0 is
// a string literal. Both checks gate the synthesis.
let outer_method = js_chain_outer_method_for_inner(call_node, QUERY_TARGETS, code);
let outer_is_orm = outer_method
.as_deref()
.is_some_and(|m| ORM_CHAIN_METHODS.contains(&m));
if outer_is_orm
&& let Some((arg0_kind, has_interp)) =
js_chain_arg0_kind_for_method(call_node, QUERY_TARGETS, code)
&& !has_interp
&& matches!(
arg0_kind.as_str(),
"string" | "string_fragment" | "template_string"
)
{
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
}
}
}
let span = (ast.start_byte(), ast.end_byte());
/* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */
let (defines, uses, extra_defines) = def_use(ast, lang, code);
// Capture constant text for SSA constant propagation: when this node
// defines a variable from a syntactic literal (no identifier uses),
// extract the raw literal text from the AST. Also capture the
// argument of a const-return (`return []`) so the SSA const-return
// synthesis can emit `Const(Some(text))` instead of `Const(None)`,
// surfacing the literal text to downstream container-literal
// detection.
let const_text = if (defines.is_some() && uses.is_empty())
|| (kind == StmtKind::Return && uses.is_empty())
{
extract_literal_rhs(ast, lang, code)
} else {
None
};
let callee = if kind == StmtKind::Call || !labels.is_empty() {
Some(text.clone())
} else {
None
};
// Extract condition metadata for If nodes.
let (condition_text, condition_vars, condition_negated) = if kind == StmtKind::If {
extract_condition_raw(ast, lang, code)
} else {
(None, Vec::new(), false)
};
// Extract per-argument identifiers for Call nodes.
// Also extract for gated-sink nodes so payload-arg filtering works.
let arg_uses = if kind == StmtKind::Call || sink_payload_args.is_some() {
call_ast
.map(|cn| extract_arg_uses(cn, code))
.unwrap_or_default()
} else {
Vec::new()
};
// String-literal values at each positional argument, parallel to
// `arg_uses`. Populated whenever there is a call AST so downstream
// passes (static-map, symex, sink suppression) can consume literals
// without re-accessing source bytes.
let arg_string_literals = call_ast
.map(|cn| extract_arg_string_literals(cn, code))
.unwrap_or_default();
// Extract keyword / named arguments for Call and gated-sink nodes.
// Languages whose grammar doesn't produce `keyword_argument` / `named_argument`
// children return an empty Vec, so this costs nothing for C/Java/Go/etc.
let kwargs = if kind == StmtKind::Call || sink_payload_args.is_some() {
call_ast
.map(|cn| extract_kwargs(cn, code))
.unwrap_or_default()
} else {
Vec::new()
};
// Check whether all arguments are syntactic literals (for taint sink suppression).
let all_args_literal = if kind == StmtKind::Call {
call_ast
.map(|cn| has_only_literal_args(cn, code))
.unwrap_or(false)
} else {
false
};
// Detect parameterized SQL queries: arg 0 is a string literal with
// placeholder patterns ($1, ?, %s, :name) and >= 2 args present.
// Uses a deeper recursive search than `call_ast` (which only goes 2
// levels) to handle await-wrapped calls inside declarations.
let parameterized_query = labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
&& call_ast
.or_else(|| find_call_node_deep(ast, lang, 5))
.is_some_and(|cn| is_parameterized_query_call(cn, code));
// Extract per-argument inner call callees for interprocedural sanitizer resolution.
let mut arg_callees = if kind == StmtKind::Call {
call_ast
.map(|cn| extract_arg_callees(cn, lang, code))
.unwrap_or_default()
} else {
Vec::new()
};
// For assignment sinks (including CallWrapper-wrapped assignments like
// `element.innerHTML = clean(name)`), also extract the RHS callee.
// This runs regardless of kind because a CallWrapper node may have
// kind=Call (for the contained getElementById call) yet the actual
// sink is the assignment to innerHTML.
if !labels.is_empty() {
let assign_node = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
Some(ast)
} else if matches!(lookup(lang, ast.kind()), Kind::CallWrapper) {
let mut cursor = ast.walk();
ast.children(&mut cursor)
.find(|c| matches!(lookup(lang, c.kind()), Kind::Assignment))
} else {
None
};
if let Some(asgn) = assign_node
&& let Some(rhs) = asgn.child_by_field_name("right")
&& let Some(callee_name) = call_ident_of(rhs, lang, code)
{
arg_callees.push(Some(callee_name));
}
}
// For method-style calls, extract the receiver identifier as a separate
// channel on `CallMeta.receiver`. The receiver is **not** prepended to
// `arg_uses`: `arg_uses` contains positional-argument identifiers only,
// and the receiver is carried as its own typed channel end-to-end
// (SSA `SsaOp::Call.receiver`, summary `receiver_to_return`/`receiver_to_sink`).
//
// Two cases:
// 1. Kind::CallMethod, native method call AST (Java method_invocation,
// Rust method_call_expression, Ruby call, PHP member_call_expression).
// Receiver is exposed via "object"/"receiver"/"scope" field on the call.
// 2. Kind::CallFn whose function child is a member_expression (JS/TS) or
// attribute (Python). These grammars model `obj.method(x)` as a plain
// call_expression/call with a dotted-name function child. Without this
// branch the structured `receiver` stays `None` and type-qualified
// resolution loses its anchor.
let receiver = if let Some(cn) = call_ast {
match lookup(lang, cn.kind()) {
Kind::CallMethod => {
let recv_node = cn
.child_by_field_name("object")
.or_else(|| cn.child_by_field_name("receiver"))
.or_else(|| cn.child_by_field_name("scope"))
// Rust `method_call_expression` names the receiver "value".
.or_else(|| cn.child_by_field_name("value"));
if let Some(rn) = recv_node
&& matches!(rn.kind(), "identifier" | "variable_name")
&& let Some(recv_text) = text_of(rn, code)
{
Some(recv_text)
} else if let Some(rn) = recv_node {
// Complex receiver (chain / field access / nested call).
// Drill through member/field/call nodes to the leftmost
// plain identifier so var_stacks lookup resolves the SSA
// value, which is what type-qualified resolution
// anchors on. Falls back to `root_receiver_text` (which
// returns raw text like "conn.execute") only if drilling
// fails, preserving prior behavior for types we can't
// structurally reduce.
root_member_receiver(rn, code).or_else(|| root_receiver_text(cn, lang, code))
} else {
None
}
}
Kind::CallFn => {
// JS/TS `obj.method(x)`: call_expression.function = member_expression.
// Python `obj.method(x)`: call.function = attribute.
// Rust `obj.method(x)`: call_expression.function = field_expression
// (field on `value`, not `object`, value can be another call
// for chained forms like `Connection::open(p).unwrap().execute(...)`).
// Pull the receiver from the object/attribute-owner field.
let func_child = cn.child_by_field_name("function");
let recv_node = match func_child {
Some(fc) if fc.kind() == "member_expression" || fc.kind() == "attribute" => {
fc.child_by_field_name("object")
}
Some(fc) if fc.kind() == "field_expression" => fc.child_by_field_name("value"),
_ => None,
};
if let Some(rn) = recv_node {
if matches!(rn.kind(), "identifier" | "variable_name" | "this" | "self") {
text_of(rn, code)
} else {
// Complex receiver (nested attribute, chained call, subscript).
// Drill to the leftmost plain identifier; when the chain is
// purely member_expression/attribute nodes, we want the base
// identifier (e.g. `request` for `request.args.get`).
root_member_receiver(rn, code)
.or_else(|| root_receiver_text(rn, lang, code))
}
} else {
None
}
}
_ => None,
}
} else {
None
};
// Extract cast/type-assertion target type from AST node.
let cast_target_type = match ast.kind() {
// Java: (Type) expr
"cast_expression" => ast
.child_by_field_name("type")
.filter(|n| matches!(n.kind(), "type_identifier" | "scoped_type_identifier"))
.and_then(|n| text_of(n, code)),
// TypeScript: expr as Type
"as_expression" => ast
.child_by_field_name("type")
.filter(|n| matches!(n.kind(), "type_identifier" | "predefined_type"))
.and_then(|n| text_of(n, code)),
// TypeScript: <Type>expr (angle-bracket syntax)
"type_assertion" => ast
.child(0)
.filter(|n| matches!(n.kind(), "type_identifier" | "predefined_type"))
.and_then(|n| text_of(n, code)),
// Go: expr.(Type)
"type_assertion_expression" => ast
.child_by_field_name("type")
.filter(|n| matches!(n.kind(), "type_identifier" | "qualified_type"))
.and_then(|n| text_of(n, code)),
_ => None,
};
// RAII-managed resource detection: tag acquire nodes whose resources
// are automatically cleaned up by language semantics (ownership/drop,
// smart pointers). Follows the same pattern as `managed_resource` for
// Python `with` and Java try-with-resources.
let is_raii_managed = is_raii_factory(lang, &text);
// Ruby block form auto-close: `File.open(path) { |f| f.read }` ,
// the block parameter receives the resource and Ruby guarantees close
// at block exit. If assigned (`f = File.open(p) { ... }`), the
// variable holds the block's return value, not an open resource.
let is_ruby_block_managed = lang == "ruby"
&& call_ast.is_some_and(|cn| {
let mut c = cn.walk();
cn.children(&mut c)
.any(|ch| ch.kind() == "do_block" || ch.kind() == "block")
});
let string_prefix = extract_template_prefix(ast, lang, code)
.or_else(|| call_ast.and_then(|cn| extract_template_prefix(cn, lang, code)));
// Prefer the span of the call found by `find_classifiable_inner_call`
// (deeper, classification-driven) over the one from `first_call_ident`
// (shallower, text-override-driven). Only record `callee_span` when it
// actually narrows against `ast.span`, storing a redundant copy would
// just bloat every labeled Call node.
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
// Constructor detection: a `new X(...)` call carries different cap
// semantics than a plain function call. The SSA Call transfer uses
// this flag to narrow the constructed value's caps so out-of-process
// side-effect bits (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on
// the arguments don't survive into a wrapper-object instance.
// Recognised forms:
// * JS/TS `new_expression`
// * Java/C++ `object_creation_expression`
// * PHP `object_creation_expression`
let is_constructor = ast.kind() == "new_expression"
|| ast.kind() == "object_creation_expression"
|| call_ast
.is_some_and(|cn| matches!(cn.kind(), "new_expression" | "object_creation_expression"));
let idx = g.add_node(NodeInfo {
kind,
call: CallMeta {
callee,
callee_text: None,
outer_callee,
callee_span,
call_ordinal,
arg_uses,
receiver,
sink_payload_args,
kwargs,
arg_string_literals,
destination_uses,
gate_filters,
is_constructor,
},
taint: TaintMeta {
labels,
const_text,
defines,
uses,
extra_defines,
},
ast: AstMeta {
span,
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
condition_text,
condition_vars,
condition_negated,
all_args_literal,
catch_param: false,
arg_callees,
cast_target_type,
bin_op: extract_bin_op(ast, lang),
bin_op_const: extract_bin_op_const(ast, lang, code),
managed_resource: is_raii_managed || is_ruby_block_managed,
in_defer: false,
parameterized_query,
string_prefix,
is_eq_with_const: detect_eq_with_const(ast, lang),
is_numeric_length_access: detect_numeric_length_access(ast, lang, code),
member_field: detect_member_field_assignment(ast, code),
rhs_is_function_literal: rhs_is_function_literal(ast, lang),
});
debug!(
target: "cfg",
"node {} ← {:?} txt=`{}` span={:?} labels={:?}",
idx.index(),
kind,
text,
span,
g[idx].taint.labels
);
idx
}
/// Add the same edge (of the same kind) from every node in `froms` to `to`.
#[inline]
pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind: EdgeKind) {
for &f in froms {
debug!(target: "cfg", "edge {} → {} ({:?})", f.index(), to.index(), kind);
g.add_edge(f, to, kind);
}
}
/// Pre-emit dedicated Source CFG nodes for call arguments that contain source
/// member expressions.
///
/// **Two-step API**, Source nodes must be created *before* the Call node so
/// they receive lower graph indices. This is critical because the If handler
/// uses `NodeIndex::new(g.node_count())` to capture the first node built in a
/// branch and wires a True/False edge to it. If the Source node has a lower
/// index than the Call node, the True edge lands on the Source node, and the
/// engine's redundant-Seq-edge skip logic correctly drops the parallel Seq
/// edge from the condition. Without this ordering, the Seq edge would bypass
/// the auth-elevation transfer on the True edge and send Unauthed state into
/// the branch body.
///
/// True when `ast` is an assignment / declaration whose RHS is a
/// function or lambda literal, i.e. shapes like
/// * Go `run := func() { ... }`
/// * JS/TS `var run = function() { ... }` / `const run = () => ...`
/// * Python `run = lambda x: ...`
/// * Ruby `run = ->() { ... }` / `run = proc { ... }`
///
/// Detected by walking the assignment's `right` / `value` field (or the
/// `init` field for declarators) and checking whether the resolved RHS
/// node classifies as `Kind::Function`. Conservative: when the assignment
/// shape isn't recognised the function returns `false`.
///
/// Used by `push_node`'s RHS member-text fallback to suppress source/sink
/// label propagation from inside the literal's body up onto the outer
/// wrapper assignment. The literal is processed as its own scope by
/// `collect_nested_function_nodes`.
fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
use conditions::unwrap_parens;
// Find the RHS node across the languages we support. Most grammars
// expose `right` (assignment_statement, assignment_expression,
// short_var_declaration); JS / Java use `value` on
// `variable_declarator` / `init_declarator`; Rust uses `value` on
// `let_declaration`.
let mut candidate = ast.child_by_field_name("right");
if candidate.is_none() {
// Walk one level into declarations whose direct child is the
// declarator (variable_declaration → variable_declarator →
// value), or expression-statement wrappers whose direct child is
// an assignment_expression / assignment with a `right` field
// (JS `expression_statement > assignment_expression`, Python
// `expression_statement > assignment`).
let mut cursor = ast.walk();
for c in ast.children(&mut cursor) {
if matches!(
c.kind(),
"variable_declarator" | "init_declarator" | "let_declaration"
) {
candidate = c
.child_by_field_name("value")
.or_else(|| c.child_by_field_name("init"));
if candidate.is_some() {
break;
}
} else if matches!(lookup(lang, c.kind()), Kind::Assignment) {
candidate = c.child_by_field_name("right");
if candidate.is_some() {
break;
}
}
}
}
if candidate.is_none() {
// Some grammars wrap the RHS in `expression_list` or similar.
// Search recursively for a Kind::Function descendant of the
// direct RHS-bearing fields.
candidate = ast
.child_by_field_name("value")
.or_else(|| ast.child_by_field_name("init"));
}
let Some(rhs) = candidate else { return false };
let rhs = unwrap_parens(rhs);
if matches!(lookup(lang, rhs.kind()), Kind::Function) && rhs.child_count() > 0 {
return true;
}
// Go's `expression_list` wrapping for short_var_declaration's RHS.
if rhs.kind() == "expression_list" {
let mut cursor = rhs.walk();
for c in rhs.named_children(&mut cursor) {
let c = unwrap_parens(c);
if matches!(lookup(lang, c.kind()), Kind::Function) && c.child_count() > 0 {
return true;
}
}
}
false
}
/// when `ast` is (or wraps) an assignment whose
/// LHS is a single subscript / index expression with a plain-identifier
/// receiver, emit a synthetic `__index_set__` Call node and return its
/// `NodeIndex`. Returns `None` for non-subscript LHSs, multi-target
/// assignments, complex receivers, or when the RHS contains a call
/// (those still flow through the existing has_call_descendant path).
///
/// Gated on `pointer::is_enabled()` by the caller.
fn try_lower_subscript_write(
ast: Node,
preds: &[NodeIndex],
g: &mut Cfg,
lang: &str,
code: &[u8],
enclosing_func: Option<&str>,
call_ordinal: &mut u32,
) -> Option<NodeIndex> {
// Locate the assignment node, `ast` may be the assignment itself
// (Go `assignment_statement`) or a wrapper (`expression_statement`
// containing JS `assignment_expression` / Python `assignment`).
let assign_ast = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
ast
} else {
let mut cursor = ast.walk();
ast.children(&mut cursor)
.find(|c| matches!(lookup(lang, c.kind()), Kind::Assignment))?
};
let lhs = assign_ast.child_by_field_name("left")?;
if has_call_descendant(assign_ast, lang) {
return None;
}
let subscript_node = subscript_lhs_node(lhs, lang)?;
let (arr_text, idx_text) = subscript_components(subscript_node, code)?;
let rhs = assign_ast.child_by_field_name("right")?;
let mut rhs_uses: Vec<String> = Vec::new();
collect_idents(rhs, code, &mut rhs_uses);
let span = (ast.start_byte(), ast.end_byte());
let ord = *call_ordinal;
*call_ordinal += 1;
let mut uses_all: Vec<String> = vec![arr_text.clone(), idx_text.clone()];
uses_all.extend(rhs_uses.iter().cloned());
let n = g.add_node(NodeInfo {
kind: StmtKind::Call,
call: CallMeta {
callee: Some("__index_set__".to_string()),
receiver: Some(arr_text.clone()),
arg_uses: vec![vec![idx_text.clone()], rhs_uses.clone()],
call_ordinal: ord,
..Default::default()
},
taint: TaintMeta {
uses: uses_all,
..Default::default()
},
ast: AstMeta {
span,
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
connect_all(g, preds, n, EdgeKind::Seq);
Some(n)
}
/// Step 1 (`pre_emit_arg_source_nodes`): scan the AST, create Source nodes,
/// wire them to `preds`, and return (effective_preds, synth_bindings,
/// uses_only_synth_names).
///
/// `synth_bindings` carry `(arg_pos, synth_name)` pairs that should be
/// appended to both the call's `arg_uses[arg_pos]` and its `taint.uses`.
/// `uses_only_synth_names` carry synth names that should *only* be
/// appended to `taint.uses`, used for chain-inner-arg sources where the
/// synth value is not a positional argument of the OUTER call but still
/// participates in the call's implicit dependency chain (e.g. `r.Body`
/// inside `json.NewDecoder(r.Body).Decode(emoji)`'s receiver).
///
/// Step 2 (`apply_arg_source_bindings`): after `push_node` creates the Call
/// node, add the synthetic variable names to its `arg_uses` and `uses`.
type PreEmitArgSourceResult = (SmallVec<[NodeIndex; 4]>, Vec<(usize, String)>, Vec<String>);
fn pre_emit_arg_source_nodes(
g: &mut Cfg,
ast: Node,
lang: &str,
code: &[u8],
enclosing_func: Option<&str>,
analysis_rules: Option<&LangAnalysisRules>,
preds: &[NodeIndex],
) -> PreEmitArgSourceResult {
let mut effective_preds: SmallVec<[NodeIndex; 4]> = SmallVec::from_slice(preds);
let mut bindings: Vec<(usize, String)> = Vec::new();
let mut uses_only: Vec<String> = Vec::new();
let extra = analysis_rules.and_then(|r| {
if r.extra_labels.is_empty() {
None
} else {
Some(r.extra_labels.as_slice())
}
});
let Some(call_ast) = find_call_node(ast, lang) else {
return (effective_preds, bindings, uses_only);
};
let Some(args_node) = call_ast.child_by_field_name("arguments") else {
return (effective_preds, bindings, uses_only);
};
// Collect children first (can't borrow cursor across mutable graph ops).
let children: Vec<_> = {
let mut cursor = args_node.walk();
args_node.named_children(&mut cursor).collect()
};
// Bail on spread/splat/keyword arguments where positional mapping is unreliable.
for child in &children {
let k = child.kind();
if k == "spread_element"
|| k == "dictionary_splat"
|| k == "list_splat"
|| k == "keyword_argument"
|| k == "splat_argument"
|| k == "hash_splat_argument"
|| k == "named_argument"
{
return (effective_preds, bindings, uses_only);
}
}
let pointer_on = crate::pointer::is_enabled();
for (pos, child) in children.iter().enumerate() {
let src_label = first_member_label(*child, lang, code, extra);
if let Some(DataLabel::Source(caps)) = src_label {
// Use the *current* node count as a unique token, it equals the
// index the new Source node will receive.
let synth_name = format!("__nyx_src_{}_{}", g.node_count(), pos);
let member_text = first_member_text(*child, code);
let span = (child.start_byte(), child.end_byte());
let mut src_labels: SmallVec<[DataLabel; 2]> = SmallVec::new();
src_labels.push(DataLabel::Source(caps));
let src_idx = g.add_node(NodeInfo {
kind: StmtKind::Seq,
call: CallMeta {
callee: member_text,
..Default::default()
},
taint: TaintMeta {
labels: src_labels,
defines: Some(synth_name.clone()),
..Default::default()
},
ast: AstMeta {
span,
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
connect_all(g, &effective_preds, src_idx, EdgeKind::Seq);
effective_preds.clear();
effective_preds.push(src_idx);
bindings.push((pos, synth_name));
continue;
}
//pre-emit `__index_get__` Call nodes for
// subscript / index-expression args when pointer analysis is
// enabled. This lets the W2/W4 container ELEM read hook fire
// on the synth call, propagating must/may/caps from the cell
// to the consuming sink call's argument.
//
// Gated on `pointer::is_enabled()` so the env-var=0 path keeps
// CFG shapes bit-identical to today's output. Only fires when
// the array operand resolves to a plain identifier, see
// `subscript_components` for the bail conditions.
if pointer_on
&& is_subscript_kind(child.kind())
&& let Some((arr_text, idx_text)) = subscript_components(*child, code)
{
let synth_name = format!("__nyx_idxget_{}_{}", g.node_count(), pos);
let span = (child.start_byte(), child.end_byte());
let idx_node = g.add_node(NodeInfo {
kind: StmtKind::Call,
call: CallMeta {
callee: Some("__index_get__".to_string()),
receiver: Some(arr_text.clone()),
arg_uses: vec![vec![idx_text.clone()]],
..Default::default()
},
taint: TaintMeta {
defines: Some(synth_name.clone()),
uses: vec![arr_text, idx_text],
..Default::default()
},
ast: AstMeta {
span,
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
connect_all(g, &effective_preds, idx_node, EdgeKind::Seq);
effective_preds.clear();
effective_preds.push(idx_node);
bindings.push((pos, synth_name));
}
}
// Chain-shape source pre-emission: walk the receiver chain of `call_ast`
// and emit synth Source nodes for any source-labeled inner-call ARGs.
//
// This is what carries `r.Body` into the OUTER call's implicit-uses
// group for shapes like `json.NewDecoder(r.Body).Decode(emoji)`, where
// the outer callee text (`json.NewDecoder.Decode` after chain
// normalisation) doesn't classify as a Source on its own. Without
// this, the writeback receiver-resolution path has nothing to read
// from and the CVE-2024-31450 chain stays clean.
//
// Gated to Go and to writeback-shaped outer callees (`Decode` /
// `Unmarshal`) because the synth-source emission is only useful when
// a downstream writeback consumer reads from the chain's tainted
// receiver, broader gating risks emitting synth sources whose taint
// never propagates and whose presence trips Layer B AST-pattern
// suppression on unrelated sinks (see
// `tests/fixtures/real_world/go/taint/func_literal_capture.go`).
// Synth names land in `uses_only` (not `bindings`) because they
// don't correspond to a positional outer-call argument; they surface
// only via `info.taint.uses`.
let outer_method_is_writeback = call_ast
.child_by_field_name("function")
.or_else(|| call_ast.child_by_field_name("method"))
.and_then(|f| {
f.child_by_field_name("field")
.or_else(|| f.child_by_field_name("property"))
.or_else(|| f.child_by_field_name("name"))
})
.and_then(|n| text_of(n, code))
.is_some_and(|name| name == "Decode" || name == "Unmarshal");
if lang == "go" && outer_method_is_writeback {
let mut inner_args: Vec<Node> = Vec::new();
walk_chain_inner_call_args(call_ast, lang, &mut inner_args);
for arg in inner_args {
let k = arg.kind();
// Mirror the splat/keyword bail from the outer-args pass.
if k == "spread_element"
|| k == "dictionary_splat"
|| k == "list_splat"
|| k == "keyword_argument"
|| k == "splat_argument"
|| k == "hash_splat_argument"
|| k == "named_argument"
{
continue;
}
let src_label = first_member_label(arg, lang, code, extra);
if let Some(DataLabel::Source(caps)) = src_label {
let synth_name = format!("__nyx_chainsrc_{}_{}", g.node_count(), uses_only.len());
let member_text = first_member_text(arg, code);
let span = (arg.start_byte(), arg.end_byte());
let mut src_labels: SmallVec<[DataLabel; 2]> = SmallVec::new();
src_labels.push(DataLabel::Source(caps));
let src_idx = g.add_node(NodeInfo {
kind: StmtKind::Seq,
call: CallMeta {
callee: member_text,
..Default::default()
},
taint: TaintMeta {
labels: src_labels,
defines: Some(synth_name.clone()),
..Default::default()
},
ast: AstMeta {
span,
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
connect_all(g, &effective_preds, src_idx, EdgeKind::Seq);
effective_preds.clear();
effective_preds.push(src_idx);
uses_only.push(synth_name);
}
}
}
(effective_preds, bindings, uses_only)
}
/// Step 2: wire synthetic variable names from pre-emitted Source nodes into
/// the Call node's `arg_uses` and `uses`. `uses_only` synth names are
/// appended only to `taint.uses`, used for chain-inner-arg sources whose
/// synth value is not a positional outer-call argument.
fn apply_arg_source_bindings(
g: &mut Cfg,
call_node: NodeIndex,
bindings: &[(usize, String)],
uses_only: &[String],
) {
for (pos, synth_name) in bindings {
let arg_uses = &mut g[call_node].call.arg_uses;
if *pos < arg_uses.len() {
arg_uses[*pos].push(synth_name.clone());
} else {
while arg_uses.len() < *pos {
arg_uses.push(vec![]);
}
arg_uses.push(vec![synth_name.clone()]);
}
g[call_node].taint.uses.push(synth_name.clone());
}
for synth_name in uses_only {
g[call_node].taint.uses.push(synth_name.clone());
}
}
// -------------------------------------------------------------------------
// The recursive *workhorse* that converts an AST node into a CFG slice.
// Returns the set of *exit* nodes that need to be wired further.
// -------------------------------------------------------------------------
#[allow(clippy::too_many_arguments)]
pub(super) fn build_sub<'a>(
ast: Node<'a>,
preds: &[NodeIndex], // predecessor frontier
g: &mut Cfg,
lang: &str,
code: &'a [u8],
summaries: &mut FuncSummaries,
file_path: &str,
enclosing_func: Option<&str>,
call_ordinal: &mut u32,
analysis_rules: Option<&LangAnalysisRules>,
break_targets: &mut Vec<NodeIndex>,
continue_targets: &mut Vec<NodeIndex>,
throw_targets: &mut Vec<NodeIndex>,
bodies: &mut Vec<BodyCfg>,
next_body_id: &mut u32,
current_body_id: BodyId,
) -> Vec<NodeIndex> {
match lookup(lang, ast.kind()) {
// ─────────────────────────────────────────────────────────────────
// IF/ELSE: two branches that remerge afterwards
// ─────────────────────────────────────────────────────────────────
Kind::If => {
// Some grammars (Go `if init; cond {}`, sibling C-style forms)
// attach an init / "initializer" subtree that runs before the
// condition. Tree-sitter exposes it under the `initializer`
// field. Without lowering it, side-effecting calls in the
// init (e.g. Owncast CVE-2024-31450's
// `if err := json.NewDecoder(r.Body).Decode(emoji); err != nil`)
// disappear from the CFG and downstream taint never sees the
// call. Languages that don't expose `initializer` here return
// None and the post-init `preds` is bit-identical to the
// pre-fix behaviour. The init's exits become the predecessors
// for the condition so its side effects are visible to both
// branches.
let init_exits_owned = ast.child_by_field_name("initializer").map(|init| {
build_sub(
init,
preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
)
});
let preds: &[NodeIndex] = match &init_exits_owned {
Some(exits) => exits.as_slice(),
None => preds,
};
// Check if condition contains a boolean operator for short-circuit decomposition.
let cond_subtree = ast.child_by_field_name("condition").or_else(|| {
// Rust `if_expression` uses positional children
let mut cursor = ast.walk();
ast.children(&mut cursor).find(|c| {
let k = c.kind();
!matches!(lookup(lang, k), Kind::Block | Kind::Trivia)
&& k != "if"
&& k != "else"
&& k != "let"
&& k != "{"
&& k != "}"
&& k != "("
&& k != ")"
})
});
let has_short_circuit = cond_subtree
.map(|c| is_boolean_operator(unwrap_parens(c)).is_some())
.unwrap_or(false);
// Check for negation wrapping the entire condition (e.g. `!(a && b)`)
//, if present, skip short-circuit decomposition (De Morgan out of scope).
let has_short_circuit = has_short_circuit
&& cond_subtree.map_or(false, |c| {
let unwrapped = unwrap_parens(c);
!matches!(
unwrapped.kind(),
"unary_expression"
| "not_operator"
| "prefix_unary_expression"
| "unary_not"
)
});
let is_unless = ast.kind() == "unless";
// Determine true/false exit sets for wiring branches.
let (true_exits, false_exits) = if has_short_circuit {
let cond_ast = cond_subtree.unwrap();
build_condition_chain(
cond_ast,
preds,
EdgeKind::Seq,
g,
lang,
code,
enclosing_func,
)
} else {
// Single-node path (original behavior)
let cond = push_node(
g,
StmtKind::If,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, cond, EdgeKind::Seq);
(vec![cond], vec![cond])
};
// For `unless`, swap: body runs when condition is false.
let (then_preds, else_preds) = if is_unless {
(&false_exits, &true_exits)
} else {
(&true_exits, &false_exits)
};
let (then_edge, else_edge) = if is_unless {
(EdgeKind::False, EdgeKind::True)
} else {
(EdgeKind::True, EdgeKind::False)
};
// Locate then & else blocks using field-based lookup first,
// then positional fallback (Rust uses positional blocks).
let (then_block, else_block) = {
let field_then = ast
.child_by_field_name("consequence")
.or_else(|| ast.child_by_field_name("body"));
let field_else = ast.child_by_field_name("alternative");
if field_then.is_some() || field_else.is_some() {
(field_then, field_else)
} else {
// Fallback: positional block children (Rust `if_expression`)
let mut cursor = ast.walk();
let blocks: Vec<_> = ast
.children(&mut cursor)
.filter(|n| lookup(lang, n.kind()) == Kind::Block)
.collect();
(blocks.first().copied(), blocks.get(1).copied())
}
};
// THEN branch
let then_first_node = NodeIndex::new(g.node_count());
let then_exits = if let Some(b) = then_block {
let exits = build_sub(
b,
then_preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
// Add True/False edge from condition exit(s) to first node of then-branch.
if then_first_node.index() < g.node_count() {
connect_all(g, then_preds, then_first_node, then_edge);
} else if let Some(&first) = exits.first() {
connect_all(g, then_preds, first, then_edge);
}
exits
} else {
then_preds.to_vec()
};
// ELSE branch
let else_first_node = NodeIndex::new(g.node_count());
let else_exits = if let Some(b) = else_block {
let exits = build_sub(
b,
else_preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
if else_first_node.index() < g.node_count() {
connect_all(g, else_preds, else_first_node, else_edge);
} else if let Some(&first) = exits.first() {
connect_all(g, else_preds, first, else_edge);
}
exits
} else {
// No explicit else → create a synthetic pass-through node
// for the false path.
let pass = g.add_node(NodeInfo {
kind: StmtKind::Seq,
ast: AstMeta {
span: (ast.end_byte(), ast.end_byte()),
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
connect_all(g, else_preds, pass, else_edge);
vec![pass]
};
// Frontier = union of both branches
then_exits.into_iter().chain(else_exits).collect()
}
Kind::InfiniteLoop => {
// Synthetic header node
let header = push_node(
g,
StmtKind::Loop,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, header, EdgeKind::Seq);
// Fresh break/continue targets scoped to this loop
let mut loop_breaks = Vec::new();
let mut loop_continues = Vec::new();
// The body is the single `block` child
let body = match ast.child_by_field_name("body") {
Some(b) => b,
None => {
warn!(
"loop without body (error recovery?): kind={} byte={}",
ast.kind(),
ast.start_byte()
);
return vec![header];
}
};
let body_exits = build_sub(
body,
&[header],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
&mut loop_breaks,
&mut loop_continues,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
// Back-edge from every linear exit to header
for &e in &body_exits {
connect_all(g, &[e], header, EdgeKind::Back);
}
// Wire continue targets as back edges to header
for &c in &loop_continues {
connect_all(g, &[c], header, EdgeKind::Back);
}
// Break targets become exits of the loop
if loop_breaks.is_empty() {
// No break → infinite loop; header is the only exit for
// downstream code (fallthrough semantics)
vec![header]
} else {
loop_breaks
}
}
// ─────────────────────────────────────────────────────────────────
// WHILE / FOR: classic loop with a back edge.
// ─────────────────────────────────────────────────────────────────
Kind::While | Kind::For => {
let header = push_node(
g,
StmtKind::Loop,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, header, EdgeKind::Seq);
// Check for short-circuit condition
let cond_subtree = ast.child_by_field_name("condition");
let has_short_circuit = cond_subtree
.map(|c| {
let unwrapped = unwrap_parens(c);
is_boolean_operator(unwrapped).is_some()
&& !matches!(
unwrapped.kind(),
"unary_expression"
| "not_operator"
| "prefix_unary_expression"
| "unary_not"
)
})
.unwrap_or(false);
// Fresh break/continue targets scoped to this loop
let mut loop_breaks = Vec::new();
let mut loop_continues = Vec::new();
// Body = first (and usually only) block child.
let body = ast
.child_by_field_name("body")
.or_else(|| {
let mut c = ast.walk();
ast.children(&mut c)
.find(|n| lookup(lang, n.kind()) == Kind::Block)
})
.expect("loop without body");
if has_short_circuit {
let cond_ast = cond_subtree.unwrap();
let (true_exits, false_exits) = build_condition_chain(
cond_ast,
&[header],
EdgeKind::Seq,
g,
lang,
code,
enclosing_func,
);
// Wire body from true_exits
let body_first = NodeIndex::new(g.node_count());
let body_exits = build_sub(
body,
&true_exits,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
&mut loop_breaks,
&mut loop_continues,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
// Add True edges from condition chain to body
if body_first.index() < g.node_count() {
connect_all(g, &true_exits, body_first, EdgeKind::True);
}
// Back-edges go to header (not into the condition chain)
for &e in &body_exits {
connect_all(g, &[e], header, EdgeKind::Back);
}
for &c in &loop_continues {
connect_all(g, &[c], header, EdgeKind::Back);
}
// Loop exits = false_exits + breaks
let mut exits: Vec<NodeIndex> = false_exits;
exits.extend(loop_breaks);
exits
} else {
let body_exits = build_sub(
body,
&[header],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
&mut loop_breaks,
&mut loop_continues,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
// Backedge for every linear exit → header.
for &e in &body_exits {
connect_all(g, &[e], header, EdgeKind::Back);
}
// Wire continue targets as back edges to header
for &c in &loop_continues {
connect_all(g, &[c], header, EdgeKind::Back);
}
// Falling out of the loop = headers false branch +
// any break targets that exit the loop.
let mut exits = vec![header];
exits.extend(loop_breaks);
exits
}
}
// ─────────────────────────────────────────────────────────────────
// Control-flow sinks (return / break / continue).
// ─────────────────────────────────────────────────────────────────
Kind::Return => {
if has_call_descendant(ast, lang) {
// Return-call bug fix: emit a Call node BEFORE the Return so
// that callee labels (source/sanitizer/sink) are applied.
let ord = *call_ordinal;
*call_ordinal += 1;
let (effective_preds, src_bindings, src_uses_only) = pre_emit_arg_source_nodes(
g,
ast,
lang,
code,
enclosing_func,
analysis_rules,
preds,
);
let call_idx = push_node(
g,
StmtKind::Call,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
apply_arg_source_bindings(g, call_idx, &src_bindings, &src_uses_only);
connect_all(g, &effective_preds, call_idx, EdgeKind::Seq);
let ret = push_node(
g,
StmtKind::Return,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, &[call_idx], ret, EdgeKind::Seq);
// Recurse into any function expressions nested inside the
// returned call's arguments (e.g.
// `return new Promise((res, rej) => { ... })`). Without this
// the executor and any further inner callbacks are silently
// swallowed and the gated sinks they contain become invisible
// to classification. Mirrors the same recursion done by the
// CallWrapper / CallFn arms. Motivated by CVE-2025-64430.
let nested = collect_nested_function_nodes(ast, lang);
for func_node in nested {
build_sub(
func_node,
&[call_idx],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
}
Vec::new()
} else {
let ret = push_node(
g,
StmtKind::Return,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, ret, EdgeKind::Seq);
Vec::new() // terminates this path
}
}
Kind::Throw => {
if has_call_descendant(ast, lang) {
let ord = *call_ordinal;
*call_ordinal += 1;
let (effective_preds, src_bindings, src_uses_only) = pre_emit_arg_source_nodes(
g,
ast,
lang,
code,
enclosing_func,
analysis_rules,
preds,
);
let call_idx = push_node(
g,
StmtKind::Call,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
apply_arg_source_bindings(g, call_idx, &src_bindings, &src_uses_only);
connect_all(g, &effective_preds, call_idx, EdgeKind::Seq);
let ret = push_node(
g,
StmtKind::Throw,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, &[call_idx], ret, EdgeKind::Seq);
throw_targets.push(ret);
// Same nested-function recursion as the Return arm: a
// `throw new Promise(() => { ... })` would otherwise lose
// any inner gated sinks.
let nested = collect_nested_function_nodes(ast, lang);
for func_node in nested {
build_sub(
func_node,
&[call_idx],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
}
Vec::new()
} else {
let ret = push_node(
g,
StmtKind::Throw,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, ret, EdgeKind::Seq);
throw_targets.push(ret);
Vec::new()
}
}
Kind::Try => build_try(
ast,
preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
),
Kind::Break => {
let brk = push_node(
g,
StmtKind::Break,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, brk, EdgeKind::Seq);
break_targets.push(brk);
Vec::new()
}
Kind::Continue => {
let cont = push_node(
g,
StmtKind::Continue,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, cont, EdgeKind::Seq);
continue_targets.push(cont);
Vec::new()
}
Kind::Switch => build_switch(
ast,
preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
),
// ─────────────────────────────────────────────────────────────────
// BLOCK: statements execute sequentially
// ─────────────────────────────────────────────────────────────────
Kind::SourceFile | Kind::Block => {
// Ruby body_statement with rescue/ensure = implicit begin/rescue
if lang == "ruby" && ast.kind() == "body_statement" {
let mut check = ast.walk();
if ast
.children(&mut check)
.any(|c| c.kind() == "rescue" || c.kind() == "ensure")
{
return build_begin_rescue(
ast,
preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
}
}
let mut cursor = ast.walk();
let mut frontier = preds.to_vec();
// With per-body CFGs, function definitions become placeholder
// nodes that always have exactly one exit. The frontier never
// empties due to a function's internal return. We still keep a
// last-live fallback for preprocessor dangling-else edge cases.
let mut last_live_frontier = preds.to_vec();
let mut prev_was_preproc = false;
for child in ast.children(&mut cursor) {
let child_preds = if frontier.is_empty() && prev_was_preproc {
last_live_frontier.clone()
} else {
frontier.clone()
};
// Go `defer`: record node count before recursing so we can
// mark the deferred Call node(s) afterward.
let is_defer = lang == "go" && child.kind() == "defer_statement";
let defer_first_idx = if is_defer { g.node_count() } else { 0 };
let child_exits = build_sub(
child,
&child_preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
// Mark only Call nodes inside the defer as deferred releases.
if is_defer {
for raw in defer_first_idx..g.node_count() {
let idx = NodeIndex::new(raw);
if g[idx].kind == StmtKind::Call {
g[idx].in_defer = true;
}
}
}
let is_preproc = child.kind().starts_with("preproc_");
if !child_exits.is_empty() {
last_live_frontier = child_exits.clone();
}
frontier = child_exits;
prev_was_preproc = is_preproc;
}
frontier
}
// Function item create a header and dive into its body
Kind::Function => {
// ── 1) Extract function name ──────────────────────────────────────
// Lambda expressions don't have meaningful names; force the
// synthetic anon name to avoid C++ lambdas picking up parameter
// names via "declarator".
let fn_name = if ast.kind() == "lambda_expression" {
anon_fn_name(ast.start_byte())
} else {
ast.child_by_field_name("name")
.or_else(|| ast.child_by_field_name("declarator"))
.and_then(|n| {
let mut tmp = Vec::new();
collect_idents(n, code, &mut tmp);
tmp.into_iter().next()
})
.unwrap_or_else(|| anon_fn_name(ast.start_byte()))
};
// When the grammar-level name is anonymous, try to derive a binding
// name from the surrounding declaration or assignment. This lets
// `var h = function(x){...}` / `this.run = () => {...}` participate
// in callback resolution, callers referencing `h` or `run` can
// find the body via `resolve_local_func_key` and intra-file calls
// like `h()` can resolve to the anonymous body's summary. Without
// this, the body is keyed with the synthetic anon name and there
// is no path from the variable identifier to the body.
let fn_name = if is_anon_fn_name(&fn_name) {
derive_anon_fn_name_from_context(ast, lang, code).unwrap_or(fn_name)
} else {
fn_name
};
let is_anon = is_anon_fn_name(&fn_name);
let param_meta = extract_param_meta(ast, lang, code);
let param_count = param_meta.len();
let param_names: Vec<String> = param_meta.iter().map(|(n, _)| n.clone()).collect();
let param_types: Vec<Option<crate::ssa::type_facts::TypeKind>> =
param_meta.iter().map(|(_, t)| t.clone()).collect();
// ── 1b) Compute identity discriminators ───────────────────────────
let (fn_container, fn_kind) =
compute_container_and_kind(ast, ast.kind(), &fn_name, code);
// Disambiguator: depth-first preorder index of this function node
// within the file. Always populated so two same-name, same-
// container definitions never collide (e.g. duplicate defs in a
// file, overload-like patterns, nested defs with identical names
// in sibling scopes). Stable against unrelated edits above the
// function. Falls back to the start byte when the DFS-index
// map is absent (tests bypassing build_cfg).
let fn_disambig: Option<u32> =
Some(fn_dfs_index(ast.start_byte()).unwrap_or(ast.start_byte() as u32));
// ── 2) Create a separate body graph for this function ─────────────
let (mut fn_graph, fn_entry, fn_exit) =
create_body_graph(ast.start_byte(), ast.end_byte(), Some(&fn_name));
let body_ast = match ast.child_by_field_name("body").or_else(|| {
let mut c = ast.walk();
ast.children(&mut c)
.find(|n| matches!(lookup(lang, n.kind()), Kind::Block | Kind::SourceFile))
}) {
Some(b) => b,
None => {
warn!(
"fn without body (forward decl / abstract / error recovery): kind={} name={}",
ast.kind(),
fn_name
);
// Insert placeholder in parent graph and skip body processing
let placeholder = g.add_node(make_empty_node_info(
StmtKind::Seq,
(ast.start_byte(), ast.end_byte()),
enclosing_func,
));
connect_all(g, preds, placeholder, EdgeKind::Seq);
return vec![placeholder];
}
};
// Allocate a BodyId for this function
let fn_body_id = BodyId(*next_body_id);
*next_body_id += 1;
let entry_preds = inject_framework_param_sources(
ast,
code,
analysis_rules,
&mut fn_graph,
fn_entry,
Some(&fn_name),
);
let mut fn_call_ordinal: u32 = 0;
let mut fn_breaks = Vec::new();
let mut fn_continues = Vec::new();
let mut fn_throws = Vec::new();
let body_exits = build_sub(
body_ast,
&entry_preds,
&mut fn_graph,
lang,
code,
summaries,
file_path,
Some(&fn_name),
&mut fn_call_ordinal,
analysis_rules,
&mut fn_breaks,
&mut fn_continues,
&mut fn_throws,
bodies,
next_body_id,
fn_body_id,
);
// ── 3) Wire exits to Exit node ────────────────────────────────────
for &b in &body_exits {
connect_all(&mut fn_graph, &[b], fn_exit, EdgeKind::Seq);
}
// Wire internal Return/Throw nodes to Exit (both terminate this body)
for idx in fn_graph.node_indices().collect::<Vec<_>>() {
if matches!(fn_graph[idx].kind, StmtKind::Return | StmtKind::Throw)
&& idx != fn_exit
&& !fn_graph.contains_edge(idx, fn_exit)
{
connect_all(&mut fn_graph, &[idx], fn_exit, EdgeKind::Seq);
}
}
// ── 4) Light-weight dataflow on the body graph ────────────────────
let mut var_taint = HashMap::<String, Cap>::new();
let mut node_bits = HashMap::<NodeIndex, Cap>::new();
let mut fn_src_bits = Cap::empty();
let mut fn_sani_bits = Cap::empty();
let mut fn_sink_bits = Cap::empty();
let mut callees = Vec::<crate::summary::CalleeSite>::new();
let mut tainted_sink_params: Vec<usize> = Vec::new();
for idx in fn_graph.node_indices() {
let info = &fn_graph[idx];
if let Some(callee) = &info.call.callee {
let site = build_callee_site(callee, info, lang);
// Dedup by (name, arity, receiver, qualifier, ordinal). A
// single function may legitimately contain multiple distinct
// calls to the same callee (e.g. different ordinals or
// different receivers); all of those are kept.
if !callees.iter().any(|c| {
c.name == site.name
&& c.arity == site.arity
&& c.receiver == site.receiver
&& c.qualifier == site.qualifier
&& c.ordinal == site.ordinal
}) {
callees.push(site);
}
}
for lbl in &info.taint.labels {
match *lbl {
DataLabel::Source(bits) => fn_src_bits |= bits,
DataLabel::Sanitizer(bits) => fn_sani_bits |= bits,
DataLabel::Sink(bits) => {
fn_sink_bits |= bits;
for u in &info.taint.uses {
if let Some(pos) = param_names.iter().position(|p| p == u)
&& !tainted_sink_params.contains(&pos)
{
tainted_sink_params.push(pos);
}
}
}
}
}
let mut in_bits = Cap::empty();
for u in &info.taint.uses {
if let Some(b) = var_taint.get(u) {
in_bits |= *b;
}
}
let mut out_bits = in_bits;
for lab in &info.taint.labels {
match *lab {
DataLabel::Source(bits) => out_bits |= bits,
DataLabel::Sanitizer(bits) => out_bits &= !bits,
DataLabel::Sink(_) => {}
}
}
if let Some(def) = &info.taint.defines {
if out_bits.is_empty() {
var_taint.remove(def);
} else {
var_taint.insert(def.clone(), out_bits);
}
}
node_bits.insert(idx, out_bits);
}
for (&idx, &bits) in &node_bits {
if fn_graph[idx].kind == StmtKind::Return {
fn_src_bits |= bits;
}
}
for &pred in &body_exits {
if let Some(&bits) = node_bits.get(&pred) {
fn_src_bits |= bits;
}
}
// ── propagating_params ────────────────────────────────────────────
let propagating_params = {
let mut params = Vec::new();
for (i, pname) in param_names.iter().enumerate() {
let mut flows = false;
for &idx in node_bits.keys() {
if fn_graph[idx].kind == StmtKind::Return {
for u in &fn_graph[idx].taint.uses {
if u == pname {
flows = true;
}
if let Some(bits) = var_taint.get(u)
&& !bits.is_empty()
&& var_taint.contains_key(pname)
{
flows = true;
}
}
}
}
if !flows {
for &exit_pred in &body_exits {
let info = &fn_graph[exit_pred];
for u in &info.taint.uses {
if u == pname {
flows = true;
}
}
if let Some(def) = &info.taint.defines
&& def == pname
{
flows = true;
}
}
}
if flows {
params.push(i);
}
}
params
};
tainted_sink_params.sort_unstable();
tainted_sink_params.dedup();
// ── 5) Store summary (entry/exit are body-local) ──────────────────
let key = FuncKey {
lang: Lang::from_slug(lang).unwrap_or(Lang::Rust),
namespace: file_path.to_owned(),
container: fn_container.clone(),
name: fn_name.clone(),
arity: Some(param_count),
disambig: fn_disambig,
kind: fn_kind,
};
let body_func_key = key.clone();
summaries.insert(
key,
LocalFuncSummary {
entry: fn_entry,
exit: fn_exit,
source_caps: fn_src_bits,
sanitizer_caps: fn_sani_bits,
sink_caps: fn_sink_bits,
param_count,
param_names: param_names.clone(),
propagating_params,
tainted_sink_params,
callees,
container: fn_container,
disambig: fn_disambig,
kind: fn_kind,
},
);
// ── 6) Push BodyCfg ───────────────────────────────────────────────
let auth_decorators = extract_auth_decorators(ast, lang, code);
bodies.push(BodyCfg {
meta: BodyMeta {
id: fn_body_id,
kind: if is_anon {
BodyKind::AnonymousFunction
} else {
BodyKind::NamedFunction
},
name: if is_anon { None } else { Some(fn_name.clone()) },
params: param_names,
param_types,
param_count,
span: (ast.start_byte(), ast.end_byte()),
parent_body_id: Some(current_body_id),
func_key: Some(body_func_key),
auth_decorators,
},
graph: fn_graph,
entry: fn_entry,
exit: fn_exit,
});
// ── 7) Insert placeholder in parent graph ─────────────────────────
// Declaration-marker only: no defines, uses, callee, or labels.
let placeholder = g.add_node(make_empty_node_info(
StmtKind::Seq,
(ast.start_byte(), ast.end_byte()),
enclosing_func,
));
connect_all(g, preds, placeholder, EdgeKind::Seq);
vec![placeholder]
}
// Statements that **may** contain a call ---------------------------------
Kind::CallWrapper => {
let mut cursor = ast.walk();
// Recurse into divergent control-flow constructs nested inside
// an expression-statement wrapper. Rust's `expression_statement`
// wraps `return_expression` / `break_expression` /
// `continue_expression`; without this delegation the wrapper
// would lower the return as a plain `StmtKind::Call`, losing
// the return semantics and letting fall-through Seq edges
// survive into the SSA terminator (the OR-chain rejection-arm
// defect, see `or_chain_rejection_block_terminates_with_return`).
if let Some(inner) = ast.children(&mut cursor).find(|c| {
matches!(
lookup(lang, c.kind()),
Kind::InfiniteLoop
| Kind::While
| Kind::For
| Kind::If
| Kind::Return
| Kind::Throw
| Kind::Break
| Kind::Continue
)
}) {
return build_sub(
inner,
preds,
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
}
// JS/TS ternary-RHS split: `var x = c ? a : b;` and
// `obj.prop = c ? a : b;` lower to a real diamond CFG so the
// condition is control-flow (not a data-flow `uses` entry).
if matches!(lang, "javascript" | "typescript" | "tsx")
&& let Some((lhs_ast, ternary_ast)) = find_ternary_rhs_wrapper(ast)
{
let (lhs_text, lhs_labels) =
classify_ternary_lhs(lhs_ast, lang, code, analysis_rules);
return build_ternary_diamond(
lhs_text,
lhs_labels,
ternary_ast,
preds,
EdgeKind::Seq,
g,
lang,
code,
enclosing_func,
call_ordinal,
analysis_rules,
);
}
//subscript-write lowering when the
// CallWrapper's inner expression is `arr[i] = v` (JS/TS,
// Python). See `try_lower_subscript_write` for shape +
// bail matrix.
if crate::pointer::is_enabled()
&& let Some(n) = try_lower_subscript_write(
ast,
preds,
g,
lang,
code,
enclosing_func,
call_ordinal,
)
{
return vec![n];
}
let has_call = has_call_descendant(ast, lang);
let kind = if has_call {
StmtKind::Call
} else {
StmtKind::Seq
};
let ord = if kind == StmtKind::Call {
let o = *call_ordinal;
*call_ordinal += 1;
o
} else {
0
};
// Pre-emit Source nodes for call arguments containing source
// member expressions (e.g. `req.body.returnTo` inside
// `res.redirect(req.body.returnTo)`). Created BEFORE the Call
// node so they get lower indices, see doc comment on
// `pre_emit_arg_source_nodes` for why this ordering matters.
let (effective_preds, src_bindings, src_uses_only) = if kind == StmtKind::Call {
pre_emit_arg_source_nodes(g, ast, lang, code, enclosing_func, analysis_rules, preds)
} else {
(SmallVec::from_slice(preds), Vec::new(), Vec::new())
};
let node = push_node(
g,
kind,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
apply_arg_source_bindings(g, node, &src_bindings, &src_uses_only);
// Python `with_item`: acquisition inside a context manager.
// Only mark if this is actually an acquisition (Call + defines).
if ast.kind() == "with_item"
&& g[node].kind == StmtKind::Call
&& g[node].taint.defines.is_some()
{
g[node].managed_resource = true;
}
connect_all(g, &effective_preds, node, EdgeKind::Seq);
// If the callee is a configured terminator, treat as a dead end
if kind == StmtKind::Call
&& let Some(callee) = &g[node].call.callee
&& is_configured_terminator(callee, analysis_rules)
{
return Vec::new();
}
// Recurse into any function expressions nested in arguments
// (e.g. `app.get('/path', function(req, res) { ... })`)
// so that they get proper function summaries.
let nested = collect_nested_function_nodes(ast, lang);
for func_node in nested {
build_sub(
func_node,
&[node],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
}
// Rust match-guard synthesis: `let <name> = match <scrutinee> { <arm> if <guard> => .., ... }`
// collapses to this single Call node, hiding the guard from the predicate-classification
// pipeline. Append a synthetic If node (condition_vars includes <name>) so validation
// predicates like `.chars().all(|c| c.is_ascii_*())` narrow taint on the guarded branch.
if lang == "rust"
&& let Some((guard, let_name)) = detect_rust_let_match_guard(ast, code)
{
let if_node = emit_rust_match_guard_if(g, guard, &let_name, code, enclosing_func);
connect_all(g, &[node], if_node, EdgeKind::Seq);
let true_gate = g.add_node(NodeInfo {
kind: StmtKind::Seq,
ast: AstMeta {
span: (ast.end_byte(), ast.end_byte()),
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
let false_gate = g.add_node(NodeInfo {
kind: StmtKind::Seq,
ast: AstMeta {
span: (ast.end_byte(), ast.end_byte()),
enclosing_func: enclosing_func.map(|s| s.to_string()),
},
..Default::default()
});
connect_all(g, &[if_node], true_gate, EdgeKind::True);
connect_all(g, &[if_node], false_gate, EdgeKind::False);
return vec![true_gate, false_gate];
}
vec![node]
}
// Direct call nodes (Ruby `call`, Python `call`, etc. when they appear
// as direct children of a block rather than wrapped in expression_statement)
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => {
let ord = *call_ordinal;
*call_ordinal += 1;
let (effective_preds, src_bindings, src_uses_only) = pre_emit_arg_source_nodes(
g,
ast,
lang,
code,
enclosing_func,
analysis_rules,
preds,
);
let n = push_node(
g,
StmtKind::Call,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
apply_arg_source_bindings(g, n, &src_bindings, &src_uses_only);
connect_all(g, &effective_preds, n, EdgeKind::Seq);
// If the callee is a configured terminator, treat as a dead end
if let Some(callee) = &g[n].call.callee
&& is_configured_terminator(callee, analysis_rules)
{
return Vec::new();
}
// Recurse into any function expressions nested in arguments.
// Each nested function hits Kind::Function and becomes a separate body.
let nested = collect_nested_function_nodes(ast, lang);
for func_node in nested {
build_sub(
func_node,
&[n],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
throw_targets,
bodies,
next_body_id,
current_body_id,
);
}
vec![n]
}
// Assignment that may contain a call (Python `x = os.getenv(...)`, Ruby `x = gets()`)
Kind::Assignment => {
// JS/TS ternary-RHS split, same rationale as the CallWrapper branch.
if matches!(lang, "javascript" | "typescript" | "tsx")
&& let (Some(left), Some(right)) = (
ast.child_by_field_name("left"),
ast.child_by_field_name("right"),
)
{
let rhs = unwrap_parens(right);
if rhs.kind() == "ternary_expression" {
let (lhs_text, lhs_labels) =
classify_ternary_lhs(left, lang, code, analysis_rules);
return build_ternary_diamond(
lhs_text,
lhs_labels,
rhs,
preds,
EdgeKind::Seq,
g,
lang,
code,
enclosing_func,
call_ordinal,
analysis_rules,
);
}
}
//subscript-write lowering. See
// `try_lower_subscript_write` for the per-language shape
// matrix and bail conditions.
if crate::pointer::is_enabled()
&& let Some(n) = try_lower_subscript_write(
ast,
preds,
g,
lang,
code,
enclosing_func,
call_ordinal,
)
{
return vec![n];
}
let has_call = has_call_descendant(ast, lang);
let kind = if has_call {
StmtKind::Call
} else {
StmtKind::Seq
};
let ord = if kind == StmtKind::Call {
let o = *call_ordinal;
*call_ordinal += 1;
o
} else {
0
};
let n = push_node(
g,
kind,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
connect_all(g, preds, n, EdgeKind::Seq);
vec![n]
}
// Trivia we drop completely ---------------------------------------------
Kind::Trivia => preds.to_vec(),
// ─────────────────────────────────────────────────────────────────
// Every other node = simple sequential statement
// ─────────────────────────────────────────────────────────────────
_ => {
let n = push_node(
g,
StmtKind::Seq,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, n, EdgeKind::Seq);
vec![n]
}
}
}
/// Build an intraprocedural CFG and return (graph, entry_node).
///
/// * Walks the TreeSitter AST.
/// * Creates `StmtKind::*` nodes only for *statementlevel* constructs to keep
/// the graph compact.
/// * Wires a synthetic `Entry` node in front and a synthetic `Exit` node after
/// all real sinks.
pub(crate) fn build_cfg<'a>(
tree: &'a Tree,
code: &'a [u8],
lang: &str,
file_path: &str,
analysis_rules: Option<&LangAnalysisRules>,
) -> FileCfg {
debug!(target: "cfg", "Building CFG for {:?}", tree.root_node());
// Populate the per-file structural DFS-index map before any build_sub
// call reads from it. Cleared unconditionally at the end of this
// function so thread-local state never leaks between files.
populate_fn_dfs_indices(tree, lang);
// harvest DTO class definitions before any param classifier
// runs. Empty for languages without a collector. Cleared
// alongside the DFS map at end-of-build_cfg.
DTO_CLASSES.with(|cell| {
*cell.borrow_mut() = dto::collect_dto_classes(tree.root_node(), lang, code);
});
// harvest same-file `type X = Map<...>` / `Set<...>` / `T[]`
// aliases so JS/TS param classifiers resolve `m: ElementsMap`
// to `LocalCollection`. Empty for non-JS/TS languages.
TYPE_ALIAS_LC.with(|cell| {
*cell.borrow_mut() =
dto::collect_type_alias_local_collections(tree.root_node(), lang, code);
});
// Create the top-level body graph (BodyId(0)).
let (mut g, entry, exit) = create_body_graph(0, code.len(), None);
let mut summaries = FuncSummaries::new();
let mut bodies: Vec<BodyCfg> = Vec::new();
// BodyId(0) is reserved for top-level; function bodies start at 1.
let mut next_body_id: u32 = 1;
// Build the body below the synthetic ENTRY.
let mut top_ordinal: u32 = 0;
let mut top_breaks = Vec::new();
let mut top_continues = Vec::new();
let mut top_throws = Vec::new();
let exits = build_sub(
tree.root_node(),
&[entry],
&mut g,
lang,
code,
&mut summaries,
file_path,
None,
&mut top_ordinal,
analysis_rules,
&mut top_breaks,
&mut top_continues,
&mut top_throws,
&mut bodies,
&mut next_body_id,
BodyId(0),
);
debug!(target: "cfg", "exits: {:?}", exits);
// Wire every real exit to our synthetic EXIT node.
for e in exits {
connect_all(&mut g, &[e], exit, EdgeKind::Seq);
}
debug!(target: "cfg", "CFG DONE, top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
if cfg!(debug_assertions) {
for idx in g.node_indices() {
debug!(target: "cfg", " node {:>3}: {:?}", idx.index(), g[idx]);
}
for e in g.edge_references() {
debug!(
target: "cfg",
" edge {:>3} → {:<3} ({:?})",
e.source().index(),
e.target().index(),
e.weight()
);
}
let mut reachable: HashSet<NodeIndex> = Default::default();
let mut bfs = Bfs::new(&g, entry);
while let Some(nx) = bfs.next(&g) {
reachable.insert(nx);
}
debug!(
target: "cfg",
"reachable nodes: {}/{}",
reachable.len(),
g.node_count()
);
if reachable.len() != g.node_count() {
let unreachable: Vec<_> = g
.node_indices()
.filter(|i| !reachable.contains(i))
.collect();
debug!(target: "cfg", "‼︎ unreachable nodes: {:?}", unreachable);
}
let doms: Dominators<_> = simple_fast(&g, entry);
debug!(target: "cfg", "dominator tree computed (len = {:?})", doms);
}
// Insert top-level body at position 0.
let toplevel = BodyCfg {
meta: BodyMeta {
id: BodyId(0),
kind: BodyKind::TopLevel,
name: None,
params: Vec::new(),
param_types: Vec::new(),
param_count: 0,
span: (0, code.len()),
parent_body_id: None,
func_key: None,
auth_decorators: Vec::new(),
},
graph: g,
entry,
exit,
};
bodies.insert(0, toplevel);
// Sort by BodyId so that bodies[i].meta.id == BodyId(i).
// Nested functions are pushed before their parents during build_sub,
// so the Vec may be out of order before this sort.
bodies.sort_by_key(|b| b.meta.id);
// Extract import alias bindings for JS/TS files.
let import_bindings = if matches!(
lang,
"javascript" | "typescript" | "tsx" | "python" | "php" | "rust"
) {
extract_import_bindings(tree, code)
} else {
HashMap::new()
};
// Extract promisify-alias bindings (JS/TS only). Applies a post-pass
// over every call node whose callee is a recorded alias so the wrapped
// function's labels (source/sanitizer/sink) carry through to the alias.
let promisify_aliases = if matches!(lang, "javascript" | "typescript" | "tsx") {
extract_promisify_aliases(tree, code)
} else {
HashMap::new()
};
let extra = analysis_rules.map(|r| r.extra_labels.as_slice());
if !promisify_aliases.is_empty() {
apply_promisify_labels(&mut bodies, &promisify_aliases, lang, extra);
}
// Clear the per-file DFS-index map so it does not leak to the next
// file built on this thread.
clear_fn_dfs_indices();
// same hygiene for the DTO map.
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
TYPE_ALIAS_LC.with(|cell| cell.borrow_mut().clear());
// collect every
// declared inheritance / impl / implements relationship in the
// file. Per-language extractor in `cfg::hierarchy`; empty for
// Go and C. Each `(sub, super)` pair gets duplicated onto every
// FuncSummary produced for the file by
// `crate::cfg::export_summaries` so the information persists
// through SQLite round-trips and re-merges into
// `crate::callgraph::TypeHierarchyIndex` at call-graph build time.
let hierarchy_edges = hierarchy::collect_hierarchy_edges(tree.root_node(), lang, code);
FileCfg {
bodies,
summaries,
import_bindings,
promisify_aliases,
hierarchy_edges,
}
}
/// Walk every CFG node in every body; for Call nodes whose callee matches a
/// promisify alias, classify the wrapped callee and union the resulting labels
/// into `info.taint.labels` (dedup by variant+caps). The displayed callee
/// text is left unchanged so diagnostics still surface the alias name.
fn apply_promisify_labels(
bodies: &mut [BodyCfg],
aliases: &PromisifyAliases,
lang: &str,
extra: Option<&[crate::labels::RuntimeLabelRule]>,
) {
for body in bodies.iter_mut() {
let indices: Vec<NodeIndex> = body.graph.node_indices().collect();
for idx in indices {
let Some(callee) = body.graph[idx].call.callee.clone() else {
continue;
};
let Some(alias) = aliases.get(&callee) else {
continue;
};
// Inherit both flat and gated labels from the wrapped callee.
// Gated sinks (e.g. `child_process.exec`) carry the same
// capability semantics as flat sinks, just with arg-position
// filtering at the call site; the promisify alias should
// surface the wrapped function's sink class regardless of
// which arm originally classified it.
let mut wrapped_labels: Vec<crate::labels::DataLabel> =
classify_all(lang, &alias.wrapped, extra)
.into_iter()
.collect();
for gm in
classify_gated_sink(lang, &alias.wrapped, |_| None, |_| None, |_| false).iter()
{
if !wrapped_labels.contains(&gm.label) {
wrapped_labels.push(gm.label);
}
}
if wrapped_labels.is_empty() {
continue;
}
let info = &mut body.graph[idx];
for lbl in wrapped_labels {
if !info.taint.labels.contains(&lbl) {
info.taint.labels.push(lbl);
}
}
}
}
}
/// Build a `CalleeSite` carrying the richer per-call-site metadata for a
/// CFG node.
///
/// * `arity`, positional argument count. `None` when `extract_arg_uses`
/// bailed out on splats/keyword-args (length 0 does not distinguish
/// zero-arg calls from unknown; we treat 0 as a concrete zero). The
/// receiver is a separate channel via `CallMeta.receiver` and is not
/// represented in `arg_uses`, so `arity == arg_uses.len()` for calls.
/// * `receiver`, forwarded verbatim from `CallMeta.receiver` (already
/// normalized to the root identifier).
/// * `qualifier`, the segment(s) before the leaf identifier of the callee.
/// For **Rust** specifically, this is the *full* `::`-joined prefix (e.g.
/// `"crate::auth::token"` for `crate::auth::token::validate`) so that
/// cross-file `use`-map resolution in `callgraph.rs` has everything it
/// needs to walk an import chain. For every other language the qualifier
/// remains the single segment immediately before the leaf (back-compat
/// with the legacy heuristic). For method calls the qualifier is
/// redundant with `receiver` and is left `None`.
fn build_callee_site(callee: &str, info: &NodeInfo, lang: &str) -> crate::summary::CalleeSite {
use crate::summary::CalleeSite;
let receiver = info.call.receiver.clone();
let arity = if info.kind == StmtKind::Call || receiver.is_some() {
Some(info.call.arg_uses.len())
} else {
None
};
let qualifier = if receiver.is_some() {
None
} else if let Some(pos) = callee.rfind("::") {
let prefix = &callee[..pos];
if lang == "rust" {
// Rust: preserve the full module path prefix so use-map
// resolution can follow `use ...` chains without re-parsing.
Some(prefix.to_string()).filter(|s| !s.is_empty())
} else {
Some(prefix.rsplit("::").next().unwrap_or(prefix).to_string()).filter(|s| !s.is_empty())
}
} else if let Some(pos) = callee.rfind('.') {
let prefix = &callee[..pos];
Some(prefix.rsplit('.').next().unwrap_or(prefix).to_string()).filter(|s| !s.is_empty())
} else {
None
};
CalleeSite {
name: callee.to_string(),
arity,
receiver,
qualifier,
ordinal: info.call.call_ordinal,
}
}
/// Convert the graphlocal `FuncSummaries` into serialisable [`FuncSummary`]
/// values suitable for crossfile persistence.
pub(crate) fn export_summaries(
summaries: &FuncSummaries,
file_path: &str,
lang: &str,
) -> Vec<FuncSummary> {
summaries
.iter()
.map(|(key, local)| FuncSummary {
name: key.name.clone(),
file_path: file_path.to_owned(),
lang: lang.to_owned(),
param_count: local.param_count,
param_names: local.param_names.clone(),
source_caps: local.source_caps.bits(),
sanitizer_caps: local.sanitizer_caps.bits(),
sink_caps: local.sink_caps.bits(),
propagating_params: local.propagating_params.clone(),
propagates_taint: false,
tainted_sink_params: local.tainted_sink_params.clone(),
// Primary sink-location attribution: the legacy
// `export_summaries` runs without tree/bytes access, so
// cannot resolve sink node spans to line/col/snippet.
// `ParsedFile::export_summaries_with_root` is responsible
// for populating this field when it has tree access.
param_to_sink: Vec::new(),
callees: local.callees.clone(),
container: local.container.clone(),
disambig: local.disambig,
kind: local.kind,
// Rust use-map metadata is attached later in
// `ParsedFile::export_summaries_with_root`, which has access to
// the file's tree and scan root. Leaving these `None` here keeps
// `export_summaries` a pure graph→summary transform.
module_path: None,
rust_use_map: None,
rust_wildcards: None,
// Hierarchy edges live on `FileCfg`, not on the
// graph-local `FuncSummaries`. `ParsedFile::export_summaries_with_root`
// attaches them after this transform returns.
hierarchy_edges: Vec::new(),
})
.collect()
}
// pub(crate) fn dump_cfg(g: &Cfg) {
// debug!(target: "taint", "CFG DUMP: nodes = {}, edges = {}", g.node_count(), g.edge_count());
// for idx in g.node_indices() {
// debug!(target: "taint", " node {:>3}: {:?}", idx.index(), g[idx]);
// }
// for e in g.edge_references() {
// debug!(
// target: "taint",
// " edge {:>3} → {:<3} ({:?})",
// e.source().index(),
// e.target().index(),
// e.weight()
// );
// }
// }
#[cfg(test)]
mod cfg_tests;