nyx/src/taint/mod.rs

2961 lines
126 KiB
Rust

//! Forward SSA taint analysis: the primary vulnerability detection engine.
//!
//! Tracks untrusted data from **sources** (where it enters the program) through
//! assignments and calls to **sinks** (where it is used dangerously). A finding
//! fires when the flow reaches a sink without passing a matching **sanitizer**.
//!
//! The engine is a monotone forward dataflow over a finite lattice with
//! guaranteed termination. It is flow-sensitive within a function and
//! interprocedural across files via persisted [`crate::summary::FuncSummary`]
//! and [`crate::summary::ssa_summary::SsaFuncSummary`] values.
//!
//! # Rule ID
//!
//! ```text
//! taint-unsanitised-flow (source <line>:<col>)
//! taint-data-exfiltration (source <line>:<col>)
//! ```
//!
//! The source location is part of the ID so sibling paths to the same sink
//! get distinct IDs. Suppressions can target either the base ID or the full
//! string.
//!
//! # Capabilities
//!
//! Sources, sanitizers, and sinks are linked by [`crate::labels::Cap`] bits.
//! A sanitizer only clears the cap it declares; a sink only fires when the
//! remaining taint still carries its required cap.
//!
//! | Cap | Typical source | Typical sanitizer | Typical sink |
//! |-----|----------------|-------------------|--------------|
//! | `env_var` | `env::var`, `getenv`, `process.env` | | |
//! | `html_escape` | | `html.escape`, `DOMPurify.sanitize` | `innerHTML`, `document.write` |
//! | `shell_escape` | | `shlex.quote`, `shell_escape::escape` | `system`, `Command::new` |
//! | `url_encode` | | `encodeURIComponent` | HTTP client URL arg |
//! | `file_io` | | `realpath`, `filepath.Clean` | `open`, `fs::read_to_string` |
//! | `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` |
//! | `deserialize` | | | `pickle.loads`, `Marshal.load` |
//! | `ssrf` | | URL-prefix locks | `fetch` URL arg, outbound HTTP |
//! | `code_exec` | | | `eval`, `exec`, `system` |
//! | `crypto` | | | weak-algorithm constructors |
//! | `data_exfil` | cookies, headers, env, db rows (Sensitive tier) | | `fetch` body/json/headers |
//!
//! Sources typically carry `Cap::all()` so they match any sink.
//!
//! # Source sensitivity
//!
//! Each source carries a [`crate::labels::SourceKind`] and a derived tier:
//!
//! - `Plain` — direct attacker input (`UserInput`): request bodies, query
//! strings, argv, stdin.
//! - `Sensitive` — operator-bound state: cookies, headers, env, files, DB rows,
//! caught exceptions.
//!
//! `Cap::DATA_EXFIL` only fires on `Sensitive`-tier sources. Plain user input
//! flowing into an outbound request body is suppressed — the canonical false
//! positive for API gateways that proxy `req.body`.
//!
//! # Confidence signals
//!
//! Higher confidence: source and sink both present in evidence, `source_kind:
//! user_input`, `path_validated: false`, symbolic witness produced.
//!
//! Lower confidence: path-validated taint, source is a database read or
//! internal file, any non-informational `EngineNote` (e.g.
//! `SsaLoweringBailed`, `PredicateStateWidened`, `WorklistCapped`).
//!
//! # Submodules
//!
//! - [`domain`]: taint lattice types (`VarTaint`, `TaintOrigin`, `SmallBitSet`,
//! `PredicateSummary`)
//! - [`ssa_transfer`]: SSA taint transfer functions and the forward worklist
//! (`SsaTaintState`, `SsaTaintTransfer`, `run_ssa_taint`)
//! - [`path_state`]: predicate classification for branch-sensitive propagation
//! - [`backwards`]: demand-driven backwards walk from sinks (off by default)
#![allow(clippy::collapsible_if, clippy::too_many_arguments)]
pub mod backwards;
pub mod domain;
pub mod path_state;
pub mod ssa_transfer;
use crate::cfg::{BodyCfg, BodyId, Cfg, FileCfg, FuncSummaries};
use crate::engine_notes::EngineNote;
use crate::interop::InteropEdge;
use crate::labels::SourceKind;
use crate::state::engine::MAX_TRACKED_VARS;
use crate::state::symbol::SymbolInterner;
use crate::summary::GlobalSummaries;
use crate::symbol::{FuncKey, FuncKind, Lang};
use path_state::PredicateKind;
use petgraph::graph::NodeIndex;
use petgraph::visit::IntoNodeReferences;
use smallvec::SmallVec;
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::atomic::{AtomicUsize, Ordering};
/// Safety cap on JS/TS in-file pass-2 convergence iterations.
///
/// Pass 2 runs a Jacobi-style round over every non-toplevel body in a
/// JS/TS file, combining each body's exit state (filtered to top-level
/// keys) into the shared seed and re-running non-toplevel bodies until
/// the seed stabilises. A chain of `k` top-level bindings threaded
/// through `k` helper functions needs up to `k` iterations for taint to
/// walk the chain; the old hardcoded `3` silently truncated any
/// 4-stage chain with no warning.
///
/// This mirrors `scan::SCC_FIXPOINT_SAFETY_CAP` in intent: the lattice
/// is monotone and finite-height, so the real fixed-point is always
/// reachable in a small multiple of the chain depth. 64 is generous
/// enough to cover every realistic JS/TS file we have seen while still
/// bounding worst-case cost.
const JS_TS_PASS2_SAFETY_CAP: usize = 64;
/// Test-only override for [`JS_TS_PASS2_SAFETY_CAP`]. When non-zero,
/// the pass-2 loop uses this value instead of the const cap. Default
/// `0` leaves production behaviour unchanged.
static JS_TS_PASS2_CAP_OVERRIDE: AtomicUsize = AtomicUsize::new(0);
/// Observability hook: records the number of pass-2 iterations used by
/// the most recent [`analyse_file`] invocation. Reset at the start of
/// each call so convergence regression tests can read a fresh value.
/// `1` means the initial lexical-containment pass completed; higher
/// values indicate the iterative convergence loop ran that many times
/// without detecting convergence (so the `iters`th iteration was the
/// last round actually executed). `1` is the common case for
/// non-JS/TS languages and for JS/TS files with no cross-body globals.
static LAST_JS_TS_PASS2_ITERATIONS: AtomicUsize = AtomicUsize::new(0);
/// Set (or clear) the test-only JS/TS pass-2 cap override. `cap = 0`
/// restores the default. Intended exclusively for integration tests
/// that need to force cap-hit behaviour on small fixtures.
#[doc(hidden)]
pub fn set_js_ts_pass2_cap_override(cap: usize) {
JS_TS_PASS2_CAP_OVERRIDE.store(cap, Ordering::Relaxed);
}
/// Returns the pass-2 iteration count observed during the most recent
/// [`analyse_file`] invocation. Intended for tests and diagnostics.
pub fn last_js_ts_pass2_iterations() -> usize {
LAST_JS_TS_PASS2_ITERATIONS.load(Ordering::Relaxed)
}
fn js_ts_pass2_cap() -> usize {
let o = JS_TS_PASS2_CAP_OVERRIDE.load(Ordering::Relaxed);
if o == 0 { JS_TS_PASS2_SAFETY_CAP } else { o }
}
// ── Perf-audit sub-stage timers (lower_all_functions_from_bodies) ───────
//
// Slot layout (µs):
// [0] lower_to_ssa_with_params (per-body sum)
// [1] extract_ssa_func_summary (per-body sum, includes per-param probes)
// [2] optimize_ssa_with_param_types (per-body sum)
// [3] typed_call_receivers + pointer fact extraction (per-body sum)
// [4] augment_summaries_with_child_sinks
// [5] rerun_extraction_with_augmented_summaries
// [6] per-body misc (FuncKey resolve, HashMap insert, interner ctor)
//
// Active only when the slot is `Some`. Production code path leaves it
// `None`, making instrumentation cost a single thread-local borrow + a
// `match Option::None` per measured chunk, sub-nanosecond.
thread_local! {
static PERF_LOWER_TIMINGS: std::cell::Cell<Option<[u128; 7]>> =
const { std::cell::Cell::new(None) };
}
#[doc(hidden)]
pub fn perf_lower_timings_start() {
PERF_LOWER_TIMINGS.with(|c| c.set(Some([0; 7])));
}
#[doc(hidden)]
pub fn perf_lower_timings_take() -> Option<[u128; 7]> {
PERF_LOWER_TIMINGS.with(|c| c.replace(None))
}
#[inline]
fn perf_lower_record(slot: usize, micros: u128) {
PERF_LOWER_TIMINGS.with(|c| {
if let Some(mut t) = c.get() {
t[slot] = t[slot].saturating_add(micros);
c.set(Some(t));
}
});
}
/// Test-only override for the Gauss-Seidel toggle. Values:
///
/// * `0`, respect `NYX_JS_GAUSS_SEIDEL` env var (default production
/// behaviour).
/// * `1`, force Jacobi (env ignored).
/// * `2`, force Gauss-Seidel (env ignored).
///
/// Used exclusively by integration tests that need to assert both
/// variants produce equal findings without per-test process isolation.
static JS_TS_GAUSS_SEIDEL_OVERRIDE: AtomicUsize = AtomicUsize::new(0);
/// Force Jacobi or Gauss-Seidel from test code. `0` clears the
/// override and restores env-var-driven behaviour.
#[doc(hidden)]
pub fn set_js_ts_gauss_seidel_override(mode: usize) {
JS_TS_GAUSS_SEIDEL_OVERRIDE.store(mode, Ordering::Relaxed);
}
/// Returns true when the Gauss-Seidel variant of JS/TS pass-2 is
/// enabled.
///
/// Default: **Jacobi** (order-independent, reproducible, one round
/// per chain hop). Set `NYX_JS_GAUSS_SEIDEL=1` to enable
/// **Gauss-Seidel** (in-place updates: a body's exit becomes visible
/// to later bodies in the same round, typically halving iteration
/// count on chain-shaped code).
///
/// Opt-in deliberately: Gauss-Seidel is order-dependent (the result
/// depends on the traversal order of bodies), which can affect
/// reproducibility for scanners whose output feeds CI gates. Before
/// flipping this on by default we need the Phase-A corpus run to
/// prove chain-depth ≥4 is common enough to justify the complexity.
///
/// Test-override via [`set_js_ts_gauss_seidel_override`] takes
/// precedence over the env var.
///
/// See `tests/gauss_seidel_tests.rs` for the determinism test that
/// guards the invariant "same fixture → same findings under both
/// variants".
pub fn js_ts_gauss_seidel_enabled() -> bool {
match JS_TS_GAUSS_SEIDEL_OVERRIDE.load(Ordering::Relaxed) {
1 => return false, // force Jacobi
2 => return true, // force Gauss-Seidel
_ => {}
}
use std::sync::OnceLock;
static ENABLED: OnceLock<bool> = OnceLock::new();
*ENABLED.get_or_init(|| match std::env::var("NYX_JS_GAUSS_SEIDEL") {
Ok(v) => !matches!(v.as_str(), "" | "0" | "false"),
Err(_) => false,
})
}
/// A raw flow step at CFG level (before line/col resolution).
#[derive(Debug, Clone)]
pub struct FlowStepRaw {
pub cfg_node: NodeIndex,
pub var_name: Option<String>,
pub op_kind: crate::evidence::FlowStepKind,
}
/// Resolved source-location of the primary (callee-internal) sink instruction.
///
/// Populated on [`Finding`] when the sink was resolved via a callee summary
/// that recorded a [`crate::summary::SinkSite`]. Data-only primary
/// sink-location attribution: downstream formatters (SARIF, JSON, diag)
/// still report the caller's call-site until they opt in.
#[derive(Debug, Clone, PartialEq)]
pub struct SinkLocation {
/// Callee file path relative to the workspace root. Matches the
/// `FuncKey::namespace` convention used in [`crate::summary::SinkSite`].
pub file_rel: String,
/// 1-based line of the sink instruction inside the callee body.
pub line: u32,
/// 1-based column of the sink instruction inside the callee body.
pub col: u32,
/// Trimmed source line at the sink, copied from the upstream
/// [`crate::summary::SinkSite`]. Empty when the extractor had no
/// tree/bytes context. Used by formatters so the primary-location
/// display does not need to re-read the callee file.
pub snippet: String,
}
/// A detected taint finding with both source and sink locations.
#[derive(Debug, Clone)]
pub struct Finding {
/// Identifies which body's graph the NodeIndex values reference.
pub body_id: BodyId,
/// The CFG node where tainted data reaches a dangerous operation.
pub sink: NodeIndex,
/// The CFG node where taint originated (may be Entry if source is
/// cross-file and couldn't be pinpointed to a specific node).
pub source: NodeIndex,
/// The full path from source to sink through the CFG.
#[allow(dead_code)] // used for future detailed diagnostics / path display
pub path: Vec<NodeIndex>,
/// The kind of source that originated the taint.
pub source_kind: SourceKind,
/// Whether all tainted sink variables are guarded by a validation
/// predicate on this path (metadata only, does not change severity).
pub path_validated: bool,
/// The kind of validation guard protecting this path, if any.
pub guard_kind: Option<PredicateKind>,
/// Number of SSA blocks between source and sink (0 = same block).
pub hop_count: u16,
/// Capability specificity: number of matching cap bits between source and sink.
/// Higher = more specific match (e.g. SQL_QUERY→SQL_QUERY vs broad Cap::all()).
pub cap_specificity: u8,
/// Whether this finding was resolved via a function summary (cross-function)
/// rather than direct intra-function flow.
pub uses_summary: bool,
/// Reconstructed flow path from source to sink (CFG-level, pre-resolution).
pub flow_steps: Vec<FlowStepRaw>,
/// Symbolic constraint analysis verdict, if attempted.
pub symbolic: Option<crate::evidence::SymbolicVerdict>,
/// Original source byte span, preserved when origin was remapped across
/// body boundaries. `None` for intra-body findings
/// (use `cfg[source].classification_span()`).
pub source_span: Option<usize>,
/// Source-location of the callee-internal dangerous instruction when the
/// sink was resolved via a function summary carrying a
/// [`crate::summary::SinkSite`] with concrete coordinates for primary
/// sink-location attribution. `None` for:
/// * intra-procedural / label-based sinks, the caller's `cfg[sink]`
/// span already names the dangerous instruction;
/// * summary-resolved sinks whose `SinkSite` was cap-only (no tree or
/// bytes context at extraction time).
///
/// # Invariant
///
/// `primary_location.is_some()` ⇒ the inner [`SinkLocation`] has
/// `line != 0`. `file_rel` may be empty for single-file scans where
/// the scan root is the file itself (every namespace normalizes to
/// `""`); consumers resolve empty `file_rel` against the file under
/// analysis. Enforced at `ssa_events_to_findings` by a
/// `debug_assert!`, upstream filters drop cap-only sites before
/// they reach this field.
///
/// Deliberately independent of `uses_summary`: that flag tracks whether
/// the **taint chain** used a callee summary, not whether the **sink**
/// was summary-resolved. A local source can reach a cross-file sink,
/// yielding `uses_summary == false` alongside a populated
/// `primary_location`.
pub primary_location: Option<SinkLocation>,
/// Engine provenance notes recorded during the analysis that produced
/// this finding. Populated when an internal budget/cap was hit, see
/// [`crate::engine_notes::EngineNote`]. Empty for the typical
/// under-budget finding.
pub engine_notes: SmallVec<[EngineNote; 2]>,
/// Stable hash of the intermediate-variable sequence between `source`
/// and `sink`. Used to keep distinct paths through different
/// variables as separate findings during deduplication, two
/// `(body_id, sink, source)` siblings with different `path_hash`
/// values represent flows along different data paths and are
/// preserved as alternatives rather than collapsed.
///
/// Derived from the `cfg_node` indices in `flow_steps` at the time
/// the finding is emitted; stable for a given scan but not
/// necessarily stable across AST/CFG changes.
pub path_hash: u64,
/// Stable identifier for this finding, derived from
/// `(body_id, source.index, sink.index, path_hash, path_validated)`.
/// Populated after `body_id` is set so the ID is consistent across
/// the lifetime of the finding and can be used to cross-reference
/// alternative paths via [`Self::alternative_finding_ids`]. Empty
/// string before the post-analysis linking pass runs.
pub finding_id: String,
/// Stable identifiers of sibling findings that share
/// `(body_id, sink, source)` but differ in `path_validated` or
/// `path_hash`. Populated by the dedup pass in
/// [`analyse_file`] after all findings are collected.
///
/// The canonical case is a guarded/unguarded pair: if an `exec(x)`
/// call is reachable from the same source `x` through both a
/// whitelisted branch and an unguarded branch, both findings
/// survive dedup and each lists the other here so downstream
/// formatters can present them as "this flow … and N alternative
/// path(s)" rather than silently dropping one.
pub alternative_finding_ids: SmallVec<[String; 2]>,
/// Sink-cap mask that this specific finding fired against. Carries the
/// per-event `sink_caps` from the multi-gate dispatch (e.g.
/// `Cap::SSRF` for a URL-flow finding on `fetch`, `Cap::DATA_EXFIL`
/// for a body-flow finding on the same call). Used by `ast.rs` to
/// route the finding to a cap-specific rule id rather than the
/// generic `taint-unsanitised-flow` bucket.
pub effective_sink_caps: crate::labels::Cap,
}
impl Finding {
/// Append an engine provenance note, deduplicating against notes
/// already present. Intended as a builder-style helper for construction
/// sites that want to tag a new finding inline.
pub fn with_note(mut self, note: EngineNote) -> Self {
crate::engine_notes::push_unique(&mut self.engine_notes, note);
self
}
/// Merge a note into `engine_notes`, skipping duplicates.
pub fn merge_note(&mut self, note: EngineNote) {
crate::engine_notes::push_unique(&mut self.engine_notes, note);
}
}
/// Pre-compute module aliases from an unoptimized SSA body for JS/TS.
///
/// Runs const propagation (read-only) to get constant values, then detects
/// `require()` calls to known modules and propagates through phis/copies.
/// Used to make module aliases available during summary extraction.
fn compute_module_aliases_for_summary(
ssa: &crate::ssa::SsaBody,
lang: Lang,
) -> std::collections::HashMap<crate::ssa::SsaValue, smallvec::SmallVec<[String; 2]>> {
if !matches!(lang, Lang::JavaScript | Lang::TypeScript) {
return std::collections::HashMap::new();
}
let cp = crate::ssa::const_prop::const_propagate(ssa);
crate::ssa::const_prop::collect_module_aliases(ssa, &cp.values)
}
/// Build a per-file cross-package import lookup for Phase 09 cross-file IPA.
///
/// For each [`crate::resolve::ImportBinding`] whose resolver verdict
/// produced a concrete `(resolved_file, exported_name)` pair, builds the
/// canonical [`FuncKey`] of the imported function in its own file's
/// scan-root-relative namespace and stores it under the caller-file's
/// local binding name.
///
/// Returns an empty map when the file has no resolved imports (non-JS/TS
/// files, scans without a `ModuleGraph`, side-effect-only imports, or
/// builtin/unresolved specifiers). The caller passes `None` to
/// `SsaTaintTransfer::cross_package_imports` in that case.
///
/// `module_graph` aligns the target [`FuncKey::namespace`] with the
/// package-prefixed form that `FuncSummary::func_key_with_resolver`
/// produces on the cross-file storage side: when the resolved file lies
/// inside a discovered package the namespace becomes
/// `"@scope/name::src/file.ts"`, otherwise it falls back to plain
/// `normalize_namespace`. Step 0.7 of `resolve_callee_full` looks up
/// `(lang, namespace, name)` against `GlobalSummaries::ssa_by_key`
/// where the SSA-side keys are now produced via the same
/// `namespace_with_package` shape (callers in `crate::ast::ParsedFile`
/// pre-compute the package-prefixed namespace before invoking
/// `lower_all_functions_from_bodies`), so the two sides agree even
/// when two packages share a project-relative file path.
///
/// `module_graph = None` (single-package scans, non-JS/TS files, unit
/// tests, indexed-mode SQLite fallback) collapses to the historical
/// `normalize_namespace` behaviour, keeping the migration strictly
/// additive for any consumer that does not opt in.
///
/// The constructed key intentionally leaves `container`, `arity`,
/// `disambig`, and `kind` at their defaults — the resolver verdict only
/// fixes the `(lang, namespace, name)` triple, and step 0.7 of
/// `resolve_callee_full` matches against `GlobalSummaries::ssa_by_key`
/// using only those three fields plus an arity hint when available.
pub fn build_cross_package_func_keys(
resolved_imports: &[crate::resolve::ImportBinding],
scan_root: Option<&str>,
module_graph: Option<&crate::resolve::ModuleGraph>,
caller_lang: Lang,
) -> HashMap<String, FuncKey> {
let mut out: HashMap<String, FuncKey> = HashMap::new();
for binding in resolved_imports {
let Some(ref resolved_file) = binding.resolved_file else {
continue;
};
let Some(ref exported_name) = binding.exported_name else {
continue;
};
if exported_name.is_empty()
|| exported_name == "*"
|| exported_name == "default"
|| binding.local_name.is_empty()
{
// Side-effect / namespace / default imports do not map to a
// single named export; step 0.7 needs a concrete leaf name.
continue;
}
let target_lang = resolved_file
.extension()
.and_then(|e| e.to_str())
.and_then(Lang::from_extension)
.unwrap_or(caller_lang);
let abs = resolved_file.to_string_lossy();
let namespace = crate::symbol::namespace_with_package(&abs, scan_root, module_graph);
let key = FuncKey {
lang: target_lang,
namespace,
container: String::new(),
name: exported_name.clone(),
arity: None,
disambig: None,
kind: FuncKind::Function,
};
out.insert(binding.local_name.clone(), key);
}
out
}
/// Run taint analysis on all bodies in a file.
///
/// Uses a unified multi-body analysis for all languages:
/// 1. Lexical containment propagation: parent body exit state seeds child bodies.
/// 2. JS/TS iterative convergence: functions that modify globals can feed taint
/// back to other functions (up to `MAX_JS_ITERATIONS` rounds).
pub fn analyse_file(
file_cfg: &FileCfg,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
caller_lang: Lang,
caller_namespace: &str,
interop_edges: &[InteropEdge],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> Vec<Finding> {
// Reset BEFORE lowering: per-parameter probes inside
// `lower_all_functions_from_bodies` may record path-safe sink spans
// (via `record_path_safe_suppressed_span`). Resetting here keeps the
// historical contract that "the span set starts empty for each file"
// while letting both the probe phase and the taint flow phase
// accumulate into the same set, which is what
// `take_path_safe_suppressed_spans` then drains for state analysis.
// The all-validated span set (cap-agnostic, drained by AST-pattern
// suppression in `TaintSuppressionCtx::build`) follows the same
// lifecycle.
ssa_transfer::reset_path_safe_suppressed_spans();
ssa_transfer::reset_all_validated_spans();
// No locator: pass-2 intra-file summaries are transient (not persisted)
// and behavior depends on SinkSite.cap only, which is always populated.
crate::ssa::type_facts::with_file_imports(Some(&file_cfg.local_imports), || {
crate::cfg::safe_fields::with_safe_lookup_fields(Some(&file_cfg.safe_lookup_fields), || {
let (ssa_summaries, callee_bodies) = lower_all_functions_from_bodies(
file_cfg,
caller_lang,
caller_namespace,
local_summaries,
global_summaries,
None,
None,
None,
);
analyse_file_with_lowered(
file_cfg,
local_summaries,
global_summaries,
caller_lang,
caller_namespace,
interop_edges,
extra_labels,
&ssa_summaries,
&callee_bodies,
None,
)
})
})
}
/// Same as [`analyse_file`] but takes pre-lowered SSA summaries + callee
/// bodies. Used by [`crate::ast::analyse_file_fused`] to share a single
/// `lower_all_functions_from_bodies` invocation across the taint engine and
/// the SSA-artifact extractor; the bare [`analyse_file`] entry-point keeps
/// its prior signature for any caller that does not have a pre-lowered
/// result handy.
///
/// `cross_package_imports` is the optional Phase-09 lookup map built via
/// [`build_cross_package_func_keys`]. `None` (the public-API default)
/// disables cross-package step 0.7 in `resolve_callee_full`.
#[allow(clippy::too_many_arguments)]
pub(crate) fn analyse_file_with_lowered(
file_cfg: &FileCfg,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
caller_lang: Lang,
caller_namespace: &str,
interop_edges: &[InteropEdge],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
ssa_summaries: &std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
callee_bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
cross_package_imports: Option<&std::collections::HashMap<String, FuncKey>>,
) -> Vec<Finding> {
let _span = tracing::debug_span!("taint_analyse_file").entered();
// Publish the per-file local-import view so the ORM TypeKind gate
// inside [`crate::ssa::type_facts::constructor_type`] can read it
// during downstream `optimize_ssa_with_param_types` passes. The
// outer `analyse_file` already wraps this for its own
// `lower_all_functions_from_bodies` pre-pass; wrapping here too
// keeps direct callers (e.g. [`crate::ast::analyse_file_fused`])
// covered. Idempotent under nesting — the inner guard restores
// the outer value on drop.
crate::ssa::type_facts::with_file_imports(Some(&file_cfg.local_imports), || {
crate::cfg::safe_fields::with_safe_lookup_fields(Some(&file_cfg.safe_lookup_fields), || {
analyse_file_with_lowered_inner(
file_cfg,
local_summaries,
global_summaries,
caller_lang,
caller_namespace,
interop_edges,
extra_labels,
ssa_summaries,
callee_bodies,
cross_package_imports,
)
})
})
}
#[allow(clippy::too_many_arguments)]
fn analyse_file_with_lowered_inner(
file_cfg: &FileCfg,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
caller_lang: Lang,
caller_namespace: &str,
interop_edges: &[InteropEdge],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
ssa_summaries: &std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
callee_bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
cross_package_imports: Option<&std::collections::HashMap<String, FuncKey>>,
) -> Vec<Finding> {
// NOTE: the path-safe-suppressed span set is reset by the caller, not
// here. Per-parameter probes inside the lowering phase
// (`lower_all_functions_from_bodies`) can already publish spans via
// `record_path_safe_suppressed_span`; resetting here would wipe them
// before `take_path_safe_suppressed_spans` drains the set for state
// analysis. Both `analyse_file` (which lowers internally) and
// `analyse_file_fused` (which lowers up-front) reset the set before
// their lowering call.
let ssa_sums_ref = if ssa_summaries.is_empty() {
None
} else {
Some(ssa_summaries)
};
// 2. Context-sensitive inline analysis setup. Toggle lives at
// `analysis.engine.context_sensitive` in `nyx.conf` (or the
// `--context-sensitive / --no-context-sensitive` CLI flag).
let context_sensitive = crate::utils::analysis_options::current().context_sensitive;
let inline_cache = std::cell::RefCell::new(std::collections::HashMap::new());
let callee_bodies_ref = if context_sensitive && !callee_bodies.is_empty() {
Some(callee_bodies)
} else {
None
};
let inline_cache_ref = if context_sensitive {
Some(&inline_cache)
} else {
None
};
// 3. Unified multi-body analysis with lexical containment propagation.
//
// `max_iterations` is the safety cap, not an expected depth, the
// pass-2 loop breaks on seed equality (monotone lattice, finite
// height) and only rides the cap when convergence legitimately
// needs more rounds than the cap allows. See
// [`JS_TS_PASS2_SAFETY_CAP`] for the rationale.
let max_iterations = if matches!(caller_lang, Lang::JavaScript | Lang::TypeScript) {
js_ts_pass2_cap()
} else {
1
};
// Reset the observability counter before this scan so tests always
// read a fresh value. Non-JS/TS languages leave it at `1` (the
// lexical-containment pass counts as a single round).
LAST_JS_TS_PASS2_ITERATIONS.store(0, Ordering::Relaxed);
let import_bindings_ref = if file_cfg.import_bindings.is_empty() {
None
} else {
Some(&file_cfg.import_bindings)
};
// Cross-file bodies come from GlobalSummaries. Threaded through the
// transfer for context-sensitive resolution; plumbing only when no
// reader is configured, preserving prior behaviour byte-for-byte.
let cross_file_bodies_ref = global_summaries.and_then(|gs| gs.bodies_by_key());
if let Some(map) = cross_file_bodies_ref {
tracing::debug!(
cross_file_bodies = map.len(),
file = %caller_namespace,
"taint: cross-file bodies available for pass 2"
);
}
let mut all_findings = analyse_multi_body(
file_cfg,
caller_lang,
caller_namespace,
local_summaries,
global_summaries,
interop_edges,
extra_labels,
ssa_sums_ref,
callee_bodies_ref,
inline_cache_ref,
max_iterations,
import_bindings_ref,
cross_file_bodies_ref,
cross_package_imports,
);
// 4. Deduplicate findings using a richer key that preserves distinct
// flows.
//
// The historical dedup at this point was:
//
// sort_by_key(|f| (body_id, sink.index(), source.index(), !path_validated));
// dedup_by_key(|f| (body_id, sink, source));
//
// which silently collapsed an *unguarded* flow reaching the same
// `(sink, source)` as a guarded flow, the `!path_validated` sort
// ordered `path_validated == true` first, so the exploitable
// branch was the one that got dropped.
//
// New behaviour: the dedup key is
// (body_id, sink, source, path_validated, path_hash).
// Findings that differ on `path_validated` *or* on `path_hash`
// (i.e. traverse different intermediate variables) are kept as
// distinct findings. `link_alternative_paths` then populates
// `alternative_finding_ids` on each finding so downstream
// formatters can render "… and N alternative path(s)".
all_findings.sort_by_key(|f| {
(
f.body_id.0,
f.sink.index(),
f.source.index(),
!f.path_validated,
f.path_hash,
f.effective_sink_caps.bits(),
)
});
all_findings.dedup_by_key(|f| {
(
f.body_id,
f.sink,
f.source,
f.path_validated,
f.path_hash,
f.effective_sink_caps.bits(),
)
});
// 5. Assign stable finding IDs now that `body_id` has been set and
// the dedup has picked the final set of distinct flows. The ID
// is used to cross-reference siblings via
// `Finding.alternative_finding_ids`.
for f in &mut all_findings {
f.finding_id = make_finding_id(f);
}
// 6. Link alternative paths: for every group of findings that share
// `(body_id, sink, source)`, publish each finding's ID into the
// other findings' `alternative_finding_ids` list.
link_alternative_paths(&mut all_findings);
all_findings
}
/// Build the stable identifier for a [`Finding`].
///
/// Format: `taint-<body_id>-<source_idx>-<sink_idx>-<path_hash_hex>-<v|u>`.
/// The `v`/`u` suffix disambiguates validated (`v`) from unvalidated
/// (`u`) flows that share `(body, sink, source, path_hash)`. The hex
/// hash disambiguates distinct intermediate paths. Both components are
/// independent of caller-side formatters so the ID survives
/// serialization to JSON/SARIF unchanged.
fn make_finding_id(f: &Finding) -> String {
format!(
"taint-{}-{}-{}-{:016x}-{}",
f.body_id.0,
f.source.index(),
f.sink.index(),
f.path_hash,
if f.path_validated { 'v' } else { 'u' },
)
}
/// Cross-link findings that share `(body_id, sink, source)` but differ
/// on `path_validated` or `path_hash`. After this call each such
/// finding's `alternative_finding_ids` lists every sibling's
/// [`Finding::finding_id`], so a guarded flow links to the unguarded
/// sibling and vice versa. Isolated findings (no sibling) get an
/// empty list.
fn link_alternative_paths(findings: &mut [Finding]) {
// Group indices by (body_id, sink, source). A simple O(n log n)
// sort would clobber the caller-visible order; use a hashmap instead.
let mut groups: HashMap<(BodyId, NodeIndex, NodeIndex), Vec<usize>> = HashMap::new();
for (idx, f) in findings.iter().enumerate() {
groups
.entry((f.body_id, f.sink, f.source))
.or_default()
.push(idx);
}
for (_, members) in groups {
if members.len() < 2 {
continue;
}
// Collect IDs once, then distribute to every member *except self*.
let ids: Vec<String> = members
.iter()
.map(|&i| findings[i].finding_id.clone())
.collect();
for &member_idx in &members {
let own_id = findings[member_idx].finding_id.clone();
findings[member_idx].alternative_finding_ids.clear();
findings[member_idx]
.alternative_finding_ids
.extend(ids.iter().filter(|id| **id != own_id).cloned());
}
}
}
/// Compute containment-topological order: parent bodies before children.
///
/// Uses BFS from roots (bodies with no parent), ensuring a body is always
/// processed after its parent, required for lexical seed propagation.
/// Returns indices into `file_cfg.bodies` in processing order.
fn containment_order(bodies: &[BodyCfg]) -> Vec<usize> {
let mut children: HashMap<BodyId, Vec<usize>> = HashMap::new();
let mut roots: Vec<usize> = Vec::new();
for (i, body) in bodies.iter().enumerate() {
match body.meta.parent_body_id {
Some(parent) => children.entry(parent).or_default().push(i),
None => roots.push(i),
}
}
let mut order = Vec::with_capacity(bodies.len());
let mut queue: VecDeque<usize> = roots.into();
while let Some(idx) = queue.pop_front() {
order.push(idx);
if let Some(kids) = children.get(&bodies[idx].meta.id) {
queue.extend(kids);
}
}
order
}
/// Build a `var_name → TypeKind` map from a body's optimised SSA + type-fact
/// result. Used by [`analyse_multi_body`] to forward closure-captured types
/// from a parent body into its children, so that bound-variable receiver
/// idioms (`const c = ldap.createClient(...); function f() { c.search(...) }`)
/// pick up `TypeKind::LdapClient` on the inner reference via the
/// [`ssa_transfer::resolve_type_qualified_labels`] receiver scan.
///
/// Conflict policy: if the same `var_name` reaches multiple SSA values with
/// distinct `TypeKind`s the entry is dropped — propagating an ambiguous type
/// into a child body would fabricate facts, while dropping it just falls back
/// to the existing structural resolution paths.
fn extract_named_type_facts(
ssa: &crate::ssa::SsaBody,
type_facts: &crate::ssa::type_facts::TypeFactResult,
) -> HashMap<String, crate::ssa::type_facts::TypeKind> {
use crate::ssa::type_facts::TypeKind;
let mut acc: HashMap<String, TypeKind> = HashMap::new();
let mut conflicts: HashSet<String> = HashSet::new();
for block in &ssa.blocks {
for inst in block.phis.iter().chain(block.body.iter()) {
let Some(name) = inst.var_name.as_deref() else {
continue;
};
if conflicts.contains(name) {
continue;
}
let Some(kind) = type_facts.get_type(inst.value) else {
continue;
};
if matches!(kind, TypeKind::Unknown) {
continue;
}
match acc.get(name) {
Some(existing) if existing != kind => {
acc.remove(name);
conflicts.insert(name.to_string());
}
Some(_) => {}
None => {
acc.insert(name.to_string(), kind.clone());
}
}
}
}
acc
}
/// Inject parent-known closure-capture types into a per-body
/// [`crate::ssa::type_facts::TypeFactResult`].
///
/// Scoped lowering ([`crate::ssa::lower_to_ssa_with_params`]) injects a
/// `SsaOp::Param` (or `SsaOp::SelfParam`) at the entry block for every
/// free / closure-captured variable read by the body. The per-body type
/// analysis can only seed declared formal-parameter types (via
/// `BodyMeta.param_types`); free variables are left as `TypeKind::Unknown`
/// because their definition lives in an enclosing body whose SSA is not
/// in scope.
///
/// This pass walks the entry block's synthetic prologue and, for each
/// external Param whose name resolves in `parent_var_types`, inserts the
/// matching [`crate::ssa::type_facts::TypeFact`] into `type_facts.facts`.
/// Strictly additive: existing facts (e.g. a fact already produced by
/// `BodyMeta.param_types` seeding for a real formal that happens to share
/// a name) are never overwritten.
fn inject_external_type_facts(
ssa: &crate::ssa::SsaBody,
type_facts: &mut crate::ssa::type_facts::TypeFactResult,
parent_var_types: &HashMap<String, crate::ssa::type_facts::TypeKind>,
) {
use crate::ssa::ir::SsaOp;
use crate::ssa::type_facts::TypeFact;
if parent_var_types.is_empty() || ssa.blocks.is_empty() {
return;
}
for inst in ssa.blocks[0].body.iter() {
if !matches!(inst.op, SsaOp::Param { .. } | SsaOp::SelfParam) {
continue;
}
if type_facts.facts.contains_key(&inst.value) {
// `analyze_types_with_param_types` may have already typed this
// value via a non-Unknown entry from BodyMeta.param_types; in
// that case the formal-parameter declaration wins. Note: the
// analysis seeds an Unknown placeholder for unparameterised
// Param ops, so we still need to override Unknown entries.
if !matches!(
type_facts.facts.get(&inst.value).map(|f| &f.kind),
Some(crate::ssa::type_facts::TypeKind::Unknown)
) {
continue;
}
}
let Some(name) = inst.var_name.as_deref() else {
continue;
};
let Some(kind) = parent_var_types.get(name) else {
continue;
};
let nullable = matches!(kind, crate::ssa::type_facts::TypeKind::Null);
type_facts.facts.insert(
inst.value,
TypeFact {
kind: kind.clone(),
nullable,
},
);
}
}
/// Apply entry-kind-derived overrides to a body's `param_types` vector.
///
/// Today only `EntryKind::AppRouteHandler` triggers an override: the first
/// formal of a Next.js App Router handler always carries a Web `Request`,
/// regardless of the user's TypeScript annotation. Returns `Some(vec)` when
/// the override changes the vector, `None` otherwise. Folding the rule into
/// one helper keeps the two consumers (`analyse_body_with_seed` and
/// `lower_all_functions_from_bodies_inner`) in lockstep.
fn entry_kind_param_type_override(
entry_kind: Option<&crate::entry_points::EntryKind>,
param_types: &[Option<crate::ssa::type_facts::TypeKind>],
) -> Option<Vec<Option<crate::ssa::type_facts::TypeKind>>> {
if matches!(
entry_kind,
Some(crate::entry_points::EntryKind::AppRouteHandler { .. })
) {
let mut pt = param_types.to_vec();
if pt.is_empty() {
pt.push(Some(crate::ssa::type_facts::TypeKind::Request));
} else {
pt[0] = Some(crate::ssa::type_facts::TypeKind::Request);
}
Some(pt)
} else {
None
}
}
/// Analyse a single body with an optional parent seed.
///
/// Shared logic extracted from `analyse_multi_body` to avoid deep nesting.
#[allow(clippy::type_complexity)]
fn analyse_body_with_seed(
body: &BodyCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
interop_edges: &[InteropEdge],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
ssa_summaries: Option<
&std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
>,
callee_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
inline_cache: Option<&std::cell::RefCell<ssa_transfer::InlineCache>>,
seed: Option<&HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>>,
import_bindings: Option<&crate::cfg::ImportBindings>,
cross_file_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
parent_var_types: Option<&HashMap<String, crate::ssa::type_facts::TypeKind>>,
cross_package_imports: Option<&std::collections::HashMap<String, FuncKey>>,
) -> (
Vec<Finding>,
Option<HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>>,
Option<HashMap<String, crate::ssa::type_facts::TypeKind>>,
) {
let cfg = &body.graph;
let entry = body.entry;
let body_id = body.meta.id;
let interner = SymbolInterner::from_cfg(cfg);
if interner.len() > MAX_TRACKED_VARS {
tracing::warn!(
symbols = interner.len(),
max = MAX_TRACKED_VARS,
"taint analysis: too many variables, some will be ignored"
);
}
// Per-body graphs contain only the body's own nodes.
// For non-toplevel bodies, use lower_to_ssa_with_params with scope to
// create SsaOp::Param ops for external/captured variables and formal
// parameters, required for global_seed to inject taint from the parent.
// Top-level bodies use lower_to_ssa with scope_all=true (no Param ops).
let is_toplevel = body.meta.parent_body_id.is_none();
// JS/TS function bodies always use scoped lowering to create Param ops
// for captured variables (globals that flow via seed between bodies).
// Other languages: scoped lowering only when the parent seed is non-empty,
// i.e. the parent body actually has taint to propagate. Without a seed,
// Param ops would just introduce unused SSA values.
let has_nonempty_seed = seed.is_some_and(|s| !s.is_empty());
// Scoped lowering creates SsaOp::Param ops for formal parameters, required
// for handler-param auto-seeding to fire. Java lambda bodies need this too
// so that `cmd -> Runtime.exec(cmd)` picks up `cmd` as a handler param.
let is_java_lambda =
lang == Lang::Java && body.meta.kind == crate::cfg::BodyKind::AnonymousFunction;
// Java methods tagged with a Spring/JaxRs entry-point annotation need
// scoped lowering so the formal parameters (`@RequestParam String name`,
// `@PathParam Long id`, ...) materialise as `SsaOp::Param` ops that
// the entry-point seeding pass paints as `Source(UserInput)`. Restricted
// to Java because (a) JS/TS already use scoped lowering above, (b) Go
// and Ruby handlers introduce request-OBJECT formals (`r *http.Request`,
// implicit `params`) whose Cap::all() seeding triggers FPs at sinks
// that take the bare object (e.g. `http.Redirect(w, r, safe, code)`
// where `r` is the request, not the URL), and (c) Python free-name
// captures (`request`, `b64decode`) bubble up as synthetic externals
// and shift source attribution. Java methods don't have those
// free-capture shapes (every reference is via explicit qualification),
// so the precision-vs-recall trade lands on the precision side.
let is_java_entry_method = lang == Lang::Java
&& body.meta.kind == crate::cfg::BodyKind::NamedFunction
&& body.meta.func_key.as_ref().is_some_and(|k| {
let mut k = k.clone();
k.namespace = namespace.to_string();
ssa_summaries
.and_then(|m| m.get(&k))
.is_some_and(|s| s.entry_kind.is_some())
});
// Rust framework handlers (axum, actix-web, Rocket) need scoped
// lowering so the typed-extractor formals (`Query<T>`, `Json<T>`,
// `Form<T>`, `Path<T>`) materialise as `SsaOp::Param` ops that the
// entry-point seeding pass paints as `Source(UserInput)`. The
// per-formal seed decision is gated on a recovered `TypeKind` from
// `BodyMeta.param_types`: extractor-wrapped formals get
// `Some(TypeKind::Int|String|Bool|...)` (or a DTO type) via
// `rust_type_to_kind`, while denylist wrappers (`State<T>`,
// `Extension<T>`, `Pool<T>`, ...) and bare primitives stay `None`
// and are skipped at seed time. This keeps DI handles
// server-side without painting the database pool as adversary input.
let is_rust_entry_method = lang == Lang::Rust
&& body.meta.kind == crate::cfg::BodyKind::NamedFunction
&& body.meta.func_key.as_ref().is_some_and(|k| {
let mut k = k.clone();
k.namespace = namespace.to_string();
ssa_summaries.and_then(|m| m.get(&k)).is_some_and(|s| {
matches!(
s.entry_kind,
Some(crate::entry_points::EntryKind::AxumHandler)
| Some(crate::entry_points::EntryKind::ActixHandler)
| Some(crate::entry_points::EntryKind::RocketRoute)
)
})
});
// Python Flask handlers need scoped lowering so the route-bound formal
// parameters (`@app.route("/users/<name>")` + `def view(name):`)
// materialise as `SsaOp::Param` ops the entry-point seeding pass paints
// as `Source(UserInput)`. The per-formal seed decision is gated against
// `BodyMeta.param_route_capture`, so only formals whose names appear as
// path captures in the routing decorator are painted; implicit globals
// (`request`, `g`, `session`) and DI-injected formals stay un-seeded.
// Restricted to Flask (`FlaskRoute`) here because FastAPI / Django
// free-name capture shapes (`request`, `b64decode`) bubble up as
// synthetic externals under scoped lowering and shift source
// attribution, while Flask handlers have all formals = path captures
// (precision lands cleanly).
let is_python_flask_route = lang == Lang::Python
&& body.meta.kind == crate::cfg::BodyKind::NamedFunction
&& body
.meta
.param_route_capture
.iter()
.any(|captured| *captured)
&& body.meta.func_key.as_ref().is_some_and(|k| {
let mut k = k.clone();
k.namespace = namespace.to_string();
ssa_summaries.and_then(|m| m.get(&k)).is_some_and(|s| {
matches!(
s.entry_kind,
Some(crate::entry_points::EntryKind::FlaskRoute { .. })
)
})
});
// Ruby Sinatra route handlers need scoped lowering so the block
// parameters (`get "/u/:name" do |name| ... end`) materialise as
// `SsaOp::Param` ops the entry-point seeding pass paints as
// `Source(UserInput)`. Sinatra body bodies are anonymous (the
// `do_block` AST node has no name field), so `BodyKind` is
// `AnonymousFunction`; the gate accepts both anonymous and named.
// Per-formal seed decision is gated against
// `BodyMeta.param_route_capture`, so only block formals whose
// names appear as `:name` segments in the routing path are
// painted. Block formals not in the capture set fall back to
// existing label rules.
let is_ruby_sinatra_route = lang == Lang::Ruby
&& matches!(
body.meta.kind,
crate::cfg::BodyKind::NamedFunction | crate::cfg::BodyKind::AnonymousFunction
)
&& body
.meta
.param_route_capture
.iter()
.any(|captured| *captured)
&& body.meta.func_key.as_ref().is_some_and(|k| {
let mut k = k.clone();
k.namespace = namespace.to_string();
ssa_summaries.and_then(|m| m.get(&k)).is_some_and(|s| {
matches!(
s.entry_kind,
Some(crate::entry_points::EntryKind::SinatraRoute { .. })
)
})
});
// Python FastAPI / Starlette handlers need scoped lowering so the
// route-bound and typed-extractor formals materialise as `SsaOp::Param`
// ops that the entry-point seeding pass paints as `Source(UserInput)`.
// The per-formal decision in `ssa_transfer` consults BOTH
// `BodyMeta.param_route_capture` (for `{name}` brace-segment captures)
// and `type_facts.get_type(value)` (for `Annotated[T, Path()/Query()/Body()
// /Header()/Cookie()/Form()/File()]` typed extractors). Formals without
// either signal — `db: Session = Depends(get_db)`, `request: Request`,
// bare `session` — stay un-seeded, matching the Hard Rule 3 policy that
// unannotated formals are not adversary input.
//
// Gated on "at least one formal qualifies" to mirror the Flask gate:
// a handler with zero path captures and zero typed extractors gets the
// existing label-rule treatment (free-name captures of `request`,
// `b64decode`, etc. bubble up as synthetic externals without scoped
// lowering shifting attribution).
let is_python_fastapi_route = lang == Lang::Python
&& body.meta.kind == crate::cfg::BodyKind::NamedFunction
&& (body
.meta
.param_route_capture
.iter()
.any(|captured| *captured)
|| body.meta.param_types.iter().any(|t| t.is_some()))
&& body.meta.func_key.as_ref().is_some_and(|k| {
let mut k = k.clone();
k.namespace = namespace.to_string();
ssa_summaries.and_then(|m| m.get(&k)).is_some_and(|s| {
matches!(
s.entry_kind,
Some(crate::entry_points::EntryKind::FastApiRoute { .. })
)
})
});
let use_scoped_lowering = !is_toplevel
&& (matches!(lang, Lang::JavaScript | Lang::TypeScript)
|| has_nonempty_seed
|| is_java_lambda
|| is_java_entry_method
|| is_rust_entry_method
|| is_python_flask_route
|| is_python_fastapi_route
|| is_ruby_sinatra_route);
let ssa_result = if use_scoped_lowering {
let func_name = body.meta.name.clone().unwrap_or_else(|| {
body.meta
.func_key
.as_ref()
.and_then(|k| k.disambig.map(|d| format!("<anon#{d}>")))
.unwrap_or_else(|| format!("<anon@{}>", body.meta.span.0))
});
crate::ssa::lower_to_ssa_with_params(cfg, entry, Some(&func_name), false, &body.meta.params)
} else {
crate::ssa::lower_to_ssa(cfg, entry, None, true)
};
// Clear per-body engine-note collector before the body's analysis;
// any WorklistCapped / OriginsTruncated notes recorded during
// transfer land in this bucket and are attached to every finding
// emitted from the body once analysis is done.
ssa_transfer::reset_body_engine_notes();
match ssa_result {
Ok(mut ssa_body) => {
// App Router handlers carry a Web `Request` as their first
// formal. Override `param_types[0]` so the type-fact pass tags
// the formal as `TypeKind::Request` and receiver-method reads
// (`req.json()`, ...) rewrite to `Request.<method>` for
// type-qualified label resolution.
let body_entry_kind = body.meta.func_key.as_ref().and_then(|k| {
let mut k = k.clone();
k.namespace = namespace.to_string();
ssa_summaries
.and_then(|m| m.get(&k))
.and_then(|s| s.entry_kind.clone())
});
let overridden_param_types =
entry_kind_param_type_override(body_entry_kind.as_ref(), &body.meta.param_types);
let param_types_ref = overridden_param_types
.as_deref()
.unwrap_or(body.meta.param_types.as_slice());
let mut opt = crate::ssa::optimize_ssa_with_param_types(
&mut ssa_body,
cfg,
Some(lang),
param_types_ref,
);
// Forward parent-body type facts onto closure-captured Param ops
// before any consumer reads `opt.type_facts`. This is the lever
// that makes bound-variable receiver idioms work in scoped bodies
// (`let c = ldap.createClient(...); function f() { c.search(...) }`)
// — without it the inner `c` SSA value stays Unknown because the
// per-body type-fact pass cannot see the enclosing definition.
if let Some(pvt) = parent_var_types {
inject_external_type_facts(&ssa_body, &mut opt.type_facts, pvt);
}
if tracing::enabled!(tracing::Level::TRACE) {
tracing::trace!(
func = body.meta.name.as_deref().unwrap_or("<anon>"),
ssa = %ssa_body,
"SSA body lowered",
);
for block in &ssa_body.blocks {
for inst in block.phis.iter().chain(block.body.iter()) {
if let Some(t) = opt.type_facts.get_type(inst.value) {
tracing::trace!(value = inst.value.0, ty = ?t, "type fact");
}
}
}
}
let dynamic_pts = std::cell::RefCell::new(std::collections::HashMap::new());
// Static-map abstract analysis: recognises provably-bounded
// lookup idioms (e.g. `map.get(x).unwrap_or("safe")`) so the SSA
// taint engine can clear command-injection findings whose payload
// is a finite set of literal strings.
let static_map =
crate::ssa::static_map::analyze(&ssa_body, cfg, Some(lang), &opt.const_values);
let static_map_opt = if static_map.is_empty() {
None
} else {
Some(static_map)
};
// Per-body field-sensitive points-to facts. Cost is
// amortised across field-write read-back, container ELEM
// cells, and the cross-call resolver.
let pointer_facts = if crate::pointer::is_enabled() {
Some(crate::pointer::analyse_body(&ssa_body, body.meta.id))
} else {
None
};
let transfer = ssa_transfer::SsaTaintTransfer {
lang,
namespace,
interner: &interner,
local_summaries,
global_summaries,
interop_edges,
owner_body_id: body.meta.id,
parent_body_id: body.meta.parent_body_id,
global_seed: seed,
param_seed: None,
receiver_seed: None,
const_values: Some(&opt.const_values),
type_facts: Some(&opt.type_facts),
xml_parser_config: Some(&opt.xml_parser_config),
xpath_config: Some(&opt.xpath_config),
ssa_summaries,
extra_labels,
base_aliases: Some(&opt.alias_result),
callee_bodies,
inline_cache,
context_depth: 0,
callback_bindings: None,
points_to: Some(&opt.points_to),
dynamic_pts: Some(&dynamic_pts),
import_bindings,
promisify_aliases: None,
module_aliases: if opt.module_aliases.is_empty() {
None
} else {
Some(&opt.module_aliases)
},
static_map: static_map_opt.as_ref(),
auto_seed_handler_params: matches!(lang, Lang::JavaScript | Lang::TypeScript)
|| (lang == Lang::Java
&& body.meta.kind == crate::cfg::BodyKind::AnonymousFunction),
cross_file_bodies,
pointer_facts: pointer_facts.as_ref(),
cross_package_imports,
// Phase 10 — Next.js entry-point seeding (looked up
// above when overriding `param_types`).
entry_kind: body_entry_kind,
param_route_capture: if body.meta.param_route_capture.is_empty() {
None
} else {
Some(body.meta.param_route_capture.as_slice())
},
recording_summary: false,
};
let (events, block_states) =
ssa_transfer::run_ssa_taint_full(&ssa_body, cfg, &transfer);
let mut findings = ssa_transfer::ssa_events_to_findings(&events, &ssa_body, cfg);
let body_notes = ssa_transfer::take_body_engine_notes();
for f in &mut findings {
f.body_id = body_id;
for note in &body_notes {
f.merge_note(note.clone());
}
}
if crate::symex::is_enabled() {
let symex_ctx = crate::symex::SymexContext {
ssa: &ssa_body,
cfg,
const_values: &opt.const_values,
type_facts: &opt.type_facts,
global_summaries,
lang,
namespace,
points_to: Some(&opt.points_to),
callee_bodies,
scc_membership: None,
cross_file_bodies: global_summaries,
};
crate::symex::annotate_findings(&mut findings, &symex_ctx);
}
// After forward taint + symex have produced a final
// `Finding.symbolic` shape, run the demand-driven backwards pass
// and layer its verdict on top. Placing this *after* symex
// (which overwrites `symbolic`) preserves any symex witness
// while still annotating `backwards-confirmed` / `-infeasible`
// onto the `cutoff_notes` vector. Gated by
// `analysis.engine.backwards_analysis` (default off).
if crate::utils::analysis_options::current().backwards_analysis {
let bctx = backwards::BackwardsCtx {
ssa: &ssa_body,
cfg,
lang,
global_summaries,
intra_file_bodies: callee_bodies,
depth_budget: backwards::DEFAULT_BACKWARDS_DEPTH,
};
for finding in &mut findings {
let Some(sink_val) = ssa_body.cfg_node_map.get(&finding.sink).copied() else {
continue;
};
let sink_caps = cfg[finding.sink].taint.labels.iter().fold(
crate::labels::Cap::empty(),
|acc, l| match l {
crate::labels::DataLabel::Sink(c) => acc | *c,
_ => acc,
},
);
let caps = if sink_caps.is_empty() {
crate::labels::Cap::all()
} else {
sink_caps
};
let flows =
backwards::analyse_sink_backwards(&bctx, sink_val, finding.sink, caps);
let verdict = backwards::aggregate_verdict(&flows);
backwards::annotate_finding(finding, verdict);
}
}
// Extract exit state for seeding child bodies. Tag every
// entry with the owner body's id so a later join (e.g. the
// JS/TS two-level `combined_exit`) cannot silently alias
// same-named bindings from different bodies.
let exit_state = ssa_transfer::extract_ssa_exit_state(
&block_states,
&ssa_body,
cfg,
&transfer,
body_id,
);
// Snapshot named TypeKinds so child bodies can pick up
// closure-captured types (e.g. an outer `LdapClient` flowing
// into an inner function via free-variable read).
let named_types = extract_named_type_facts(&ssa_body, &opt.type_facts);
let named_types = if named_types.is_empty() {
None
} else {
Some(named_types)
};
(findings, Some(exit_state), named_types)
}
Err(e) => {
// SSA lowering produced no analyzable body. We still surface
// the event so downstream tooling can tell "we tried and gave
// up" from "we ran clean", a TRACE-level log records the
// reason (no synthetic Finding is manufactured because a
// diag pointing at no source location would be misleading).
tracing::trace!(
body_id = body_id.0,
body_name = ?body.meta.name,
error = %e,
"SSA lowering bailed; emitting engine note",
);
ssa_transfer::record_engine_note(crate::engine_notes::EngineNote::SsaLoweringBailed {
reason: format!("{e}"),
});
// Drain the collector so the note does not bleed into the
// next body (which will call reset on entry, but be explicit).
let _ = ssa_transfer::take_body_engine_notes();
(Vec::new(), None, None)
}
}
}
/// Unified multi-body taint analysis with lexical containment propagation.
///
/// Pass 1: process all bodies in containment-topological order (parent before
/// child), seeding each child body with its parent's exit state.
///
/// Pass 2 (JS/TS only, `max_iterations > 1`): iterative convergence for
/// functions that modify global state, feeding taint back to other functions.
fn analyse_multi_body(
file_cfg: &FileCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
interop_edges: &[InteropEdge],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
ssa_summaries: Option<
&std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
>,
callee_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
inline_cache: Option<&std::cell::RefCell<ssa_transfer::InlineCache>>,
max_iterations: usize,
import_bindings: Option<&crate::cfg::ImportBindings>,
cross_file_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
cross_package_imports: Option<&std::collections::HashMap<String, FuncKey>>,
) -> Vec<Finding> {
let order = containment_order(&file_cfg.bodies);
let mut all_findings: Vec<Finding> = Vec::new();
// Exit states per body, used to seed children.
let mut body_exit_states: HashMap<
BodyId,
HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
> = HashMap::new();
// Per-body `var_name → TypeKind` snapshots, used to forward closure-
// captured types from parent bodies into their children's type-fact
// results. Only populated when a body produces a non-empty set of
// typed named values, i.e. it has at least one named SSA value with
// a concrete `TypeKind` after optimisation.
let mut body_var_types: HashMap<BodyId, HashMap<String, crate::ssa::type_facts::TypeKind>> =
HashMap::new();
// ── Pass 1: lexical containment propagation ──────────────────────
for &idx in &order {
let body = &file_cfg.bodies[idx];
// Determine seed from parent body's exit state.
let parent_seed = body
.meta
.parent_body_id
.and_then(|pid| body_exit_states.get(&pid));
let parent_var_types = body
.meta
.parent_body_id
.and_then(|pid| body_var_types.get(&pid));
let (findings, exit_state, var_types) = analyse_body_with_seed(
body,
lang,
namespace,
local_summaries,
global_summaries,
interop_edges,
extra_labels,
ssa_summaries,
callee_bodies,
inline_cache,
parent_seed,
import_bindings,
cross_file_bodies,
parent_var_types,
cross_package_imports,
);
tracing::debug!(
body_id = body.meta.id.0,
body_name = ?body.meta.name,
findings = findings.len(),
graph_nodes = body.graph.node_count(),
has_seed = parent_seed.is_some(),
"analyse_multi_body: body analysed"
);
all_findings.extend(findings);
if let Some(es) = exit_state {
body_exit_states.insert(body.meta.id, es);
}
if let Some(vt) = var_types {
body_var_types.insert(body.meta.id, vt);
}
}
// ── Pass 2: JS/TS iterative convergence ──────────────────────────
// Only for JS/TS: functions that modify global variables can feed taint
// back to other functions. Iterate until the top-level seed stabilises.
//
// `iters_used` counts how many rounds of the convergence loop
// actually ran (not including the initial lexical-containment pass
// above). It is used to detect cap-hit after the loop exits: a
// cap-hit is the case where we exhausted the budget without the
// `combined_exit == current_seed` break firing.
let mut converged_early = true;
let mut iters_used: usize = 0;
// Trajectory of per-round seed-delta sizes; populated inside the
// max_iterations > 1 branch and read on cap-hit. Default empty
// → classifier returns `Unknown`, which is the correct outcome
// for non-JS/TS languages (no iterative loop ran).
let mut convergence_trajectory: smallvec::SmallVec<[u32; 4]> = smallvec::SmallVec::new();
if max_iterations > 1 {
let top = file_cfg.toplevel();
let top_cfg = &top.graph;
// Collect top-level binding keys for seed filtering. Always
// keyed under `BodyId(0)`, `filter_seed_to_toplevel` matches
// by name and re-keys every surviving entry to `BodyId(0)`
// anyway, so the body_id on the probe keys is informational.
let toplevel_keys: HashSet<ssa_transfer::BindingKey> = {
let mut keys = HashSet::new();
for (_idx, info) in top_cfg.node_references() {
if let Some(ref d) = info.taint.defines {
keys.insert(ssa_transfer::BindingKey::new(d.as_str(), BodyId(0)));
}
for u in &info.taint.uses {
keys.insert(ssa_transfer::BindingKey::new(u.as_str(), BodyId(0)));
}
}
keys
};
// Phase-B (body granularity): precompute per-body read-set of
// top-level binding names. A non-toplevel body only needs
// re-analysis when a name it reads via Param or via the
// global_seed ancestor-lookup path has actually changed in
// the combined seed. `reads` is a superset of the body's
// top-level dependencies, we err on the side of over-running
// (false dirty) rather than missing a dependency.
let body_reads: HashMap<BodyId, HashSet<String>> = {
let mut m: HashMap<BodyId, HashSet<String>> = HashMap::new();
for body in &file_cfg.bodies {
if body.meta.parent_body_id.is_none() {
continue; // top-level has no global_seed lookups
}
let mut names: HashSet<String> = HashSet::new();
for (_idx, info) in body.graph.node_references() {
for u in &info.taint.uses {
names.insert(u.to_string());
}
}
m.insert(body.meta.id, names);
}
m
};
// Initial seed is the top-level exit state.
let mut current_seed = body_exit_states
.get(&BodyId(0))
.cloned()
.unwrap_or_default();
// Phase-B per-body findings cache: retains the most-recent
// round's findings for each body. Round N re-runs only dirty
// bodies; clean bodies keep their round N-1 findings. This
// replaces the previous "drop all non-toplevel findings, run
// everything, repeat" pattern.
let mut findings_by_body: HashMap<BodyId, Vec<Finding>> = HashMap::new();
// Seed the cache with the pass-1 findings so round 0 of the
// worklist has a consistent starting state. We partition
// `all_findings` into "toplevel" (kept verbatim) and
// "non-toplevel" (moved into the cache, keyed by body).
let mut toplevel_findings: Vec<Finding> = Vec::new();
for f in std::mem::take(&mut all_findings) {
let body = file_cfg.bodies.get(f.body_id.0 as usize);
if body.is_some_and(|b| b.meta.parent_body_id.is_none()) {
toplevel_findings.push(f);
} else {
findings_by_body
.entry(BodyId(f.body_id.0))
.or_default()
.push(f);
}
}
let rounds = max_iterations.saturating_sub(1);
converged_early = rounds == 0;
let use_gauss_seidel = js_ts_gauss_seidel_enabled();
for round in 0..rounds {
iters_used = round + 1;
// Combine function body exits filtered to top-level scope.
let mut combined_exit = current_seed.clone();
for &idx in &order {
let body = &file_cfg.bodies[idx];
if body.meta.parent_body_id.is_none() {
continue; // skip top-level itself
}
if let Some(es) = body_exit_states.get(&body.meta.id) {
let filtered = ssa_transfer::filter_seed_to_toplevel(es, &toplevel_keys);
combined_exit = ssa_transfer::join_seed_maps(&combined_exit, &filtered);
}
}
// Record seed-delta for cap-hit classification. Count the
// number of keys whose value differs between current_seed
// and combined_exit. This mirrors scan.rs's diff helpers
// but at BindingKey granularity.
let iter_delta = seed_delta_size(&current_seed, &combined_exit);
if convergence_trajectory.len() == 4 {
convergence_trajectory.remove(0);
}
convergence_trajectory.push(iter_delta as u32);
// Converged: seed didn't change.
if combined_exit == current_seed {
converged_early = true;
break;
}
// Phase-B: compute which binding names changed so we can
// skip bodies whose read-set is disjoint from the change
// set.
let changed_names = changed_binding_names(&current_seed, &combined_exit);
current_seed = combined_exit;
// Re-run non-toplevel bodies with updated seed.
body_exit_states.insert(BodyId(0), current_seed.clone());
// Phase-C: Gauss-Seidel variant, as each body is
// re-analysed, merge its new exit into `current_seed`
// immediately so subsequent bodies in the same round see
// the fresh value. Order matters here; we pin to
// `order` (containment-topological) for reproducibility.
// The Jacobi path leaves `current_seed` untouched for
// the rest of the round.
for &idx in &order {
let body = &file_cfg.bodies[idx];
if body.meta.parent_body_id.is_none() {
continue; // don't re-run top-level
}
// Skip clean bodies: nothing this body reads has
// changed, so re-analysis would produce byte-identical
// output. The cached findings from the previous
// round (or pass-1) remain correct.
if let Some(reads) = body_reads.get(&body.meta.id) {
if reads.is_disjoint(&changed_names) {
continue;
}
}
let parent_seed = body
.meta
.parent_body_id
.and_then(|pid| body_exit_states.get(&pid));
let parent_var_types = body
.meta
.parent_body_id
.and_then(|pid| body_var_types.get(&pid));
let (findings, exit_state, var_types) = analyse_body_with_seed(
body,
lang,
namespace,
local_summaries,
global_summaries,
interop_edges,
extra_labels,
ssa_summaries,
callee_bodies,
inline_cache,
parent_seed,
import_bindings,
cross_file_bodies,
parent_var_types,
cross_package_imports,
);
// Phase-B: replace (not append) this body's findings
// in the cache. Previous rounds' findings for this
// body are superseded by the new round's output.
findings_by_body.insert(body.meta.id, findings);
if let Some(vt) = var_types {
body_var_types.insert(body.meta.id, vt);
}
if let Some(es) = exit_state {
// Phase-C Gauss-Seidel: immediately publish this
// body's filtered exit into `current_seed` and
// `body_exit_states[BodyId(0)]` so the next body
// in this same round sees the updated seed via
// its `global_seed` ancestor lookup.
if use_gauss_seidel {
let filtered = ssa_transfer::filter_seed_to_toplevel(&es, &toplevel_keys);
current_seed = ssa_transfer::join_seed_maps(&current_seed, &filtered);
body_exit_states.insert(BodyId(0), current_seed.clone());
}
body_exit_states.insert(body.meta.id, es);
}
}
}
// After the loop, flatten per-body caches back into
// `all_findings`, preserving the toplevel findings we set
// aside earlier.
all_findings = toplevel_findings;
for body in &file_cfg.bodies {
if body.meta.parent_body_id.is_none() {
continue;
}
if let Some(fs) = findings_by_body.remove(&body.meta.id) {
all_findings.extend(fs);
}
}
}
// Record observability counter. `iters_used == 0` covers the
// non-JS/TS path (`max_iterations == 1`) and the JS/TS case where
// the convergence loop did not enter, report `1` so the counter
// always reflects "at least the lexical-containment pass ran".
let reported_iters = if iters_used == 0 { 1 } else { iters_used };
LAST_JS_TS_PASS2_ITERATIONS.store(reported_iters, Ordering::Relaxed);
// Convergence telemetry: record this file's pass-2 outcome.
// No-op unless `NYX_CONVERGENCE_TELEMETRY=1`. Only emitted for
// JS/TS (`max_iterations > 1`) where a pass-2 loop actually ran;
// single-iteration languages do not produce a convergence event.
if max_iterations > 1 {
let non_toplevel_bodies = file_cfg
.bodies
.iter()
.filter(|b| b.meta.parent_body_id.is_some())
.count();
crate::convergence_telemetry::record(
crate::convergence_telemetry::ConvergenceEvent::InFilePass2(
crate::convergence_telemetry::InFilePass2Record {
schema: crate::convergence_telemetry::SCHEMA_VERSION,
namespace: namespace.to_string(),
body_count: non_toplevel_bodies,
iterations: iters_used,
cap: max_iterations,
converged: converged_early,
trajectory: convergence_trajectory.clone(),
},
),
);
}
// Cap-hit: the loop exhausted `max_iterations` without the
// `combined_exit == current_seed` break firing. Tag every finding
// produced by this file so downstream consumers know the results
// may be under-reported. Only meaningful for JS/TS
// (`max_iterations > 1`); single-iteration languages always
// converge trivially by definition.
if max_iterations > 1 && !converged_early {
// Trajectory is captured in the convergence loop above; empty
// when the loop never entered the delta-push path (rounds ==
// 0, non-JS/TS, etc.). Classifier defaults to `Unknown` for
// <2 samples.
let reason = crate::engine_notes::CapHitReason::classify(&convergence_trajectory);
tracing::warn!(
file = %namespace,
iterations = iters_used,
cap = max_iterations,
reason = reason.tag(),
"JS/TS pass-2 in-file fixpoint did not converge within safety cap — \
results may be imprecise. This usually indicates a very deep chain \
of top-level bindings threaded through helper functions; please \
file a bug with a reproducer."
);
let note = EngineNote::InFileFixpointCapped {
iterations: iters_used as u32,
reason,
};
for f in &mut all_findings {
f.merge_note(note.clone());
}
}
all_findings
}
/// Return the set of binding **names** whose value differs between two
/// seed maps. Used by the Phase-B body-level worklist to decide
/// which non-toplevel bodies must re-run.
///
/// Names (not full `BindingKey`s) because `filter_seed_to_toplevel`
/// re-keys every surviving entry to `BodyId(0)` anyway, and
/// per-body reads are plain identifier strings from the SSA IR.
/// Collapsing to names avoids a spurious mismatch when the same
/// binding appears under different body-scoped keys.
fn changed_binding_names(
before: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
after: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
) -> HashSet<String> {
let mut changed = HashSet::new();
for (k, v_after) in after {
match before.get(k) {
Some(v_before) if v_before == v_after => {}
_ => {
changed.insert(k.name.to_string());
}
}
}
for k in before.keys() {
if !after.contains_key(k) {
changed.insert(k.name.to_string());
}
}
changed
}
/// Count [`ssa_transfer::BindingKey`]s whose [`VarTaint`] differs
/// between two seed maps. Keys present in one map but missing from the
/// other count as differences.
fn seed_delta_size(
before: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
after: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
) -> usize {
let mut changed = 0usize;
for (k, v_after) in after {
match before.get(k) {
Some(v_before) if v_before == v_after => {}
_ => changed += 1,
}
}
for k in before.keys() {
if !after.contains_key(k) {
changed += 1;
}
}
changed
}
/// Find function entry nodes: (func_name, entry_node) pairs.
///
/// A function entry is the first node with a given `enclosing_func` value.
fn find_function_entries(cfg: &Cfg) -> Vec<(String, NodeIndex)> {
let mut seen = HashSet::new();
let mut entries = Vec::new();
for (idx, info) in cfg.node_references() {
if let Some(ref func_name) = info.ast.enclosing_func
&& seen.insert(func_name.clone())
{
entries.push((func_name.clone(), idx));
}
}
entries
}
/// Look up formal parameter names (in declaration order) for a function from
/// the CFG-level local summaries. Returns empty vec if not found.
fn lookup_formal_params(local_summaries: &FuncSummaries, func_name: &str) -> Vec<String> {
local_summaries
.iter()
.find(|(k, _)| k.name == func_name)
.map(|(_, s)| s.param_names.clone())
.unwrap_or_default()
}
/// Resolve a bare function name + param count to a canonical [`FuncKey`] by
/// consulting the already FuncKey-keyed `local_summaries`.
///
/// When exactly one `(name, arity)`-matching entry exists we use its full
/// identity (container / disambig / kind preserved). When zero or multiple
/// match we fall back to a free-function key so the caller still has a
/// well-formed key, this can only happen in legacy discovery paths that
/// cannot see through same-name siblings, and those paths were already
/// collision-prone before this refactor. New intra-file analysis code
/// should prefer [`BodyMeta::func_key`].
fn lookup_canonical_func_key(
local_summaries: &FuncSummaries,
lang: Lang,
namespace: &str,
func_name: &str,
param_count: usize,
) -> FuncKey {
// `local_summaries` is file-local, so every entry's namespace agrees with
// whatever `build_cfg` wrote (raw file path). We match by lang + name +
// arity and fall back to name-only, the caller's `namespace` argument is
// only used when we have to synthesise a key as a last resort.
let mut matches = local_summaries
.keys()
.filter(|k| k.lang == lang && k.name == func_name && k.arity == Some(param_count));
let first = matches.next().cloned();
if let Some(first) = first
&& matches.next().is_none()
{
return first;
}
if let Some(name_only) = local_summaries
.keys()
.find(|k| k.lang == lang && k.name == func_name)
{
return name_only.clone();
}
FuncKey {
lang,
namespace: namespace.to_string(),
container: String::new(),
name: func_name.to_string(),
arity: Some(param_count),
disambig: None,
kind: FuncKind::Function,
}
}
/// Extract precise SSA function summaries for all functions in a file.
///
/// Lowers each function to SSA individually and runs per-parameter probing
/// to produce an `SsaFuncSummary`. The resulting map is keyed by canonical
/// [`FuncKey`] so that same-name functions on different containers in the
/// same file produce distinct summary entries.
#[allow(dead_code)] // Used by tests; production code uses extract_ssa_artifacts
pub(crate) fn extract_intra_file_ssa_summaries(
cfg: &Cfg,
interner: &SymbolInterner,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
) -> std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary> {
let func_entries = find_function_entries(cfg);
let mut summaries = std::collections::HashMap::new();
for (func_name, func_entry) in &func_entries {
let formal_params = lookup_formal_params(local_summaries, func_name);
let mut func_ssa = match crate::ssa::lower_to_ssa_with_params(
cfg,
*func_entry,
Some(func_name),
false,
&formal_params,
) {
Ok(ssa) => ssa,
Err(_) => continue,
};
// Match the `_from_bodies` path: prune dead constant branches before
// the summary probe (see `prefold_dead_branches_for_summary`).
prefold_dead_branches_for_summary(&mut func_ssa, cfg);
// `formal_params` is authoritative even when it is empty. SSA lowering
// also emits Param ops for external captures; counting those as arity
// makes zero-arg functions look like synthetic overloads.
let param_count = formal_params.len();
// Zero-param helpers are normally elided, a fixture with no
// parameters cannot carry per-parameter taint transforms. But
// zero-arg factories (`function makeBag() { return []; }`) do
// have one observable cross-file effect: the return is a fresh
// container allocation. Run the summary extractor for those and
// keep the result only when `returns_fresh_alloc` is set;
// everything else falls through the observable-effects filter
// below.
//
// Pre-compute module aliases for JS/TS (read-only const prop pass)
let mod_aliases = compute_module_aliases_for_summary(&func_ssa, lang);
let mod_aliases_ref = if mod_aliases.is_empty() {
None
} else {
Some(&mod_aliases)
};
let summary = ssa_transfer::extract_ssa_func_summary(
&func_ssa,
cfg,
local_summaries,
global_summaries,
lang,
namespace,
interner,
param_count,
mod_aliases_ref,
None,
Some(&formal_params),
None,
None,
);
// Only store if the summary has observable effects. With
// `points_to` support, a void helper whose only observable behaviour
// is a parameter-to-parameter alias (e.g. `fn set(t, v) { t.x = v; }`)
// must survive this filter so summary application at cross-file
// call sites can replay the alias edges. Zero-param factories
// are kept via the `returns_fresh_alloc` leg of
// `points_to.is_empty()`, `is_empty()` returns false when the
// fresh-alloc flag is set.
if !summary.param_to_return.is_empty()
|| !summary.param_to_sink.is_empty()
|| !summary.source_caps.is_empty()
|| !summary.param_container_to_return.is_empty()
|| !summary.param_to_container_store.is_empty()
|| summary.return_abstract.is_some()
|| !summary.points_to.is_empty()
{
let key =
lookup_canonical_func_key(local_summaries, lang, namespace, func_name, param_count);
summaries.insert(key, summary);
}
}
if !summaries.is_empty() {
tracing::debug!(
count = summaries.len(),
"SSA summary extraction: produced intra-file summaries"
);
}
summaries
}
/// Lower all function bodies from `FileCfg` to produce SSA summaries + cached
/// bodies. Each body's own graph is used directly, no scope filtering needed.
///
/// Both returned maps are keyed by each body's canonical [`FuncKey`] (carried
/// on [`crate::cfg::BodyMeta::func_key`]). This is the most collision-
/// resistant identity we have: same-name methods on different classes, same-
/// name overloads with different arity, and anonymous bodies at distinct
/// source spans all get distinct keys.
#[allow(clippy::too_many_arguments)]
/// Prune definite-constant dead branches on a freshly-lowered body *before*
/// its interprocedural summary is extracted.
///
/// Summary extraction ([`ssa_transfer::extract_ssa_func_summary`]) runs on the
/// pre-optimisation SSA, so without this a helper whose body returns a constant
/// only because a dead `else x = param` branch is never taken would still emit
/// a `param → return` transform — re-tainting the caller's `bar =
/// helper(param)` and defeating the in-body branch fold. Only
/// [`crate::ssa::const_prop::fold_constant_branches`] is applied (no copy-prop /
/// DCE), so the change is limited to provably-dead arithmetic-comparison
/// branches; the body's value numbering is otherwise untouched.
fn prefold_dead_branches_for_summary(func_ssa: &mut crate::ssa::SsaBody, cfg: &crate::cfg::Cfg) {
let cp = crate::ssa::const_prop::const_propagate(func_ssa);
crate::ssa::const_prop::fold_constant_branches(func_ssa, cfg, &cp.values);
}
pub(crate) fn lower_all_functions_from_bodies(
file_cfg: &FileCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
locator: Option<&crate::summary::SinkSiteLocator<'_>>,
scan_root: Option<&str>,
module_graph: Option<&crate::resolve::ModuleGraph>,
) -> (
std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
) {
crate::ssa::type_facts::with_file_imports(Some(&file_cfg.local_imports), || {
crate::cfg::safe_fields::with_safe_lookup_fields(Some(&file_cfg.safe_lookup_fields), || {
lower_all_functions_from_bodies_inner(
file_cfg,
lang,
namespace,
local_summaries,
global_summaries,
locator,
scan_root,
module_graph,
)
})
})
}
#[allow(clippy::too_many_arguments)]
fn lower_all_functions_from_bodies_inner(
file_cfg: &FileCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
locator: Option<&crate::summary::SinkSiteLocator<'_>>,
scan_root: Option<&str>,
module_graph: Option<&crate::resolve::ModuleGraph>,
) -> (
std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
) {
let mut summaries = std::collections::HashMap::new();
let mut bodies = std::collections::HashMap::new();
// Build the file's cross-package import map once and share it
// across every body produced from this file. The map mirrors what
// `analyse_file_with_lowered` builds at pass-2 entry, but storing
// it on each `CalleeSsaBody` lets the inline-analysis frame inside
// another file resolve the callee's local import names against
// the callee's own package boundary (Phase 09 step 0.7) instead of
// skipping the lookup entirely.
let cross_package_imports_arc = {
let map = build_cross_package_func_keys(
&file_cfg.resolved_imports,
scan_root,
module_graph,
lang,
);
std::sync::Arc::new(map)
};
for body in file_cfg.function_bodies() {
let _t_misc = std::time::Instant::now();
let func_name = body.meta.name.clone().unwrap_or_else(|| {
body.meta
.func_key
.as_ref()
.and_then(|k| k.disambig.map(|d| format!("<anon#{d}>")))
.unwrap_or_else(|| format!("<anon@{}>", body.meta.span.0))
});
let interner = SymbolInterner::from_cfg(&body.graph);
let formal_params = &body.meta.params;
perf_lower_record(6, _t_misc.elapsed().as_micros());
let _t_lower = std::time::Instant::now();
let mut func_ssa = match crate::ssa::lower_to_ssa_with_params(
&body.graph,
body.entry,
Some(&func_name),
false,
formal_params,
) {
Ok(ssa) => ssa,
Err(_) => continue,
};
perf_lower_record(0, _t_lower.elapsed().as_micros());
// Prune dead constant branches before the summary probe so a helper's
// dead `else x = param` does not surface as a spurious param→return.
prefold_dead_branches_for_summary(&mut func_ssa, &body.graph);
let param_count = if !formal_params.is_empty() {
formal_params.len()
} else {
func_ssa
.blocks
.iter()
.flat_map(|b| b.phis.iter().chain(b.body.iter()))
.filter(|i| matches!(i.op, crate::ssa::ir::SsaOp::Param { .. }))
.count()
};
// Canonical FuncKey: prefer the identity attached to the body at
// CFG-construction time; otherwise fall back to matching in
// `local_summaries`.
//
// `body.meta.func_key` carries the raw file-path namespace that
// `build_cfg` wrote. The caller passes `namespace` already normalized
// against `scan_root`, which is what FuncSummary keys use on the
// cross-file side (`FuncSummary::func_key`). Overriding the namespace
// here keeps both sides of `GlobalSummaries` agreement, otherwise
// `resolve_callee` resolves to the normalized FuncSummary key and
// misses the raw-path SSA entry.
let mut key = body.meta.func_key.clone().unwrap_or_else(|| {
lookup_canonical_func_key(local_summaries, lang, namespace, &func_name, param_count)
});
key.namespace = namespace.to_string();
// Run the extractor even for zero-param functions so factories
// (`returns_fresh_alloc = true`) emit a summary the caller can
// replay. A completely empty summary is still inserted for
// non-zero-param functions (see the existing rationale below) but
// zero-param cases without the factory flag stay out of the map
// to avoid cluttering `GlobalSummaries` with trivially-empty
// entries.
{
let _t_extract = std::time::Instant::now();
let mod_aliases = compute_module_aliases_for_summary(&func_ssa, lang);
let mod_aliases_ref = if mod_aliases.is_empty() {
None
} else {
Some(&mod_aliases)
};
let formal_destructured = if !body.meta.param_destructured_fields.is_empty() {
Some(body.meta.param_destructured_fields.as_slice())
} else {
None
};
let param_types_ref = if !body.meta.param_types.is_empty() {
Some(body.meta.param_types.as_slice())
} else {
None
};
let summary = ssa_transfer::extract_ssa_func_summary(
&func_ssa,
&body.graph,
local_summaries,
global_summaries,
lang,
namespace,
&interner,
param_count,
mod_aliases_ref,
locator,
Some(formal_params),
formal_destructured,
param_types_ref,
);
// Phase 10 — annotate entry-point summaries. The pass-2
// taint engine reads `entry_kind` to seed the function's
// formals as `TaintOrigin::Source` at SSA entry, mirroring
// an HTTP handler's adversary-controlled inputs. Always
// recorded even on empty summaries so caller-side resolution
// sees the entry classification through cross-file lookups.
let mut summary = summary;
summary.entry_kind = file_cfg.entry_kinds.get(&body.meta.span).cloned();
// Always insert the summary, even when all fields are empty/default.
// An empty summary tells resolve_callee "this function exists and has
// no taint effects", preventing fallthrough to the less precise old
// FuncSummary which may report false source_caps from internal sources.
// For zero-param functions we only insert when the summary carries
// the fresh-container signal (the only observable effect worth
// persisting for a parameter-less body).
//
// An entry-kind tag also keeps the summary in the map even
// for zero-param entry points so cross-file resolvers see it.
if param_count > 0
|| summary.points_to.returns_fresh_alloc
|| summary.entry_kind.is_some()
{
summaries.insert(key.clone(), summary);
}
perf_lower_record(1, _t_extract.elapsed().as_micros());
}
let _t_opt = std::time::Instant::now();
// Override `param_types[0]` for entry-kind-tagged formals (e.g. App
// Router handlers receive a Web `Request`). Other entry kinds keep
// the ambient param-type vector unchanged. See
// `entry_kind_param_type_override` for the full rule set.
let entry_kind_for_body = file_cfg.entry_kinds.get(&body.meta.span);
let overridden_param_types =
entry_kind_param_type_override(entry_kind_for_body, &body.meta.param_types);
let param_types_ref = overridden_param_types
.as_deref()
.unwrap_or(body.meta.param_types.as_slice());
let opt = crate::ssa::optimize_ssa_with_param_types(
&mut func_ssa,
&body.graph,
Some(lang),
param_types_ref,
);
perf_lower_record(2, _t_opt.elapsed().as_micros());
let _t_typed = std::time::Instant::now();
// For every SSA method call, look up the receiver's TypeKind
// and record `(call_ordinal, container_name)` so devirtualisation
// in `build_call_graph` can narrow the edge to the receiver-typed
// container. Free-function calls and unknown types fall back to
// bare-name resolution.
let typed_receivers = collect_typed_call_receivers(&func_ssa, &body.graph, &opt.type_facts);
if !typed_receivers.is_empty() {
// Zero-param/no-fresh-alloc bodies are skipped above;
// force-insert so receiver-type info still reaches
// build_call_graph.
let entry = summaries.entry(key.clone()).or_default();
entry.typed_call_receivers = typed_receivers;
}
// Populate `field_points_to` from the body's pointer facts.
// `extract_field_points_to` covers both reads (FieldProj walks)
// and writes (`field_writes` side-table) in one pass.
if crate::pointer::is_enabled() {
let facts = crate::pointer::analyse_body(&func_ssa, body.meta.id);
let fpt = crate::pointer::extract_field_points_to(&func_ssa, &facts);
if !fpt.is_empty() {
let entry = summaries.entry(key.clone()).or_default();
entry.field_points_to = fpt;
}
}
perf_lower_record(3, _t_typed.elapsed().as_micros());
let _t_misc2 = std::time::Instant::now();
bodies.insert(
key,
ssa_transfer::CalleeSsaBody {
ssa: func_ssa,
opt,
param_count,
node_meta: std::collections::HashMap::new(),
body_graph: Some(body.graph.clone()),
cross_package_imports: std::sync::Arc::clone(&cross_package_imports_arc),
},
);
perf_lower_record(6, _t_misc2.elapsed().as_micros());
}
// ── Closure-capture summary augmentation ─────────────────────────
//
// Lift child-body sinks into the parent's `param_to_sink` for
// every parent body with lexically contained children. This
// handles the direct-wrapper case
// `f(x) { return new Promise((res, rej) => sink(x)) }`, the
// executor's gated http.get sink becomes visible to callers of
// `f` via `f.summary.param_to_sink`.
//
// Without this pass, `f.summary.param_to_sink` stays empty
// because the sink lives in a separately-extracted child body
// that the parent's pass-1 probe never sees. The
// lexical-containment propagation in `analyse_multi_body`
// carries seeded taint into child bodies for the production
// analysis path, but the single-body summary extractor in
// `extract_ssa_func_summary` does not. This pass reproduces that
// propagation at summary-extraction time so cross-call
// resolution sees the sink at every caller of `f`.
//
// Strict-additive: only ADDs `param_to_sink` entries, never
// removes or modifies existing data, so it cannot regress
// detection. Bounded: each parent-param probe runs each child
// body's analysis exactly once.
let _t_aug = std::time::Instant::now();
augment_summaries_with_child_sinks(
file_cfg,
lang,
namespace,
local_summaries,
global_summaries,
&bodies,
&mut summaries,
);
perf_lower_record(4, _t_aug.elapsed().as_micros());
// ── Second extraction pass: transitive cross-function summary lift ───
//
// The augment pass populates direct sink-wrapper summaries
// (`f(x) { Promise(() => sink(x)) }`). This second pass then
// re-runs every body's per-parameter probe with the augmented
// `summaries` map plumbed through to the probe transfer's
// `ssa_summaries` field, so callers of those wrappers (e.g. an
// `addFileDataIfNeeded` whose body calls a `downloadFileFromURI`
// sink wrapper) see the augmented `param_to_sink` at step 0 of
// `resolve_callee_full` and propagate it onto their own summary.
//
// OR-merge: only adds `param_to_sink` / `param_to_sink_param`
// entries to existing summaries. Existing entries (return
// transforms, source caps, augment-populated sinks, etc.) are
// preserved. Strict-additive, cannot regress detection.
let _t_rerun = std::time::Instant::now();
rerun_extraction_with_augmented_summaries(
file_cfg,
lang,
namespace,
local_summaries,
global_summaries,
locator,
&bodies,
&mut summaries,
);
perf_lower_record(5, _t_rerun.elapsed().as_micros());
if !summaries.is_empty() {
tracing::debug!(
count = summaries.len(),
bodies = bodies.len(),
"lower_all_functions_from_bodies: produced summaries + cached bodies"
);
}
(summaries, bodies)
}
/// Second extraction pass: re-runs `extract_ssa_func_summary_full` for
/// every body with the augmented `summaries` map plumbed through.
///
/// Only sink-related fields (`param_to_sink`, `param_to_sink_param`)
/// are merged into existing summaries; other fields stay as-produced
/// by the first pass. Bounded: one re-extraction per body.
#[allow(clippy::too_many_arguments)]
fn rerun_extraction_with_augmented_summaries(
file_cfg: &FileCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
locator: Option<&crate::summary::SinkSiteLocator<'_>>,
bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
summaries: &mut std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
) {
use crate::ssa::ir::SsaOp;
use crate::state::symbol::SymbolInterner;
// Fast-out: rerun matters only when at least one body in the file has
// an SSA summary entry that *another* body in the same file might
// resolve a Call to. If no SSA summaries were produced, nothing to
// re-extract. This is the dominant case for files of unrelated
// functions or with all-cross-file callees.
if summaries.is_empty() {
return;
}
// Snapshot the augmented summaries map so the probes resolve
// callees against a stable view (the merge below mutates
// `summaries` as we iterate).
let augmented_snapshot: std::collections::HashMap<
FuncKey,
crate::summary::ssa_summary::SsaFuncSummary,
> = summaries.clone();
// Set of bare callee names known to have an in-file SsaFuncSummary.
// `extract_ssa_func_summary_full` only consults `ssa_summaries` at
// Call resolution time, so a body with no Call to any of these names
// produces a summary identical to its first-pass output.
//
// SSA `Call::callee` carries the bare method name after lowering
// decomposes chained-receiver calls, which matches `FuncKey::name`.
// Borrows `augmented_snapshot` (immutable view) so the loop below can
// freely mutate `summaries`.
let in_file_names: std::collections::HashSet<&str> =
augmented_snapshot.keys().map(|k| k.name.as_str()).collect();
for body in file_cfg.function_bodies() {
let Some(parent_key) = body.meta.func_key.clone() else {
continue;
};
let mut key = parent_key;
key.namespace = namespace.to_string();
let Some(callee) = bodies.get(&key) else {
continue;
};
if callee.param_count == 0 {
continue;
}
let Some(parent_cfg) = callee.body_graph.as_ref() else {
continue;
};
// Narrow: rerun only bodies whose SSA references at least one
// in-file summary by name. Bodies with no in-file Call cannot
// benefit from the augmented `ssa_summaries` view, so their
// re-extraction is a strict no-op.
let has_in_file_call = callee.ssa.blocks.iter().any(|b| {
b.body.iter().any(|inst| {
if let SsaOp::Call { callee: name, .. } = &inst.op {
in_file_names.contains(name.as_str())
} else {
false
}
})
});
if !has_in_file_call {
continue;
}
let interner = SymbolInterner::from_cfg(parent_cfg);
let mod_aliases = compute_module_aliases_for_summary(&callee.ssa, lang);
let mod_aliases_ref = if mod_aliases.is_empty() {
None
} else {
Some(&mod_aliases)
};
let formal_destructured = if !body.meta.param_destructured_fields.is_empty() {
Some(body.meta.param_destructured_fields.as_slice())
} else {
None
};
let param_types_ref = if !body.meta.param_types.is_empty() {
Some(body.meta.param_types.as_slice())
} else {
None
};
let new_summary = ssa_transfer::extract_ssa_func_summary_full(
&callee.ssa,
parent_cfg,
local_summaries,
global_summaries,
lang,
namespace,
&interner,
callee.param_count,
mod_aliases_ref,
locator,
Some(&body.meta.params),
Some(&augmented_snapshot),
formal_destructured,
param_types_ref,
Some(&callee.opt.alias_result),
);
// OR-merge sink-only fields into the existing summary.
let entry = summaries.entry(key).or_default();
merge_sink_fields(entry, &new_summary);
}
}
/// OR-merge `param_to_sink`, `param_to_sink_param`, and
/// `validated_params_to_return` from `src` into `dst`. Existing entries
/// are preserved; only NEW entries are added.
///
/// The validated-param list grows monotonically across extraction
/// rounds: a parameter that proves validated under any extraction
/// pass (the augmented second pass typically resolves more
/// cross-function summaries than the first) stays validated. Drops
/// here would silently lose CVE-2026-25544-class precision the
/// re-extraction pass was specifically designed to recover.
fn merge_sink_fields(
dst: &mut crate::summary::ssa_summary::SsaFuncSummary,
src: &crate::summary::ssa_summary::SsaFuncSummary,
) {
for (idx, sites) in &src.param_to_sink {
if let Some((_, dst_sites)) = dst.param_to_sink.iter_mut().find(|(i, _)| i == idx) {
for site in sites {
let key = site.dedup_key();
if !dst_sites.iter().any(|s| s.dedup_key() == key) {
dst_sites.push(site.clone());
}
}
} else {
dst.param_to_sink.push((*idx, sites.clone()));
}
}
for &(idx, pos, caps) in &src.param_to_sink_param {
if !dst
.param_to_sink_param
.iter()
.any(|(i, p, c)| *i == idx && *p == pos && *c == caps)
{
dst.param_to_sink_param.push((idx, pos, caps));
}
}
for &idx in &src.validated_params_to_return {
if !dst.validated_params_to_return.contains(&idx) {
dst.validated_params_to_return.push(idx);
}
}
}
/// Walk lexical-containment children of every parent body and lift
/// their sinks into the parent's [`SsaFuncSummary::param_to_sink`].
///
/// For each parent body P with at least one lexically contained
/// child:
/// - For each formal parameter `p_i` of P:
/// - Seed a probe with `{ p_i → Cap::all() }`, run P's SSA
/// analysis, extract P's exit state.
/// - For every descendant child body C of P, run C's SSA
/// analysis with the parent's exit state seeded as
/// `global_seed`. Collect sink events.
/// - For each event whose `sink_caps` is non-empty, append a
/// cap-only [`SinkSite`] under `p_i` on P's summary
/// (deduplicated by cap-mask so repeat probes don't inflate
/// the entry).
///
/// Strict-additive: only inserts new `param_to_sink` entries; never
/// modifies `param_return_paths`, `points_to`, `source_caps`, etc.
fn augment_summaries_with_child_sinks(
file_cfg: &FileCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
summaries: &mut std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
) {
use crate::cfg::BodyId;
use crate::labels::{Cap, SourceKind};
use crate::summary::SinkSite;
use crate::taint::domain::{TaintOrigin, VarTaint};
use ssa_transfer::BindingKey;
// ── Build lexical-containment relationships ──────────────────────
// Map parent BodyId → list of descendant body indices. Reverse-walk
// each body's `parent_body_id` chain so a grand-child's sinks are
// attributed to every ancestor in its containment chain.
let body_id_to_idx: std::collections::HashMap<BodyId, usize> = file_cfg
.bodies
.iter()
.enumerate()
.map(|(i, b)| (b.meta.id, i))
.collect();
let mut descendants: std::collections::HashMap<BodyId, Vec<usize>> =
std::collections::HashMap::new();
for (idx, body) in file_cfg.bodies.iter().enumerate() {
// Walk up the parent chain, registering this body as a descendant
// of every ancestor.
let mut cur = body.meta.parent_body_id;
while let Some(pid) = cur {
descendants.entry(pid).or_default().push(idx);
cur = body_id_to_idx
.get(&pid)
.and_then(|i| file_cfg.bodies[*i].meta.parent_body_id);
}
}
// ── Map each parent body to its FuncKey and the SSA body cache ──
// Skip bodies with no formal params (nothing to probe) and bodies
// whose SSA was never lowered (lowering errors logged earlier).
for parent_body in &file_cfg.bodies {
let Some(parent_key) = parent_body.meta.func_key.clone() else {
continue;
};
let mut parent_key = parent_key;
parent_key.namespace = namespace.to_string();
let Some(parent_callee) = bodies.get(&parent_key) else {
continue;
};
if parent_callee.param_count == 0 {
continue;
}
let Some(child_indices) = descendants.get(&parent_body.meta.id) else {
continue;
};
if child_indices.is_empty() {
continue;
}
let parent_ssa = &parent_callee.ssa;
let parent_cfg = match parent_callee.body_graph.as_ref() {
Some(g) => g,
None => continue,
};
let parent_interner = crate::state::symbol::SymbolInterner::from_cfg(parent_cfg);
// Collect (formal_param_idx, var_name, ssa_value) for the parent's
// formal params, mirrors `extract_ssa_func_summary`'s param scan.
let mut parent_param_info: Vec<(usize, String)> = Vec::new();
for block in &parent_ssa.blocks {
for inst in block.phis.iter().chain(block.body.iter()) {
if let crate::ssa::ir::SsaOp::Param { index } = &inst.op {
if *index < parent_callee.param_count {
if let Some(name) = inst.var_name.as_ref() {
parent_param_info.push((*index, name.clone()));
}
}
}
}
}
for (param_idx, param_name) in &parent_param_info {
// Seed parent's probe with this single param tainted to all caps.
let mut seed: std::collections::HashMap<BindingKey, VarTaint> =
std::collections::HashMap::new();
seed.insert(
BindingKey::new(param_name.as_str(), BodyId(0)),
VarTaint {
caps: Cap::all(),
origins: smallvec::SmallVec::from_elem(
TaintOrigin {
node: petgraph::graph::NodeIndex::new(0),
source_kind: SourceKind::UserInput,
source_span: None,
},
1,
),
uses_summary: false,
},
);
let parent_transfer = ssa_transfer::SsaTaintTransfer {
lang,
namespace,
interner: &parent_interner,
local_summaries,
global_summaries,
interop_edges: &[],
owner_body_id: BodyId(0),
parent_body_id: None,
global_seed: Some(&seed),
param_seed: None,
receiver_seed: None,
const_values: None,
type_facts: None,
xml_parser_config: None,
xpath_config: None,
ssa_summaries: Some(summaries),
extra_labels: None,
base_aliases: None,
callee_bodies: None,
inline_cache: None,
context_depth: 0,
callback_bindings: None,
points_to: None,
dynamic_pts: None,
import_bindings: None,
promisify_aliases: None,
module_aliases: None,
static_map: None,
auto_seed_handler_params: false,
cross_file_bodies: None,
pointer_facts: None,
cross_package_imports: None,
entry_kind: None,
param_route_capture: None,
recording_summary: false,
};
let (_parent_events, parent_block_states) =
ssa_transfer::run_ssa_taint_full(parent_ssa, parent_cfg, &parent_transfer);
let parent_exit = ssa_transfer::extract_ssa_exit_state(
&parent_block_states,
parent_ssa,
parent_cfg,
&parent_transfer,
BodyId(0),
);
if parent_exit.is_empty() {
continue;
}
for &child_idx in child_indices {
let child_body = &file_cfg.bodies[child_idx];
let Some(child_key) = child_body.meta.func_key.clone() else {
continue;
};
let mut child_key = child_key;
child_key.namespace = namespace.to_string();
let Some(child_callee) = bodies.get(&child_key) else {
continue;
};
let child_ssa = &child_callee.ssa;
let Some(child_cfg) = child_callee.body_graph.as_ref() else {
continue;
};
let child_interner = crate::state::symbol::SymbolInterner::from_cfg(child_cfg);
let child_transfer = ssa_transfer::SsaTaintTransfer {
lang,
namespace,
interner: &child_interner,
local_summaries,
global_summaries,
interop_edges: &[],
owner_body_id: BodyId(0),
parent_body_id: None,
global_seed: Some(&parent_exit),
param_seed: None,
receiver_seed: None,
const_values: None,
type_facts: None,
xml_parser_config: None,
xpath_config: None,
ssa_summaries: Some(summaries),
extra_labels: None,
base_aliases: None,
callee_bodies: None,
inline_cache: None,
context_depth: 0,
callback_bindings: None,
points_to: None,
dynamic_pts: None,
import_bindings: None,
promisify_aliases: None,
module_aliases: None,
static_map: None,
auto_seed_handler_params: false,
cross_file_bodies: None,
pointer_facts: None,
cross_package_imports: None,
entry_kind: None,
param_route_capture: None,
recording_summary: false,
};
let (child_events, _child_block_states) =
ssa_transfer::run_ssa_taint_full(child_ssa, child_cfg, &child_transfer);
if child_events.is_empty() {
continue;
}
// Aggregate sink caps across all child events into one
// entry per parent param (cap-only SinkSite, the
// exact location lives in the child body's CFG and is
// not directly addressable from the parent's summary).
let mut union_caps = Cap::empty();
for ev in &child_events {
union_caps |= ev.sink_caps;
}
if union_caps.is_empty() {
continue;
}
let entry = summaries.entry(parent_key.clone()).or_default();
let new_site = SinkSite::cap_only(union_caps);
let new_key = new_site.dedup_key();
if let Some((_, sites)) = entry
.param_to_sink
.iter_mut()
.find(|(i, _)| *i == *param_idx)
{
if !sites.iter().any(|s| s.dedup_key() == new_key) {
sites.push(new_site);
}
} else {
entry
.param_to_sink
.push((*param_idx, smallvec::smallvec![new_site]));
}
// Mirror cap-only attribution into `param_to_sink_param`
// so the call-site emission path that consults it (the
// engine's primary sink-site picker uses
// `param_to_sink_param` for arg-position filtering)
// sees this captured-flow sink. Position 0 is a
// best-effort placeholder, the actual filtering at
// the caller is by SSRF cap, not arg position, when
// the wrapper is itself non-gated.
if !entry
.param_to_sink_param
.iter()
.any(|(i, _, c)| *i == *param_idx && *c == union_caps)
{
entry.param_to_sink_param.push((*param_idx, 0, union_caps));
}
}
}
}
}
/// Walk every SSA `Call` instruction in `ssa` and produce
/// `(call_ordinal, container_name)` entries for those whose receiver
/// SSA value has a [`crate::ssa::type_facts::TypeKind`] with a
/// non-empty [`crate::ssa::type_facts::TypeKind::container_name`].
///
/// Free-function calls (`receiver: None`) and unknown receiver types
/// are skipped, the cross-file call-graph builder will fall back to
/// today's name-only resolution for those, preserving the
/// "subset of today's targets, never a superset" invariant from
/// `docs/typed-call-graph-prompt.md`.
///
/// Ordinals are pulled from the underlying CFG node's
/// [`crate::cfg::CallMeta::call_ordinal`] so they line up with
/// [`crate::summary::CalleeSite::ordinal`] at consumer time. Calls
/// whose CFG node has no recoverable ordinal (synthetic / removed
/// nodes) are silently dropped.
fn collect_typed_call_receivers(
ssa: &crate::ssa::ir::SsaBody,
cfg: &crate::cfg::Cfg,
type_facts: &crate::ssa::type_facts::TypeFactResult,
) -> Vec<(u32, String)> {
use crate::ssa::ir::SsaOp;
let mut out: Vec<(u32, String)> = Vec::new();
let mut seen: std::collections::HashSet<u32> = std::collections::HashSet::new();
for block in &ssa.blocks {
for inst in block.body.iter() {
let SsaOp::Call { receiver, .. } = &inst.op else {
continue;
};
let Some(receiver_val) = receiver else {
continue; // free-function call, no devirtualisation possible
};
let Some(kind) = type_facts.get_type(*receiver_val) else {
continue; // type unknown, fall back to name-only resolution
};
let Some(container) = kind.container_name() else {
continue; // scalar/unknown type, no useful container
};
let Some(node_info) = cfg.node_weight(inst.cfg_node) else {
continue;
};
let ordinal = node_info.call.call_ordinal;
// A single SSA call instruction maps 1:1 with a CFG call
// node, so each ordinal should appear at most once. The
// dedup guard exists in case lowering ever introduces a
// second SSA Call sharing a cfg_node, first wins.
if !seen.insert(ordinal) {
continue;
}
out.push((ordinal, container));
}
}
out.sort_by_key(|(ord, _)| *ord);
out
}
/// Maximum blocks for a callee body to be eligible for cross-file persistence.
const MAX_CROSS_FILE_BODY_BLOCKS: usize = 100;
type SsaArtifactSummaries =
std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>;
type EligibleCalleeBodies = Vec<(FuncKey, ssa_transfer::CalleeSsaBody)>;
/// FileCfg-based artifact extraction: iterates per-body (not per function
/// entry) and lowers each body's graph with its recorded entry/params. This
/// path is equivalent to what `analyse_file` uses at taint time, so the SSA
/// summaries produced here line up exactly with what pass 2 will consult.
#[allow(clippy::too_many_arguments)]
pub(crate) fn extract_ssa_artifacts_from_file_cfg(
file_cfg: &FileCfg,
lang: Lang,
namespace: &str,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
locator: Option<&crate::summary::SinkSiteLocator<'_>>,
scan_root: Option<&str>,
module_graph: Option<&crate::resolve::ModuleGraph>,
) -> (SsaArtifactSummaries, EligibleCalleeBodies) {
let (summaries, bodies) = lower_all_functions_from_bodies(
file_cfg,
lang,
namespace,
local_summaries,
global_summaries,
locator,
scan_root,
module_graph,
);
let eligible_bodies = build_eligible_bodies(file_cfg, bodies);
(summaries, eligible_bodies)
}
/// Filter pre-lowered SSA bodies down to the cross-file-eligible subset and
/// populate per-node metadata against the original CFG.
///
/// Split out from [`extract_ssa_artifacts_from_file_cfg`] so callers that
/// already hold a freshly-lowered `bodies` map (specifically
/// `analyse_file_fused`, which now lowers once and feeds both the taint
/// engine and this filter) don't pay for a second lowering pass.
pub(crate) fn build_eligible_bodies(
file_cfg: &FileCfg,
bodies: std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
) -> EligibleCalleeBodies {
let mut eligible_bodies = Vec::new();
if crate::symex::cross_file_symex_enabled() {
for (key, mut body) in bodies {
if body.ssa.blocks.len() > MAX_CROSS_FILE_BODY_BLOCKS {
continue;
}
// Populate node metadata against the per-body graph whose NodeIndex
// space the SSA was produced on, otherwise cross-file replay can't
// find the original CFG nodes.
//
// `key.namespace` was already normalised against `scan_root` in
// `lower_all_functions_from_bodies`; `body.meta.func_key.namespace`
// still carries the raw `build_cfg` file path. Compare on
// structural identity (everything *but* namespace) so the two
// agree even when the namespace representations differ.
let Some(body_cfg) = file_cfg.bodies.iter().find(|b| {
b.meta.func_key.as_ref().is_some_and(|k| {
k.lang == key.lang
&& k.container == key.container
&& k.name == key.name
&& k.arity == key.arity
&& k.disambig == key.disambig
&& k.kind == key.kind
})
}) else {
continue;
};
if !ssa_transfer::populate_node_meta(&mut body, &body_cfg.graph) {
continue;
}
eligible_bodies.push((key, body));
}
}
eligible_bodies
}
#[cfg(test)]
mod tests;