nyx/src/taint/mod.rs

#![allow(clippy::collapsible_if, clippy::too_many_arguments)]

pub mod backwards;
pub mod domain;
pub mod path_state;
pub mod ssa_transfer;

use crate::cfg::{BodyCfg, BodyId, Cfg, FileCfg, FuncSummaries};
use crate::engine_notes::EngineNote;
use crate::interop::InteropEdge;
use crate::labels::SourceKind;
use crate::state::engine::MAX_TRACKED_VARS;
use crate::state::symbol::SymbolInterner;
use crate::summary::GlobalSummaries;
use crate::symbol::{FuncKey, FuncKind, Lang};
use path_state::PredicateKind;
use petgraph::graph::NodeIndex;
use petgraph::visit::IntoNodeReferences;
use smallvec::SmallVec;
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::atomic::{AtomicUsize, Ordering};

/// Safety cap on JS/TS in-file pass-2 convergence iterations.
///
/// Pass 2 runs a Jacobi-style round over every non-toplevel body in a
/// JS/TS file, combining each body's exit state (filtered to top-level
/// keys) into the shared seed and re-running non-toplevel bodies until
/// the seed stabilises.  A chain of `k` top-level bindings threaded
/// through `k` helper functions needs up to `k` iterations for taint to
/// walk the chain; the old hardcoded `3` silently truncated any
/// 4-stage chain with no warning.
///
/// This mirrors `scan::SCC_FIXPOINT_SAFETY_CAP` in intent: the lattice
/// is monotone and finite-height, so the real fixed-point is always
/// reachable in a small multiple of the chain depth.  64 is generous
/// enough to cover every realistic JS/TS file we have seen while still
/// bounding worst-case cost.
const JS_TS_PASS2_SAFETY_CAP: usize = 64;

/// Test-only override for [`JS_TS_PASS2_SAFETY_CAP`].  When non-zero,
/// the pass-2 loop uses this value instead of the const cap.  Default
/// `0` leaves production behaviour unchanged.
static JS_TS_PASS2_CAP_OVERRIDE: AtomicUsize = AtomicUsize::new(0);

/// Observability hook: records the number of pass-2 iterations used by
/// the most recent [`analyse_file`] invocation.  Reset at the start of
/// each call so convergence regression tests can read a fresh value.
/// `1` means the initial lexical-containment pass completed; higher
/// values indicate the iterative convergence loop ran that many times
/// without detecting convergence (so the `iters`th iteration was the
/// last round actually executed).  `1` is the common case for
/// non-JS/TS languages and for JS/TS files with no cross-body globals.
static LAST_JS_TS_PASS2_ITERATIONS: AtomicUsize = AtomicUsize::new(0);

/// Set (or clear) the test-only JS/TS pass-2 cap override.  `cap = 0`
/// restores the default.  Intended exclusively for integration tests
/// that need to force cap-hit behaviour on small fixtures.
#[doc(hidden)]
pub fn set_js_ts_pass2_cap_override(cap: usize) {
    JS_TS_PASS2_CAP_OVERRIDE.store(cap, Ordering::Relaxed);
}

/// Returns the pass-2 iteration count observed during the most recent
/// [`analyse_file`] invocation.  Intended for tests and diagnostics.
pub fn last_js_ts_pass2_iterations() -> usize {
    LAST_JS_TS_PASS2_ITERATIONS.load(Ordering::Relaxed)
}

fn js_ts_pass2_cap() -> usize {
    let o = JS_TS_PASS2_CAP_OVERRIDE.load(Ordering::Relaxed);
    if o == 0 { JS_TS_PASS2_SAFETY_CAP } else { o }
}

// ── Perf-audit sub-stage timers (lower_all_functions_from_bodies) ───────
//
// Slot layout (µs):
//   [0] lower_to_ssa_with_params (per-body sum)
//   [1] extract_ssa_func_summary (per-body sum, includes per-param probes)
//   [2] optimize_ssa_with_param_types (per-body sum)
//   [3] typed_call_receivers + pointer fact extraction (per-body sum)
//   [4] augment_summaries_with_child_sinks
//   [5] rerun_extraction_with_augmented_summaries
//   [6] per-body misc (FuncKey resolve, HashMap insert, interner ctor)
//
// Active only when the slot is `Some`.  Production code path leaves it
// `None`, making instrumentation cost a single thread-local borrow + a
// `match Option::None` per measured chunk — sub-nanosecond.
thread_local! {
    static PERF_LOWER_TIMINGS: std::cell::Cell<Option<[u128; 7]>> =
        const { std::cell::Cell::new(None) };
}

#[doc(hidden)]
pub fn perf_lower_timings_start() {
    PERF_LOWER_TIMINGS.with(|c| c.set(Some([0; 7])));
}

#[doc(hidden)]
pub fn perf_lower_timings_take() -> Option<[u128; 7]> {
    PERF_LOWER_TIMINGS.with(|c| c.replace(None))
}

#[inline]
fn perf_lower_record(slot: usize, micros: u128) {
    PERF_LOWER_TIMINGS.with(|c| {
        if let Some(mut t) = c.get() {
            t[slot] = t[slot].saturating_add(micros);
            c.set(Some(t));
        }
    });
}

/// Test-only override for the Gauss-Seidel toggle.  Values:
///
/// * `0` — respect `NYX_JS_GAUSS_SEIDEL` env var (default production
///   behaviour).
/// * `1` — force Jacobi (env ignored).
/// * `2` — force Gauss-Seidel (env ignored).
///
/// Used exclusively by integration tests that need to assert both
/// variants produce equal findings without per-test process isolation.
static JS_TS_GAUSS_SEIDEL_OVERRIDE: AtomicUsize = AtomicUsize::new(0);

/// Force Jacobi or Gauss-Seidel from test code.  `0` clears the
/// override and restores env-var-driven behaviour.
#[doc(hidden)]
pub fn set_js_ts_gauss_seidel_override(mode: usize) {
    JS_TS_GAUSS_SEIDEL_OVERRIDE.store(mode, Ordering::Relaxed);
}

/// Returns true when the Gauss-Seidel variant of JS/TS pass-2 is
/// enabled.
///
/// Default: **Jacobi** (order-independent, reproducible, one round
/// per chain hop).  Set `NYX_JS_GAUSS_SEIDEL=1` to enable
/// **Gauss-Seidel** (in-place updates: a body's exit becomes visible
/// to later bodies in the same round, typically halving iteration
/// count on chain-shaped code).
///
/// Opt-in deliberately: Gauss-Seidel is order-dependent (the result
/// depends on the traversal order of bodies), which can affect
/// reproducibility for scanners whose output feeds CI gates.  Before
/// flipping this on by default we need the Phase-A corpus run to
/// prove chain-depth ≥4 is common enough to justify the complexity.
///
/// Test-override via [`set_js_ts_gauss_seidel_override`] takes
/// precedence over the env var.
///
/// See `tests/gauss_seidel_tests.rs` for the determinism test that
/// guards the invariant "same fixture → same findings under both
/// variants".
pub fn js_ts_gauss_seidel_enabled() -> bool {
    match JS_TS_GAUSS_SEIDEL_OVERRIDE.load(Ordering::Relaxed) {
        1 => return false, // force Jacobi
        2 => return true,  // force Gauss-Seidel
        _ => {}
    }
    use std::sync::OnceLock;
    static ENABLED: OnceLock<bool> = OnceLock::new();
    *ENABLED.get_or_init(|| match std::env::var("NYX_JS_GAUSS_SEIDEL") {
        Ok(v) => !matches!(v.as_str(), "" | "0" | "false"),
        Err(_) => false,
    })
}

/// A raw flow step at CFG level (before line/col resolution).
#[derive(Debug, Clone)]
pub struct FlowStepRaw {
    pub cfg_node: NodeIndex,
    pub var_name: Option<String>,
    pub op_kind: crate::evidence::FlowStepKind,
}

/// Resolved source-location of the primary (callee-internal) sink instruction.
///
/// Populated on [`Finding`] when the sink was resolved via a callee summary
/// that recorded a [`crate::summary::SinkSite`].  Data-only primary
/// sink-location attribution: downstream formatters (SARIF, JSON, diag)
/// still report the caller's call-site until they opt in.
#[derive(Debug, Clone, PartialEq)]
pub struct SinkLocation {
    /// Callee file path relative to the workspace root.  Matches the
    /// `FuncKey::namespace` convention used in [`crate::summary::SinkSite`].
    pub file_rel: String,
    /// 1-based line of the sink instruction inside the callee body.
    pub line: u32,
    /// 1-based column of the sink instruction inside the callee body.
    pub col: u32,
    /// Trimmed source line at the sink, copied from the upstream
    /// [`crate::summary::SinkSite`].  Empty when the extractor had no
    /// tree/bytes context.  Used by formatters so the primary-location
    /// display does not need to re-read the callee file.
    pub snippet: String,
}

/// A detected taint finding with both source and sink locations.
#[derive(Debug, Clone)]
pub struct Finding {
    /// Identifies which body's graph the NodeIndex values reference.
    pub body_id: BodyId,
    /// The CFG node where tainted data reaches a dangerous operation.
    pub sink: NodeIndex,
    /// The CFG node where taint originated (may be Entry if source is
    /// cross-file and couldn't be pinpointed to a specific node).
    pub source: NodeIndex,
    /// The full path from source to sink through the CFG.
    #[allow(dead_code)] // used for future detailed diagnostics / path display
    pub path: Vec<NodeIndex>,
    /// The kind of source that originated the taint.
    pub source_kind: SourceKind,
    /// Whether all tainted sink variables are guarded by a validation
    /// predicate on this path (metadata only — does not change severity).
    pub path_validated: bool,
    /// The kind of validation guard protecting this path, if any.
    pub guard_kind: Option<PredicateKind>,
    /// Number of SSA blocks between source and sink (0 = same block).
    pub hop_count: u16,
    /// Capability specificity: number of matching cap bits between source and sink.
    /// Higher = more specific match (e.g. SQL_QUERY→SQL_QUERY vs broad Cap::all()).
    pub cap_specificity: u8,
    /// Whether this finding was resolved via a function summary (cross-function)
    /// rather than direct intra-function flow.
    pub uses_summary: bool,
    /// Reconstructed flow path from source to sink (CFG-level, pre-resolution).
    pub flow_steps: Vec<FlowStepRaw>,
    /// Symbolic constraint analysis verdict, if attempted.
    pub symbolic: Option<crate::evidence::SymbolicVerdict>,
    /// Original source byte span, preserved when origin was remapped across
    /// body boundaries.  `None` for intra-body findings
    /// (use `cfg[source].classification_span()`).
    pub source_span: Option<usize>,
    /// Source-location of the callee-internal dangerous instruction when the
    /// sink was resolved via a function summary carrying a
    /// [`crate::summary::SinkSite`] with concrete coordinates for primary
    /// sink-location attribution.  `None` for:
    /// * intra-procedural / label-based sinks — the caller's `cfg[sink]`
    ///   span already names the dangerous instruction;
    /// * summary-resolved sinks whose `SinkSite` was cap-only (no tree or
    ///   bytes context at extraction time).
    ///
    /// # Invariant
    ///
    /// `primary_location.is_some()` ⇒ the inner [`SinkLocation`] has
    /// `line != 0`.  `file_rel` may be empty for single-file scans where
    /// the scan root is the file itself (every namespace normalizes to
    /// `""`); consumers resolve empty `file_rel` against the file under
    /// analysis.  Enforced at `ssa_events_to_findings` by a
    /// `debug_assert!` — upstream filters drop cap-only sites before
    /// they reach this field.
    ///
    /// Deliberately independent of `uses_summary`: that flag tracks whether
    /// the **taint chain** used a callee summary, not whether the **sink**
    /// was summary-resolved.  A local source can reach a cross-file sink,
    /// yielding `uses_summary == false` alongside a populated
    /// `primary_location`.
    pub primary_location: Option<SinkLocation>,
    /// Engine provenance notes recorded during the analysis that produced
    /// this finding.  Populated when an internal budget/cap was hit — see
    /// [`crate::engine_notes::EngineNote`].  Empty for the typical
    /// under-budget finding.
    pub engine_notes: SmallVec<[EngineNote; 2]>,
    /// Stable hash of the intermediate-variable sequence between `source`
    /// and `sink`.  Used to keep distinct paths through different
    /// variables as separate findings during deduplication — two
    /// `(body_id, sink, source)` siblings with different `path_hash`
    /// values represent flows along different data paths and are
    /// preserved as alternatives rather than collapsed.
    ///
    /// Derived from the `cfg_node` indices in `flow_steps` at the time
    /// the finding is emitted; stable for a given scan but not
    /// necessarily stable across AST/CFG changes.
    pub path_hash: u64,
    /// Stable identifier for this finding, derived from
    /// `(body_id, source.index, sink.index, path_hash, path_validated)`.
    /// Populated after `body_id` is set so the ID is consistent across
    /// the lifetime of the finding and can be used to cross-reference
    /// alternative paths via [`Self::alternative_finding_ids`].  Empty
    /// string before the post-analysis linking pass runs.
    pub finding_id: String,
    /// Stable identifiers of sibling findings that share
    /// `(body_id, sink, source)` but differ in `path_validated` or
    /// `path_hash`.  Populated by the dedup pass in
    /// [`analyse_file`] after all findings are collected.
    ///
    /// The canonical case is a guarded/unguarded pair: if an `exec(x)`
    /// call is reachable from the same source `x` through both a
    /// whitelisted branch and an unguarded branch, both findings
    /// survive dedup and each lists the other here so downstream
    /// formatters can present them as "this flow … and N alternative
    /// path(s)" rather than silently dropping one.
    pub alternative_finding_ids: SmallVec<[String; 2]>,
}

impl Finding {
    /// Append an engine provenance note, deduplicating against notes
    /// already present.  Intended as a builder-style helper for construction
    /// sites that want to tag a new finding inline.
    pub fn with_note(mut self, note: EngineNote) -> Self {
        crate::engine_notes::push_unique(&mut self.engine_notes, note);
        self
    }

    /// Merge a note into `engine_notes`, skipping duplicates.
    pub fn merge_note(&mut self, note: EngineNote) {
        crate::engine_notes::push_unique(&mut self.engine_notes, note);
    }
}

/// Pre-compute module aliases from an unoptimized SSA body for JS/TS.
///
/// Runs const propagation (read-only) to get constant values, then detects
/// `require()` calls to known modules and propagates through phis/copies.
/// Used to make module aliases available during summary extraction.
fn compute_module_aliases_for_summary(
    ssa: &crate::ssa::SsaBody,
    lang: Lang,
) -> std::collections::HashMap<crate::ssa::SsaValue, smallvec::SmallVec<[String; 2]>> {
    if !matches!(lang, Lang::JavaScript | Lang::TypeScript) {
        return std::collections::HashMap::new();
    }
    let cp = crate::ssa::const_prop::const_propagate(ssa);
    crate::ssa::const_prop::collect_module_aliases(ssa, &cp.values)
}

/// Run taint analysis on all bodies in a file.
///
/// Uses a unified multi-body analysis for all languages:
/// 1. Lexical containment propagation: parent body exit state seeds child bodies.
/// 2. JS/TS iterative convergence: functions that modify globals can feed taint
///    back to other functions (up to `MAX_JS_ITERATIONS` rounds).
pub fn analyse_file(
    file_cfg: &FileCfg,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    caller_lang: Lang,
    caller_namespace: &str,
    interop_edges: &[InteropEdge],
    extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> Vec<Finding> {
    // Reset BEFORE lowering: per-parameter probes inside
    // `lower_all_functions_from_bodies` may record path-safe sink spans
    // (via `record_path_safe_suppressed_span`).  Resetting here keeps the
    // historical contract that "the span set starts empty for each file"
    // while letting both the probe phase and the taint flow phase
    // accumulate into the same set, which is what
    // `take_path_safe_suppressed_spans` then drains for state analysis.
    // The all-validated span set (cap-agnostic, drained by AST-pattern
    // suppression in `TaintSuppressionCtx::build`) follows the same
    // lifecycle.
    ssa_transfer::reset_path_safe_suppressed_spans();
    ssa_transfer::reset_all_validated_spans();
    // No locator: pass-2 intra-file summaries are transient (not persisted)
    // and behavior depends on SinkSite.cap only, which is always populated.
    let (ssa_summaries, callee_bodies) = lower_all_functions_from_bodies(
        file_cfg,
        caller_lang,
        caller_namespace,
        local_summaries,
        global_summaries,
        None,
    );
    analyse_file_with_lowered(
        file_cfg,
        local_summaries,
        global_summaries,
        caller_lang,
        caller_namespace,
        interop_edges,
        extra_labels,
        &ssa_summaries,
        &callee_bodies,
    )
}

/// Same as [`analyse_file`] but takes pre-lowered SSA summaries + callee
/// bodies.  Used by [`crate::ast::analyse_file_fused`] to share a single
/// `lower_all_functions_from_bodies` invocation across the taint engine and
/// the SSA-artifact extractor; the bare [`analyse_file`] entry-point keeps
/// its prior signature for any caller that does not have a pre-lowered
/// result handy.
#[allow(clippy::too_many_arguments)]
pub(crate) fn analyse_file_with_lowered(
    file_cfg: &FileCfg,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    caller_lang: Lang,
    caller_namespace: &str,
    interop_edges: &[InteropEdge],
    extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
    ssa_summaries: &std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
    callee_bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
) -> Vec<Finding> {
    let _span = tracing::debug_span!("taint_analyse_file").entered();

    // NOTE: the path-safe-suppressed span set is reset by the caller, not
    // here.  Per-parameter probes inside the lowering phase
    // (`lower_all_functions_from_bodies`) can already publish spans via
    // `record_path_safe_suppressed_span`; resetting here would wipe them
    // before `take_path_safe_suppressed_spans` drains the set for state
    // analysis.  Both `analyse_file` (which lowers internally) and
    // `analyse_file_fused` (which lowers up-front) reset the set before
    // their lowering call.

    let ssa_sums_ref = if ssa_summaries.is_empty() {
        None
    } else {
        Some(ssa_summaries)
    };

    // 2. Context-sensitive inline analysis setup.  Toggle lives at
    //    `analysis.engine.context_sensitive` in `nyx.conf` (or the
    //    `--context-sensitive / --no-context-sensitive` CLI flag).
    let context_sensitive = crate::utils::analysis_options::current().context_sensitive;
    let inline_cache = std::cell::RefCell::new(std::collections::HashMap::new());
    let callee_bodies_ref = if context_sensitive && !callee_bodies.is_empty() {
        Some(callee_bodies)
    } else {
        None
    };
    let inline_cache_ref = if context_sensitive {
        Some(&inline_cache)
    } else {
        None
    };

    // 3. Unified multi-body analysis with lexical containment propagation.
    //
    // `max_iterations` is the safety cap, not an expected depth — the
    // pass-2 loop breaks on seed equality (monotone lattice, finite
    // height) and only rides the cap when convergence legitimately
    // needs more rounds than the cap allows.  See
    // [`JS_TS_PASS2_SAFETY_CAP`] for the rationale.
    let max_iterations = if matches!(caller_lang, Lang::JavaScript | Lang::TypeScript) {
        js_ts_pass2_cap()
    } else {
        1
    };
    // Reset the observability counter before this scan so tests always
    // read a fresh value.  Non-JS/TS languages leave it at `1` (the
    // lexical-containment pass counts as a single round).
    LAST_JS_TS_PASS2_ITERATIONS.store(0, Ordering::Relaxed);
    let import_bindings_ref = if file_cfg.import_bindings.is_empty() {
        None
    } else {
        Some(&file_cfg.import_bindings)
    };
    // Cross-file bodies come from GlobalSummaries. Threaded through the
    // transfer for context-sensitive resolution; plumbing only when no
    // reader is configured, preserving prior behaviour byte-for-byte.
    let cross_file_bodies_ref = global_summaries.and_then(|gs| gs.bodies_by_key());
    if let Some(map) = cross_file_bodies_ref {
        tracing::debug!(
            cross_file_bodies = map.len(),
            file = %caller_namespace,
            "taint: cross-file bodies available for pass 2"
        );
    }

    let mut all_findings = analyse_multi_body(
        file_cfg,
        caller_lang,
        caller_namespace,
        local_summaries,
        global_summaries,
        interop_edges,
        extra_labels,
        ssa_sums_ref,
        callee_bodies_ref,
        inline_cache_ref,
        max_iterations,
        import_bindings_ref,
        cross_file_bodies_ref,
    );

    // 4. Deduplicate findings using a richer key that preserves distinct
    //    flows.
    //
    //    The historical dedup at this point was:
    //
    //        sort_by_key(|f| (body_id, sink.index(), source.index(), !path_validated));
    //        dedup_by_key(|f| (body_id, sink, source));
    //
    //    which silently collapsed an *unguarded* flow reaching the same
    //    `(sink, source)` as a guarded flow — the `!path_validated` sort
    //    ordered `path_validated == true` first, so the exploitable
    //    branch was the one that got dropped.
    //
    //    New behaviour: the dedup key is
    //        (body_id, sink, source, path_validated, path_hash).
    //    Findings that differ on `path_validated` *or* on `path_hash`
    //    (i.e. traverse different intermediate variables) are kept as
    //    distinct findings.  `link_alternative_paths` then populates
    //    `alternative_finding_ids` on each finding so downstream
    //    formatters can render "… and N alternative path(s)".
    all_findings.sort_by_key(|f| {
        (
            f.body_id.0,
            f.sink.index(),
            f.source.index(),
            !f.path_validated,
            f.path_hash,
        )
    });
    all_findings.dedup_by_key(|f| (f.body_id, f.sink, f.source, f.path_validated, f.path_hash));

    // 5. Assign stable finding IDs now that `body_id` has been set and
    //    the dedup has picked the final set of distinct flows.  The ID
    //    is used to cross-reference siblings via
    //    `Finding.alternative_finding_ids`.
    for f in &mut all_findings {
        f.finding_id = make_finding_id(f);
    }

    // 6. Link alternative paths: for every group of findings that share
    //    `(body_id, sink, source)`, publish each finding's ID into the
    //    other findings' `alternative_finding_ids` list.
    link_alternative_paths(&mut all_findings);

    all_findings
}

/// Build the stable identifier for a [`Finding`].
///
/// Format: `taint-<body_id>-<source_idx>-<sink_idx>-<path_hash_hex>-<v|u>`.
/// The `v`/`u` suffix disambiguates validated (`v`) from unvalidated
/// (`u`) flows that share `(body, sink, source, path_hash)`.  The hex
/// hash disambiguates distinct intermediate paths.  Both components are
/// independent of caller-side formatters so the ID survives
/// serialization to JSON/SARIF unchanged.
fn make_finding_id(f: &Finding) -> String {
    format!(
        "taint-{}-{}-{}-{:016x}-{}",
        f.body_id.0,
        f.source.index(),
        f.sink.index(),
        f.path_hash,
        if f.path_validated { 'v' } else { 'u' },
    )
}

/// Cross-link findings that share `(body_id, sink, source)` but differ
/// on `path_validated` or `path_hash`.  After this call each such
/// finding's `alternative_finding_ids` lists every sibling's
/// [`Finding::finding_id`] — so a guarded flow links to the unguarded
/// sibling and vice versa.  Isolated findings (no sibling) get an
/// empty list.
fn link_alternative_paths(findings: &mut [Finding]) {
    // Group indices by (body_id, sink, source).  A simple O(n log n)
    // sort would clobber the caller-visible order; use a hashmap instead.
    let mut groups: HashMap<(BodyId, NodeIndex, NodeIndex), Vec<usize>> = HashMap::new();
    for (idx, f) in findings.iter().enumerate() {
        groups
            .entry((f.body_id, f.sink, f.source))
            .or_default()
            .push(idx);
    }
    for (_, members) in groups {
        if members.len() < 2 {
            continue;
        }
        // Collect IDs once, then distribute to every member *except self*.
        let ids: Vec<String> = members
            .iter()
            .map(|&i| findings[i].finding_id.clone())
            .collect();
        for &member_idx in &members {
            let own_id = findings[member_idx].finding_id.clone();
            findings[member_idx].alternative_finding_ids.clear();
            findings[member_idx]
                .alternative_finding_ids
                .extend(ids.iter().filter(|id| **id != own_id).cloned());
        }
    }
}

/// Compute containment-topological order: parent bodies before children.
///
/// Uses BFS from roots (bodies with no parent), ensuring a body is always
/// processed after its parent — required for lexical seed propagation.
/// Returns indices into `file_cfg.bodies` in processing order.
fn containment_order(bodies: &[BodyCfg]) -> Vec<usize> {
    let mut children: HashMap<BodyId, Vec<usize>> = HashMap::new();
    let mut roots: Vec<usize> = Vec::new();
    for (i, body) in bodies.iter().enumerate() {
        match body.meta.parent_body_id {
            Some(parent) => children.entry(parent).or_default().push(i),
            None => roots.push(i),
        }
    }
    let mut order = Vec::with_capacity(bodies.len());
    let mut queue: VecDeque<usize> = roots.into();
    while let Some(idx) = queue.pop_front() {
        order.push(idx);
        if let Some(kids) = children.get(&bodies[idx].meta.id) {
            queue.extend(kids);
        }
    }
    order
}

/// Analyse a single body with an optional parent seed.
///
/// Shared logic extracted from `analyse_multi_body` to avoid deep nesting.
fn analyse_body_with_seed(
    body: &BodyCfg,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    interop_edges: &[InteropEdge],
    extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
    ssa_summaries: Option<
        &std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
    >,
    callee_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
    inline_cache: Option<&std::cell::RefCell<ssa_transfer::InlineCache>>,
    seed: Option<&HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>>,
    import_bindings: Option<&crate::cfg::ImportBindings>,
    cross_file_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
) -> (
    Vec<Finding>,
    Option<HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>>,
) {
    let cfg = &body.graph;
    let entry = body.entry;
    let body_id = body.meta.id;

    let interner = SymbolInterner::from_cfg(cfg);
    if interner.len() > MAX_TRACKED_VARS {
        tracing::warn!(
            symbols = interner.len(),
            max = MAX_TRACKED_VARS,
            "taint analysis: too many variables, some will be ignored"
        );
    }

    // Per-body graphs contain only the body's own nodes.
    // For non-toplevel bodies, use lower_to_ssa_with_params with scope to
    // create SsaOp::Param ops for external/captured variables and formal
    // parameters — required for global_seed to inject taint from the parent.
    // Top-level bodies use lower_to_ssa with scope_all=true (no Param ops).
    let is_toplevel = body.meta.parent_body_id.is_none();
    // JS/TS function bodies always use scoped lowering to create Param ops
    // for captured variables (globals that flow via seed between bodies).
    // Other languages: scoped lowering only when the parent seed is non-empty,
    // i.e. the parent body actually has taint to propagate.  Without a seed,
    // Param ops would just introduce unused SSA values.
    let has_nonempty_seed = seed.is_some_and(|s| !s.is_empty());
    // Scoped lowering creates SsaOp::Param ops for formal parameters, required
    // for handler-param auto-seeding to fire. Java lambda bodies need this too
    // so that `cmd -> Runtime.exec(cmd)` picks up `cmd` as a handler param.
    let is_java_lambda =
        lang == Lang::Java && body.meta.kind == crate::cfg::BodyKind::AnonymousFunction;
    let use_scoped_lowering = !is_toplevel
        && (matches!(lang, Lang::JavaScript | Lang::TypeScript)
            || has_nonempty_seed
            || is_java_lambda);
    let ssa_result = if use_scoped_lowering {
        let func_name = body.meta.name.clone().unwrap_or_else(|| {
            body.meta
                .func_key
                .as_ref()
                .and_then(|k| k.disambig.map(|d| format!("<anon#{d}>")))
                .unwrap_or_else(|| format!("<anon@{}>", body.meta.span.0))
        });
        crate::ssa::lower_to_ssa_with_params(cfg, entry, Some(&func_name), false, &body.meta.params)
    } else {
        crate::ssa::lower_to_ssa(cfg, entry, None, true)
    };

    // Clear per-body engine-note collector before the body's analysis;
    // any WorklistCapped / OriginsTruncated notes recorded during
    // transfer land in this bucket and are attached to every finding
    // emitted from the body once analysis is done.
    ssa_transfer::reset_body_engine_notes();

    match ssa_result {
        Ok(mut ssa_body) => {
            let opt = crate::ssa::optimize_ssa_with_param_types(
                &mut ssa_body,
                cfg,
                Some(lang),
                &body.meta.param_types,
            );
            if tracing::enabled!(tracing::Level::TRACE) {
                tracing::trace!(
                    func = body.meta.name.as_deref().unwrap_or("<anon>"),
                    ssa = %ssa_body,
                    "SSA body lowered",
                );
                for block in &ssa_body.blocks {
                    for inst in block.phis.iter().chain(block.body.iter()) {
                        if let Some(t) = opt.type_facts.get_type(inst.value) {
                            tracing::trace!(value = inst.value.0, ty = ?t, "type fact");
                        }
                    }
                }
            }
            let dynamic_pts = std::cell::RefCell::new(std::collections::HashMap::new());
            // Static-map abstract analysis: recognises provably-bounded
            // lookup idioms (e.g. `map.get(x).unwrap_or("safe")`) so the SSA
            // taint engine can clear command-injection findings whose payload
            // is a finite set of literal strings.
            let static_map =
                crate::ssa::static_map::analyze(&ssa_body, cfg, Some(lang), &opt.const_values);
            let static_map_opt = if static_map.is_empty() {
                None
            } else {
                Some(static_map)
            };
            // Pointer-Phase 3 / W1+W2+W3: per-body field-sensitive points-to
            // facts.  Computed only when `NYX_POINTER_ANALYSIS=1`; the
            // per-body `analyse_body` cost is amortised across the three
            // hooks (W1 field-write read-back, W2 container ELEM cells,
            // W3 cross-call resolver).  Strict-additive: `None` keeps
            // pointer-disabled behaviour bit-identical.
            let pointer_facts = if crate::pointer::is_enabled() {
                Some(crate::pointer::analyse_body(&ssa_body, body.meta.id))
            } else {
                None
            };
            let transfer = ssa_transfer::SsaTaintTransfer {
                lang,
                namespace,
                interner: &interner,
                local_summaries,
                global_summaries,
                interop_edges,
                owner_body_id: body.meta.id,
                parent_body_id: body.meta.parent_body_id,
                global_seed: seed,
                param_seed: None,
                receiver_seed: None,
                const_values: Some(&opt.const_values),
                type_facts: Some(&opt.type_facts),
                ssa_summaries,
                extra_labels,
                base_aliases: Some(&opt.alias_result),
                callee_bodies,
                inline_cache,
                context_depth: 0,
                callback_bindings: None,
                points_to: Some(&opt.points_to),
                dynamic_pts: Some(&dynamic_pts),
                import_bindings,
                promisify_aliases: None,
                module_aliases: if opt.module_aliases.is_empty() {
                    None
                } else {
                    Some(&opt.module_aliases)
                },
                static_map: static_map_opt.as_ref(),
                auto_seed_handler_params: matches!(lang, Lang::JavaScript | Lang::TypeScript)
                    || (lang == Lang::Java
                        && body.meta.kind == crate::cfg::BodyKind::AnonymousFunction),
                cross_file_bodies,
                pointer_facts: pointer_facts.as_ref(),
            };
            let (events, block_states) =
                ssa_transfer::run_ssa_taint_full(&ssa_body, cfg, &transfer);
            let mut findings = ssa_transfer::ssa_events_to_findings(&events, &ssa_body, cfg);
            let body_notes = ssa_transfer::take_body_engine_notes();
            for f in &mut findings {
                f.body_id = body_id;
                for note in &body_notes {
                    f.merge_note(note.clone());
                }
            }
            if crate::symex::is_enabled() {
                let symex_ctx = crate::symex::SymexContext {
                    ssa: &ssa_body,
                    cfg,
                    const_values: &opt.const_values,
                    type_facts: &opt.type_facts,
                    global_summaries,
                    lang,
                    namespace,
                    points_to: Some(&opt.points_to),
                    callee_bodies,
                    scc_membership: None,
                    cross_file_bodies: global_summaries,
                };
                crate::symex::annotate_findings(&mut findings, &symex_ctx);
            }
            // After forward taint + symex have produced a final
            // `Finding.symbolic` shape, run the demand-driven backwards pass
            // and layer its verdict on top.  Placing this *after* symex
            // (which overwrites `symbolic`) preserves any symex witness
            // while still annotating `backwards-confirmed` / `-infeasible`
            // onto the `cutoff_notes` vector.  Gated by
            // `analysis.engine.backwards_analysis` (default off).
            if crate::utils::analysis_options::current().backwards_analysis {
                let bctx = backwards::BackwardsCtx {
                    ssa: &ssa_body,
                    cfg,
                    lang,
                    global_summaries,
                    intra_file_bodies: callee_bodies,
                    depth_budget: backwards::DEFAULT_BACKWARDS_DEPTH,
                };
                for finding in &mut findings {
                    let Some(sink_val) = ssa_body.cfg_node_map.get(&finding.sink).copied() else {
                        continue;
                    };
                    let sink_caps = cfg[finding.sink].taint.labels.iter().fold(
                        crate::labels::Cap::empty(),
                        |acc, l| match l {
                            crate::labels::DataLabel::Sink(c) => acc | *c,
                            _ => acc,
                        },
                    );
                    let caps = if sink_caps.is_empty() {
                        crate::labels::Cap::all()
                    } else {
                        sink_caps
                    };
                    let flows =
                        backwards::analyse_sink_backwards(&bctx, sink_val, finding.sink, caps);
                    let verdict = backwards::aggregate_verdict(&flows);
                    backwards::annotate_finding(finding, verdict);
                }
            }
            // Extract exit state for seeding child bodies.  Tag every
            // entry with the owner body's id so a later join (e.g. the
            // JS/TS two-level `combined_exit`) cannot silently alias
            // same-named bindings from different bodies.
            let exit_state = ssa_transfer::extract_ssa_exit_state(
                &block_states,
                &ssa_body,
                cfg,
                &transfer,
                body_id,
            );
            (findings, Some(exit_state))
        }
        Err(e) => {
            // SSA lowering produced no analyzable body.  We still surface
            // the event so downstream tooling can tell "we tried and gave
            // up" from "we ran clean" — a TRACE-level log records the
            // reason (no synthetic Finding is manufactured because a
            // diag pointing at no source location would be misleading).
            tracing::trace!(
                body_id = body_id.0,
                body_name = ?body.meta.name,
                error = %e,
                "SSA lowering bailed; emitting engine note",
            );
            ssa_transfer::record_engine_note(crate::engine_notes::EngineNote::SsaLoweringBailed {
                reason: format!("{e}"),
            });
            // Drain the collector so the note does not bleed into the
            // next body (which will call reset on entry, but be explicit).
            let _ = ssa_transfer::take_body_engine_notes();
            (Vec::new(), None)
        }
    }
}

/// Unified multi-body taint analysis with lexical containment propagation.
///
/// Pass 1: process all bodies in containment-topological order (parent before
/// child), seeding each child body with its parent's exit state.
///
/// Pass 2 (JS/TS only, `max_iterations > 1`): iterative convergence for
/// functions that modify global state, feeding taint back to other functions.
fn analyse_multi_body(
    file_cfg: &FileCfg,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    interop_edges: &[InteropEdge],
    extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
    ssa_summaries: Option<
        &std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
    >,
    callee_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
    inline_cache: Option<&std::cell::RefCell<ssa_transfer::InlineCache>>,
    max_iterations: usize,
    import_bindings: Option<&crate::cfg::ImportBindings>,
    cross_file_bodies: Option<&std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>>,
) -> Vec<Finding> {
    let order = containment_order(&file_cfg.bodies);
    let mut all_findings: Vec<Finding> = Vec::new();

    // Exit states per body, used to seed children.
    let mut body_exit_states: HashMap<
        BodyId,
        HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
    > = HashMap::new();

    // ── Pass 1: lexical containment propagation ──────────────────────
    for &idx in &order {
        let body = &file_cfg.bodies[idx];
        // Determine seed from parent body's exit state.
        let parent_seed = body
            .meta
            .parent_body_id
            .and_then(|pid| body_exit_states.get(&pid));

        let (findings, exit_state) = analyse_body_with_seed(
            body,
            lang,
            namespace,
            local_summaries,
            global_summaries,
            interop_edges,
            extra_labels,
            ssa_summaries,
            callee_bodies,
            inline_cache,
            parent_seed,
            import_bindings,
            cross_file_bodies,
        );
        tracing::debug!(
            body_id = body.meta.id.0,
            body_name = ?body.meta.name,
            findings = findings.len(),
            graph_nodes = body.graph.node_count(),
            has_seed = parent_seed.is_some(),
            "analyse_multi_body: body analysed"
        );
        all_findings.extend(findings);
        if let Some(es) = exit_state {
            body_exit_states.insert(body.meta.id, es);
        }
    }

    // ── Pass 2: JS/TS iterative convergence ──────────────────────────
    // Only for JS/TS: functions that modify global variables can feed taint
    // back to other functions.  Iterate until the top-level seed stabilises.
    //
    // `iters_used` counts how many rounds of the convergence loop
    // actually ran (not including the initial lexical-containment pass
    // above).  It is used to detect cap-hit after the loop exits: a
    // cap-hit is the case where we exhausted the budget without the
    // `combined_exit == current_seed` break firing.
    let mut converged_early = true;
    let mut iters_used: usize = 0;
    // Trajectory of per-round seed-delta sizes; populated inside the
    // max_iterations > 1 branch and read on cap-hit.  Default empty
    // → classifier returns `Unknown`, which is the correct outcome
    // for non-JS/TS languages (no iterative loop ran).
    let mut convergence_trajectory: smallvec::SmallVec<[u32; 4]> = smallvec::SmallVec::new();
    if max_iterations > 1 {
        let top = file_cfg.toplevel();
        let top_cfg = &top.graph;

        // Collect top-level binding keys for seed filtering.  Always
        // keyed under `BodyId(0)` — `filter_seed_to_toplevel` matches
        // by name and re-keys every surviving entry to `BodyId(0)`
        // anyway, so the body_id on the probe keys is informational.
        let toplevel_keys: HashSet<ssa_transfer::BindingKey> = {
            let mut keys = HashSet::new();
            for (_idx, info) in top_cfg.node_references() {
                if let Some(ref d) = info.taint.defines {
                    keys.insert(ssa_transfer::BindingKey::new(d.as_str(), BodyId(0)));
                }
                for u in &info.taint.uses {
                    keys.insert(ssa_transfer::BindingKey::new(u.as_str(), BodyId(0)));
                }
            }
            keys
        };

        // Phase-B (body granularity): precompute per-body read-set of
        // top-level binding names.  A non-toplevel body only needs
        // re-analysis when a name it reads via Param or via the
        // global_seed ancestor-lookup path has actually changed in
        // the combined seed.  `reads` is a superset of the body's
        // top-level dependencies — we err on the side of over-running
        // (false dirty) rather than missing a dependency.
        let body_reads: HashMap<BodyId, HashSet<String>> = {
            let mut m: HashMap<BodyId, HashSet<String>> = HashMap::new();
            for body in &file_cfg.bodies {
                if body.meta.parent_body_id.is_none() {
                    continue; // top-level has no global_seed lookups
                }
                let mut names: HashSet<String> = HashSet::new();
                for (_idx, info) in body.graph.node_references() {
                    for u in &info.taint.uses {
                        names.insert(u.to_string());
                    }
                }
                m.insert(body.meta.id, names);
            }
            m
        };

        // Initial seed is the top-level exit state.
        let mut current_seed = body_exit_states
            .get(&BodyId(0))
            .cloned()
            .unwrap_or_default();

        // Phase-B per-body findings cache: retains the most-recent
        // round's findings for each body.  Round N re-runs only dirty
        // bodies; clean bodies keep their round N-1 findings.  This
        // replaces the previous "drop all non-toplevel findings, run
        // everything, repeat" pattern.
        let mut findings_by_body: HashMap<BodyId, Vec<Finding>> = HashMap::new();

        // Seed the cache with the pass-1 findings so round 0 of the
        // worklist has a consistent starting state.  We partition
        // `all_findings` into "toplevel" (kept verbatim) and
        // "non-toplevel" (moved into the cache, keyed by body).
        let mut toplevel_findings: Vec<Finding> = Vec::new();
        for f in std::mem::take(&mut all_findings) {
            let body = file_cfg.bodies.get(f.body_id.0 as usize);
            if body.is_some_and(|b| b.meta.parent_body_id.is_none()) {
                toplevel_findings.push(f);
            } else {
                findings_by_body
                    .entry(BodyId(f.body_id.0))
                    .or_default()
                    .push(f);
            }
        }

        let rounds = max_iterations.saturating_sub(1);
        converged_early = rounds == 0;
        let use_gauss_seidel = js_ts_gauss_seidel_enabled();
        for round in 0..rounds {
            iters_used = round + 1;
            // Combine function body exits filtered to top-level scope.
            let mut combined_exit = current_seed.clone();
            for &idx in &order {
                let body = &file_cfg.bodies[idx];
                if body.meta.parent_body_id.is_none() {
                    continue; // skip top-level itself
                }
                if let Some(es) = body_exit_states.get(&body.meta.id) {
                    let filtered = ssa_transfer::filter_seed_to_toplevel(es, &toplevel_keys);
                    combined_exit = ssa_transfer::join_seed_maps(&combined_exit, &filtered);
                }
            }

            // Record seed-delta for cap-hit classification.  Count the
            // number of keys whose value differs between current_seed
            // and combined_exit.  This mirrors scan.rs's diff helpers
            // but at BindingKey granularity.
            let iter_delta = seed_delta_size(&current_seed, &combined_exit);
            if convergence_trajectory.len() == 4 {
                convergence_trajectory.remove(0);
            }
            convergence_trajectory.push(iter_delta as u32);

            // Converged: seed didn't change.
            if combined_exit == current_seed {
                converged_early = true;
                break;
            }

            // Phase-B: compute which binding names changed so we can
            // skip bodies whose read-set is disjoint from the change
            // set.
            let changed_names = changed_binding_names(&current_seed, &combined_exit);
            current_seed = combined_exit;

            // Re-run non-toplevel bodies with updated seed.
            body_exit_states.insert(BodyId(0), current_seed.clone());
            // Phase-C: Gauss-Seidel variant — as each body is
            // re-analysed, merge its new exit into `current_seed`
            // immediately so subsequent bodies in the same round see
            // the fresh value.  Order matters here; we pin to
            // `order` (containment-topological) for reproducibility.
            // The Jacobi path leaves `current_seed` untouched for
            // the rest of the round.
            for &idx in &order {
                let body = &file_cfg.bodies[idx];
                if body.meta.parent_body_id.is_none() {
                    continue; // don't re-run top-level
                }
                // Skip clean bodies: nothing this body reads has
                // changed, so re-analysis would produce byte-identical
                // output.  The cached findings from the previous
                // round (or pass-1) remain correct.
                if let Some(reads) = body_reads.get(&body.meta.id) {
                    if reads.is_disjoint(&changed_names) {
                        continue;
                    }
                }
                let parent_seed = body
                    .meta
                    .parent_body_id
                    .and_then(|pid| body_exit_states.get(&pid));

                let (findings, exit_state) = analyse_body_with_seed(
                    body,
                    lang,
                    namespace,
                    local_summaries,
                    global_summaries,
                    interop_edges,
                    extra_labels,
                    ssa_summaries,
                    callee_bodies,
                    inline_cache,
                    parent_seed,
                    import_bindings,
                    cross_file_bodies,
                );
                // Phase-B: replace (not append) this body's findings
                // in the cache.  Previous rounds' findings for this
                // body are superseded by the new round's output.
                findings_by_body.insert(body.meta.id, findings);
                if let Some(es) = exit_state {
                    // Phase-C Gauss-Seidel: immediately publish this
                    // body's filtered exit into `current_seed` and
                    // `body_exit_states[BodyId(0)]` so the next body
                    // in this same round sees the updated seed via
                    // its `global_seed` ancestor lookup.
                    if use_gauss_seidel {
                        let filtered = ssa_transfer::filter_seed_to_toplevel(&es, &toplevel_keys);
                        current_seed = ssa_transfer::join_seed_maps(&current_seed, &filtered);
                        body_exit_states.insert(BodyId(0), current_seed.clone());
                    }
                    body_exit_states.insert(body.meta.id, es);
                }
            }
        }

        // After the loop, flatten per-body caches back into
        // `all_findings`, preserving the toplevel findings we set
        // aside earlier.
        all_findings = toplevel_findings;
        for body in &file_cfg.bodies {
            if body.meta.parent_body_id.is_none() {
                continue;
            }
            if let Some(fs) = findings_by_body.remove(&body.meta.id) {
                all_findings.extend(fs);
            }
        }
    }

    // Record observability counter.  `iters_used == 0` covers the
    // non-JS/TS path (`max_iterations == 1`) and the JS/TS case where
    // the convergence loop did not enter — report `1` so the counter
    // always reflects "at least the lexical-containment pass ran".
    let reported_iters = if iters_used == 0 { 1 } else { iters_used };
    LAST_JS_TS_PASS2_ITERATIONS.store(reported_iters, Ordering::Relaxed);

    // Convergence telemetry: record this file's pass-2 outcome.
    // No-op unless `NYX_CONVERGENCE_TELEMETRY=1`.  Only emitted for
    // JS/TS (`max_iterations > 1`) where a pass-2 loop actually ran;
    // single-iteration languages do not produce a convergence event.
    if max_iterations > 1 {
        let non_toplevel_bodies = file_cfg
            .bodies
            .iter()
            .filter(|b| b.meta.parent_body_id.is_some())
            .count();
        crate::convergence_telemetry::record(
            crate::convergence_telemetry::ConvergenceEvent::InFilePass2(
                crate::convergence_telemetry::InFilePass2Record {
                    schema: crate::convergence_telemetry::SCHEMA_VERSION,
                    namespace: namespace.to_string(),
                    body_count: non_toplevel_bodies,
                    iterations: iters_used,
                    cap: max_iterations,
                    converged: converged_early,
                    trajectory: convergence_trajectory.clone(),
                },
            ),
        );
    }

    // Cap-hit: the loop exhausted `max_iterations` without the
    // `combined_exit == current_seed` break firing.  Tag every finding
    // produced by this file so downstream consumers know the results
    // may be under-reported.  Only meaningful for JS/TS
    // (`max_iterations > 1`); single-iteration languages always
    // converge trivially by definition.
    if max_iterations > 1 && !converged_early {
        // Trajectory is captured in the convergence loop above; empty
        // when the loop never entered the delta-push path (rounds ==
        // 0, non-JS/TS, etc.).  Classifier defaults to `Unknown` for
        // <2 samples.
        let reason = crate::engine_notes::CapHitReason::classify(&convergence_trajectory);
        tracing::warn!(
            file = %namespace,
            iterations = iters_used,
            cap = max_iterations,
            reason = reason.tag(),
            "JS/TS pass-2 in-file fixpoint did not converge within safety cap — \
             results may be imprecise. This usually indicates a very deep chain \
             of top-level bindings threaded through helper functions; please \
             file a bug with a reproducer."
        );
        let note = EngineNote::InFileFixpointCapped {
            iterations: iters_used as u32,
            reason,
        };
        for f in &mut all_findings {
            f.merge_note(note.clone());
        }
    }

    all_findings
}

/// Return the set of binding **names** whose value differs between two
/// seed maps.  Used by the Phase-B body-level worklist to decide
/// which non-toplevel bodies must re-run.
///
/// Names (not full `BindingKey`s) because `filter_seed_to_toplevel`
/// re-keys every surviving entry to `BodyId(0)` anyway, and
/// per-body reads are plain identifier strings from the SSA IR.
/// Collapsing to names avoids a spurious mismatch when the same
/// binding appears under different body-scoped keys.
fn changed_binding_names(
    before: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
    after: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
) -> HashSet<String> {
    let mut changed = HashSet::new();
    for (k, v_after) in after {
        match before.get(k) {
            Some(v_before) if v_before == v_after => {}
            _ => {
                changed.insert(k.name.to_string());
            }
        }
    }
    for k in before.keys() {
        if !after.contains_key(k) {
            changed.insert(k.name.to_string());
        }
    }
    changed
}

/// Count [`ssa_transfer::BindingKey`]s whose [`VarTaint`] differs
/// between two seed maps.  Keys present in one map but missing from the
/// other count as differences.
fn seed_delta_size(
    before: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
    after: &HashMap<ssa_transfer::BindingKey, crate::taint::domain::VarTaint>,
) -> usize {
    let mut changed = 0usize;
    for (k, v_after) in after {
        match before.get(k) {
            Some(v_before) if v_before == v_after => {}
            _ => changed += 1,
        }
    }
    for k in before.keys() {
        if !after.contains_key(k) {
            changed += 1;
        }
    }
    changed
}

/// Find function entry nodes: (func_name, entry_node) pairs.
///
/// A function entry is the first node with a given `enclosing_func` value.
fn find_function_entries(cfg: &Cfg) -> Vec<(String, NodeIndex)> {
    let mut seen = HashSet::new();
    let mut entries = Vec::new();

    for (idx, info) in cfg.node_references() {
        if let Some(ref func_name) = info.ast.enclosing_func
            && seen.insert(func_name.clone())
        {
            entries.push((func_name.clone(), idx));
        }
    }

    entries
}

/// Look up formal parameter names (in declaration order) for a function from
/// the CFG-level local summaries. Returns empty vec if not found.
fn lookup_formal_params(local_summaries: &FuncSummaries, func_name: &str) -> Vec<String> {
    local_summaries
        .iter()
        .find(|(k, _)| k.name == func_name)
        .map(|(_, s)| s.param_names.clone())
        .unwrap_or_default()
}

/// Resolve a bare function name + param count to a canonical [`FuncKey`] by
/// consulting the already FuncKey-keyed `local_summaries`.
///
/// When exactly one `(name, arity)`-matching entry exists we use its full
/// identity (container / disambig / kind preserved).  When zero or multiple
/// match we fall back to a free-function key so the caller still has a
/// well-formed key — this can only happen in legacy discovery paths that
/// cannot see through same-name siblings, and those paths were already
/// collision-prone before this refactor.  New intra-file analysis code
/// should prefer [`BodyMeta::func_key`].
fn lookup_canonical_func_key(
    local_summaries: &FuncSummaries,
    lang: Lang,
    namespace: &str,
    func_name: &str,
    param_count: usize,
) -> FuncKey {
    // `local_summaries` is file-local, so every entry's namespace agrees with
    // whatever `build_cfg` wrote (raw file path). We match by lang + name +
    // arity and fall back to name-only — the caller's `namespace` argument is
    // only used when we have to synthesise a key as a last resort.
    let mut matches = local_summaries
        .keys()
        .filter(|k| k.lang == lang && k.name == func_name && k.arity == Some(param_count));
    let first = matches.next().cloned();
    if let Some(first) = first
        && matches.next().is_none()
    {
        return first;
    }
    if let Some(name_only) = local_summaries
        .keys()
        .find(|k| k.lang == lang && k.name == func_name)
    {
        return name_only.clone();
    }
    FuncKey {
        lang,
        namespace: namespace.to_string(),
        container: String::new(),
        name: func_name.to_string(),
        arity: Some(param_count),
        disambig: None,
        kind: FuncKind::Function,
    }
}

/// Extract precise SSA function summaries for all functions in a file.
///
/// Lowers each function to SSA individually and runs per-parameter probing
/// to produce an `SsaFuncSummary`. The resulting map is keyed by canonical
/// [`FuncKey`] so that same-name functions on different containers in the
/// same file produce distinct summary entries.
#[allow(dead_code)] // Used by tests; production code uses extract_ssa_artifacts
pub(crate) fn extract_intra_file_ssa_summaries(
    cfg: &Cfg,
    interner: &SymbolInterner,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
) -> std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary> {
    let func_entries = find_function_entries(cfg);
    let mut summaries = std::collections::HashMap::new();

    for (func_name, func_entry) in &func_entries {
        let formal_params = lookup_formal_params(local_summaries, func_name);
        let func_ssa = match crate::ssa::lower_to_ssa_with_params(
            cfg,
            *func_entry,
            Some(func_name),
            false,
            &formal_params,
        ) {
            Ok(ssa) => ssa,
            Err(_) => continue,
        };

        // Param count = number of formal params (from CFG), falling back to
        // counting all SsaOp::Param ops when no local summary is available.
        let param_count = if !formal_params.is_empty() {
            formal_params.len()
        } else {
            func_ssa
                .blocks
                .iter()
                .flat_map(|b| b.phis.iter().chain(b.body.iter()))
                .filter(|i| matches!(i.op, crate::ssa::ir::SsaOp::Param { .. }))
                .count()
        };

        // Zero-param helpers are normally elided — a fixture with no
        // parameters cannot carry per-parameter taint transforms.  But
        // zero-arg factories (`function makeBag() { return []; }`) do
        // have one observable cross-file effect: the return is a fresh
        // container allocation.  Run the summary extractor for those and
        // keep the result only when `returns_fresh_alloc` is set;
        // everything else falls through the observable-effects filter
        // below.
        //
        // Pre-compute module aliases for JS/TS (read-only const prop pass)
        let mod_aliases = compute_module_aliases_for_summary(&func_ssa, lang);
        let mod_aliases_ref = if mod_aliases.is_empty() {
            None
        } else {
            Some(&mod_aliases)
        };

        let summary = ssa_transfer::extract_ssa_func_summary(
            &func_ssa,
            cfg,
            local_summaries,
            global_summaries,
            lang,
            namespace,
            interner,
            param_count,
            mod_aliases_ref,
            None,
            Some(&formal_params),
        );

        // Only store if the summary has observable effects.  With
        // `points_to` support, a void helper whose only observable behaviour
        // is a parameter-to-parameter alias (e.g. `fn set(t, v) { t.x = v; }`)
        // must survive this filter so summary application at cross-file
        // call sites can replay the alias edges.  Zero-param factories
        // are kept via the `returns_fresh_alloc` leg of
        // `points_to.is_empty()` — `is_empty()` returns false when the
        // fresh-alloc flag is set.
        if !summary.param_to_return.is_empty()
            || !summary.param_to_sink.is_empty()
            || !summary.source_caps.is_empty()
            || !summary.param_container_to_return.is_empty()
            || !summary.param_to_container_store.is_empty()
            || summary.return_abstract.is_some()
            || !summary.points_to.is_empty()
        {
            let key =
                lookup_canonical_func_key(local_summaries, lang, namespace, func_name, param_count);
            summaries.insert(key, summary);
        }
    }

    if !summaries.is_empty() {
        tracing::debug!(
            count = summaries.len(),
            "SSA summary extraction: produced intra-file summaries"
        );
    }

    summaries
}

/// Lower all function bodies from `FileCfg` to produce SSA summaries + cached
/// bodies.  Each body's own graph is used directly — no scope filtering needed.
///
/// Both returned maps are keyed by each body's canonical [`FuncKey`] (carried
/// on [`crate::cfg::BodyMeta::func_key`]).  This is the most collision-
/// resistant identity we have: same-name methods on different classes, same-
/// name overloads with different arity, and anonymous bodies at distinct
/// source spans all get distinct keys.
pub(crate) fn lower_all_functions_from_bodies(
    file_cfg: &FileCfg,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    locator: Option<&crate::summary::SinkSiteLocator<'_>>,
) -> (
    std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
    std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
) {
    let mut summaries = std::collections::HashMap::new();
    let mut bodies = std::collections::HashMap::new();

    for body in file_cfg.function_bodies() {
        let _t_misc = std::time::Instant::now();
        let func_name = body.meta.name.clone().unwrap_or_else(|| {
            body.meta
                .func_key
                .as_ref()
                .and_then(|k| k.disambig.map(|d| format!("<anon#{d}>")))
                .unwrap_or_else(|| format!("<anon@{}>", body.meta.span.0))
        });

        let interner = SymbolInterner::from_cfg(&body.graph);
        let formal_params = &body.meta.params;
        perf_lower_record(6, _t_misc.elapsed().as_micros());

        let _t_lower = std::time::Instant::now();
        let mut func_ssa = match crate::ssa::lower_to_ssa_with_params(
            &body.graph,
            body.entry,
            Some(&func_name),
            false,
            formal_params,
        ) {
            Ok(ssa) => ssa,
            Err(_) => continue,
        };
        perf_lower_record(0, _t_lower.elapsed().as_micros());

        let param_count = if !formal_params.is_empty() {
            formal_params.len()
        } else {
            func_ssa
                .blocks
                .iter()
                .flat_map(|b| b.phis.iter().chain(b.body.iter()))
                .filter(|i| matches!(i.op, crate::ssa::ir::SsaOp::Param { .. }))
                .count()
        };

        // Canonical FuncKey: prefer the identity attached to the body at
        // CFG-construction time; otherwise fall back to matching in
        // `local_summaries`.
        //
        // `body.meta.func_key` carries the raw file-path namespace that
        // `build_cfg` wrote. The caller passes `namespace` already normalized
        // against `scan_root`, which is what FuncSummary keys use on the
        // cross-file side (`FuncSummary::func_key`). Overriding the namespace
        // here keeps both sides of `GlobalSummaries` agreement — otherwise
        // `resolve_callee` resolves to the normalized FuncSummary key and
        // misses the raw-path SSA entry.
        let mut key = body.meta.func_key.clone().unwrap_or_else(|| {
            lookup_canonical_func_key(local_summaries, lang, namespace, &func_name, param_count)
        });
        key.namespace = namespace.to_string();

        // Run the extractor even for zero-param functions so factories
        // (`returns_fresh_alloc = true`) emit a summary the caller can
        // replay.  A completely empty summary is still inserted for
        // non-zero-param functions (see the existing rationale below) but
        // zero-param cases without the factory flag stay out of the map
        // to avoid cluttering `GlobalSummaries` with trivially-empty
        // entries.
        {
            let _t_extract = std::time::Instant::now();
            let mod_aliases = compute_module_aliases_for_summary(&func_ssa, lang);
            let mod_aliases_ref = if mod_aliases.is_empty() {
                None
            } else {
                Some(&mod_aliases)
            };
            let summary = ssa_transfer::extract_ssa_func_summary(
                &func_ssa,
                &body.graph,
                local_summaries,
                global_summaries,
                lang,
                namespace,
                &interner,
                param_count,
                mod_aliases_ref,
                locator,
                Some(formal_params),
            );

            // Always insert the summary, even when all fields are empty/default.
            // An empty summary tells resolve_callee "this function exists and has
            // no taint effects" — preventing fallthrough to the less precise old
            // FuncSummary which may report false source_caps from internal sources.
            // For zero-param functions we only insert when the summary carries
            // the fresh-container signal (the only observable effect worth
            // persisting for a parameter-less body).
            if param_count > 0 || summary.points_to.returns_fresh_alloc {
                summaries.insert(key.clone(), summary);
            }
            perf_lower_record(1, _t_extract.elapsed().as_micros());
        }

        let _t_opt = std::time::Instant::now();
        let opt = crate::ssa::optimize_ssa_with_param_types(
            &mut func_ssa,
            &body.graph,
            Some(lang),
            &body.meta.param_types,
        );
        perf_lower_record(2, _t_opt.elapsed().as_micros());

        let _t_typed = std::time::Instant::now();
        // Phase 2 (typed call-graph devirtualisation): walk every SSA
        // method call in this body, look up the receiver SSA value's
        // [`crate::ssa::type_facts::TypeKind`] in the just-computed
        // `opt.type_facts`, and record `(call_ordinal, container_name)`
        // on the matching summary so Phase 3 in `build_call_graph` can
        // narrow the indirect-method-call edge to the receiver-typed
        // container.  Free-function calls (`receiver: None`) and
        // unknown receiver types are silently skipped — the bare-name
        // resolution path applies unchanged in that case.
        let typed_receivers = collect_typed_call_receivers(&func_ssa, &body.graph, &opt.type_facts);
        if !typed_receivers.is_empty() {
            // The summary may not have been inserted above (zero-param,
            // no-fresh-alloc bodies are skipped).  Force-insert in that
            // case so the receiver-type info reaches Phase 3 — without
            // it, the cross-file devirtualisation signal would be lost
            // for any method invoked inside a parameterless caller.
            let entry = summaries.entry(key.clone()).or_default();
            entry.typed_call_receivers = typed_receivers;
        }

        // Pointer-Phase 5 / W3: populate `field_points_to` from the
        // body's pointer facts when the analysis is enabled.  Strict
        // opt-in via `NYX_POINTER_ANALYSIS=1`; off-by-default keeps
        // bit-for-bit identity with the pre-W3 behaviour.
        //
        // `extract_field_points_to` covers both reads (via
        // `SsaOp::FieldProj` walks) and writes (via the W1
        // `field_writes` side-table on the body) in a single pass.
        if crate::pointer::is_enabled() {
            let facts = crate::pointer::analyse_body(&func_ssa, body.meta.id);
            let fpt = crate::pointer::extract_field_points_to(&func_ssa, &facts);
            if !fpt.is_empty() {
                let entry = summaries.entry(key.clone()).or_default();
                entry.field_points_to = fpt;
            }
        }

        perf_lower_record(3, _t_typed.elapsed().as_micros());

        let _t_misc2 = std::time::Instant::now();
        bodies.insert(
            key,
            ssa_transfer::CalleeSsaBody {
                ssa: func_ssa,
                opt,
                param_count,
                node_meta: std::collections::HashMap::new(),
                body_graph: Some(body.graph.clone()),
            },
        );
        perf_lower_record(6, _t_misc2.elapsed().as_micros());
    }

    // ── Closure-capture summary augmentation ─────────────────────────
    //
    // Lift child-body sinks into the parent's `param_to_sink` for
    // every parent body with lexically contained children. This
    // handles the direct-wrapper case
    // `f(x) { return new Promise((res, rej) => sink(x)) }` — the
    // executor's gated http.get sink becomes visible to callers of
    // `f` via `f.summary.param_to_sink`.
    //
    // Without this pass, `f.summary.param_to_sink` stays empty
    // because the sink lives in a separately-extracted child body
    // that the parent's pass-1 probe never sees. The
    // lexical-containment propagation in `analyse_multi_body`
    // carries seeded taint into child bodies for the production
    // analysis path, but the single-body summary extractor in
    // `extract_ssa_func_summary` does not. This pass reproduces that
    // propagation at summary-extraction time so cross-call
    // resolution sees the sink at every caller of `f`.
    //
    // Strict-additive: only ADDs `param_to_sink` entries — never
    // removes or modifies existing data — so it cannot regress
    // detection. Bounded: each parent-param probe runs each child
    // body's analysis exactly once.
    let _t_aug = std::time::Instant::now();
    augment_summaries_with_child_sinks(
        file_cfg,
        lang,
        namespace,
        local_summaries,
        global_summaries,
        &bodies,
        &mut summaries,
    );
    perf_lower_record(4, _t_aug.elapsed().as_micros());

    // ── Second extraction pass: transitive cross-function summary lift ───
    //
    // The augment pass populates direct sink-wrapper summaries
    // (`f(x) { Promise(() => sink(x)) }`). This second pass then
    // re-runs every body's per-parameter probe with the augmented
    // `summaries` map plumbed through to the probe transfer's
    // `ssa_summaries` field, so callers of those wrappers (e.g. an
    // `addFileDataIfNeeded` whose body calls a `downloadFileFromURI`
    // sink wrapper) see the augmented `param_to_sink` at step 0 of
    // `resolve_callee_full` and propagate it onto their own summary.
    //
    // OR-merge: only adds `param_to_sink` / `param_to_sink_param`
    // entries to existing summaries. Existing entries (return
    // transforms, source caps, augment-populated sinks, etc.) are
    // preserved. Strict-additive — cannot regress detection.
    let _t_rerun = std::time::Instant::now();
    rerun_extraction_with_augmented_summaries(
        file_cfg,
        lang,
        namespace,
        local_summaries,
        global_summaries,
        locator,
        &bodies,
        &mut summaries,
    );
    perf_lower_record(5, _t_rerun.elapsed().as_micros());

    if !summaries.is_empty() {
        tracing::debug!(
            count = summaries.len(),
            bodies = bodies.len(),
            "lower_all_functions_from_bodies: produced summaries + cached bodies"
        );
    }

    (summaries, bodies)
}

/// Second extraction pass: re-runs `extract_ssa_func_summary_full` for
/// every body with the augmented `summaries` map plumbed through.
///
/// Only sink-related fields (`param_to_sink`, `param_to_sink_param`)
/// are merged into existing summaries; other fields stay as-produced
/// by the first pass.  Bounded: one re-extraction per body.
#[allow(clippy::too_many_arguments)]
fn rerun_extraction_with_augmented_summaries(
    file_cfg: &FileCfg,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    locator: Option<&crate::summary::SinkSiteLocator<'_>>,
    bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
    summaries: &mut std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
) {
    use crate::ssa::ir::SsaOp;
    use crate::state::symbol::SymbolInterner;

    // Fast-out: rerun matters only when at least one body in the file has
    // an SSA summary entry that *another* body in the same file might
    // resolve a Call to.  If no SSA summaries were produced, nothing to
    // re-extract.  This is the dominant case for files of unrelated
    // functions or with all-cross-file callees.
    if summaries.is_empty() {
        return;
    }

    // Snapshot the augmented summaries map so the probes resolve
    // callees against a stable view (the merge below mutates
    // `summaries` as we iterate).
    let augmented_snapshot: std::collections::HashMap<
        FuncKey,
        crate::summary::ssa_summary::SsaFuncSummary,
    > = summaries.clone();

    // Set of bare callee names known to have an in-file SsaFuncSummary.
    // `extract_ssa_func_summary_full` only consults `ssa_summaries` at
    // Call resolution time, so a body with no Call to any of these names
    // produces a summary identical to its first-pass output.
    //
    // SSA `Call::callee` carries the bare method name after lowering
    // decomposes chained-receiver calls, which matches `FuncKey::name`.
    // Borrows `augmented_snapshot` (immutable view) so the loop below can
    // freely mutate `summaries`.
    let in_file_names: std::collections::HashSet<&str> =
        augmented_snapshot.keys().map(|k| k.name.as_str()).collect();

    for body in file_cfg.function_bodies() {
        let Some(parent_key) = body.meta.func_key.clone() else {
            continue;
        };
        let mut key = parent_key;
        key.namespace = namespace.to_string();

        let Some(callee) = bodies.get(&key) else {
            continue;
        };
        if callee.param_count == 0 {
            continue;
        }
        let Some(parent_cfg) = callee.body_graph.as_ref() else {
            continue;
        };

        // Narrow: rerun only bodies whose SSA references at least one
        // in-file summary by name.  Bodies with no in-file Call cannot
        // benefit from the augmented `ssa_summaries` view, so their
        // re-extraction is a strict no-op.
        let has_in_file_call = callee.ssa.blocks.iter().any(|b| {
            b.body.iter().any(|inst| {
                if let SsaOp::Call { callee: name, .. } = &inst.op {
                    in_file_names.contains(name.as_str())
                } else {
                    false
                }
            })
        });
        if !has_in_file_call {
            continue;
        }

        let interner = SymbolInterner::from_cfg(parent_cfg);
        let mod_aliases = compute_module_aliases_for_summary(&callee.ssa, lang);
        let mod_aliases_ref = if mod_aliases.is_empty() {
            None
        } else {
            Some(&mod_aliases)
        };

        let new_summary = ssa_transfer::extract_ssa_func_summary_full(
            &callee.ssa,
            parent_cfg,
            local_summaries,
            global_summaries,
            lang,
            namespace,
            &interner,
            callee.param_count,
            mod_aliases_ref,
            locator,
            Some(&body.meta.params),
            Some(&augmented_snapshot),
        );

        // OR-merge sink-only fields into the existing summary.
        let entry = summaries.entry(key).or_default();
        merge_sink_fields(entry, &new_summary);
    }
}

/// OR-merge `param_to_sink` and `param_to_sink_param` from `src` into
/// `dst`. Existing entries are preserved; only NEW entries are added.
fn merge_sink_fields(
    dst: &mut crate::summary::ssa_summary::SsaFuncSummary,
    src: &crate::summary::ssa_summary::SsaFuncSummary,
) {
    for (idx, sites) in &src.param_to_sink {
        if let Some((_, dst_sites)) = dst.param_to_sink.iter_mut().find(|(i, _)| i == idx) {
            for site in sites {
                let key = site.dedup_key();
                if !dst_sites.iter().any(|s| s.dedup_key() == key) {
                    dst_sites.push(site.clone());
                }
            }
        } else {
            dst.param_to_sink.push((*idx, sites.clone()));
        }
    }
    for &(idx, pos, caps) in &src.param_to_sink_param {
        if !dst
            .param_to_sink_param
            .iter()
            .any(|(i, p, c)| *i == idx && *p == pos && *c == caps)
        {
            dst.param_to_sink_param.push((idx, pos, caps));
        }
    }
}

/// Walk lexical-containment children of every parent body and lift
/// their sinks into the parent's [`SsaFuncSummary::param_to_sink`].
///
/// For each parent body P with at least one lexically contained
/// child:
///   - For each formal parameter `p_i` of P:
///     - Seed a probe with `{ p_i → Cap::all() }`, run P's SSA
///       analysis, extract P's exit state.
///     - For every descendant child body C of P, run C's SSA
///       analysis with the parent's exit state seeded as
///       `global_seed`. Collect sink events.
///     - For each event whose `sink_caps` is non-empty, append a
///       cap-only [`SinkSite`] under `p_i` on P's summary
///       (deduplicated by cap-mask so repeat probes don't inflate
///       the entry).
///
/// Strict-additive: only inserts new `param_to_sink` entries; never
/// modifies `param_return_paths`, `points_to`, `source_caps`, etc.
fn augment_summaries_with_child_sinks(
    file_cfg: &FileCfg,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    bodies: &std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
    summaries: &mut std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>,
) {
    use crate::cfg::BodyId;
    use crate::labels::{Cap, SourceKind};
    use crate::summary::SinkSite;
    use crate::taint::domain::{TaintOrigin, VarTaint};
    use ssa_transfer::BindingKey;

    // ── Build lexical-containment relationships ──────────────────────
    // Map parent BodyId → list of descendant body indices.  Reverse-walk
    // each body's `parent_body_id` chain so a grand-child's sinks are
    // attributed to every ancestor in its containment chain.
    let body_id_to_idx: std::collections::HashMap<BodyId, usize> = file_cfg
        .bodies
        .iter()
        .enumerate()
        .map(|(i, b)| (b.meta.id, i))
        .collect();
    let mut descendants: std::collections::HashMap<BodyId, Vec<usize>> =
        std::collections::HashMap::new();
    for (idx, body) in file_cfg.bodies.iter().enumerate() {
        // Walk up the parent chain, registering this body as a descendant
        // of every ancestor.
        let mut cur = body.meta.parent_body_id;
        while let Some(pid) = cur {
            descendants.entry(pid).or_default().push(idx);
            cur = body_id_to_idx
                .get(&pid)
                .and_then(|i| file_cfg.bodies[*i].meta.parent_body_id);
        }
    }

    // ── Map each parent body to its FuncKey and the SSA body cache ──
    // Skip bodies with no formal params (nothing to probe) and bodies
    // whose SSA was never lowered (lowering errors logged earlier).
    for parent_body in &file_cfg.bodies {
        let Some(parent_key) = parent_body.meta.func_key.clone() else {
            continue;
        };
        let mut parent_key = parent_key;
        parent_key.namespace = namespace.to_string();

        let Some(parent_callee) = bodies.get(&parent_key) else {
            continue;
        };
        if parent_callee.param_count == 0 {
            continue;
        }
        let Some(child_indices) = descendants.get(&parent_body.meta.id) else {
            continue;
        };
        if child_indices.is_empty() {
            continue;
        }

        let parent_ssa = &parent_callee.ssa;
        let parent_cfg = match parent_callee.body_graph.as_ref() {
            Some(g) => g,
            None => continue,
        };
        let parent_interner = crate::state::symbol::SymbolInterner::from_cfg(parent_cfg);

        // Collect (formal_param_idx, var_name, ssa_value) for the parent's
        // formal params — mirrors `extract_ssa_func_summary`'s param scan.
        let mut parent_param_info: Vec<(usize, String)> = Vec::new();
        for block in &parent_ssa.blocks {
            for inst in block.phis.iter().chain(block.body.iter()) {
                if let crate::ssa::ir::SsaOp::Param { index } = &inst.op {
                    if *index < parent_callee.param_count {
                        if let Some(name) = inst.var_name.as_ref() {
                            parent_param_info.push((*index, name.clone()));
                        }
                    }
                }
            }
        }

        for (param_idx, param_name) in &parent_param_info {
            // Seed parent's probe with this single param tainted to all caps.
            let mut seed: std::collections::HashMap<BindingKey, VarTaint> =
                std::collections::HashMap::new();
            seed.insert(
                BindingKey::new(param_name.as_str(), BodyId(0)),
                VarTaint {
                    caps: Cap::all(),
                    origins: smallvec::SmallVec::from_elem(
                        TaintOrigin {
                            node: petgraph::graph::NodeIndex::new(0),
                            source_kind: SourceKind::UserInput,
                            source_span: None,
                        },
                        1,
                    ),
                    uses_summary: false,
                },
            );

            let parent_transfer = ssa_transfer::SsaTaintTransfer {
                lang,
                namespace,
                interner: &parent_interner,
                local_summaries,
                global_summaries,
                interop_edges: &[],
                owner_body_id: BodyId(0),
                parent_body_id: None,
                global_seed: Some(&seed),
                param_seed: None,
                receiver_seed: None,
                const_values: None,
                type_facts: None,
                ssa_summaries: Some(summaries),
                extra_labels: None,
                base_aliases: None,
                callee_bodies: None,
                inline_cache: None,
                context_depth: 0,
                callback_bindings: None,
                points_to: None,
                dynamic_pts: None,
                import_bindings: None,
                promisify_aliases: None,
                module_aliases: None,
                static_map: None,
                auto_seed_handler_params: false,
                cross_file_bodies: None,
                pointer_facts: None,
            };

            let (_parent_events, parent_block_states) =
                ssa_transfer::run_ssa_taint_full(parent_ssa, parent_cfg, &parent_transfer);
            let parent_exit = ssa_transfer::extract_ssa_exit_state(
                &parent_block_states,
                parent_ssa,
                parent_cfg,
                &parent_transfer,
                BodyId(0),
            );
            if parent_exit.is_empty() {
                continue;
            }

            for &child_idx in child_indices {
                let child_body = &file_cfg.bodies[child_idx];
                let Some(child_key) = child_body.meta.func_key.clone() else {
                    continue;
                };
                let mut child_key = child_key;
                child_key.namespace = namespace.to_string();
                let Some(child_callee) = bodies.get(&child_key) else {
                    continue;
                };
                let child_ssa = &child_callee.ssa;
                let Some(child_cfg) = child_callee.body_graph.as_ref() else {
                    continue;
                };

                let child_interner = crate::state::symbol::SymbolInterner::from_cfg(child_cfg);

                let child_transfer = ssa_transfer::SsaTaintTransfer {
                    lang,
                    namespace,
                    interner: &child_interner,
                    local_summaries,
                    global_summaries,
                    interop_edges: &[],
                    owner_body_id: BodyId(0),
                    parent_body_id: None,
                    global_seed: Some(&parent_exit),
                    param_seed: None,
                    receiver_seed: None,
                    const_values: None,
                    type_facts: None,
                    ssa_summaries: Some(summaries),
                    extra_labels: None,
                    base_aliases: None,
                    callee_bodies: None,
                    inline_cache: None,
                    context_depth: 0,
                    callback_bindings: None,
                    points_to: None,
                    dynamic_pts: None,
                    import_bindings: None,
                    promisify_aliases: None,
                    module_aliases: None,
                    static_map: None,
                    auto_seed_handler_params: false,
                    cross_file_bodies: None,
                    pointer_facts: None,
                };

                let (child_events, _child_block_states) =
                    ssa_transfer::run_ssa_taint_full(child_ssa, child_cfg, &child_transfer);

                if child_events.is_empty() {
                    continue;
                }

                // Aggregate sink caps across all child events into one
                // entry per parent param (cap-only SinkSite — the
                // exact location lives in the child body's CFG and is
                // not directly addressable from the parent's summary).
                let mut union_caps = Cap::empty();
                for ev in &child_events {
                    union_caps |= ev.sink_caps;
                }
                if union_caps.is_empty() {
                    continue;
                }

                let entry = summaries.entry(parent_key.clone()).or_default();
                let new_site = SinkSite::cap_only(union_caps);
                let new_key = new_site.dedup_key();
                if let Some((_, sites)) = entry
                    .param_to_sink
                    .iter_mut()
                    .find(|(i, _)| *i == *param_idx)
                {
                    if !sites.iter().any(|s| s.dedup_key() == new_key) {
                        sites.push(new_site);
                    }
                } else {
                    entry
                        .param_to_sink
                        .push((*param_idx, smallvec::smallvec![new_site]));
                }

                // Mirror cap-only attribution into `param_to_sink_param`
                // so the call-site emission path that consults it (the
                // engine's primary sink-site picker uses
                // `param_to_sink_param` for arg-position filtering)
                // sees this captured-flow sink. Position 0 is a
                // best-effort placeholder — the actual filtering at
                // the caller is by SSRF cap, not arg position, when
                // the wrapper is itself non-gated.
                if !entry
                    .param_to_sink_param
                    .iter()
                    .any(|(i, _, c)| *i == *param_idx && *c == union_caps)
                {
                    entry.param_to_sink_param.push((*param_idx, 0, union_caps));
                }
            }
        }
    }
}

/// Walk every SSA `Call` instruction in `ssa` and produce
/// `(call_ordinal, container_name)` entries for those whose receiver
/// SSA value has a [`crate::ssa::type_facts::TypeKind`] with a
/// non-empty [`crate::ssa::type_facts::TypeKind::container_name`].
///
/// Free-function calls (`receiver: None`) and unknown receiver types
/// are skipped — the cross-file call-graph builder will fall back to
/// today's name-only resolution for those, preserving the
/// "subset of today's targets, never a superset" invariant from
/// `docs/typed-call-graph-prompt.md`.
///
/// Ordinals are pulled from the underlying CFG node's
/// [`crate::cfg::CallMeta::call_ordinal`] so they line up with
/// [`crate::summary::CalleeSite::ordinal`] at consumer time.  Calls
/// whose CFG node has no recoverable ordinal (synthetic / removed
/// nodes) are silently dropped.
fn collect_typed_call_receivers(
    ssa: &crate::ssa::ir::SsaBody,
    cfg: &crate::cfg::Cfg,
    type_facts: &crate::ssa::type_facts::TypeFactResult,
) -> Vec<(u32, String)> {
    use crate::ssa::ir::SsaOp;

    let mut out: Vec<(u32, String)> = Vec::new();
    let mut seen: std::collections::HashSet<u32> = std::collections::HashSet::new();

    for block in &ssa.blocks {
        for inst in block.body.iter() {
            let SsaOp::Call { receiver, .. } = &inst.op else {
                continue;
            };
            let Some(receiver_val) = receiver else {
                continue; // free-function call — no devirtualisation possible
            };
            let Some(kind) = type_facts.get_type(*receiver_val) else {
                continue; // type unknown — fall back to name-only resolution
            };
            let Some(container) = kind.container_name() else {
                continue; // scalar/unknown type — no useful container
            };
            let Some(node_info) = cfg.node_weight(inst.cfg_node) else {
                continue;
            };
            let ordinal = node_info.call.call_ordinal;
            // A single SSA call instruction maps 1:1 with a CFG call
            // node, so each ordinal should appear at most once.  The
            // dedup guard exists in case lowering ever introduces a
            // second SSA Call sharing a cfg_node — first wins.
            if !seen.insert(ordinal) {
                continue;
            }
            out.push((ordinal, container));
        }
    }

    out.sort_by_key(|(ord, _)| *ord);
    out
}

/// Maximum blocks for a callee body to be eligible for cross-file persistence.
const MAX_CROSS_FILE_BODY_BLOCKS: usize = 100;

type SsaArtifactSummaries =
    std::collections::HashMap<FuncKey, crate::summary::ssa_summary::SsaFuncSummary>;
type EligibleCalleeBodies = Vec<(FuncKey, ssa_transfer::CalleeSsaBody)>;

/// FileCfg-based artifact extraction: iterates per-body (not per function
/// entry) and lowers each body's graph with its recorded entry/params. This
/// path is equivalent to what `analyse_file` uses at taint time, so the SSA
/// summaries produced here line up exactly with what pass 2 will consult.
pub(crate) fn extract_ssa_artifacts_from_file_cfg(
    file_cfg: &FileCfg,
    lang: Lang,
    namespace: &str,
    local_summaries: &FuncSummaries,
    global_summaries: Option<&GlobalSummaries>,
    locator: Option<&crate::summary::SinkSiteLocator<'_>>,
) -> (SsaArtifactSummaries, EligibleCalleeBodies) {
    let (summaries, bodies) = lower_all_functions_from_bodies(
        file_cfg,
        lang,
        namespace,
        local_summaries,
        global_summaries,
        locator,
    );
    let eligible_bodies = build_eligible_bodies(file_cfg, bodies);
    (summaries, eligible_bodies)
}

/// Filter pre-lowered SSA bodies down to the cross-file-eligible subset and
/// populate per-node metadata against the original CFG.
///
/// Split out from [`extract_ssa_artifacts_from_file_cfg`] so callers that
/// already hold a freshly-lowered `bodies` map (specifically
/// `analyse_file_fused`, which now lowers once and feeds both the taint
/// engine and this filter) don't pay for a second lowering pass.
pub(crate) fn build_eligible_bodies(
    file_cfg: &FileCfg,
    bodies: std::collections::HashMap<FuncKey, ssa_transfer::CalleeSsaBody>,
) -> EligibleCalleeBodies {
    let mut eligible_bodies = Vec::new();
    if crate::symex::cross_file_symex_enabled() {
        for (key, mut body) in bodies {
            if body.ssa.blocks.len() > MAX_CROSS_FILE_BODY_BLOCKS {
                continue;
            }
            // Populate node metadata against the per-body graph whose NodeIndex
            // space the SSA was produced on — otherwise cross-file replay can't
            // find the original CFG nodes.
            //
            // `key.namespace` was already normalised against `scan_root` in
            // `lower_all_functions_from_bodies`; `body.meta.func_key.namespace`
            // still carries the raw `build_cfg` file path.  Compare on
            // structural identity (everything *but* namespace) so the two
            // agree even when the namespace representations differ.
            let Some(body_cfg) = file_cfg.bodies.iter().find(|b| {
                b.meta.func_key.as_ref().is_some_and(|k| {
                    k.lang == key.lang
                        && k.container == key.container
                        && k.name == key.name
                        && k.arity == key.arity
                        && k.disambig == key.disambig
                        && k.kind == key.kind
                })
            }) else {
                continue;
            };
            if !ssa_transfer::populate_node_meta(&mut body, &body_cfg.graph) {
                continue;
            }
            eligible_bodies.push((key, body));
        }
    }
    eligible_bodies
}

#[cfg(test)]
mod tests;