#![allow( clippy::collapsible_if, clippy::if_same_then_else, clippy::needless_range_loop, clippy::only_used_in_recursion, clippy::too_many_arguments, clippy::type_complexity, clippy::unnecessary_unwrap )] use crate::cfg::{Cfg, EdgeKind, StmtKind}; use petgraph::algo::dominators::{Dominators, simple_fast}; use petgraph::graph::NodeIndex; use petgraph::prelude::*; use petgraph::visit::EdgeRef; use smallvec::SmallVec; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque}; use super::ir::*; /// Try to decompose a chained-receiver method call (e.g. `a.b.c.method`) /// into a `FieldProj` chain plus a bare-method `Call`. /// /// **Returns** `Some((final_receiver_value, bare_method_name))` on success, /// `None` to fall back to the existing single-Call lowering (current /// behaviour). /// /// On success, the caller should: /// - Construct the `Call` op with `callee = bare_method_name`, /// `callee_text = Some(original_callee.to_string())`, /// `receiver = Some(final_receiver_value)`. /// - Use the returned receiver as the implicit method receiver, do NOT /// add the chain root or any intermediate field name to `args`. /// /// **Decomposition rules**: /// - Skip when the callee contains zero `.` characters (no member access) /// or only one `.` (single-dot case is handled by the existing /// `info.call.receiver` channel without needing a `FieldProj` op). /// - Bail when any "complex" token appears in the callee, `(`, `)`, /// `[`, `]`, `::`, `->`, `?`, `<`, `>`, `*`, `&`, `:` (other than `::` /// already filtered), or whitespace, signaling the callee text isn't /// a clean `....` chain we can safely split on `.`. /// - The first segment must be a known SSA variable in `var_stacks`; /// otherwise the chain root is unresolvable and we bail. /// - Each intermediate segment becomes a `FieldProj { receiver, field }` /// instruction emitted onto `block.body` with a fresh `SsaValue`. /// - The last segment is the bare method name returned to the caller. /// /// FieldProj instructions are tagged with `var_name = Some("base.f1.f2")` /// so debug output and downstream consumers that key on `var_name` can /// recognise the projection chain provenance. #[allow(clippy::too_many_arguments)] fn try_lower_field_proj_chain( callee: &str, var_stacks: &HashMap>, field_interner: &mut crate::ssa::ir::FieldInterner, block_idx: usize, block_id: BlockId, next_value: &mut u32, ssa_blocks: &mut [SsaBlock], value_defs: &mut Vec, cfg_node: NodeIndex, span: (usize, usize), ) -> Option<(SsaValue, String)> { // Bail on any token that signals a complex callee expression. // `::` (Rust/C++ paths) is folded into the broader `:` check. for ch in callee.chars() { match ch { '(' | ')' | '[' | ']' | '<' | '>' | '?' | '*' | '&' | ':' | ' ' | '\t' | '\n' | '-' | '!' | ',' | ';' | '"' | '\'' | '\\' => return None, _ => {} } } let segments: Vec<&str> = callee.split('.').collect(); // Need at least 3 segments: `base.field.method` → 1 FieldProj, 1 Call. if segments.len() < 3 { return None; } // Reject empty segments (would happen on leading/trailing/double dots). if segments.iter().any(|s| s.is_empty()) { return None; } let base = segments[0]; let mut current = *var_stacks.get(base).and_then(|s| s.last())?; let mut chain_var = base.to_string(); // Each intermediate segment becomes a FieldProj op. segments[0] is the // base SSA variable, segments[len-1] is the bare method name. for field_name in &segments[1..segments.len() - 1] { let fid = field_interner.intern(field_name); let v = SsaValue(*next_value); *next_value += 1; chain_var.push('.'); chain_var.push_str(field_name); ssa_blocks[block_idx].body.push(SsaInst { value: v, op: SsaOp::FieldProj { receiver: current, field: fid, projected_type: None, }, cfg_node, var_name: Some(chain_var.clone()), span, }); value_defs.push(ValueDef { var_name: Some(chain_var.clone()), cfg_node, block: block_id, }); current = v; } let method = segments.last().unwrap().to_string(); Some((current, method)) } /// Lower a CFG to SSA form for a single function scope. /// /// `scope` filters nodes by `enclosing_func`: /// - `None` → top-level code only (`enclosing_func.is_none()`) /// - `Some(name)` → only nodes with `enclosing_func == Some(name)` /// /// If `scope_all` is true, all nodes reachable from `entry` are included /// regardless of `enclosing_func`. pub fn lower_to_ssa( cfg: &Cfg, entry: NodeIndex, scope: Option<&str>, scope_all: bool, ) -> Result { lower_to_ssa_inner(cfg, entry, scope, scope_all, false, &[], false) } /// Like `lower_to_ssa` but with formal parameter names supplied in declaration /// order. External variables that match these names are placed first (in /// declaration order) so that `Param { index }` indices 0..N correspond to /// call-site argument positions. pub fn lower_to_ssa_with_params( cfg: &Cfg, entry: NodeIndex, scope: Option<&str>, scope_all: bool, formal_params: &[String], ) -> Result { // `with_params=true` signals "callers supplied an explicit formal list, // even if empty" (e.g. arrow `() => {…}` has zero formals). This lets // the synthetic-externals classifier distinguish "no formals info" from // "explicit empty formals" — closure captures of an arrow with empty // formals are still synthetic, not formals. Bug surfaced on outline's // jest test files: free vars bubbled up from nested arrow callbacks // (`body`, `userId`, `server.post`) became Params at the outer arrow's // entry, and the JS/TS auto-seed treated `userId` as a real handler // formal, producing 934 phantom taint findings. See // `taint/ssa_transfer/mod.rs::auto_seed_handler_params`. lower_to_ssa_inner(cfg, entry, scope, scope_all, false, formal_params, true) } /// Like `lower_to_ssa` but with `scope_nop`: when true, all nodes are included /// in the SSA body for graph connectivity, but out-of-scope nodes become Nop /// (their defines/uses are ignored). This is used for the JS two-level solve /// where the CFG linearizes function bodies inline. pub fn lower_to_ssa_scoped_nop( cfg: &Cfg, entry: NodeIndex, scope: Option<&str>, ) -> Result { lower_to_ssa_inner(cfg, entry, scope, false, true, &[], false) } fn lower_to_ssa_inner( cfg: &Cfg, entry: NodeIndex, scope: Option<&str>, scope_all: bool, scope_nop: bool, formal_params: &[String], with_params: bool, ) -> Result { if cfg.node_count() == 0 { return Err(SsaError::EmptyCfg); } // When scope_nop is set, traverse all nodes (scope_all=true) for graph connectivity let traverse_all = scope_all || scope_nop; // Collect reachable nodes in scope, stripping exception edges. let (reachable, filtered_edges, raw_exception_edges) = collect_reachable(cfg, entry, scope, traverse_all); // Build the set of nodes that should be treated as Nop (out-of-scope but included) let nop_nodes: HashSet = if scope_nop { let in_scope = |node: NodeIndex| -> bool { let info = &cfg[node]; match scope { None => info.ast.enclosing_func.is_none(), Some(name) => info.ast.enclosing_func.as_deref() == Some(name), } }; reachable .iter() .filter(|&&n| !in_scope(n) && !matches!(cfg[n].kind, StmtKind::Entry | StmtKind::Exit)) .copied() .collect() } else { HashSet::new() }; if reachable.is_empty() { return Err(SsaError::EmptyCfg); } // 1. Form basic blocks let (blocks_nodes, block_of_node, block_succs, block_preds) = form_blocks(cfg, entry, &reachable, &filtered_edges); let num_blocks = blocks_nodes.len(); if num_blocks == 0 { return Err(SsaError::EmptyCfg); } // 2. Compute dominators on block-level graph let (block_graph, block_graph_entry) = build_block_graph(num_blocks, &block_succs, BlockId(0)); let doms = simple_fast(&block_graph, block_graph_entry); // 3. Compute dominance frontiers let dom_frontiers = compute_dominance_frontiers(num_blocks, &block_preds, &doms, &block_graph); // 4. Collect variable definitions per block (skip nop nodes) let mut var_defs = collect_var_defs(cfg, &blocks_nodes, &nop_nodes); // 4b. For per-function scope: identify external variables (used but not defined) // and inject synthetic Param defs at entry block so rename can find them. // When formal_params is supplied, reorder so formal params come first in // declaration order, this makes Param indices correspond to call-site positions. // let external_vars = if scope.is_some() && !scope_all && !scope_nop { let raw = identify_external_uses(cfg, &blocks_nodes, &var_defs); reorder_external_vars(raw, formal_params) } else { vec![] }; // Register external vars as defined in block 0 so phi insertion considers them for var in &external_vars { var_defs.entry(var.clone()).or_default().insert(0); } // 5. Phi insertion (Cytron algorithm) let phi_placements = insert_phis(&var_defs, &dom_frontiers, num_blocks); // 6. Rename variables (dominator tree preorder walk) let dom_tree_children = build_dom_tree_children(num_blocks, &doms, &block_graph); let ( mut ssa_blocks, mut value_defs, cfg_node_map, field_interner, field_writes, synthetic_externals, slot_scoped_assigns, ) = rename_variables( cfg, &blocks_nodes, &block_succs, &block_preds, &phi_placements, &dom_tree_children, &filtered_edges, &external_vars, formal_params, with_params, &nop_nodes, ); // 6b. Fill any missing phi operands with a shared Undef sentinel so // every phi has exactly one operand per predecessor. See // `fill_undef_phi_operands` for the invariant rationale. fill_undef_phi_operands( &mut ssa_blocks, &block_preds, &mut value_defs, &blocks_nodes, ); // 7. Fill in preds/succs on SsaBlocks for bid in 0..num_blocks { let id = BlockId(bid as u32); ssa_blocks[bid].id = id; ssa_blocks[bid].preds = block_preds[bid] .iter() .map(|&b| BlockId(b as u32)) .collect(); ssa_blocks[bid].succs = block_succs[bid] .iter() .map(|&b| BlockId(b as u32)) .collect(); } // 7b. Debug assertions: verify structural invariants. // The helper body is `debug_assert!` only, so it's a no-op in release , // call unconditionally to avoid a dead_code warning when the lib is // built without `--tests`. debug_assert_bfs_ordering(&block_preds); // Phi operand counts are a release-level invariant: every phi must // have exactly one operand per predecessor. Missing operands are // filled with an explicit Undef sentinel in // `fill_undef_phi_operands`; extra operands would reference // nonexistent predecessors and corrupt analysis silently. assert_phi_operand_counts(&ssa_blocks, &block_preds); // 8. Map exception edges from CFG node indices to SSA block IDs let exception_edges: Vec<(BlockId, BlockId)> = raw_exception_edges .iter() .filter_map(|(src_node, catch_node)| { let src_block = block_of_node.get(src_node)?; let catch_block = block_of_node.get(catch_node)?; Some((BlockId(*src_block as u32), BlockId(*catch_block as u32))) }) .collect(); let body = SsaBody { blocks: ssa_blocks, entry: BlockId(0), value_defs, cfg_node_map, exception_edges, field_interner, field_writes, synthetic_externals, slot_scoped_assigns, }; // 9. Catch-block reachability invariant. // // A CatchParam-carrying block that is neither reachable from entry nor // listed as an exception target indicates a CFG construction bug. Debug // builds panic loudly; release builds warn, record an engine note so // downstream findings carry "SSA lowering bailed" provenance, and fall // through to the existing orphan handling above (the "all definitions" // fallback) which remains sound for taint reachability. check_catch_block_reachability_gated(&body); Ok(body) } /// Runtime gate around [`check_catch_block_reachability`] that panics in /// debug builds and warns + records an engine note in release builds. /// /// The current lowering's orphan handling (`process_block` fallback in /// `rename_variables`) already widens to an "all definitions" conservative /// state for blocks without predecessors. That preserves soundness for /// taint reachability but masks CFG-builder bugs: this gate surfaces them. fn check_catch_block_reachability_gated(body: &SsaBody) { let result = super::invariants::check_catch_block_reachability(body); if let Err(err) = result { #[cfg(debug_assertions)] { if !catch_invariant_do_not_panic() { panic!( "SSA catch-block reachability invariant violated:\n{}", err.joined() ); } } tracing::warn!( violations = %err.joined(), "SSA catch-block reachability invariant violated; proceeding with \ conservative orphan fallback" ); crate::taint::ssa_transfer::record_engine_note( crate::engine_notes::EngineNote::SsaLoweringBailed { reason: format!("catch_block_orphan: {}", err.joined()), }, ); } } // Test-only escape hatch: when set, `check_catch_block_reachability_gated` // takes the release-build path (warn + engine note, no panic) even under // `debug_assertions`. Used by the invariant test that constructs a // synthetic orphan catch body. #[cfg(debug_assertions)] thread_local! { static CATCH_INVARIANT_DO_NOT_PANIC: std::cell::Cell = const { std::cell::Cell::new(false) }; } #[cfg(debug_assertions)] #[allow(dead_code)] pub(crate) fn set_catch_invariant_do_not_panic(on: bool) { CATCH_INVARIANT_DO_NOT_PANIC.with(|c| c.set(on)); } #[cfg(debug_assertions)] fn catch_invariant_do_not_panic() -> bool { CATCH_INVARIANT_DO_NOT_PANIC.with(|c| c.get()) } /// Collect reachable nodes (BFS from entry), filtering by scope and stripping exception edges. /// Returns (reachable set, filtered edges, exception edges as (src_node, catch_node)). fn collect_reachable( cfg: &Cfg, entry: NodeIndex, scope: Option<&str>, scope_all: bool, ) -> ( HashSet, Vec<(NodeIndex, NodeIndex, EdgeKind)>, Vec<(NodeIndex, NodeIndex)>, ) { let mut reachable = HashSet::new(); let mut edges = Vec::new(); let mut exception_edges = Vec::new(); let mut queue = VecDeque::new(); // Check if a node is in scope let in_scope = |node: NodeIndex| -> bool { if scope_all { return true; } let info = &cfg[node]; match scope { None => info.ast.enclosing_func.is_none(), Some(name) => info.ast.enclosing_func.as_deref() == Some(name), } }; if !in_scope(entry) && !scope_all { // Entry must be in scope; for top-level, Entry node often has no enclosing_func // Accept Entry/Exit nodes regardless of scope if !matches!(cfg[entry].kind, StmtKind::Entry | StmtKind::Exit) { return (reachable, edges, exception_edges); } } reachable.insert(entry); queue.push_back(entry); while let Some(node) = queue.pop_front() { for edge in cfg.edges(node) { let kind = *edge.weight(); let target = edge.target(); // Strip exception edges from the graph, but still visit targets // so catch-block nodes are included in the SSA body. if matches!(kind, EdgeKind::Exception) { if (in_scope(target) || matches!(cfg[target].kind, StmtKind::Entry | StmtKind::Exit)) && reachable.insert(target) { queue.push_back(target); } // Record exception edge for taint seeding exception_edges.push((node, target)); continue; } // Allow Entry/Exit nodes and nodes in scope if !in_scope(target) && !matches!(cfg[target].kind, StmtKind::Entry | StmtKind::Exit) { continue; } edges.push((node, target, kind)); if reachable.insert(target) { queue.push_back(target); } } } (reachable, edges, exception_edges) } /// Form basic blocks from filtered CFG nodes. /// /// Returns: /// - blocks_nodes: Vec>, nodes per block (in order) /// - block_of_node: HashMap, node → block index /// - block_succs: Vec>, successors per block /// - block_preds: Vec>, predecessors per block fn form_blocks( cfg: &Cfg, entry: NodeIndex, reachable: &HashSet, filtered_edges: &[(NodeIndex, NodeIndex, EdgeKind)], ) -> ( Vec>, HashMap, Vec>, Vec>, ) { // Build adjacency from filtered edges let mut successors: HashMap> = HashMap::new(); let mut in_degree: HashMap = HashMap::new(); let mut has_branching_in: HashMap = HashMap::new(); for node in reachable { in_degree.entry(*node).or_insert(0); has_branching_in.entry(*node).or_insert(false); } // CFG construction wires every Return / Throw node to the synthetic // function-exit node via a `Seq` edge so the underlying graph is a single // connected component. Those edges are bookkeeping only: control flow // does not actually fall through a Return into the exit block. Treating // them as block successors causes an early-return block to share its // post-exit body with the function's fall-through tail, silently merging // two distinct paths into one (the "merged-return" defect). Strip them // here so block-level adjacency reflects real control flow; the SSA // terminator for the containing block becomes Return / Unreachable // instead of Goto(exit). let is_terminating = |n: NodeIndex| -> bool { matches!(cfg[n].kind, StmtKind::Return | StmtKind::Throw) }; for &(src, tgt, kind) in filtered_edges { if is_terminating(src) { continue; } successors.entry(src).or_default().push((tgt, kind)); *in_degree.entry(tgt).or_insert(0) += 1; if matches!(kind, EdgeKind::True | EdgeKind::False | EdgeKind::Back) { *has_branching_in.entry(tgt).or_insert(false) = true; } } // Determine block leaders let mut is_leader: HashSet = HashSet::new(); is_leader.insert(entry); // entry is always a leader for &node in reachable { let in_deg = in_degree.get(&node).copied().unwrap_or(0); if in_deg > 1 || has_branching_in.get(&node).copied().unwrap_or(false) { is_leader.insert(node); } // Orphan nodes (reachable via exception edges but no filtered predecessors) // must be leaders so they get their own block (e.g. catch block entries). if in_deg == 0 && node != entry { is_leader.insert(node); } // Node following a multi-exit node let succs = successors.get(&node).map(|s| s.len()).unwrap_or(0); if succs > 1 { for &(tgt, _) in successors.get(&node).unwrap_or(&vec![]) { is_leader.insert(tgt); } } } // Build blocks by following single-successor Seq edges from each leader let mut blocks_nodes: Vec> = Vec::new(); let mut block_of_node: HashMap = HashMap::new(); let mut visited: HashSet = HashSet::new(); // BFS order to assign blocks deterministically (entry first) let mut leader_queue: VecDeque = VecDeque::new(); leader_queue.push_back(entry); let mut leader_visited: HashSet = HashSet::new(); leader_visited.insert(entry); // Discover leaders in BFS order over `cfg`, but skip edges whose // source is a terminating (Return / Throw) node. Walking the raw // `cfg` directly here would re-introduce the bookkeeping // Return/Throw → fn_exit edges we just stripped, fn_exit (or any // post-return join) would be discovered through them and assigned a // block ID before its true block-level predecessors, breaking the // BFS-forward-pred invariant (`debug_assert_bfs_ordering`). // // We can't simply BFS our `successors` map because that excludes // exception edges entirely (collect_reachable strips them and records // them separately in `exception_edges`). Catch-block nodes are still // in `reachable` and must be discoverable as leaders via the // try-body → catch path, only the terminating-source bookkeeping // edges are bogus. { let mut bfs_queue: VecDeque = VecDeque::new(); let mut bfs_seen: HashSet = HashSet::new(); bfs_queue.push_back(entry); bfs_seen.insert(entry); while let Some(node) = bfs_queue.pop_front() { if reachable.contains(&node) && is_leader.contains(&node) && leader_visited.insert(node) { leader_queue.push_back(node); } if is_terminating(node) { continue; } for edge in cfg.edges(node) { let tgt = edge.target(); if reachable.contains(&tgt) && bfs_seen.insert(tgt) { bfs_queue.push_back(tgt); } } } // Belt-and-braces: any leader still unvisited gets appended in // CFG-node-index order so block-ID assignment remains // deterministic. We do NOT include the synthetic function-exit // node when it is unreachable through filtered edges, that // happens whenever every path in the body terminates explicitly // (e.g. a function whose only return is `return buf.toString()` // at the tail). Including it would emit an orphan SSA block // with no real predecessors and no semantic meaning, which the // structural reachability invariant correctly rejects. // Genuine orphan handlers (catch blocks reached via stripped // exception edges) keep their entries here. let mut orphan_leaders: Vec = is_leader .iter() .copied() .filter(|n| !leader_visited.contains(n)) .filter(|n| !matches!(cfg[*n].kind, StmtKind::Exit)) .collect(); orphan_leaders.sort_by_key(|n| n.index()); for n in orphan_leaders { if leader_visited.insert(n) { leader_queue.push_back(n); } } } for leader in leader_queue { if visited.contains(&leader) { continue; } let block_idx = blocks_nodes.len(); let mut block = vec![leader]; visited.insert(leader); block_of_node.insert(leader, block_idx); // Follow single-successor Seq edges let mut current = leader; loop { let succs = successors.get(¤t).cloned().unwrap_or_default(); if succs.len() == 1 && matches!(succs[0].1, EdgeKind::Seq) && !is_leader.contains(&succs[0].0) { let next = succs[0].0; if visited.insert(next) { block.push(next); block_of_node.insert(next, block_idx); current = next; } else { break; } } else { break; } } blocks_nodes.push(block); } // Build block-level successor/predecessor lists let num_blocks = blocks_nodes.len(); let mut block_succs: Vec> = vec![vec![]; num_blocks]; let mut block_preds: Vec> = vec![vec![]; num_blocks]; for &(src, tgt, _kind) in filtered_edges { // Mirror the adjacency-construction filter above: edges out of // Return/Throw CFG nodes are not real successors at the block level. if is_terminating(src) { continue; } if let (Some(&src_blk), Some(&tgt_blk)) = (block_of_node.get(&src), block_of_node.get(&tgt)) { if src_blk != tgt_blk && !block_succs[src_blk].contains(&tgt_blk) { block_succs[src_blk].push(tgt_blk); block_preds[tgt_blk].push(src_blk); } } } (blocks_nodes, block_of_node, block_succs, block_preds) } /// Build a block-level petgraph for dominator computation. fn build_block_graph( num_blocks: usize, block_succs: &[Vec], _entry: BlockId, ) -> (Graph, NodeIndex) { let mut g: Graph = Graph::new(); let mut block_nodes: Vec = Vec::with_capacity(num_blocks); for i in 0..num_blocks { block_nodes.push(g.add_node(BlockId(i as u32))); } for (i, succs) in block_succs.iter().enumerate() { for &s in succs { g.add_edge(block_nodes[i], block_nodes[s], ()); } } let entry_gnode = block_nodes[0]; // block 0 is always entry (g, entry_gnode) } /// Compute dominance frontiers for all blocks. fn compute_dominance_frontiers( num_blocks: usize, block_preds: &[Vec], doms: &Dominators, block_graph: &Graph, ) -> Vec> { let mut df: Vec> = vec![HashSet::new(); num_blocks]; // Map block index → graph NodeIndex let block_node: Vec = block_graph.node_indices().collect(); for n in 0..num_blocks { let preds = &block_preds[n]; if preds.len() >= 2 { for &p in preds { let mut runner = p; // idom(n) in the block graph let n_gnode = block_node[n]; let idom_n = doms.immediate_dominator(n_gnode); loop { let runner_gnode = block_node[runner]; if idom_n == Some(runner_gnode) { break; } df[runner].insert(n); // Move runner to its immediate dominator match doms.immediate_dominator(runner_gnode) { Some(idom_runner) if idom_runner != runner_gnode => { // Find block index from graph node runner = block_graph[idom_runner].0 as usize; } _ => break, // reached root } } } } } df } /// Identify variables used but not defined within the scoped blocks. /// These represent external (e.g. global/top-level) variables that need /// synthetic Param instructions so the SSA rename pass can reference them. fn identify_external_uses( cfg: &Cfg, blocks_nodes: &[Vec], var_defs: &BTreeMap>, ) -> Vec { let mut used: HashSet = HashSet::new(); for nodes in blocks_nodes { for &node in nodes { for u in &cfg[node].taint.uses { used.insert(u.clone()); } } } // External = used but never defined in any block let mut external: Vec = used .into_iter() .filter(|u| !var_defs.contains_key(u)) .collect(); external.sort(); // deterministic order external } /// True iff `name` is a language-reserved method receiver identifier /// (Rust/Python `self`, JS/TS/Java/PHP/C++ `this`). /// /// Receivers get their own IR node ([`SsaOp::SelfParam`]) and are therefore /// tracked as a distinct channel from positional parameters. Keeping the /// check localised to one helper ensures the set of receiver names stays /// consistent across lowering and summary extraction. pub(crate) fn is_receiver_name(name: &str) -> bool { matches!(name, "self" | "this") } /// Reorder external variables so the receiver (`self`/`this`) comes first, /// followed by formal positional parameters in declaration order, followed /// by remaining external vars in alphabetical order. /// /// This fixed order is what the synthetic-parameter injection step relies /// on to emit one [`SsaOp::SelfParam`] (for the leading receiver slot, when /// present) followed by a contiguous run of [`SsaOp::Param { index }`] values /// whose indices 0..N correspond exactly to positional call-site argument /// positions, no receiver offset required anywhere downstream. /// /// W1.b: every formal parameter gets a Param op even when the body never /// references it directly. Without this, the *first* `obj.f = rhs` on a /// formal `obj` whose body never reads `obj` produces no W1 /// `field_writes` entry, `var_stacks["obj"]` is empty when the synth /// Assign runs because no external-use path interned `obj`. Subsequent /// writes work because the synth Assign itself defines `obj`, so the /// gap is exactly the FIRST write. Always emitting a formal Param at /// block 0 closes that gap. fn reorder_external_vars(external: Vec, formal_params: &[String]) -> Vec { if formal_params.is_empty() { return external; // no reordering, preserve existing alphabetical sort } let ext_set: HashSet<&str> = external.iter().map(|s| s.as_str()).collect(); let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect(); let mut result = Vec::with_capacity(external.len()); // Receiver first (highest priority), regardless of whether it appears in // formal_params or was discovered purely as an external reference. // Languages with explicit self (Rust/Python) put it in formal_params; // languages with implicit this (JS/TS/Java/PHP) have it only as an // external reference. Either way, SelfParam should be emitted first. if ext_set.contains("self") || formal_set.contains("self") { result.push("self".to_string()); } else if ext_set.contains("this") || formal_set.contains("this") { result.push("this".to_string()); } // Formal positional params next (declaration order), skipping any // receiver that was already emitted above. W1.b: include EVERY // formal regardless of whether the body uses it externally, an // unused formal that gets field-written via `obj.cache = rhs` still // needs a Param op so the synth Assign loop sees its prior reaching // def in `var_stacks`. for p in formal_params { if is_receiver_name(p) { continue; } result.push(p.clone()); } // Remaining external vars alphabetically (external is already sorted), // excluding anything already placed. let placed: HashSet = result.iter().cloned().collect(); for v in external { if placed.contains(&v) { continue; } if !formal_set.contains(v.as_str()) && !is_receiver_name(&v) { result.push(v); } } result } /// Collect variable definitions per block: var_name → set of block indices. /// Nodes in `nop_nodes` are skipped (they won't define variables in SSA). fn collect_var_defs( cfg: &Cfg, blocks_nodes: &[Vec], nop_nodes: &HashSet, ) -> BTreeMap> { let mut defs: BTreeMap> = BTreeMap::new(); for (block_idx, nodes) in blocks_nodes.iter().enumerate() { for &node in nodes { if nop_nodes.contains(&node) { continue; } if let Some(ref d) = cfg[node].taint.defines { defs.entry(d.clone()).or_default().insert(block_idx); // Register parent prefixes for synthetic base updates on field writes. // E.g. `obj.data` also registers `obj` so phi insertion works correctly. let mut path = d.as_str(); while let Some(dot_pos) = path.rfind('.') { path = &path[..dot_pos]; defs.entry(path.to_string()).or_default().insert(block_idx); } } // Register extra defines from destructuring patterns. for ed in &cfg[node].taint.extra_defines { defs.entry(ed.clone()).or_default().insert(block_idx); } // Implicit definitions for uninitialized declarations (e.g., C/C++ // `char buf[256]`). The variable appears in uses but not defines // because def_use() doesn't treat declarations without initializers // as definitions. Registering here ensures phi insertion at join points. if cfg[node].taint.defines.is_none() && cfg[node].call.callee.is_none() && cfg[node].kind == StmtKind::Seq && cfg[node].taint.uses.len() == 1 { defs.entry(cfg[node].taint.uses[0].clone()) .or_default() .insert(block_idx); } } } defs } /// Cytron-style phi insertion: returns phi_placements[block] = set of var names needing phis. /// /// Returns a `BTreeSet` per block so downstream consumers that iterate /// the set (notably `rename_variables`) observe a deterministic, alphabetical /// order regardless of the underlying hasher state. The Cytron algorithm /// itself is order-independent, only its observers are. fn insert_phis( var_defs: &BTreeMap>, dom_frontiers: &[HashSet], _num_blocks: usize, ) -> Vec> { let num_blocks = dom_frontiers.len(); let mut phi_placements: Vec> = vec![BTreeSet::new(); num_blocks]; for (var, def_blocks) in var_defs { let mut worklist: VecDeque = def_blocks.iter().copied().collect(); let mut has_phi: HashSet = HashSet::new(); while let Some(b) = worklist.pop_front() { for &f in &dom_frontiers[b] { if has_phi.insert(f) { phi_placements[f].insert(var.clone()); // Phi is a new definition, add to worklist if !def_blocks.contains(&f) { worklist.push_back(f); } } } } } phi_placements } /// Build dominator tree children lists. fn build_dom_tree_children( num_blocks: usize, doms: &Dominators, block_graph: &Graph, ) -> Vec> { let mut children: Vec> = vec![vec![]; num_blocks]; let block_nodes: Vec = block_graph.node_indices().collect(); for i in 0..num_blocks { if let Some(idom) = doms.immediate_dominator(block_nodes[i]) { let idom_idx = block_graph[idom].0 as usize; if idom_idx != i { children[idom_idx].push(i); } } } children } /// Rename variables: dominator tree preorder walk with per-variable stacks. /// /// Returns (ssa_blocks, value_defs, cfg_node_map). fn rename_variables( cfg: &Cfg, blocks_nodes: &[Vec], block_succs: &[Vec], block_preds: &[Vec], phi_placements: &[BTreeSet], dom_tree_children: &[Vec], filtered_edges: &[(NodeIndex, NodeIndex, EdgeKind)], external_vars: &[String], formal_params: &[String], with_params: bool, nop_nodes: &HashSet, ) -> ( Vec, Vec, HashMap, crate::ssa::ir::FieldInterner, HashMap, HashSet, HashSet, ) { let num_blocks = blocks_nodes.len(); let mut next_value: u32 = 0; let mut value_defs: Vec = Vec::new(); let mut cfg_node_map: HashMap = HashMap::new(); // Per-body interner for FieldProj field names; populated when the // member-access decomposition (try_lower_field_proj_chain) emits a // chain for chained-receiver method calls (`a.b.c()`), and remains // empty otherwise so existing per-statement Call lowering is // bit-for-bit unchanged. let mut field_interner = crate::ssa::ir::FieldInterner::new(); //side-table mapping each synthetic base-update // [`SsaOp::Assign`]'s defined value to its `(receiver, field)` pair. // Populated below at the synthetic-Assign emission site. Read by // the taint engine to lift the assign into a structural field WRITE. let mut field_writes: HashMap = HashMap::new(); // SSA values whose `Assign` comes from a bare-array destructure // slot-scoped kill arm; the taint engine consults this set to skip // outer-node Source label pickup while still unioning operand taint. let mut slot_scoped_assigns: HashSet = HashSet::new(); // Per-variable rename stacks let mut var_stacks: HashMap> = HashMap::new(); // Pre-allocate SSA blocks let mut ssa_blocks: Vec = (0..num_blocks) .map(|i| SsaBlock { id: BlockId(i as u32), phis: Vec::new(), body: Vec::new(), terminator: Terminator::Unreachable, preds: SmallVec::new(), succs: SmallVec::new(), }) .collect(); // `BTreeMap` guarantees a deterministic (alphabetical) iteration order when // pushing phi values onto `var_stacks` and when filling operands on // successor phis, both sites are observable in SSA numbering if they // reordered between runs. let mut phi_values: Vec> = vec![BTreeMap::new(); num_blocks]; // Pre-create phi instructions for all blocks (operands filled during rename) for (block_idx, vars) in phi_placements.iter().enumerate() { let block_id = BlockId(block_idx as u32); let cfg_node = blocks_nodes[block_idx][0]; // anchor to first node for var in vars { let v = SsaValue(next_value); next_value += 1; value_defs.push(ValueDef { var_name: Some(var.clone()), cfg_node, block: block_id, }); phi_values[block_idx].insert(var.clone(), v); ssa_blocks[block_idx].phis.push(SsaInst { value: v, op: SsaOp::Phi(SmallVec::new()), cfg_node, var_name: Some(var.clone()), span: cfg[cfg_node].ast.span, }); } } // Process blocks in dominator tree preorder // We need to track stack depths to restore after processing subtrees // Use iterative approach: process block, then process children, restore // Simpler approach: preorder walk with explicit save/restore fn process_block( block_idx: usize, cfg: &Cfg, blocks_nodes: &[Vec], block_succs: &[Vec], block_preds: &[Vec], phi_placements: &[BTreeSet], dom_tree_children: &[Vec], filtered_edges: &[(NodeIndex, NodeIndex, EdgeKind)], var_stacks: &mut HashMap>, ssa_blocks: &mut [SsaBlock], phi_values: &mut [BTreeMap], value_defs: &mut Vec, cfg_node_map: &mut HashMap, next_value: &mut u32, nop_nodes: &HashSet, field_interner: &mut crate::ssa::ir::FieldInterner, field_writes: &mut HashMap, slot_scoped_assigns: &mut HashSet, ) { let block_id = BlockId(block_idx as u32); // Save stack depths for rollback let saved: Vec<(String, usize)> = var_stacks .iter() .map(|(k, v)| (k.clone(), v.len())) .collect(); // 1. Push pre-created phi values onto var stacks for (var, &v) in &phi_values[block_idx] { var_stacks.entry(var.clone()).or_default().push(v); } // 2. Process body nodes for &node in &blocks_nodes[block_idx] { let info = &cfg[node]; // Helper: build Call args from arg_uses, falling back to info.taint.uses let build_call_args = |info: &crate::cfg::NodeInfo, var_stacks: &HashMap>| -> (Vec>, Option) { let receiver = info .call .receiver .as_ref() .and_then(|r| var_stacks.get(r).and_then(|s| s.last().copied())); let args = if !info.call.arg_uses.is_empty() { let mut args: Vec> = info .call .arg_uses .iter() .map(|arg_idents| { arg_idents .iter() .filter_map(|ident| { var_stacks.get(ident).and_then(|s| s.last().copied()) }) .collect() }) .collect(); // For chained calls (e.g. fetch(url).then(fn)), arg_uses only // captures the final call's args. Variables used by intermediate // calls (like `url` in fetch) are in info.taint.uses but not arg_uses. // Add them as an extra group so sink detection can see them. // // Exclude the receiver ident: it's carried on its own typed // channel (`SsaOp::Call.receiver`). Callers that care about // positional arity must read it from `info.call.arg_uses.len()`, // not `args.len()`, since this implicit group inflates args. let arg_uses_flat: HashSet<&str> = info .call .arg_uses .iter() .flat_map(|g| g.iter().map(|s| s.as_str())) .collect(); let receiver_ident = info.call.receiver.as_deref(); let implicit: SmallVec<[SsaValue; 2]> = info .taint .uses .iter() .filter(|u| !arg_uses_flat.contains(u.as_str())) .filter(|u| Some(u.as_str()) != receiver_ident) .filter_map(|u| var_stacks.get(u).and_then(|s| s.last().copied())) .collect(); if !implicit.is_empty() { args.push(implicit); } args } else { // Fallback: treat all uses as a single argument group let all_uses: SmallVec<[SsaValue; 2]> = info .taint .uses .iter() .filter_map(|u| var_stacks.get(u).and_then(|s| s.last().copied())) .collect(); if all_uses.is_empty() { vec![] } else { vec![all_uses] } }; (args, receiver) }; // Determine operation and collect uses // Out-of-scope nodes (nop_nodes) become Nop: they preserve graph // connectivity but don't participate in taint flow. let op = if nop_nodes.contains(&node) { SsaOp::Nop } else if info.catch_param { SsaOp::CatchParam } else if info .taint .labels .iter() .any(|l| matches!(l, crate::labels::DataLabel::Source(_))) && info.call.callee.is_none() { // Pure source (e.g. $_GET, env var), no callee, so no args to track. // Source-labeled calls (e.g. file_get_contents) fall through to Call // so argument taint and sink detection still work. SsaOp::Source } else if info.call.callee.is_some() { let callee = info.call.callee.as_deref().unwrap_or("").to_string(); let (mut args, mut receiver) = build_call_args(info, var_stacks); // try decomposing chained-receiver method calls // (`a.b.c()`) into a FieldProj chain plus a bare-method Call // so downstream consumers can read the receiver structure // without re-parsing the callee text. Bails to None on any // non-chain receiver (current behaviour preserved). let (final_callee, callee_text) = match try_lower_field_proj_chain( &callee, var_stacks, field_interner, block_idx, block_id, next_value, ssa_blocks, value_defs, node, info.ast.span, ) { Some((recv_v, bare_method)) => { receiver = Some(recv_v); // Strip any positional arg group that exactly matches the // chain root identifier, it has been replaced by the // FieldProj chain receiver, and re-listing it as an // argument would inflate arity / double-taint. if let Some(base_ident) = callee.split('.').next() { if let Some(base_v) = var_stacks.get(base_ident).and_then(|s| s.last()) { args.retain(|grp| !(grp.len() == 1 && grp.first() == Some(base_v))); } } (bare_method, Some(callee.clone())) } None => (callee, None), }; SsaOp::Call { callee: final_callee, callee_text, args, receiver, } } else if info.taint.defines.is_some() && info.taint.uses.is_empty() && !info .taint .labels .iter() .any(|l| matches!(l, crate::labels::DataLabel::Source(_))) { // Reassignment kill: a node that defines a variable but has no // uses (operands) and is not a source is a constant/literal // assignment. SSA rename allocates a fresh SsaValue, so // downstream references see this new (untainted) value, the // prior tainted definition is implicitly dead. SsaOp::Const(info.taint.const_text.clone()) } else if info.taint.defines.is_some() { let mut uses: SmallVec<[SsaValue; 4]> = info .taint .uses .iter() .filter_map(|u| var_stacks.get(u).and_then(|s| s.last().copied())) .collect(); // Inject Const for binary expression literal operand. // When a binary expression has one identifier and one numeric literal // (e.g., `flags & 0x07`), the literal isn't in `uses`. Inject a // synthetic Const instruction so the Assign has 2 uses, preventing // copy propagation from eliminating the operation. if uses.len() == 1 && info.bin_op.is_some() && info.bin_op_const.is_some() { let const_val = info.bin_op_const.unwrap(); let const_v = SsaValue(*next_value); *next_value += 1; let const_inst = SsaInst { value: const_v, op: SsaOp::Const(Some(const_val.to_string())), cfg_node: node, var_name: None, span: info.ast.span, }; ssa_blocks[block_idx].body.push(const_inst); value_defs.push(ValueDef { var_name: None, cfg_node: node, block: block_id, }); uses.push(const_v); } SsaOp::Assign(uses) } else if matches!(info.kind, StmtKind::Return | StmtKind::Throw) && !info.taint.uses.is_empty() { // `return s` / `throw e` with identifier uses: emit an // `Assign(uses)` so the SSA carries an explicit pass-through // for the returned/thrown value. Without this, the Return // node was lowered as a `Nop` and the terminator-setup // "last non-Nop body inst" search returned None, producing // `Terminator::Return(None)` for a function that visibly // returns an identifier. That broke per-return-path // PathFact narrowing for non-Rust languages where the // returned identifier wasn't computed in the same block // (e.g. Python `def f(s): return s`, `s` is a Param in // block 0, the Return block itself has no body insts). let uses: SmallVec<[SsaValue; 4]> = info .taint .uses .iter() .filter_map(|u| var_stacks.get(u).and_then(|s| s.last().copied())) .collect(); if uses.is_empty() { SsaOp::Nop } else { SsaOp::Assign(uses) } } else if info.is_await_forward && info.call.callee.is_none() && !info.taint.uses.is_empty() { // `await x` resolves to the same value as `x` — model as a 1:1 // copy so taint, origins, and abstract-domain facts forward // unchanged. Gated on `callee.is_none()` so an await-wrapped // call still lowers as a Call op rather than being collapsed // to Assign (today CFG splits `await foo(x)` into two nodes, // but the guard keeps the invariant explicit). let uses: SmallVec<[SsaValue; 4]> = info .taint .uses .iter() .filter_map(|u| var_stacks.get(u).and_then(|s| s.last().copied())) .collect(); if uses.is_empty() { SsaOp::Nop } else { SsaOp::Assign(uses) } } else if matches!( info.kind, StmtKind::Entry | StmtKind::Exit | StmtKind::If | StmtKind::Loop | StmtKind::Break | StmtKind::Continue | StmtKind::Return | StmtKind::Throw ) { SsaOp::Nop } else if info.call.callee.is_some() { let callee = info.call.callee.as_deref().unwrap_or("").to_string(); let (mut args, mut receiver) = build_call_args(info, var_stacks); // same FieldProj-chain decomposition as the primary // Call branch above, kept in sync because this fallback // path also constructs SSA Call ops (used for control-flow // wrapper calls that landed past the earlier match arms). let (final_callee, callee_text) = match try_lower_field_proj_chain( &callee, var_stacks, field_interner, block_idx, block_id, next_value, ssa_blocks, value_defs, node, info.ast.span, ) { Some((recv_v, bare_method)) => { receiver = Some(recv_v); if let Some(base_ident) = callee.split('.').next() { if let Some(base_v) = var_stacks.get(base_ident).and_then(|s| s.last()) { args.retain(|grp| !(grp.len() == 1 && grp.first() == Some(base_v))); } } (bare_method, Some(callee.clone())) } None => (callee, None), }; SsaOp::Call { callee: final_callee, callee_text, args, receiver, } } else { SsaOp::Nop }; // Allocate SSA value let v = SsaValue(*next_value); *next_value += 1; let var_name_for_ssa = if nop_nodes.contains(&node) { None } else if info.taint.defines.is_some() { info.taint.defines.clone() } else if info.kind == StmtKind::Seq && info.call.callee.is_none() && info.taint.uses.len() == 1 && !var_stacks.contains_key(&info.taint.uses[0]) { // Implicit definition for uninitialized declarations (e.g., // C/C++ `char buf[256]`). Creates a reaching definition so // output-parameter sources like fgets() can taint the buffer // and subsequent uses (e.g., system(buf)) see the tainted value. Some(info.taint.uses[0].clone()) } else { None }; value_defs.push(ValueDef { var_name: var_name_for_ssa.clone(), cfg_node: node, block: block_id, }); // Push defined variable onto stack (skip nop nodes) if let Some(ref d) = var_name_for_ssa { var_stacks.entry(d.clone()).or_default().push(v); } cfg_node_map.insert(node, v); // Promise.all-style array-destructure precision: when a CallWrapper // node binds an array_pattern (`const [a, b] = await Promise.all( // [x, y])` or `let (a, b) = tokio::join!(x, y)`) and the value is a // promise combinator that produces an array/tuple of per-element // results (`Promise.all`, `Promise.allSettled`, `asyncio.gather`, // `tokio::join!` and friends), rewrite the per-binding SSA so each // binding sees only its own index's taint instead of the scalar // union that `try_apply_promise_combinator` would produce. // // Two argument shapes are supported: // (a) literal-array (JS/Python): one positional arg whose // collected idents represent the array elements in order, // e.g. `Promise.all([x, y])` → args = [[x, y]]. // (b) positional (Rust macros): N positional args, each one // ident, e.g. `tokio::join!(x, y)` → args = [[x], [y]]. // // `Promise.race` and `Promise.resolve` are excluded: the awaited // value of a race is whichever promise wins (a single value, not // an array), and destructuring that value index-by-index does not // correspond to the args. // The rewrite fires when: // - the call is a promise combinator that produces an array of // per-element results (`All` / `AllSettled`), AND // - the LHS destructures into >= 2 bindings (sequential case // where `extra_defines` is non-empty), OR // - the LHS is an array_pattern with at least one skip slot // (`array_pattern_indices` is non-empty, even if `extra_defines` // itself is empty — `const [, b]` is a single-binding pattern // whose index is 1, not 0). let is_combinator_rewrite_target = matches!( info.call .callee .as_deref() .and_then(crate::labels::is_any_promise_combinator), Some( crate::labels::PromiseCombinatorKind::All | crate::labels::PromiseCombinatorKind::AllSettled ) ); // Indices for each binding in source order: primary at index 0, // then extras. Falls back to sequential 0..N when the AST didn't // record explicit indices (non-array_pattern destructures and // tuple_pattern shapes that contain no wildcards). let binding_indices: SmallVec<[usize; 4]> = if !info.taint.array_pattern_indices.is_empty() { info.taint.array_pattern_indices.clone() } else if !info.taint.extra_defines.is_empty() { (0..=info.taint.extra_defines.len()).collect() } else { SmallVec::new() }; let promise_destruct_args: Option> = if is_combinator_rewrite_target && !binding_indices.is_empty() { let max_index = binding_indices.iter().copied().max().unwrap_or(0); let needed = max_index + 1; // Use `info.call.arg_uses` directly rather than the // build_call_args-derived `args`, which may include an // implicit "uses not in arg_uses" group appended for chain // bookkeeping that would inflate the apparent arity. let arg_uses = &info.call.arg_uses; let map_idents = |idents: &[String]| -> Option> { let mapped: SmallVec<[SsaValue; 4]> = idents .iter() .take(needed) .filter_map(|ident| { var_stacks.get(ident).and_then(|s| s.last().copied()) }) .collect(); if mapped.len() == needed { Some(mapped) } else { None } }; if arg_uses.len() == 1 && arg_uses[0].len() >= needed { // Shape (a): single positional arg whose idents are the // array elements in source order (`Promise.all([x, y])`, // `asyncio.gather([x, y])`). map_idents(&arg_uses[0]) } else if arg_uses.len() >= needed && arg_uses.iter().take(needed).all(|g| g.len() == 1) { // Shape (b): N positional args, each with one ident // (`tokio::join!(x, y)`). let names: Vec<&String> = arg_uses.iter().take(needed).map(|g| &g[0]).collect(); let mapped: SmallVec<[SsaValue; 4]> = names .iter() .filter_map(|ident| { var_stacks .get(ident.as_str()) .and_then(|s| s.last().copied()) }) .collect(); if mapped.len() == needed { Some(mapped) } else { None } } else { None } } else { None }; // Bare-array RHS destructure precision: when the LHS is an // array_pattern / tuple_pattern / pattern_list / left_assignment_list // AND the RHS is a bare array-literal, build per-source-position // ops so each binding sees only its index's element instead of // the scalar union of every RHS ident. // // Three slot shapes are recognised by `collect_rhs_array_literal_elements`: // // * `Ident(name)` — bare identifier. Emit `Assign(reaching_def)`. // * `Literal` — syntactic literal (string/number/etc.). Emit // `Const(None)` so the binding carries no taint. // * `Complex(uses)` — call / binary / subscript / member access / // interpolated string / nested array literal / etc. Emit // `Assign(union of inner ident reaching defs)` — slot-scoped // union, not the whole-RHS union the legacy path produced. // Falls back to `Const(None)` when no inner idents resolve // (pure literal subexpression like `1 + 2`). // // Closes FPs like `const [a, b] = [safe, tainted]; exec(b);` // (Ident shape) and `const [c, d] = [fn(req.x), 'lit']; exec(d);` // (Complex shape) where the legacy union painted the safe binding. // // The promise-combinator path above has already populated // `promise_destruct_args` when its preconditions held, so the // mutual exclusion is gated through `promise_destruct_args.is_none()` // rather than `info.call.callee.is_none()`. The earlier // callee-none gate was wrong because the outer // variable_declarator node picks up `info.call.callee` whenever // the RHS text matches a Source label — which is exactly the // case where we need the per-slot rewrite most. // The outer node may carry a `DataLabel::Source(_)` whose // classification matched somewhere in the RHS expression text // (`req.body.cmd`, `process.env.X`, etc.). For multi-slot // RHS we can't statically partition WHICH slot caused that // match, but it must originate from a Complex slot (Literal // and bare-Ident slots whose names resolve through // `var_stacks` carry their own SsaValue identity). Treat // Complex slots as Source-emitting when the outer label set // included Source — strict precision improvement over the // legacy union path which painted EVERY slot, including // Literal, with the outer Source. let outer_is_source = info .taint .labels .iter() .any(|l| matches!(l, crate::labels::DataLabel::Source(_))); // Per-slot Source classification (see `RhsArraySlot::Complex.source_cap`): // when at least one Complex slot's own subtree classified as // Source, we know which slot(s) carried the source pattern, so // sibling Complex slots without their own source_cap stay // slot-scoped (Assign / Const). Otherwise (the outer node // matched but no per-slot classifier fired — typical of subscript // chains and other shapes whose source flows via reaching-def // rather than static text), fall back to the conservative // "all-Complex-are-Source" emission for legacy preservation. use crate::cfg::RhsArraySlot; let any_slot_has_source_cap = info.taint.rhs_array_elements.iter().any(|s| { matches!( s, RhsArraySlot::Complex { source_cap, .. } if !source_cap.is_empty() ) }); let effective_outer_fallback = outer_is_source && !any_slot_has_source_cap; let bare_array_ops: Option<(SmallVec<[SsaOp; 4]>, SmallVec<[bool; 4]>)> = if !info.taint.rhs_array_elements.is_empty() && !binding_indices.is_empty() && promise_destruct_args.is_none() { let max_index = binding_indices.iter().copied().max().unwrap_or(0); let needed = max_index + 1; if info.taint.rhs_array_elements.len() < needed { None } else { let mut per_pos: SmallVec<[SsaOp; 4]> = SmallVec::new(); let mut slot_scoped_mask: SmallVec<[bool; 4]> = SmallVec::new(); let mut bail = false; for slot in info.taint.rhs_array_elements.iter().take(needed) { let mut is_slot_scoped = false; let slot_op = match slot { RhsArraySlot::Ident(ident) => { match var_stacks .get(ident.as_str()) .and_then(|s| s.last().copied()) { Some(sv) => SsaOp::Assign(SmallVec::from_elem(sv, 1)), None => { bail = true; break; } } } RhsArraySlot::Literal => SsaOp::Const(None), RhsArraySlot::Complex { uses: inner_uses, source_cap, } => { let mut mapped: SmallVec<[SsaValue; 4]> = SmallVec::new(); for ident in inner_uses.iter() { if let Some(sv) = var_stacks .get(ident.as_str()) .and_then(|s| s.last().copied()) { if !mapped.contains(&sv) { mapped.push(sv); } } } if !source_cap.is_empty() { // Per-slot classification found a Source // pattern (e.g. `req.body.cmd`) inside // THIS slot's subtree. Emit Source so the // binding inherits the outer-node Source // caps for this slot's index. SsaOp::Source } else if outer_is_source && any_slot_has_source_cap { // Some OTHER slot's subtree classified as // Source; this slot did NOT. Emit // Assign(mapped) and mark the slot as // slot-scoped so the taint transfer's // Assign arm skips outer-node Source // label pickup for this binding (without // losing transitive taint through inner // uses). When `mapped` is empty, fall // back to Const(None) — the binding // carries no taint anyway. if mapped.is_empty() { SsaOp::Const(None) } else { is_slot_scoped = true; SsaOp::Assign(mapped.clone()) } } else if effective_outer_fallback { // Outer-node Source label but no // per-slot classifier fired on any slot // (typical of subscript-on-tainted-local // shapes). Preserve legacy conservative // emission for unrecognised shapes. SsaOp::Source } else if mapped.is_empty() { SsaOp::Const(None) } else { SsaOp::Assign(mapped) } } }; per_pos.push(slot_op); slot_scoped_mask.push(is_slot_scoped); } if bail { None } else { Some((per_pos, slot_scoped_mask)) } } } else { None }; // Clone op for potential extra_defines before moving into SsaInst. // For the destructure-promise / bare-array rewrites, the // per-extra ops are built explicitly below, so the shared clone // path is bypassed. let primary_op_for_extras = if info.taint.extra_defines.is_empty() || promise_destruct_args.is_some() || bare_array_ops.is_some() { None } else { Some(op.clone()) }; // Override primary op to single-operand Assign when the // destructure-promise rewrite fires. The primary's source-order // index is `binding_indices[0]` — non-zero for skip-leading // patterns like `const [, b]` where `b` is the FIRST (and only) // binding but lives at pattern position 1. let primary_op = if let Some(ref args) = promise_destruct_args { let primary_idx = binding_indices.first().copied().unwrap_or(0); let pick = args.get(primary_idx).copied().unwrap_or(args[0]); SsaOp::Assign(SmallVec::from_elem(pick, 1)) } else if let Some((ref per_pos, ref slot_scoped_mask)) = bare_array_ops { let primary_idx = binding_indices.first().copied().unwrap_or(0); if slot_scoped_mask.get(primary_idx).copied().unwrap_or(false) { slot_scoped_assigns.insert(v); } per_pos .get(primary_idx) .cloned() .unwrap_or(SsaOp::Const(None)) } else { op }; ssa_blocks[block_idx].body.push(SsaInst { value: v, op: primary_op, cfg_node: node, var_name: var_name_for_ssa.clone(), span: info.ast.span, }); // Synthetic base update: when a dotted path is defined (e.g. `obj.data`), // create synthetic Assign instructions for parent prefixes (e.g. `obj`) // so that subsequent reads of the base variable see the field write. // Only includes the new field value (not the old base) so that field // overwrites properly kill taint: if obj.data is re-assigned to a // constant, the base `obj` no longer carries that field's taint. // //each synthetic Assign also records its // structural identity into `field_writes`, `(receiver_old_value, // FieldId(field_name))`, so the taint engine can recognise the // synthetic assign as a field WRITE and mirror the rhs taint // into the matching `(loc, field)` cell on `SsaTaintState`. // The "old" parent value is the reaching def of `parent` BEFORE // we push the new `synth_v`; when no prior def exists (the // parent is undefined at this point), we skip the side-table // entry so the consumer's `pt(receiver)` walk produces no work. if !nop_nodes.contains(&node) { if let Some(ref d) = info.taint.defines { let mut current = d.as_str(); let mut child_value = v; while let Some(dot_pos) = current.rfind('.') { let parent = ¤t[..dot_pos]; let field_name = ¤t[dot_pos + 1..]; // Snapshot prior reaching def of `parent` BEFORE we // push the new synth_v. Used by the field-write // side-table as the receiver SsaValue. let prior_parent_value: Option = var_stacks.get(parent).and_then(|s| s.last().copied()); let synth_v = SsaValue(*next_value); *next_value += 1; let synth_uses: SmallVec<[SsaValue; 4]> = SmallVec::from_elem(child_value, 1); value_defs.push(ValueDef { var_name: Some(parent.to_string()), cfg_node: node, block: block_id, }); var_stacks .entry(parent.to_string()) .or_default() .push(synth_v); ssa_blocks[block_idx].body.push(SsaInst { value: synth_v, op: SsaOp::Assign(synth_uses), cfg_node: node, var_name: Some(parent.to_string()), span: info.ast.span, }); // Record `(synth_v -> (prior_parent, field_id))` so // the taint engine can lift the synthetic assign // into a field-write hook. The field name is // interned through the per-body `FieldInterner` so // FieldProj reads downstream resolve to the same id. if let Some(rcv) = prior_parent_value { let fid = field_interner.intern(field_name); field_writes.insert(synth_v, (rcv, fid)); } child_value = synth_v; current = parent; } } } // Emit extra SSA instructions for destructuring bindings. // Each extra define inherits the same op (Source/Call/Assign) as the primary. // // For the destructure-promise rewrite, each extra emits an Assign // on its corresponding indexed argument so per-element taint is // preserved instead of the scalar union. The source-order index // for `extra_defines[i]` is `binding_indices[i + 1]` — accounts // for skip slots like `const [a, , b]` where `b` sits at index 2, // not at index 1. if let Some(ref pd_args) = promise_destruct_args { for (i, extra_def) in info.taint.extra_defines.iter().enumerate() { let ev = SsaValue(*next_value); *next_value += 1; value_defs.push(ValueDef { var_name: Some(extra_def.clone()), cfg_node: node, block: block_id, }); var_stacks.entry(extra_def.clone()).or_default().push(ev); let extra_idx = binding_indices.get(i + 1).copied().unwrap_or(i + 1); let arg = pd_args.get(extra_idx).copied().unwrap_or(pd_args[0]); ssa_blocks[block_idx].body.push(SsaInst { value: ev, op: SsaOp::Assign(SmallVec::from_elem(arg, 1)), cfg_node: node, var_name: Some(extra_def.clone()), span: info.ast.span, }); } } else if let Some((ref per_pos, ref slot_scoped_mask)) = bare_array_ops { // Bare-array RHS destructure: each extra emits the op for its // source-order RHS position. Ident slots emit Assign of the // ident's reaching SSA value; literal slots emit Const(None). // Slot-scoped Assigns are registered in // `slot_scoped_assigns` so the taint transfer skips // outer-node Source pickup for those bindings. for (i, extra_def) in info.taint.extra_defines.iter().enumerate() { let ev = SsaValue(*next_value); *next_value += 1; value_defs.push(ValueDef { var_name: Some(extra_def.clone()), cfg_node: node, block: block_id, }); var_stacks.entry(extra_def.clone()).or_default().push(ev); let extra_idx = binding_indices.get(i + 1).copied().unwrap_or(i + 1); let op_for_extra = per_pos .get(extra_idx) .cloned() .unwrap_or(SsaOp::Const(None)); if slot_scoped_mask.get(extra_idx).copied().unwrap_or(false) { slot_scoped_assigns.insert(ev); } ssa_blocks[block_idx].body.push(SsaInst { value: ev, op: op_for_extra, cfg_node: node, var_name: Some(extra_def.clone()), span: info.ast.span, }); } } else if let Some(ref primary_op) = primary_op_for_extras { for extra_def in &info.taint.extra_defines { let ev = SsaValue(*next_value); *next_value += 1; value_defs.push(ValueDef { var_name: Some(extra_def.clone()), cfg_node: node, block: block_id, }); var_stacks.entry(extra_def.clone()).or_default().push(ev); ssa_blocks[block_idx].body.push(SsaInst { value: ev, op: primary_op.clone(), cfg_node: node, var_name: Some(extra_def.clone()), span: info.ast.span, }); } } } // 3. Set terminator let succs = &block_succs[block_idx]; let last_node = *blocks_nodes[block_idx].last().unwrap(); ssa_blocks[block_idx].terminator = if succs.is_empty() { // A block with no successors at the block level is one of: // (1) a block containing a Throw, terminates with an // exception; no normal fall-through. // (2) a block containing a Return, terminates with a value // (or void). After form_blocks strips the bookkeeping // Seq edge from Return → fn_exit, every explicit-return // block lands here, including `if cond { return X; }` // early returns. // (3) the function-exit (fn_exit) block itself when the // function falls off the end (implicit return). // // Distinguish them by inspecting the block's CFG nodes. let return_node = blocks_nodes[block_idx] .iter() .copied() .find(|&n| cfg[n].kind == StmtKind::Return); let has_throw_node = blocks_nodes[block_idx] .iter() .any(|&n| cfg[n].kind == StmtKind::Throw); if has_throw_node && return_node.is_none() { // Throw terminates control flow with an exception. No // structured Throw terminator exists today; downstream // analyses rely on `exception_edges` (recorded separately) // for catch-block dispatch. Mark the normal-flow exit as // Unreachable so successor consumers do not invent a // synthetic fall-through edge. Terminator::Unreachable } else if let Some(rn) = return_node { let return_info = &cfg[rn]; // Return-value resolution. Mirror the legacy // `has_const_return` path so callers see exactly the same // SSA shape they did before the merged-return fix, only // the *terminator* changes (Goto(exit) → Return(_)), not // the value selection. // // (a) Literal return (`return 'x'`, `return None`, // `return []`, `return;`). Marked by // `taint.uses.is_empty()` on the Return CFG node. // Emit a synthetic Const inst so taint never leaks // from an unrelated inst earlier in the same block // (regression guard: C-1 inline-return precision). // (b) Computed / passthrough return, last non-Nop body // inst. Covers `return foo()` (Call sits before the // Return Nop), `return x + y` (Assign), and the // implicit tail expression collapsed into a single // block by the leader-following loop. When the // Return carries identifier uses (`return req`, // `return { req.session, ... }`), the SSA defs for // those identifiers are already on the body as // Param / Assign / Source insts, picking the last // one matches pre-fix behaviour exactly. // (c) Void / unresolved, `Return(None)`. if return_info.taint.uses.is_empty() { let const_text = return_info.taint.const_text.clone(); let const_v = SsaValue(*next_value); *next_value += 1; let block_id = BlockId(block_idx as u32); value_defs.push(ValueDef { var_name: None, cfg_node: rn, block: block_id, }); ssa_blocks[block_idx].body.push(SsaInst { value: const_v, op: SsaOp::Const(const_text), cfg_node: rn, var_name: None, span: return_info.ast.span, }); Terminator::Return(Some(const_v)) } else { let from_body = ssa_blocks[block_idx] .body .iter() .rev() .find(|inst| !matches!(inst.op, SsaOp::Nop)) .map(|inst| inst.value); Terminator::Return(from_body) } } else { // (3) fn_exit / true fall-off, no Return CFG node in this // block. Use the last non-Nop body instruction as the // implicit return value (e.g. the function's tail-position // expression in Rust). let ret_val = ssa_blocks[block_idx] .body .iter() .rev() .find(|inst| !matches!(inst.op, SsaOp::Nop)) .map(|inst| inst.value); Terminator::Return(ret_val) } } else if succs.len() == 1 { Terminator::Goto(BlockId(succs[0] as u32)) } else if succs.len() == 2 { // Find the If/Loop node that branches let cond_node = blocks_nodes[block_idx] .iter() .rev() .find(|&&n| matches!(cfg[n].kind, StmtKind::If | StmtKind::Loop)) .copied() .unwrap_or(last_node); // Determine which successor is true/false by looking at edge kinds let mut true_blk = succs[0]; let mut false_blk = succs[1]; // Check filtered edges from any node in this block to successors for &(src, tgt, kind) in filtered_edges { if blocks_nodes[block_idx].contains(&src) { let tgt_blk_opt = succs.iter().position(|&s| { blocks_nodes .get(s) .is_some_and(|nodes| nodes.contains(&tgt)) }); if let Some(tgt_blk_pos) = tgt_blk_opt { match kind { EdgeKind::True => true_blk = succs[tgt_blk_pos], EdgeKind::False => false_blk = succs[tgt_blk_pos], _ => {} } } } } // Lower structured condition from CFG metadata let cond_info = &cfg[cond_node]; let condition = if cond_info.condition_text.is_some() && !cond_info.condition_vars.is_empty() { let expr = crate::constraint::lower::lower_condition_with_stacks(cond_info, var_stacks); if matches!(expr, crate::constraint::lower::ConditionExpr::Unknown) { None } else { Some(Box::new(expr)) } } else { None }; Terminator::Branch { cond: cond_node, true_blk: BlockId(true_blk as u32), false_blk: BlockId(false_blk as u32), condition, } } else { // More than 2 successors, model as a multi-way Switch. // // This replaces the previous `Goto(first)` collapse: the // structured terminator now enumerates every target instead // of hiding N-1 of them behind `block.succs`. Flow consumers // (taint, const-prop, symex) still iterate `succs` as // authoritative, but downstream tooling that inspects the // terminator shape gets the full fanout. // // Note: today's switch-statement CFG construction decomposes // cases into a cascade of binary `Branch` headers (see // `build_switch` in src/cfg.rs), so real switch statements // never reach this arm. Folding the cascade back into a // single Switch node is a follow-up; in the meantime, this // arm fires only on genuine multi-way CFG fanouts (e.g. // future Go-switch / Java-arrow / Rust-match lowerings). // // Scrutinee: use the primary SSA value defined at the last // node in this block when one exists; fall back to // `SsaValue(0)` (a valid index, SSA numbering is 1-based // only conceptually, and value 0 is always present in a // non-empty body) when no value is defined. Downstream // consumers that care about the scrutinee (abstract interp, // symex per-case constraints) treat a missing/degenerate // scrutinee as "unknown" rather than panicking. let scrutinee = cfg_node_map.get(&last_node).copied().unwrap_or(SsaValue(0)); let targets: SmallVec<[BlockId; 4]> = succs.iter().skip(1).map(|&s| BlockId(s as u32)).collect(); let default = BlockId(succs[0] as u32); // Synthetic ≥3-way fanouts have no per-case literal metadata , // every entry is None (unknown), so the executor falls back to // first-reachable behavior on this terminator. let case_values: SmallVec<[Option; 4]> = std::iter::repeat_with(|| None) .take(targets.len()) .collect(); tracing::debug!( block = block_idx, num_succs = succs.len(), "emitting Terminator::Switch for ≥3-way fanout", ); Terminator::Switch { scrutinee, targets, default, case_values, } }; // 4. Fill phi operands in successor blocks for &succ in succs { for (var, &phi_val) in &phi_values[succ] { // The version of `var` reaching from this block let reaching_val = var_stacks.get(var).and_then(|s| s.last().copied()); if let Some(rv) = reaching_val { // Find the phi instruction and add this operand for phi in &mut ssa_blocks[succ].phis { if phi.value == phi_val { if let SsaOp::Phi(ref mut operands) = phi.op { operands.push((block_id, rv)); } } } } } } // 5. Recurse into dominator tree children for &child in &dom_tree_children[block_idx] { process_block( child, cfg, blocks_nodes, block_succs, block_preds, phi_placements, dom_tree_children, filtered_edges, var_stacks, ssa_blocks, phi_values, value_defs, cfg_node_map, next_value, nop_nodes, field_interner, field_writes, slot_scoped_assigns, ); } // 6. Restore stacks for (var, depth) in &saved { if let Some(stack) = var_stacks.get_mut(var) { stack.truncate(*depth); } } // Remove any new variables that weren't in saved let saved_vars: HashSet<&String> = saved.iter().map(|(k, _)| k).collect(); var_stacks.retain(|k, _| saved_vars.contains(k)); } // Inject synthetic Param instructions at START of block 0 for external variables. // These create SSA definitions so the rename pass can reference them. // Pre-seed var_stacks so process_block sees them. // // `external_vars` contains both real formal parameters and free / closure- // captured variables (variables read by the body but not declared as a // formal and not assigned anywhere). Both end up emitted as // [`SsaOp::Param`] in block 0; we record the SSA values that correspond // to free vars in `synthetic_externals` so downstream analyses (the JS/TS // handler-name auto-seed in particular) can avoid treating closure // captures as if they were parameters of the function under analysis. // // **Conservative behaviour when the caller didn't supply formal-param // info.** Several call sites (`lower_to_ssa`, `lower_to_ssa_scoped_nop`) // don't supply formal parameter names; in that case we cannot distinguish // formals from free vars structurally, so we leave `synthetic_externals` // empty and the auto-seed pass keeps its pre-fix behaviour of treating // every `Param` op as a candidate. Callers that opt in via // `lower_to_ssa_with_params` set `with_params=true`, signalling that // `formal_params` is the authoritative formal list — even when empty // (arrow `() => {…}`). In that case every external becomes synthetic // unless it appears in `formal_params`, so the auto-seed pass cannot // mistake a bubbled-up free var (like `userId` lifted from a nested // jest test callback) for a formal of the outer body. let mut synthetic_externals: HashSet = HashSet::new(); let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect(); let track_synthetic = with_params; if !external_vars.is_empty() { let entry_cfg_node = blocks_nodes[0][0]; let mut synthetic_body = Vec::with_capacity(external_vars.len()); let mut positional_idx: usize = 0; for var in external_vars.iter() { let v = SsaValue(next_value); next_value += 1; value_defs.push(ValueDef { var_name: Some(var.clone()), cfg_node: entry_cfg_node, block: BlockId(0), }); let is_receiver = is_receiver_name(var); let op = if is_receiver { SsaOp::SelfParam } else { let op = SsaOp::Param { index: positional_idx, }; positional_idx += 1; op }; // A non-receiver var is "synthetic" (a free / closure capture) // when it is *not* one of the function's declared formals AND // not a dotted access on a formal (`input.cmd` where `input` is // a formal — it represents a structural projection of the // formal, not a free variable; the auto-seed should still treat // it as part of the formal's own taint surface). Receivers are // intentionally excluded: `this` / `self` represent the implicit // receiver, which always belongs to the function. // // Only fire when the caller supplied formal-parameter names; see // the `track_synthetic` rationale above. let root_is_formal = var .split_once('.') .map(|(root, _)| formal_set.contains(root)) .unwrap_or(false); if track_synthetic && !is_receiver && !formal_set.contains(var.as_str()) && !root_is_formal { synthetic_externals.insert(v); } synthetic_body.push(SsaInst { value: v, op, cfg_node: entry_cfg_node, var_name: Some(var.clone()), span: (0, 0), }); var_stacks.entry(var.clone()).or_default().push(v); } // Prepend synthetic params before any existing body instructions synthetic_body.append(&mut ssa_blocks[0].body); ssa_blocks[0].body = synthetic_body; } process_block( 0, // entry block cfg, blocks_nodes, block_succs, block_preds, phi_placements, dom_tree_children, filtered_edges, &mut var_stacks, &mut ssa_blocks, &mut phi_values, &mut value_defs, &mut cfg_node_map, &mut next_value, nop_nodes, &mut field_interner, &mut field_writes, &mut slot_scoped_assigns, ); // Process orphan blocks (e.g. catch blocks disconnected after exception edge removal). // These blocks have no predecessors and weren't reached by the dominator tree walk. // // Rebuild var_stacks from already-processed instructions so that catch blocks // can reference variables defined before the try block (e.g. `userInput`). let has_orphans = (1..num_blocks).any(|bid| block_preds[bid].is_empty() && ssa_blocks[bid].body.is_empty()); if has_orphans { // Rebuild var_stacks from all SSA instructions created during the main walk. // This gives orphan blocks access to all variable definitions. var_stacks.clear(); for block in &ssa_blocks { for inst in block.phis.iter().chain(block.body.iter()) { if let Some(ref name) = inst.var_name { var_stacks.entry(name.clone()).or_default().push(inst.value); } } } for bid in 1..num_blocks { if block_preds[bid].is_empty() && ssa_blocks[bid].body.is_empty() { process_block( bid, cfg, blocks_nodes, block_succs, block_preds, phi_placements, dom_tree_children, filtered_edges, &mut var_stacks, &mut ssa_blocks, &mut phi_values, &mut value_defs, &mut cfg_node_map, &mut next_value, nop_nodes, &mut field_interner, &mut field_writes, &mut slot_scoped_assigns, ); } } } ( ssa_blocks, value_defs, cfg_node_map, field_interner, field_writes, synthetic_externals, slot_scoped_assigns, ) } // ───────────────────────────────────────────────────────────────────────────── // Debug invariant checkers // ───────────────────────────────────────────────────────────────────────────── /// Verify BFS block ordering: every non-entry, non-orphan block must have at /// least one predecessor with a smaller block ID. fn debug_assert_bfs_ordering(block_preds: &[Vec]) { for (i, preds) in block_preds.iter().enumerate() { if i == 0 { continue; // entry block } if preds.is_empty() { continue; // orphan block (e.g. catch block reached via exception edge) } let has_forward_pred = preds.iter().any(|&p| p < i); debug_assert!( has_forward_pred, "Block {} has no forward predecessor — BFS ordering violated. Preds: {:?}", i, preds ); } } /// Verify phi operand counts: each phi must have exactly one operand /// per predecessor, and every operand must reference an actual /// predecessor of the block. /// /// Runs in release builds because phi-operand mismatches are /// load-bearing for soundness, downstream taint, const, and abstract /// analyses iterate phi operands by `(pred_blk, value)` pairs, and /// either a missing operand (silent "no contribution" on that edge) /// or a phantom operand (garbage into the join) corrupts analysis /// without surfacing. /// /// The invariant is strict equality. Predecessors that carry no /// reaching definition for the phi's variable are filled with the /// [`SsaOp::Undef`] sentinel in `fill_undef_phi_operands`, rather than /// being dropped, so consumers that look up by `(pred_blk, value)` /// see a real operand for every control-flow edge. fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec]) { use std::collections::HashSet; for (i, block) in ssa_blocks.iter().enumerate() { let pred_set: HashSet = block_preds[i].iter().map(|&p| p as u32).collect(); for phi in &block.phis { if let SsaOp::Phi(ref operands) = phi.op { assert_eq!( operands.len(), block_preds[i].len(), "SSA phi operand count does not match predecessor count: block {} phi v{} \ (var={:?}) has {} operands but block has {} predecessors. \ preds={:?}, operand_preds={:?}", i, phi.value.0, phi.var_name, operands.len(), block_preds[i].len(), block_preds[i], operands.iter().map(|(b, _)| b.0).collect::>(), ); // Each operand's pred block must be an actual predecessor, // and no predecessor may appear more than once. let mut seen: HashSet = HashSet::new(); for (pred_blk, _) in operands.iter() { assert!( pred_set.contains(&pred_blk.0), "SSA phi operand references nonexistent predecessor: block {} phi v{} \ references pred B{} but block predecessors are {:?}", i, phi.value.0, pred_blk.0, block_preds[i], ); assert!( seen.insert(pred_blk.0), "SSA phi operand duplicates predecessor: block {} phi v{} has two \ operands for pred B{}", i, phi.value.0, pred_blk.0, ); } } } } } /// Post-rename pass: ensure every phi has one operand per predecessor. /// /// During rename, phi operands are only pushed when the variable has a /// live reaching definition on that predecessor edge. Edges where the /// variable is not yet defined (e.g. a try-body rejoining after a /// catch-only binding, an early-return branch on a later-defined /// variable, an orphan catch block's implicit predecessors) leave the /// phi with fewer operands than the block has predecessors. /// /// This pass scans all phis, and for every missing `(pred_block, _)` /// slot, pushes `(pred_block, undef_val)` where `undef_val` is a /// single shared sentinel instruction ([`SsaOp::Undef`]) synthesized /// at the end of block 0's body. Consumers iterate phi operands by /// `(pred_blk, value)` and therefore see a real operand on every /// control-flow edge, no implicit "missing = empty" semantics. /// /// The Undef instruction is created lazily (only when at least one phi /// has a gap) so functions with fully-dominating definitions pay zero /// cost. All phis share the same Undef value: a phi operand is /// identified by its `(pred_block, value)` pair, so sharing the value /// across phis is safe and keeps the synthesized-instruction count at /// most one per function body. fn fill_undef_phi_operands( ssa_blocks: &mut [SsaBlock], block_preds: &[Vec], value_defs: &mut Vec, blocks_nodes: &[Vec], ) { // Fast path: detect whether any phi has a gap. Avoid allocating // the Undef value in the common case where every phi is saturated. let needs_undef = ssa_blocks.iter().enumerate().any(|(bi, block)| { block.phis.iter().any(|phi| { if let SsaOp::Phi(ref operands) = phi.op { operands.len() < block_preds[bi].len() } else { false } }) }); if !needs_undef { return; } // Anchor the synthetic Undef instruction to the entry block's first // CFG node so span lookups don't hit an invalid NodeIndex. let anchor_node = blocks_nodes .first() .and_then(|b| b.first()) .copied() .expect("entry block has at least one CFG node"); let undef_val = SsaValue(value_defs.len() as u32); value_defs.push(ValueDef { var_name: None, cfg_node: anchor_node, block: BlockId(0), }); // Place the Undef instruction at the end of block 0's body so it // appears after any synthetic Param / SelfParam emissions, its // only role is to anchor the SsaValue; ordering relative to other // body instructions is cosmetic (no consumer depends on its // position, only on the value lookup). ssa_blocks[0].body.push(SsaInst { value: undef_val, op: SsaOp::Undef, cfg_node: anchor_node, var_name: None, span: (0, 0), }); // Fill missing operand slots. Iterate `block_preds[bi]` in its // natural order so the resulting phi operand list is deterministic // across runs. for (bi, block) in ssa_blocks.iter_mut().enumerate() { for phi in block.phis.iter_mut() { if let SsaOp::Phi(ref mut operands) = phi.op { if operands.len() == block_preds[bi].len() { continue; } use std::collections::HashSet; let present: HashSet = operands.iter().map(|(b, _)| b.0).collect(); for &pred in &block_preds[bi] { let pid = pred as u32; if !present.contains(&pid) { operands.push((BlockId(pid), undef_val)); } } } } } } #[cfg(test)] mod tests { use super::*; use crate::cfg::{EdgeKind, NodeInfo, StmtKind, TaintMeta}; use petgraph::Graph; fn make_node(kind: StmtKind) -> NodeInfo { NodeInfo { kind, ..Default::default() } } #[test] fn linear_cfg_no_phis() { // Entry → x=1 → y=x → Exit let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let n1 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let n2 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("y".into()), uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, n1, EdgeKind::Seq); cfg.add_edge(n1, n2, EdgeKind::Seq); cfg.add_edge(n2, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Should be a single block (all Seq edges, no branches) assert_eq!(ssa.blocks.len(), 1); // No phis in a linear CFG assert!(ssa.blocks[0].phis.is_empty()); // 4 body instructions (entry, x=1, y=x, exit) assert_eq!(ssa.blocks[0].body.len(), 4); } #[test] fn diamond_cfg_produces_phi() { // Entry → x=1 → If → [True: x=2] [False: x=3] → Join → Exit let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let def_x = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let if_node = cfg.add_node(make_node(StmtKind::If)); let true_node = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let false_node = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let join = cfg.add_node(make_node(StmtKind::Seq)); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, def_x, EdgeKind::Seq); cfg.add_edge(def_x, if_node, EdgeKind::Seq); cfg.add_edge(if_node, true_node, EdgeKind::True); cfg.add_edge(if_node, false_node, EdgeKind::False); cfg.add_edge(true_node, join, EdgeKind::Seq); cfg.add_edge(false_node, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Should have multiple blocks assert!(ssa.blocks.len() >= 3); // The join block should have a phi for "x" let join_block = ssa .blocks .iter() .find(|b| !b.phis.is_empty()) .expect("should have a block with a phi"); assert_eq!(join_block.phis.len(), 1); assert_eq!(join_block.phis[0].var_name.as_deref(), Some("x")); // Phi should have 2 operands (from true and false branches) if let SsaOp::Phi(ref operands) = join_block.phis[0].op { assert_eq!(operands.len(), 2); } else { panic!("expected Phi op"); } } #[test] fn loop_cfg_produces_phi() { // Entry → x=0 → Loop header → [Back: x=x+1] → Exit let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let def_x = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let loop_header = cfg.add_node(make_node(StmtKind::Loop)); let body = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, def_x, EdgeKind::Seq); cfg.add_edge(def_x, loop_header, EdgeKind::Seq); cfg.add_edge(loop_header, body, EdgeKind::True); cfg.add_edge(body, loop_header, EdgeKind::Back); cfg.add_edge(loop_header, exit, EdgeKind::False); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Loop header block should have a phi for "x" (from entry and back edge) let header_phis: Vec<_> = ssa.blocks.iter().filter(|b| !b.phis.is_empty()).collect(); assert!( !header_phis.is_empty(), "loop header should have a phi for x" ); let x_phi = header_phis[0] .phis .iter() .find(|p| p.var_name.as_deref() == Some("x")); assert!(x_phi.is_some(), "should have phi for variable x"); } #[test] fn multiple_reassignments_distinct_values() { // Entry → x=1 → x=2 → x=3 → Exit let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let n1 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let n2 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let n3 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, n1, EdgeKind::Seq); cfg.add_edge(n1, n2, EdgeKind::Seq); cfg.add_edge(n2, n3, EdgeKind::Seq); cfg.add_edge(n3, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Each definition of x should produce a distinct SsaValue let x_values: Vec<_> = ssa .value_defs .iter() .enumerate() .filter(|(_, vd)| vd.var_name.as_deref() == Some("x")) .map(|(i, _)| SsaValue(i as u32)) .collect(); assert_eq!(x_values.len(), 3, "three definitions of x"); // All distinct let unique: HashSet<_> = x_values.iter().collect(); assert_eq!(unique.len(), 3, "all SsaValues should be distinct"); } #[test] fn empty_cfg_returns_error() { let cfg: Cfg = Graph::new(); let result = lower_to_ssa(&cfg, NodeIndex::new(0), None, true); assert!(result.is_err()); } // ── BFS ordering and phi invariant tests ───────────────────────────── #[test] fn bfs_ordering_holds_for_linear_cfg() { // Entry → A → B → Exit, all blocks should satisfy BFS ordering let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let a = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let b = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("y".into()), uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, a, EdgeKind::Seq); cfg.add_edge(a, b, EdgeKind::Seq); cfg.add_edge(b, exit, EdgeKind::Seq); // This exercises the debug_assert_bfs_ordering in debug builds let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); assert!(!ssa.blocks.is_empty()); } #[test] fn bfs_ordering_holds_for_diamond_cfg() { // Entry → If → [True] [False] → Join → Exit let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let def_x = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let if_node = cfg.add_node(make_node(StmtKind::If)); let true_node = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let false_node = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let join = cfg.add_node(make_node(StmtKind::Seq)); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, def_x, EdgeKind::Seq); cfg.add_edge(def_x, if_node, EdgeKind::Seq); cfg.add_edge(if_node, true_node, EdgeKind::True); cfg.add_edge(if_node, false_node, EdgeKind::False); cfg.add_edge(true_node, join, EdgeKind::Seq); cfg.add_edge(false_node, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); // Exercises both BFS ordering and phi operand count assertions let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // The join block should have a phi with exactly 2 operands (== 2 preds) let phi_block = ssa.blocks.iter().find(|b| !b.phis.is_empty()); if let Some(block) = phi_block { assert_eq!( block.preds.len(), 2, "join block should have 2 predecessors" ); for phi in &block.phis { if let SsaOp::Phi(ref ops) = phi.op { assert!( ops.len() <= block.preds.len(), "phi operands should not exceed predecessor count" ); } } } } #[test] fn bfs_ordering_holds_for_loop_with_back_edge() { // Entry → x=0 → Loop → body(x=x+1) → [Back→Loop] → Exit let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let def_x = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let loop_h = cfg.add_node(make_node(StmtKind::Loop)); let body = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, def_x, EdgeKind::Seq); cfg.add_edge(def_x, loop_h, EdgeKind::Seq); cfg.add_edge(loop_h, body, EdgeKind::True); cfg.add_edge(body, loop_h, EdgeKind::Back); cfg.add_edge(loop_h, exit, EdgeKind::False); // Exercises BFS ordering with back edges and phi on loop header let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); assert!(!ssa.blocks.is_empty()); } #[test] fn orphan_catch_block_does_not_violate_bfs_ordering() { // Entry → body → Exit, with an exception edge body → catch → Exit // The catch block becomes an orphan (no normal-flow predecessors) let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let body = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let catch = cfg.add_node(NodeInfo { catch_param: true, taint: TaintMeta { defines: Some("e".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, body, EdgeKind::Seq); cfg.add_edge(body, exit, EdgeKind::Seq); cfg.add_edge(body, catch, EdgeKind::Exception); cfg.add_edge(catch, exit, EdgeKind::Seq); // The catch block is reached via exception edge (stripped from normal flow) // so it may appear as an orphan. The BFS assertion should skip it. let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); assert!(!ssa.blocks.is_empty()); } #[test] fn phi_operand_count_equals_pred_count_in_diamond() { // Specific test: phi operands == predecessor count (not just <=) let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let if_node = cfg.add_node(make_node(StmtKind::If)); let t = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("v".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let f = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("v".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let join = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["v".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, if_node, EdgeKind::Seq); cfg.add_edge(if_node, t, EdgeKind::True); cfg.add_edge(if_node, f, EdgeKind::False); cfg.add_edge(t, join, EdgeKind::Seq); cfg.add_edge(f, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); let phi_block = ssa .blocks .iter() .find(|b| !b.phis.is_empty()) .expect("should have a phi block"); for phi in &phi_block.phis { if let SsaOp::Phi(ref ops) = phi.op { assert_eq!( ops.len(), phi_block.preds.len(), "phi operand count should equal predecessor count in a clean diamond" ); } } } #[test] fn bfs_assertion_helper_accepts_valid_orderings() { // Direct unit test of the assertion helper with valid input let block_preds = vec![ vec![], // block 0: entry (no preds) vec![0], // block 1: pred is block 0 (forward) vec![0, 1], // block 2: both forward preds vec![], // block 3: orphan (no preds) vec![2], // block 4: forward pred ]; // Should not panic debug_assert_bfs_ordering(&block_preds); } /// Regression guard: a catch block that joins an exception /// predecessor and a normal control-flow predecessor must lower to a /// consistent phi. For variables defined before the try (live on /// *both* edges), the phi at the catch block has exactly two operands ///, one per predecessor, and the release assertion accepts it. #[test] fn catch_block_join_phi_has_operand_per_live_predecessor() { // Entry → defines `x` → Try → (Seq) → Join ← (Exception via body) Catch // ↑ // A phi for `x` at the join block should carry // one operand from each of its two predecessors. let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let define_x = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let body = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let catch = cfg.add_node(NodeInfo { catch_param: true, taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let join = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, define_x, EdgeKind::Seq); cfg.add_edge(define_x, body, EdgeKind::Seq); cfg.add_edge(body, join, EdgeKind::Seq); cfg.add_edge(body, catch, EdgeKind::Exception); cfg.add_edge(catch, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); // Lowering must succeed, the assertion is active in release. let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Locate the block containing a phi for `x`; it must be the join // block with two reachable predecessors. The phi must have // exactly two operands. let phi_block = ssa .blocks .iter() .find(|b| { b.phis .iter() .any(|p| p.var_name.as_deref() == Some("x") && matches!(p.op, SsaOp::Phi(_))) }) .expect("expected a phi for `x` at the catch/normal join"); assert_eq!( phi_block.preds.len(), 2, "catch/normal join block must have 2 predecessors, got {}", phi_block.preds.len() ); let phi_for_x = phi_block .phis .iter() .find(|p| p.var_name.as_deref() == Some("x")) .unwrap(); if let SsaOp::Phi(ref operands) = phi_for_x.op { assert_eq!( operands.len(), 2, "phi for `x` at the catch/normal join must have one operand per \ predecessor, got {}", operands.len() ); } else { panic!("expected SsaOp::Phi for `x`"); } } /// Regression guard for the Undef fill pass. When a variable is /// only defined on one branch of a join (e.g. a catch-only binding /// rejoining the normal path), the lowering must still emit one /// phi operand per predecessor, the missing edge becoming a /// reference to the synthesized `SsaOp::Undef` sentinel rather /// than being dropped. #[test] fn partial_phi_edge_fills_with_undef_sentinel() { // Entry → Body → Join // ↓ // Catch (defines `e`) → Join // // `e` is defined only on the exception path; on the normal path // from Body → Join it has no reaching definition. The phi for `e` // at Join must have two operands (one per predecessor), with the // Body-side operand pointing at the Undef sentinel. let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let body = cfg.add_node(make_node(StmtKind::Seq)); let catch = cfg.add_node(NodeInfo { catch_param: true, taint: TaintMeta { defines: Some("e".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let join = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["e".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, body, EdgeKind::Seq); cfg.add_edge(body, join, EdgeKind::Seq); cfg.add_edge(body, catch, EdgeKind::Exception); cfg.add_edge(catch, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Find the phi for `e`. let phi_block = ssa .blocks .iter() .find(|b| b.phis.iter().any(|p| p.var_name.as_deref() == Some("e"))) .expect("expected a phi for `e`"); let phi_for_e = phi_block .phis .iter() .find(|p| p.var_name.as_deref() == Some("e")) .unwrap(); let operands = match &phi_for_e.op { SsaOp::Phi(ops) => ops, _ => panic!("expected SsaOp::Phi for `e`"), }; // Strict invariant: one operand per predecessor. assert_eq!( operands.len(), phi_block.preds.len(), "phi for `e` must have one operand per predecessor", ); // At least one operand must reference the Undef sentinel (the // Body-side edge where `e` has no reaching definition). let found_inst = |v: SsaValue| -> Option<&SsaInst> { ssa.blocks .iter() .flat_map(|b| b.phis.iter().chain(b.body.iter())) .find(|i| i.value == v) }; let any_undef = operands.iter().any(|(_, v)| { found_inst(*v) .map(|i| matches!(i.op, SsaOp::Undef)) .unwrap_or(false) }); assert!( any_undef, "phi for `e` at the catch-join must reference SsaOp::Undef \ on the normal-path predecessor edge", ); } #[test] fn phi_assertion_helper_accepts_exact_operand_count() { // Direct test of the assertion helper: a phi with exactly as many // operands as the block has predecessors must not panic. let dummy_node = NodeIndex::new(0); let block = SsaBlock { id: BlockId(1), phis: vec![SsaInst { value: SsaValue(0), op: SsaOp::Phi(smallvec::smallvec![ (BlockId(0), SsaValue(1)), (BlockId(2), SsaValue(2)), ]), cfg_node: dummy_node, var_name: Some("x".into()), span: (0, 0), }], body: vec![], terminator: Terminator::Unreachable, preds: smallvec::smallvec![BlockId(0), BlockId(2)], succs: smallvec::smallvec![], }; let block_preds = vec![vec![], vec![0, 2], vec![0]]; assert_phi_operand_counts( &[ SsaBlock { id: BlockId(0), phis: vec![], body: vec![], terminator: Terminator::Goto(BlockId(1)), preds: smallvec::smallvec![], succs: smallvec::smallvec![BlockId(1)], }, block, SsaBlock { id: BlockId(2), phis: vec![], body: vec![], terminator: Terminator::Goto(BlockId(1)), preds: smallvec::smallvec![BlockId(0)], succs: smallvec::smallvec![BlockId(1)], }, ], &block_preds, ); } #[test] #[should_panic(expected = "SSA phi operand count does not match predecessor count")] fn phi_assertion_helper_rejects_more_operands_than_preds() { // A phi with MORE operands than preds references a nonexistent // predecessor, unsound because downstream consumers either // panic on the lookup or silently feed garbage taint into the // join. Strict-equality invariant catches this. let dummy_node = NodeIndex::new(0); let block = SsaBlock { id: BlockId(1), phis: vec![SsaInst { value: SsaValue(0), op: SsaOp::Phi(smallvec::smallvec![ (BlockId(0), SsaValue(1)), (BlockId(2), SsaValue(2)), (BlockId(3), SsaValue(3)), ]), cfg_node: dummy_node, var_name: Some("x".into()), span: (0, 0), }], body: vec![], terminator: Terminator::Unreachable, preds: smallvec::smallvec![BlockId(0), BlockId(2)], succs: smallvec::smallvec![], }; let block_preds = vec![vec![], vec![0, 2]]; assert_phi_operand_counts( &[ SsaBlock { id: BlockId(0), phis: vec![], body: vec![], terminator: Terminator::Goto(BlockId(1)), preds: smallvec::smallvec![], succs: smallvec::smallvec![BlockId(1)], }, block, ], &block_preds, ); } #[test] #[should_panic(expected = "SSA phi operand count does not match predecessor count")] fn phi_assertion_helper_rejects_fewer_operands_than_preds() { // A phi with fewer operands than preds violates the strict-equality // invariant: `fill_undef_phi_operands` is responsible for filling // every missing slot with an Undef sentinel, so the final body // should never have gaps. This test guards the post-pass. let dummy_node = NodeIndex::new(0); let block = SsaBlock { id: BlockId(1), phis: vec![SsaInst { value: SsaValue(0), op: SsaOp::Phi(smallvec::smallvec![(BlockId(0), SsaValue(1))]), cfg_node: dummy_node, var_name: Some("e".into()), span: (0, 0), }], body: vec![], terminator: Terminator::Unreachable, preds: smallvec::smallvec![BlockId(0), BlockId(2)], succs: smallvec::smallvec![], }; let block_preds = vec![vec![], vec![0, 2]]; assert_phi_operand_counts( &[ SsaBlock { id: BlockId(0), phis: vec![], body: vec![], terminator: Terminator::Goto(BlockId(1)), preds: smallvec::smallvec![], succs: smallvec::smallvec![BlockId(1)], }, block, ], &block_preds, ); } #[test] #[should_panic(expected = "SSA phi operand references nonexistent predecessor")] fn phi_assertion_helper_rejects_wrong_pred_block() { // A phi with the correct operand count but referencing a block // that isn't actually a predecessor must also fail the invariant. let dummy_node = NodeIndex::new(0); let block = SsaBlock { id: BlockId(1), phis: vec![SsaInst { value: SsaValue(0), op: SsaOp::Phi(smallvec::smallvec![ (BlockId(0), SsaValue(1)), (BlockId(3), SsaValue(2)), ]), cfg_node: dummy_node, var_name: Some("x".into()), span: (0, 0), }], body: vec![], terminator: Terminator::Unreachable, preds: smallvec::smallvec![BlockId(0), BlockId(2)], succs: smallvec::smallvec![], }; let block_preds = vec![vec![], vec![0, 2]]; assert_phi_operand_counts( &[ SsaBlock { id: BlockId(0), phis: vec![], body: vec![], terminator: Terminator::Goto(BlockId(1)), preds: smallvec::smallvec![], succs: smallvec::smallvec![BlockId(1)], }, block, ], &block_preds, ); } #[test] fn three_successor_collapse_produces_switch() { // Build a CFG where a single node has 3 successors. The // structured `Terminator::Switch` replaced the old // `Goto(first)` collapse so every target is visible on the // terminator shape (not only on `block.succs`). let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let branch = cfg.add_node(make_node(StmtKind::If)); let s0 = cfg.add_node(make_node(StmtKind::Seq)); let s1 = cfg.add_node(make_node(StmtKind::Seq)); let s2 = cfg.add_node(make_node(StmtKind::Seq)); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, branch, EdgeKind::Seq); cfg.add_edge(branch, s0, EdgeKind::True); cfg.add_edge(branch, s1, EdgeKind::False); cfg.add_edge(branch, s2, EdgeKind::Seq); cfg.add_edge(s0, exit, EdgeKind::Seq); cfg.add_edge(s1, exit, EdgeKind::Seq); cfg.add_edge(s2, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); assert!(!ssa.blocks.is_empty()); let switch_block = ssa .blocks .iter() .find(|b| matches!(b.terminator, Terminator::Switch { .. }) && b.succs.len() >= 3) .expect("expected a block with a Switch terminator and ≥3 succs"); assert_eq!( switch_block.succs.len(), 3, "≥3-successor lowering must retain all succs on block.succs, got {:?}", switch_block.succs ); if let Terminator::Switch { targets, default, .. } = &switch_block.terminator { // Default is the first succ (deterministic ordering); the // remaining N-1 succs populate `targets` in order. assert_eq!( *default, switch_block.succs[0], "Switch default must match succs[0]" ); assert_eq!( targets.len(), switch_block.succs.len() - 1, "Switch targets must cover every succ except default" ); for (i, t) in targets.iter().enumerate() { assert_eq!( *t, switch_block.succs[i + 1], "Switch target[{i}] must match succs[{}]", i + 1 ); } } } #[test] fn normal_two_successor_produces_branch() { // Regression: normal 2-successor case should still produce Branch let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let if_node = cfg.add_node(make_node(StmtKind::If)); let t = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let f = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, if_node, EdgeKind::Seq); cfg.add_edge(if_node, t, EdgeKind::True); cfg.add_edge(if_node, f, EdgeKind::False); cfg.add_edge(t, exit, EdgeKind::Seq); cfg.add_edge(f, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); let has_branch = ssa .blocks .iter() .any(|b| matches!(b.terminator, Terminator::Branch { .. })); assert!( has_branch, "normal 2-successor case must produce Branch, not Goto" ); } /// Regression: a block containing an explicit Return CFG node must /// terminate with [`Terminator::Return`], never [`Terminator::Goto`] /// to a synthetic exit block. Previously, the bookkeeping /// `Return → fn_exit` `Seq` edge made early-return blocks fall into /// the single-successor `Goto` arm, and the fall-through tail /// expression's body got merged into the shared exit block, every /// early-return path therefore appeared to also execute the tail. /// Mirrors the `if cond { return X; } Y` shape that motivated the fix. #[test] fn early_return_block_terminates_with_return_not_goto_to_exit() { let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); // Param-style external use (x is read by the if condition). let if_node = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::If) }); // True branch: return constant. uses=[] + const_text=Some triggers // the literal-return path, ensuring the block emits a synthetic // Const + Return(Some(_)), the same shape `return None` / // `return String::new()` produces in real Rust code. let early_ret = cfg.add_node(NodeInfo { taint: TaintMeta { const_text: Some("\"\"".to_string()), ..Default::default() }, ..make_node(StmtKind::Return) }); // False branch: tail expression that defines `y` (the implicit // function return value). let tail = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("y".into()), uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, if_node, EdgeKind::Seq); cfg.add_edge(if_node, early_ret, EdgeKind::True); cfg.add_edge(if_node, tail, EdgeKind::False); // Bookkeeping wire-up the real CFG construction performs in // `build_cfg`, Return / Throw → fn_exit via Seq, so the SSA // lowering has to handle it. cfg.add_edge(early_ret, exit, EdgeKind::Seq); cfg.add_edge(tail, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Locate the block containing the early-return CFG node and // assert it terminates with Return, not Goto(_) into the // shared exit block. let early_block = ssa .blocks .iter() .find(|b| { b.body .iter() .chain(b.phis.iter()) .any(|inst| inst.cfg_node == early_ret) }) .expect("early-return CFG node must live in some SSA block"); assert!( matches!(early_block.terminator, Terminator::Return(_)), "early-return block must terminate with Return, got {:?}", early_block.terminator ); assert!( early_block.succs.is_empty(), "early-return block must have no successors at the block level, \ got succs = {:?}", early_block.succs ); // The fall-through (tail) block must NOT have the early-return // block as a predecessor. Pre-fix, both the early-return path // and the tail path merged into the shared fn_exit block, so the // tail's body was reachable from the early-return path, that's // the merged-return defect. let tail_block = ssa .blocks .iter() .find(|b| { b.body .iter() .chain(b.phis.iter()) .any(|inst| inst.cfg_node == tail) }) .expect("tail CFG node must live in some SSA block"); let early_block_id = early_block.id; assert!( !tail_block.preds.contains(&early_block_id), "tail block must not have early-return block as a predecessor; \ merged-return defect would re-emerge. tail.preds = {:?}, \ early_block_id = {:?}", tail_block.preds, early_block_id ); } /// Regression: an OR-chain rejection arm such as /// `if a || b || c { return X; } Y` must have its rejection body emit a /// `Terminator::Return(_)` and have `succs.is_empty()`. Pre-fix the /// rejection body's String::new() Call shared a block whose only /// successor was the merged tail, losing the early-return semantics /// entirely and diluting per-return-path PathFact narrowing. #[test] fn or_chain_rejection_block_terminates_with_return() { use crate::cfg::build_cfg; let src = br#" fn sanitize_path(s: &str) -> String { if s.contains("..") || s.starts_with('/') || s.starts_with('\\') { return String::new(); } s.to_string() } "#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&tree_sitter::Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src.as_slice(), None).unwrap(); let file_cfg = build_cfg(&tree, src.as_slice(), "rust", "test.rs", None); let body = if file_cfg.bodies.len() > 1 { &file_cfg.bodies[1] } else { file_cfg.first_body() }; let cfg = &body.graph; let entry = body.entry; // Locate the Return CFG node sourced from the if-body and the tail // expression's Call node so the assertions are meaningful even if // block ordering shifts. let mut rejection_call: Option = None; for idx in cfg.node_indices() { let info = &cfg[idx]; if info.kind == StmtKind::Call { if let Some(callee) = &info.call.callee { if callee == "String::new" || callee.ends_with("String::new") { rejection_call = Some(idx); } } } } let rejection_call = rejection_call .expect("CFG must contain a String::new() Call node for the rejection arm"); let ssa = lower_to_ssa(cfg, entry, None, true).expect("SSA lowering should succeed"); // Find the SSA block containing the String::new() Call. This is // the rejection-arm block. let rejection_block = ssa .blocks .iter() .find(|b| { b.body .iter() .chain(b.phis.iter()) .any(|inst| inst.cfg_node == rejection_call) }) .expect("rejection-arm Call must live in some SSA block"); assert!( rejection_block.succs.is_empty(), "rejection-arm block must have no block-level successors after \ return-frontier strip; got succs = {:?}", rejection_block.succs ); assert!( matches!(rejection_block.terminator, Terminator::Return(_)), "rejection-arm block must terminate with Terminator::Return; got {:?}", rejection_block.terminator ); } /// Cross-language regression: the same merged-return defect that the Rust /// fix closed must not appear in C. The C OR-chain shape from /// `tests/benchmark/corpus/c/safe/safe_direct_path_sanitizer.c` has both /// a rejection arm (`return ""`) and a tail return (`return s`). Both /// must produce blocks whose terminator is `Terminator::Return(_)`. #[test] fn c_or_chain_both_return_arms_terminate_with_return() { use crate::cfg::build_cfg; let src = br#" const char *sanitize_path(const char *s) { if (strstr(s, "..") != NULL || s[0] == '/' || s[0] == '\\') { return ""; } return s; } "#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&tree_sitter::Language::from(tree_sitter_c::LANGUAGE)) .unwrap(); let tree = parser.parse(src.as_slice(), None).unwrap(); let file_cfg = build_cfg(&tree, src.as_slice(), "c", "test.c", None); let body = file_cfg.first_body(); let cfg = &body.graph; let entry = body.entry; let ssa = lower_to_ssa(cfg, entry, None, true).expect("SSA lowering should succeed"); let return_blocks: Vec<&SsaBlock> = ssa .blocks .iter() .filter(|b| matches!(b.terminator, Terminator::Return(_))) .collect(); assert!( return_blocks.len() >= 2, "Expected ≥2 Return-terminated blocks (rejection arm + tail); got {}: {:?}", return_blocks.len(), ssa.blocks .iter() .map(|b| (b.id, &b.terminator)) .collect::>() ); // Each Return-terminated block must have an empty successor list // (no fall-through past Return). for b in &return_blocks { assert!( b.succs.is_empty(), "Return-terminated block id={:?} has succs={:?}", b.id, b.succs ); } } // ───────────────────────────────────────────────────────────────── // FieldProj chain lowering tests // ───────────────────────────────────────────────────────────────── // // These tests pin the contract that `try_lower_field_proj_chain` // emits a `FieldProj` chain for chained-receiver method calls // (`a.b.c.method()`) and bails (preserving the existing single-Call // lowering) for everything else. Per-language end-to-end coverage // lives below in `phase2_e2e_*` tests; the unit tests here pin the // helper's behaviour without going through tree-sitter. /// Build a freshly-allocated empty SSA scratch state suitable for /// invoking `try_lower_field_proj_chain` in isolation. Returns /// `(var_stacks, field_interner, ssa_blocks, value_defs, next_value)`. fn fresh_proj_scratch() -> ( std::collections::HashMap>, crate::ssa::ir::FieldInterner, Vec, Vec, u32, ) { let blocks = vec![SsaBlock { id: BlockId(0), phis: Vec::new(), body: Vec::new(), terminator: Terminator::Unreachable, preds: SmallVec::new(), succs: SmallVec::new(), }]; ( std::collections::HashMap::new(), crate::ssa::ir::FieldInterner::new(), blocks, Vec::new(), 0, ) } /// Seed a single SSA value `SsaValue(0)` for `name` so the chain /// helper's base lookup succeeds. fn seed_var( var_stacks: &mut std::collections::HashMap>, value_defs: &mut Vec, next_value: &mut u32, name: &str, ) -> SsaValue { let v = SsaValue(*next_value); *next_value += 1; value_defs.push(ValueDef { var_name: Some(name.into()), cfg_node: NodeIndex::new(0), block: BlockId(0), }); var_stacks.entry(name.into()).or_default().push(v); v } #[test] fn try_lower_field_proj_chain_too_few_segments_returns_none() { // 0 dots: bare callee → no chain. let (mut vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); seed_var(&mut vs, &mut defs, &mut nv, "obj"); assert!( try_lower_field_proj_chain( "foo", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .is_none() ); // 1 dot: simple receiver, NOT decomposed (existing receiver channel // already handles `obj.method()` calls). assert!( try_lower_field_proj_chain( "obj.method", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .is_none() ); // No FieldProj instructions emitted; interner stays empty. assert!(blocks[0].body.is_empty()); assert!(interner.is_empty()); } #[test] fn try_lower_field_proj_chain_complex_token_returns_none() { // Each of these contains a token signaling complexity that breaks // the simple `....` shape; helper must bail. let cases = [ "Foo::bar::baz", // Rust path "ptr->field.f", // C-style arrow "obj.f().g", // intermediate call "vec[0].field", // index expression "obj.f.", // template-ish "obj.f g", // whitespace "obj?.f.g", // optional chain ]; let (mut vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); seed_var(&mut vs, &mut defs, &mut nv, "obj"); for s in &cases { assert!( try_lower_field_proj_chain( s, &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .is_none(), "expected bail on complex callee {s}" ); } assert!(blocks[0].body.is_empty()); assert!(interner.is_empty()); } #[test] fn try_lower_field_proj_chain_unknown_base_returns_none() { // The chain root must be a known SSA variable; otherwise the chain // root SSA value is unrecoverable and we must fall back. let (vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); // "ghost" intentionally not seeded. assert!( try_lower_field_proj_chain( "ghost.f.method", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .is_none() ); assert!(blocks[0].body.is_empty()); assert!(interner.is_empty()); } #[test] fn try_lower_field_proj_chain_basic_two_dots_emits_one_proj() { // `c.mu.Lock()` → emit one FieldProj, return (v_mu, "Lock"). let (mut vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); let v_c = seed_var(&mut vs, &mut defs, &mut nv, "c"); let (recv, method) = try_lower_field_proj_chain( "c.mu.Lock", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (10, 20), ) .expect("chain decomposition should succeed"); // The returned receiver is a NEW SsaValue (one past v_c). assert_eq!(recv, SsaValue(1)); assert_eq!(method, "Lock"); // Exactly one FieldProj op was emitted. assert_eq!(blocks[0].body.len(), 1); let inst = &blocks[0].body[0]; match &inst.op { SsaOp::FieldProj { receiver, field, projected_type, } => { assert_eq!(*receiver, v_c); assert_eq!(interner.resolve(*field), "mu"); assert!(projected_type.is_none()); } other => panic!("expected FieldProj, got {other:?}"), } // Span propagated to the FieldProj instruction. assert_eq!(inst.span, (10, 20)); assert_eq!(inst.var_name.as_deref(), Some("c.mu")); // value_defs has an entry for the new SSA value. assert_eq!(defs.last().unwrap().var_name.as_deref(), Some("c.mu")); } #[test] fn try_lower_field_proj_chain_three_dots_emits_two_projs_chained() { // `c.writer.header.set` → 2 FieldProj ops, chained: v_writer reads c, // v_header reads v_writer. let (mut vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); let v_c = seed_var(&mut vs, &mut defs, &mut nv, "c"); let (recv, method) = try_lower_field_proj_chain( "c.writer.header.set", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .expect("chain decomposition should succeed"); assert_eq!(method, "set"); assert_eq!(recv, SsaValue(2)); // v_c=0, v_writer=1, v_header=2 assert_eq!(blocks[0].body.len(), 2, "expected 2 FieldProj ops"); match &blocks[0].body[0].op { SsaOp::FieldProj { receiver, field, .. } => { assert_eq!(*receiver, v_c); assert_eq!(interner.resolve(*field), "writer"); } other => panic!("expected FieldProj, got {other:?}"), } match &blocks[0].body[1].op { SsaOp::FieldProj { receiver, field, .. } => { assert_eq!(*receiver, SsaValue(1)); // chained on v_writer assert_eq!(interner.resolve(*field), "header"); } other => panic!("expected FieldProj, got {other:?}"), } // var_names form a readable chain assert_eq!(blocks[0].body[0].var_name.as_deref(), Some("c.writer")); assert_eq!( blocks[0].body[1].var_name.as_deref(), Some("c.writer.header") ); } #[test] fn try_lower_field_proj_chain_dedupes_field_names() { // Two separate chains that share a field name should reuse the // same FieldId via the per-body interner. let (mut vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); let v_a = seed_var(&mut vs, &mut defs, &mut nv, "a"); let v_b = seed_var(&mut vs, &mut defs, &mut nv, "b"); let _ = try_lower_field_proj_chain( "a.shared.f", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .unwrap(); let _ = try_lower_field_proj_chain( "b.shared.g", &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .unwrap(); // Two FieldProj insts emitted, both pointing at the same FieldId. assert_eq!(blocks[0].body.len(), 2); let f0 = match &blocks[0].body[0].op { SsaOp::FieldProj { field, .. } => *field, _ => panic!(), }; let f1 = match &blocks[0].body[1].op { SsaOp::FieldProj { field, .. } => *field, _ => panic!(), }; assert_eq!(f0, f1, "dedup should reuse FieldId"); assert_eq!(interner.len(), 1, "only one unique field name interned"); let _ = (v_a, v_b); } #[test] fn try_lower_field_proj_chain_rejects_empty_segments() { // Defensive: leading/trailing/double dots are not a member chain. let (mut vs, mut interner, mut blocks, mut defs, mut nv) = fresh_proj_scratch(); seed_var(&mut vs, &mut defs, &mut nv, "x"); for s in [".x.f", "x..f", "x.f."] { assert!( try_lower_field_proj_chain( s, &vs, &mut interner, 0, BlockId(0), &mut nv, &mut blocks, &mut defs, NodeIndex::new(0), (0, 0), ) .is_none(), "expected bail on {s}" ); } assert!(blocks[0].body.is_empty()); } // ── End-to-end SSA decomposition tests via real tree-sitter parsing ────────── // // These exercise the integration between CFG construction (which sets // `info.call.callee = "c.mu.Lock"`) and SSA lowering. We assert that // the resulting SsaBody contains a `FieldProj` op whose interned name // matches the source-level field name. fn parse_to_first_body( src: &[u8], lang: &str, ts_lang: tree_sitter::Language, path: &str, ) -> SsaBody { let mut parser = tree_sitter::Parser::new(); parser.set_language(&ts_lang).unwrap(); let tree = parser.parse(src, None).unwrap(); let file_cfg = crate::cfg::build_cfg(&tree, src, lang, path, None); // Prefer the first non-top-level body (a function), fall back to top. let body = if file_cfg.bodies.len() > 1 { &file_cfg.bodies[1] } else { &file_cfg.bodies[0] }; // Mirror the production lowering path: function bodies use // lower_to_ssa_with_params so formal parameters get synthetic // Param/SelfParam injections at block 0, without them, the // FieldProj chain helper has no SSA root to anchor to. if body.meta.name.is_some() { let func_name = body.meta.name.clone().unwrap_or_default(); lower_to_ssa_with_params( &body.graph, body.entry, Some(&func_name), false, &body.meta.params, ) .expect("SSA lowering should succeed") } else { lower_to_ssa(&body.graph, body.entry, None, true).expect("SSA lowering should succeed") } } /// Iterate every FieldProj instance in `body` along with its resolved /// field name. fn collect_field_projs(body: &SsaBody) -> Vec<(SsaValue, SsaValue, String)> { let mut out = Vec::new(); for blk in &body.blocks { for inst in blk.phis.iter().chain(blk.body.iter()) { if let SsaOp::FieldProj { receiver, field, .. } = &inst.op { out.push((inst.value, *receiver, body.field_name(*field).to_string())); } } } out } /// Iterate every Call instance in `body` along with its callee + callee_text. fn collect_calls(body: &SsaBody) -> Vec<(String, Option, Option)> { let mut out = Vec::new(); for blk in &body.blocks { for inst in blk.body.iter() { if let SsaOp::Call { callee, callee_text, receiver, .. } = &inst.op { out.push((callee.clone(), callee_text.clone(), *receiver)); } } } out } #[test] fn phase2_e2e_go_chained_receiver_emits_field_proj() { // Go: `c.writer.header.set(k, v)`, 3-segment receiver, 2 FieldProjs. // Chain root `c` is a function parameter so it is resolvable. let src = b"package p\nfunc f(c *T, k string, v string) { c.writer.header.set(k, v) }\n"; let body = parse_to_first_body( src, "go", tree_sitter::Language::from(tree_sitter_go::LANGUAGE), "test.go", ); let projs = collect_field_projs(&body); assert!( projs.len() >= 2, "expected ≥2 FieldProj ops for c.writer.header.; got {projs:?}" ); // Field names match the source-level field structure. let names: Vec<&str> = projs.iter().map(|(_, _, n)| n.as_str()).collect(); assert!( names.contains(&"writer"), "missing 'writer' projection in {names:?}" ); assert!( names.contains(&"header"), "missing 'header' projection in {names:?}" ); // The Call op carries the bare method name and callee_text retains the path. let calls = collect_calls(&body); let bare = calls.iter().find(|(c, _, _)| c == "set"); assert!( bare.is_some(), "expected a Call with bare callee 'set'; got {calls:?}" ); let (_, ctext, recv) = bare.unwrap(); assert!(recv.is_some(), "decomposed call must carry an SSA receiver"); assert_eq!( ctext.as_deref(), Some("c.writer.header.set"), "callee_text should preserve the original textual path" ); } #[test] fn phase2_e2e_python_chained_receiver_emits_field_proj() { // Python: `obj.client.session.send(p)`, 3-segment receiver. let src = b"def f(obj, p):\n obj.client.session.send(p)\n"; let body = parse_to_first_body( src, "python", tree_sitter::Language::from(tree_sitter_python::LANGUAGE), "test.py", ); let projs = collect_field_projs(&body); let names: Vec<&str> = projs.iter().map(|(_, _, n)| n.as_str()).collect(); assert!( names.contains(&"client") && names.contains(&"session"), "expected client + session projections, got {names:?}" ); let calls = collect_calls(&body); assert!( calls.iter().any(|(c, ct, r)| c == "send" && ct.as_deref() == Some("obj.client.session.send") && r.is_some()), "expected bare 'send' Call with callee_text retained; got {calls:?}" ); } #[test] fn phase2_e2e_javascript_chained_receiver_emits_field_proj() { // JS: `obj.foo.bar.baz()`, 3-segment receiver. let src = b"function f(obj) { obj.foo.bar.baz(); }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); let projs = collect_field_projs(&body); let names: Vec<&str> = projs.iter().map(|(_, _, n)| n.as_str()).collect(); assert!( names.contains(&"foo") && names.contains(&"bar"), "expected foo + bar projections, got {names:?}" ); } #[test] fn phase2_e2e_java_chained_receiver_emits_field_proj() { // Java: `obj.config.handler.run()`, 3-segment receiver chain through // a parameter `obj`. We avoid `this.…` because `this` is a Java // keyword (not an identifier_node) so it isn't extracted as an // external use, outside SSA decomposition.s scope. let src = b"class C { void f(Object obj) { obj.config.handler.run(); } }"; let body = parse_to_first_body( src, "java", tree_sitter::Language::from(tree_sitter_java::LANGUAGE), "test.java", ); let projs = collect_field_projs(&body); let names: Vec<&str> = projs.iter().map(|(_, _, n)| n.as_str()).collect(); assert!( names.contains(&"config") && names.contains(&"handler"), "expected config + handler projections, got {names:?}; full body:\n{body}" ); let calls = collect_calls(&body); assert!( calls.iter().any(|(c, ct, r)| c == "run" && ct.as_deref() == Some("obj.config.handler.run") && r.is_some()), "expected bare 'run' Call with callee_text retained; got {calls:?}" ); } #[test] fn phase2_e2e_simple_receiver_no_field_proj() { // REGRESSION: `obj.foo()`, single-dot receiver. SSA lowering must NOT // decompose this into a FieldProj chain (existing receiver channel // already covers it). Verify the body has zero FieldProj ops and // the Call's callee_text stays None. let src = b"function f(obj) { obj.foo(); }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); assert!( collect_field_projs(&body).is_empty(), "single-dot call should not generate FieldProj" ); let calls = collect_calls(&body); assert!( calls.iter().any(|(_, ct, _)| ct.is_none()), "single-dot Call should have callee_text=None; calls={calls:?}" ); } #[test] fn phase2_e2e_bare_call_no_field_proj() { // REGRESSION: a free-function call `foo()` must produce zero // FieldProj ops and an empty per-body interner. let src = b"function f() { foo(1, 2); }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); assert!(collect_field_projs(&body).is_empty()); assert!( body.field_interner.is_empty(), "no chain → interner stays empty" ); } #[test] fn phase2_e2e_global_root_chain_still_emits_field_proj() { // REGRESSION-NEGATIVE: when the chain root is a global identifier // (`Math.foo.bar()`), the lowerer's external-var synthesis makes // `Math` available as a synthetic Param, the chain still // decomposes, treating `Math` as the SSA receiver. This is the // semantically correct outcome even for global-rooted chains: the // FieldProj op precisely captures the field-access structure. let src = b"function f() { Math.foo.bar(); }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); let projs = collect_field_projs(&body); let names: Vec<&str> = projs.iter().map(|(_, _, n)| n.as_str()).collect(); assert!( names.contains(&"foo"), "expected 'foo' projection (chain root Math is a synthesized external var); got {names:?}" ); } #[test] fn phase2_e2e_rust_method_call_through_field_emits_field_proj() { // Rust: `c.mu.lock()`, `c` is a function parameter, `mu` is a field, // `lock` is the method. Verifies we generate FieldProj for `mu`. // (Rust paths like `std::env::var` use `::` and are excluded by // the helper's complex-token check.) let src = b"fn f(c: &T) { c.mu.lock(); }"; let body = parse_to_first_body( src, "rust", tree_sitter::Language::from(tree_sitter_rust::LANGUAGE), "test.rs", ); let projs = collect_field_projs(&body); let names: Vec<&str> = projs.iter().map(|(_, _, n)| n.as_str()).collect(); assert!( names.contains(&"mu"), "expected 'mu' projection from c.mu.lock(); got {names:?}; body:\n{body}" ); let calls = collect_calls(&body); assert!( calls .iter() .any(|(c, ct, r)| c == "lock" && ct.as_deref() == Some("c.mu.lock") && r.is_some()), "expected bare 'lock' Call with callee_text='c.mu.lock'; got {calls:?}" ); } #[test] fn phase2_e2e_rust_path_call_does_not_emit_field_proj() { // REGRESSION: `std::env::var(...)` is a Rust path (uses `::`), NOT // a member-access chain. Helper must bail. let src = br#"fn f() { let _ = std::env::var("X"); }"#; let body = parse_to_first_body( src, "rust", tree_sitter::Language::from(tree_sitter_rust::LANGUAGE), "test.rs", ); assert!( collect_field_projs(&body).is_empty(), "Rust path expression must not be decomposed into FieldProj" ); } #[test] fn phase2_e2e_field_interner_populated_only_when_chain_emitted() { // Helper invariant: a body with a chained call has a non-empty // interner; a body with no chained calls has an empty interner. let src_chain = b"function f(o) { o.a.b.c(); }"; let src_plain = b"function f(o) { o.foo(); }"; let body_chain = parse_to_first_body( src_chain, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); let body_plain = parse_to_first_body( src_plain, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); assert!( !body_chain.field_interner.is_empty(), "interner should hold the chain field names" ); assert!( body_plain.field_interner.is_empty(), "single-dot call should not populate interner" ); } #[test] fn phase2_e2e_field_proj_chain_preserves_receiver_dataflow() { // The FieldProj receiver chain must trace back to the chain root // (parameter `c` here) via `uses_iter()`. This is the contract // every downstream consumer relies on for taint propagation. let src = b"function f(c) { c.a.b.m(); }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); let projs = collect_field_projs(&body); assert_eq!(projs.len(), 2, "expected 2 FieldProj ops, got {projs:?}"); // The first FieldProj's receiver should be a parameter or external // var; the second FieldProj's receiver should be the first // FieldProj's value. let v_first = projs[0].0; let r_second = projs[1].1; assert_eq!( r_second, v_first, "second FieldProj must chain off the first's value" ); } /// End-to-end: lowering an `obj.f = rhs` statement populates /// `SsaBody.field_writes` with the synthetic base-update Assign's /// `(receiver, FieldId)` mapping. A single-write shape suffices , /// every formal gets a Param op at block 0 so the first write /// finds the formal in `var_stacks`. #[test] fn w1_end_to_end_field_write_records_side_table_when_parent_has_prior_def() { // Single write to `obj.cache`: the formal `obj` provides the // prior reaching def via the synthetic Param at block 0. let src = b"function f(obj) { obj.cache = 42; }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); assert!( !body.field_writes.is_empty(), "single `obj.cache = 42` on a JS formal must populate \ field_writes via the formal's W1.b synthetic Param; got \ body.field_writes={:?}\nbody:\n{body}", body.field_writes, ); // Every recorded field name resolves to "cache". for (_rcv, fid) in body.field_writes.values() { assert_eq!(body.field_interner.resolve(*fid), "cache"); } } /// W1.b: Python, single `obj.cache = 42` on a formal also /// populates `field_writes` thanks to the formal Param op. #[test] fn w1b_single_write_records_field_write_python() { let src = b"def f(obj):\n obj.cache = 42\n"; let body = parse_to_first_body( src, "python", tree_sitter::Language::from(tree_sitter_python::LANGUAGE), "test.py", ); assert!( !body.field_writes.is_empty(), "Python single `obj.cache = 42` must populate field_writes; \ got body.field_writes={:?}\nbody:\n{body}", body.field_writes, ); } /// W1.b: Rust, single `obj.cache = 42` on a method-style formal /// (`fn f(obj: &mut O)`) also populates `field_writes`. #[test] fn w1b_single_write_records_field_write_rust() { let src = b"struct O { cache: i32 } fn f(obj: &mut O) { obj.cache = 42; }"; let body = parse_to_first_body( src, "rust", tree_sitter::Language::from(tree_sitter_rust::LANGUAGE), "test.rs", ); assert!( !body.field_writes.is_empty(), "Rust single `obj.cache = 42` must populate field_writes; \ got body.field_writes={:?}\nbody:\n{body}", body.field_writes, ); } /// REGRESSION: when the body takes a real handler-named formal /// (`userId`), that formal must NOT end up in /// `synthetic_externals` — the JS/TS / Java auto-seed pass relies /// on this distinction to seed only real formals as /// `Source(UserInput)` and skip closure captures. Companion /// integration coverage for the empty-formals shape (arrow /// `() => {…}` lifting bubbled-up free vars as synthetic) lives /// in `tests/fixtures/fp_guards/framework_jest_test_callback_arrow/` /// — that fixture exercises the full CFG construction path which /// this unit test cannot reproduce in isolation. #[test] fn arrow_with_handler_formal_keeps_param_non_synthetic() { let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(NodeInfo { ast: crate::cfg::AstMeta { enclosing_func: Some("lookup".into()), ..Default::default() }, ..make_node(StmtKind::Entry) }); let use_node = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["userId".into()], ..Default::default() }, ast: crate::cfg::AstMeta { enclosing_func: Some("lookup".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(NodeInfo { ast: crate::cfg::AstMeta { enclosing_func: Some("lookup".into()), ..Default::default() }, ..make_node(StmtKind::Exit) }); cfg.add_edge(entry, use_node, EdgeKind::Seq); cfg.add_edge(use_node, exit, EdgeKind::Seq); let formals = vec!["userId".to_string()]; let body = lower_to_ssa_with_params(&cfg, entry, Some("lookup"), false, &formals) .expect("SSA lowering should succeed"); let user_id_param = body .blocks .first() .and_then(|b| { b.body.iter().find(|inst| { matches!(inst.op, SsaOp::Param { .. }) && inst.var_name.as_deref() == Some("userId") }) }) .expect("userId Param should be present"); assert!( !body.synthetic_externals.contains(&user_id_param.value), "real formal `userId` must not be marked synthetic; \ synthetic_externals={:?}", body.synthetic_externals, ); } /// W1: a plain non-dotted assignment (`x = 1`) records nothing /// in `field_writes`. Strict-additive: existing behaviour is /// unchanged for non-field-write shapes. #[test] fn w1_end_to_end_plain_assign_records_no_field_write() { let src = b"function f() { let x = 1; x = 2; }"; let body = parse_to_first_body( src, "javascript", tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE), "test.js", ); assert!( body.field_writes.is_empty(), "plain assign must not populate field_writes; got {:?}", body.field_writes, ); } // ───────────────────────────────────────────────────────────────── // SSA edge cases: loop induction, multi-variable phis, multiple // returns, switch-cases, and shadowing. These plug holes in the // dominator-frontier / variable-renaming coverage. // ───────────────────────────────────────────────────────────────── /// Loop induction variable: `x = x + 1` inside a loop is the /// canonical SSA challenge, the body uses `x` then redefines it, /// and the join with the entry definition must produce a phi that /// distinguishes the entry value from the body's redefinition. /// Induction-var pruning depends on this shape being lowered /// correctly. #[test] fn loop_self_assignment_induction_phi_is_distinct() { // Entry → x=0 → Loop header → [Body: use x; x = x_new] → Loop // The body both uses and defines x, modeling `x = x + 1`. let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let init_x = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let header = cfg.add_node(make_node(StmtKind::Loop)); let body = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, init_x, EdgeKind::Seq); cfg.add_edge(init_x, header, EdgeKind::Seq); cfg.add_edge(header, body, EdgeKind::True); cfg.add_edge(body, header, EdgeKind::Back); cfg.add_edge(header, exit, EdgeKind::False); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // We expect THREE distinct SSA values for `x`: // - init_x (entry value) // - body's redefinition // - the loop-header phi let x_defs: Vec<_> = ssa .value_defs .iter() .filter(|vd| vd.var_name.as_deref() == Some("x")) .collect(); assert!( x_defs.len() >= 3, "expected ≥3 SSA values for x (init, phi, body-redef), got {}", x_defs.len() ); // The header's phi for x must have exactly two operands (entry // value + back-edge value) and they must NOT both be the same // SsaValue (otherwise the renaming collapsed the two arms). let phi_ops = ssa .blocks .iter() .flat_map(|b| b.phis.iter()) .find(|p| p.var_name.as_deref() == Some("x")) .and_then(|p| match &p.op { SsaOp::Phi(ops) => Some(ops.clone()), _ => None, }) .expect("expected a Phi op for x at the loop header"); assert_eq!( phi_ops.len(), 2, "loop header phi for x should have 2 operands, got {}", phi_ops.len() ); let unique: HashSet<_> = phi_ops.iter().map(|(_, v)| v).collect(); assert_eq!( unique.len(), 2, "phi operands must be distinct (entry vs back-edge), got {:?}", phi_ops ); } /// Diamond join with two distinct variables defined in both arms: /// the merge block must contain a phi for EACH of the variables, /// not just one. Guards against single-variable phi insertion. #[test] fn diamond_join_produces_phi_per_variable() { // Entry → cond → [True: x=1; y=10] → join // ↘ [False: x=2; y=20] ↗ let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let cond = cfg.add_node(make_node(StmtKind::If)); let true_def = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let true_def2 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("y".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let false_def = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let false_def2 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("y".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let join = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["x".into(), "y".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, cond, EdgeKind::Seq); cfg.add_edge(cond, true_def, EdgeKind::True); cfg.add_edge(true_def, true_def2, EdgeKind::Seq); cfg.add_edge(true_def2, join, EdgeKind::Seq); cfg.add_edge(cond, false_def, EdgeKind::False); cfg.add_edge(false_def, false_def2, EdgeKind::Seq); cfg.add_edge(false_def2, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); let phi_vars: HashSet<&str> = ssa .blocks .iter() .flat_map(|b| b.phis.iter()) .filter_map(|p| p.var_name.as_deref()) .collect(); assert!( phi_vars.contains("x"), "expected phi for x at diamond join, got {:?}", phi_vars ); assert!( phi_vars.contains("y"), "expected phi for y at diamond join, got {:?}", phi_vars ); } /// Two reachable Return nodes from different branches must each /// produce a `Terminator::Return`. Common before: only the last /// CFG-Return survived as a real return, others were Goto'd to /// Exit. Regression for the early-return check. #[test] fn two_branches_with_returns_each_terminates_with_return() { // Entry → cond → [True: r1=1; return r1] / [False: r2=2; return r2] let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let cond = cfg.add_node(make_node(StmtKind::If)); let r1 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("r1".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let ret1 = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["r1".into()], ..Default::default() }, ..make_node(StmtKind::Return) }); let r2 = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("r2".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let ret2 = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["r2".into()], ..Default::default() }, ..make_node(StmtKind::Return) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, cond, EdgeKind::Seq); cfg.add_edge(cond, r1, EdgeKind::True); cfg.add_edge(r1, ret1, EdgeKind::Seq); cfg.add_edge(ret1, exit, EdgeKind::Seq); cfg.add_edge(cond, r2, EdgeKind::False); cfg.add_edge(r2, ret2, EdgeKind::Seq); cfg.add_edge(ret2, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Count blocks ending with `Terminator::Return(_)`. let return_blocks = ssa .blocks .iter() .filter(|b| matches!(&b.terminator, Terminator::Return(_))) .count(); assert_eq!( return_blocks, 2, "expected 2 Return-terminated blocks, got {}", return_blocks ); } /// Variable defined ONLY in one branch of a conditional must be /// undef on the other path. The phi at the join should include an /// undef sentinel for the missing arm, guards against the /// renamer silently dropping the missing operand. #[test] fn conditional_define_only_one_arm_phi_has_undef_operand() { // Entry → cond → [True: x=1] → join (uses x) // ↘ [False: nop] ↗ let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let cond = cfg.add_node(make_node(StmtKind::If)); let true_def = cfg.add_node(NodeInfo { taint: TaintMeta { defines: Some("x".into()), ..Default::default() }, ..make_node(StmtKind::Seq) }); let false_nop = cfg.add_node(make_node(StmtKind::Seq)); let join = cfg.add_node(NodeInfo { taint: TaintMeta { uses: vec!["x".into()], ..Default::default() }, ..make_node(StmtKind::Seq) }); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, cond, EdgeKind::Seq); cfg.add_edge(cond, true_def, EdgeKind::True); cfg.add_edge(true_def, join, EdgeKind::Seq); cfg.add_edge(cond, false_nop, EdgeKind::False); cfg.add_edge(false_nop, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Find a phi for x and verify it has 2 operands. The "undef" // operand can manifest as a Nop-defined SsaValue or a sentinel //, both are acceptable; the invariant is that arity == preds. let x_phi_ops = ssa .blocks .iter() .flat_map(|b| b.phis.iter()) .find(|p| p.var_name.as_deref() == Some("x")) .and_then(|p| match &p.op { SsaOp::Phi(ops) => Some(ops.clone()), _ => None, }); if let Some(ops) = x_phi_ops { assert_eq!( ops.len(), 2, "phi for x at the join must have 2 operands (one per pred), got {}", ops.len() ); } // Acceptable alternative: SSA may skip phi insertion when one // arm is undef. The invariant we care about is that lowering // doesn't panic, which `lower_to_ssa(...).unwrap()` already // exercises. } /// `lower_to_ssa` on a CFG with NO definitions of any variable /// must still succeed and produce a body with at least entry/exit /// blocks. Regression for trivial-function lowering. #[test] fn empty_function_body_only_entry_and_exit_lowers_cleanly() { let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let exit = cfg.add_node(make_node(StmtKind::Exit)); cfg.add_edge(entry, exit, EdgeKind::Seq); let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); assert!( !ssa.blocks.is_empty(), "even an empty body should produce at least one block" ); // No phis (nothing converged), no value_defs except possibly // entry sentinels. We just assert it lowered without panic. } }