nyx/src/ssa/param_points_to.rs
Eli Peter 1f2bfe76c1
docs: Enhance module documentation across various files for clarity a… (#62)
* docs: Enhance module documentation across various files for clarity and completeness

* fix: Remove unnecessary blank line in build.rs for cleaner code

* docs: Update documentation to improve clarity and consistency in code comments
2026-05-02 17:46:45 -04:00

654 lines
25 KiB
Rust

//! Parameter-granularity points-to analysis.
//!
//! Produces a [`PointsToSummary`] for a function body by walking the SSA
//! once and recording two classes of aliasing:
//!
//! 1. **Param → Param field writes.** An `obj.field = val` where `obj`
//! traces back to parameter `b` and `val` traces back to parameter `a`
//! emits a `Param(a) → Param(b)` `MayAlias` edge. This captures the
//! `mutating_helper` pattern, the callee mutates a shared heap cell
//! through one parameter and the caller observes the mutation through
//! its argument for that parameter.
//!
//! 2. **Param → Return aliases.** `Terminator::Return(v)` where `v`
//! traces back to a parameter emits a `Param(i) → Return` edge. This
//! captures the `returned_alias` pattern, the callee returns its
//! argument unchanged and the caller treats the result as aliasing the
//! input.
//!
//! Field-write detection uses the existing SSA lowering convention: a
//! source-level `obj.x = val` is lowered to an `Assign` whose `var_name`
//! is the dotted path `"obj.x"`, plus synthetic parent-path Assigns that
//! propagate the write up to the base (`"obj"`). See
//! [`crate::ssa::lower`]'s "Synthetic base update" block for the
//! canonical source.
//!
//! The analysis is **flow-insensitive** and **bounded**: it does not
//! reason about path feasibility, and it stops adding edges once the
//! summary's `MAX_ALIAS_EDGES` cap is reached, the overflow flag is
//! the conservative fallback that callers honour.
use std::collections::{HashMap, HashSet};
use smallvec::SmallVec;
use crate::summary::points_to::{AliasKind, AliasPosition, PointsToSummary};
use crate::symbol::Lang;
use super::ir::{SsaBody, SsaOp, SsaValue, Terminator};
/// Map an SSA value back to its defining instruction's op.
///
/// Local to this module, the taint engine has its own `build_inst_map`
/// that also carries receiver info we do not need, and duplicating it
/// keeps this analysis independent of that private helper's shape.
fn build_op_map(ssa: &SsaBody) -> HashMap<SsaValue, SsaOp> {
let mut map = HashMap::with_capacity(ssa.num_values());
for block in &ssa.blocks {
for inst in block.phis.iter().chain(block.body.iter()) {
map.insert(inst.value, inst.op.clone());
}
}
map
}
/// Sibling of [`build_op_map`] that captures the optional `var_name`
/// recorded on each SSA instruction. Used alongside the op map so a
/// [`ParamHit`] can surface the underlying variable name for
/// formal-index resolution.
fn build_var_name_map(ssa: &SsaBody) -> HashMap<SsaValue, Option<String>> {
let mut map = HashMap::with_capacity(ssa.num_values());
for block in &ssa.blocks {
for inst in block.phis.iter().chain(block.body.iter()) {
map.insert(inst.value, inst.var_name.clone());
}
}
map
}
/// Information about an SSA `Param { index }` node needed to resolve
/// back to a caller-side positional index via formal-params lookup.
#[derive(Clone, Debug)]
struct ParamHit {
/// The `SsaOp::Param` index as lowered.
ssa_index: usize,
/// The parameter's variable name (from [`SsaInst::var_name`]). Used
/// to map back to the formal-declaration position, the caller's
/// `args[i]` slot is keyed by declaration position, not by SSA
/// index, and the two can disagree when a formal parameter is
/// skipped from SSA lowering (e.g., pure-output params).
var_name: Option<String>,
}
/// Walk Assign/Phi chains to find a backing `Param { index }` SSA op.
///
/// Returns the `SsaOp::Param`'s index *and* its var_name so callers can
/// resolve the formal-positional index via the name lookup table, the
/// two indices can disagree when SSA lowering skips a formal parameter
/// (never used as a read), shifting subsequent param indices down.
fn trace_to_param_hit(
v: SsaValue,
op_map: &HashMap<SsaValue, SsaOp>,
var_names: &HashMap<SsaValue, Option<String>>,
visited: &mut HashSet<SsaValue>,
) -> Option<ParamHit> {
if !visited.insert(v) {
return None;
}
match op_map.get(&v)? {
SsaOp::Param { index } => Some(ParamHit {
ssa_index: *index,
var_name: var_names.get(&v).cloned().flatten(),
}),
SsaOp::Assign(uses) => {
for u in uses {
if let Some(hit) = trace_to_param_hit(*u, op_map, var_names, visited) {
return Some(hit);
}
}
None
}
SsaOp::Phi(operands) => {
for (_, pv) in operands {
if let Some(hit) = trace_to_param_hit(*pv, op_map, var_names, visited) {
return Some(hit);
}
}
None
}
// Call produces a fresh identity; Const / Source / CatchParam /
// SelfParam / Nop are not param-derived.
_ => None,
}
}
/// Resolve a [`ParamHit`] to a caller-side positional index using the
/// formal-params name lookup. Falls back to the SSA `index` when no
/// name-based match exists (e.g., extractor called without
/// `formal_param_names`).
fn param_hit_to_formal_index(hit: &ParamHit, params_by_name: &HashMap<String, usize>) -> usize {
if let Some(name) = &hit.var_name
&& let Some(&idx) = params_by_name.get(name)
{
return idx;
}
hit.ssa_index
}
/// Parse the base of a dotted / indexed path into its root name.
///
/// * `"obj"` → `"obj"`
/// * `"obj.field"` → `"obj"`
/// * `"obj.field.sub"` → `"obj"`
/// * `"obj[0]"` → `"obj"`
/// * `"obj.list[2].name"` → `"obj"`
///
/// Used to decide whether a field-style Assign's LHS base names a
/// parameter variable, we strip everything after the first separator
/// and compare the remainder to the recorded param names.
fn base_of_path(name: &str) -> &str {
let dot = name.find('.');
let bracket = name.find('[');
let end = match (dot, bracket) {
(Some(d), Some(b)) => d.min(b),
(Some(d), None) => d,
(None, Some(b)) => b,
(None, None) => return name,
};
&name[..end]
}
/// Local receiver check duplicated to avoid depending on private
/// `lower::is_receiver_name`. Must stay in sync with that helper.
fn is_receiver_name_local(name: &str) -> bool {
matches!(name, "self" | "this")
}
/// Walk Assign/Phi chains from a return value to decide whether the path
/// ends at a fresh container allocation (literal or constructor call).
///
/// Returns `true` the first time a qualifying allocation is found.
/// Parameter-terminated paths, `Call` ops that are not container
/// constructors, and constants that are not container literals all
/// return `false`, soundly under-approximating, since the caller will
/// simply fall back to the existing `Param(i) → Return` / store-into-
/// heap channels when the flag is absent.
fn trace_to_fresh_alloc(
v: SsaValue,
op_map: &HashMap<SsaValue, SsaOp>,
lang: Option<Lang>,
visited: &mut HashSet<SsaValue>,
) -> bool {
if !visited.insert(v) {
return false;
}
let Some(op) = op_map.get(&v) else {
return false;
};
match op {
SsaOp::Const(Some(text)) => crate::ssa::heap::is_container_literal_public(text),
SsaOp::Call { callee, .. } => lang
.map(|l| crate::ssa::heap::is_container_constructor(callee, l))
.unwrap_or(false),
SsaOp::Assign(uses) => uses
.iter()
.any(|u| trace_to_fresh_alloc(*u, op_map, lang, visited)),
SsaOp::Phi(operands) => operands
.iter()
.any(|(_, pv)| trace_to_fresh_alloc(*pv, op_map, lang, visited)),
_ => false,
}
}
/// Whether any `Terminator::Return(Some(v))` in the body traces back to a
/// fresh container allocation. Invoked once per function; the visited
/// set is fresh per return block so distinct returns do not poison each
/// other's searches.
fn returns_fresh_allocation(
ssa: &SsaBody,
op_map: &HashMap<SsaValue, SsaOp>,
lang: Option<Lang>,
) -> bool {
for block in &ssa.blocks {
let Terminator::Return(Some(v)) = block.terminator else {
continue;
};
let mut visited = HashSet::new();
if trace_to_fresh_alloc(v, op_map, lang, &mut visited) {
return true;
}
}
false
}
/// Compute the parameter-granularity points-to summary for a function.
///
/// `param_info` carries one `(param_index, param_name, param_ssa_value)`
/// tuple per formal parameter that was emitted as [`SsaOp::Param`] in the
/// lowered body. The receiver is intentionally excluded, this table
/// captures positional parameters only.
///
/// `formal_param_names`, when supplied, is the authoritative list of
/// declared parameter names in declaration order. It matters for
/// **pure-output parameters**: a param like `target` in
/// `fn set(target, val): target.data = val` is never *used* in the body
/// (only assigned into), so SSA lowering does not emit a `Param` node
/// for it and `param_info` will not contain it. Falling back to
/// `formal_param_names` lets the base-name lookup still find its index.
///
/// `formal_param_count` bounds the parameter indices written to the
/// summary: scoped lowering synthesises `Param` ops for module-level
/// captures at indices beyond the formal arity, and those must not leak
/// into the summary (they would trip `ssa_summary_fits_arity`).
pub fn analyse_param_points_to(
ssa: &SsaBody,
param_info: &[(usize, String, SsaValue)],
formal_param_count: usize,
formal_param_names: Option<&[String]>,
lang: Option<Lang>,
) -> PointsToSummary {
let mut summary = PointsToSummary::empty();
let op_map = build_op_map(ssa);
let var_names = build_var_name_map(ssa);
// ── 0. Fresh-container return detection ─────────────────────────────
//
// A return path traces back to either:
// * `SsaOp::Const(text)` where `text` is a container literal
// (`[]`, `{}`, `new Map()`, …), OR
// * `SsaOp::Call { callee, … }` where `callee` matches a known
// container constructor for `lang` (`ArrayList`, `dict`, …).
//
// When at least one return path matches, the callee produces a
// caller-visible fresh heap identity on that path, callers
// synthesise a `HeapObjectId` keyed on the call result so later
// container operations have a stable heap cell. Traces that reach a
// parameter are handled by the edge-based `Param(i) → Return` channel
// below and do not contribute here; a mixed function emits both.
//
// Runs before the early-out on `formal_param_count == 0` so pure
// factories (zero-param container constructors) still record the
// fresh-alloc signal.
if returns_fresh_allocation(ssa, &op_map, lang) {
summary.returns_fresh_alloc = true;
}
if formal_param_count == 0 {
return summary;
}
// Build the name→positional-index map. Summary param indices are
// *positional*, they match the call-site `args[i]` position, which
// excludes the receiver (`self`/`this`). When `formal_param_names`
// contains a leading receiver, skip it so the remaining names align
// with the SSA `SsaOp::Param { index }` convention.
let mut params_by_name: HashMap<String, usize> = HashMap::new();
if let Some(names) = formal_param_names {
let mut pos: usize = 0;
for name in names {
if is_receiver_name_local(name) {
continue;
}
if pos >= formal_param_count {
break;
}
params_by_name.insert(name.clone(), pos);
pos += 1;
}
}
// Overlay `param_info` ONLY when formal_param_names was absent.
// When formal_param_names is supplied it is the authoritative
// declaration-order mapping; SSA param indices can legitimately
// diverge (a pure-output param is never emitted, shifting later
// indices down), so trusting SSA here would mis-map the caller's
// `args[i]` positional slot.
if formal_param_names.is_none() {
for (idx, name, _) in param_info {
params_by_name.insert(name.clone(), *idx);
}
}
// ── 1. Field-store alias edges (Param(a) → Param(b)) ────────────────
//
// SSA lowering encodes `obj.field = val` as one or more Assigns whose
// `var_name` is the dotted / indexed path. For every such Assign we
// look up the root name, check it matches a parameter variable, and
// trace each use back to a param for the `Param(a) → Param(b)` edge.
for block in &ssa.blocks {
for inst in block.body.iter() {
let SsaOp::Assign(uses) = &inst.op else {
continue;
};
let Some(name) = inst.var_name.as_ref() else {
continue;
};
// Only field/index-style writes encode the base in var_name;
// a plain `x = ...` doesn't imply aliasing with `x`'s param.
if !name.contains('.') && !name.contains('[') {
continue;
}
let base = base_of_path(name);
let Some(&target_idx) = params_by_name.get(base) else {
continue;
};
if target_idx >= formal_param_count {
continue;
}
for u in uses {
let mut visited = HashSet::new();
let Some(hit) = trace_to_param_hit(*u, &op_map, &var_names, &mut visited) else {
continue;
};
let src_idx = param_hit_to_formal_index(&hit, &params_by_name);
if src_idx >= formal_param_count {
continue;
}
if src_idx == target_idx {
// Self-alias is uninformative, the caller's
// arg-to-itself propagation is already covered by
// `param_to_return`/`param_to_sink`.
continue;
}
summary.insert(
AliasPosition::Param(src_idx as u32),
AliasPosition::Param(target_idx as u32),
AliasKind::MayAlias,
);
if summary.overflow {
return summary;
}
}
}
}
// ── 2. Return-alias edges (Param(i) → Return) ───────────────────────
//
// `Terminator::Return(v)` with `v` tracing back to a parameter means
// the call site's result aliases the corresponding argument's heap
// identity. Joining across all return blocks is a plain set union.
let mut return_param_indices: SmallVec<[usize; 4]> = SmallVec::new();
for block in &ssa.blocks {
let Terminator::Return(Some(v)) = block.terminator else {
continue;
};
let mut visited = HashSet::new();
if let Some(hit) = trace_to_param_hit(v, &op_map, &var_names, &mut visited) {
let idx = param_hit_to_formal_index(&hit, &params_by_name);
if idx < formal_param_count && !return_param_indices.contains(&idx) {
return_param_indices.push(idx);
}
}
}
for idx in return_param_indices {
summary.insert(
AliasPosition::Param(idx as u32),
AliasPosition::Return,
AliasKind::MayAlias,
);
if summary.overflow {
return summary;
}
}
summary
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ssa::ir::{BlockId, SsaBlock, SsaInst};
use petgraph::graph::NodeIndex;
use smallvec::smallvec;
fn mk_body(blocks: Vec<SsaBlock>, num_values: u32) -> SsaBody {
use crate::ssa::ir::ValueDef;
let value_defs = (0..num_values)
.map(|_| ValueDef {
var_name: None,
cfg_node: NodeIndex::new(0),
block: BlockId(0),
})
.collect();
SsaBody {
blocks,
entry: BlockId(0),
value_defs,
cfg_node_map: HashMap::new(),
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
fn inst(v: u32, op: SsaOp, var_name: Option<&str>) -> SsaInst {
SsaInst {
value: SsaValue(v),
op,
cfg_node: NodeIndex::new(0),
var_name: var_name.map(String::from),
span: (0, 0),
}
}
#[test]
fn field_write_param_to_param_emits_edge() {
// Simulate:
// fn f(a, b):
// b.data = a # Assign var_name="b.data" uses=[a_ssa]
// synthetic: b = b.data # Assign var_name="b" uses=[assign0]
// return
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![
inst(0, SsaOp::Param { index: 0 }, Some("a")),
inst(1, SsaOp::Param { index: 1 }, Some("b")),
inst(2, SsaOp::Assign(smallvec![SsaValue(0)]), Some("b.data")),
inst(3, SsaOp::Assign(smallvec![SsaValue(2)]), Some("b")),
],
terminator: Terminator::Return(None),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 4);
let pinfo = vec![
(0usize, "a".to_string(), SsaValue(0)),
(1usize, "b".to_string(), SsaValue(1)),
];
let s = analyse_param_points_to(&body, &pinfo, 2, None, None);
assert!(!s.overflow, "unexpected overflow: {s:?}");
assert!(
s.edges.iter().any(|e| e.source == AliasPosition::Param(0)
&& e.target == AliasPosition::Param(1)
&& e.kind == AliasKind::MayAlias),
"expected Param(0) → Param(1) edge, got {s:?}"
);
}
#[test]
fn return_alias_emits_edge() {
// fn f(a): return a
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![inst(0, SsaOp::Param { index: 0 }, Some("a"))],
terminator: Terminator::Return(Some(SsaValue(0))),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 1);
let pinfo = vec![(0usize, "a".to_string(), SsaValue(0))];
let s = analyse_param_points_to(&body, &pinfo, 1, None, None);
assert!(!s.overflow);
assert_eq!(s.edges.len(), 1);
assert_eq!(s.edges[0].source, AliasPosition::Param(0));
assert_eq!(s.edges[0].target, AliasPosition::Return);
}
#[test]
fn self_alias_is_dropped() {
// fn f(b): b.data = b_other_field (reading b.x and writing b.y)
// Both uses trace back to Param(0) and base is Param(0) →
// self-alias is uninformative, no edge emitted.
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![
inst(0, SsaOp::Param { index: 0 }, Some("b")),
inst(1, SsaOp::Assign(smallvec![SsaValue(0)]), Some("b.x")),
inst(2, SsaOp::Assign(smallvec![SsaValue(1)]), Some("b.data")),
],
terminator: Terminator::Return(None),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 3);
let pinfo = vec![(0usize, "b".to_string(), SsaValue(0))];
let s = analyse_param_points_to(&body, &pinfo, 1, None, None);
assert!(
s.is_empty(),
"self-alias edges should not be emitted: {s:?}"
);
}
#[test]
fn out_of_range_param_rejected() {
// Synthetic Param with index >= formal_param_count must not leak
// into the summary (it would trip ssa_summary_fits_arity).
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![
inst(0, SsaOp::Param { index: 5 }, Some("capture")),
inst(1, SsaOp::Param { index: 1 }, Some("b")),
inst(2, SsaOp::Assign(smallvec![SsaValue(0)]), Some("b.data")),
],
terminator: Terminator::Return(None),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 3);
let pinfo = vec![
(5usize, "capture".to_string(), SsaValue(0)),
(1usize, "b".to_string(), SsaValue(1)),
];
// formal_param_count = 2, index 5 is out of range.
let s = analyse_param_points_to(&body, &pinfo, 2, None, None);
assert!(
s.is_empty(),
"synthetic captures past formal arity must not emit edges: {s:?}"
);
}
#[test]
fn bounded_graph_overflows_at_cap() {
// Build MAX_ALIAS_EDGES+2 param→return edges by returning a Phi
// of every param. This exercises the overflow fallback.
let n = (crate::summary::points_to::MAX_ALIAS_EDGES + 2) as u32;
let mut insts = Vec::new();
let mut phi_operands: SmallVec<[(BlockId, SsaValue); 2]> = SmallVec::new();
for i in 0..n {
insts.push(inst(
i,
SsaOp::Param { index: i as usize },
Some(&format!("p{i}")),
));
phi_operands.push((BlockId(0), SsaValue(i)));
}
let phi_v = n;
insts.push(inst(phi_v, SsaOp::Phi(phi_operands), Some("ret")));
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: insts,
terminator: Terminator::Return(Some(SsaValue(phi_v))),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], n + 1);
let pinfo: Vec<(usize, String, SsaValue)> = (0..n as usize)
.map(|i| (i, format!("p{i}"), SsaValue(i as u32)))
.collect();
// Only the first traced param is emitted (trace_to_param short-
// circuits on first match), so overflow is not expected, we
// instead verify the bounded behaviour: a single edge.
let s = analyse_param_points_to(&body, &pinfo, n as usize, None, None);
assert!(!s.overflow);
assert_eq!(s.edges.len(), 1);
}
#[test]
fn fresh_container_literal_return_sets_flag() {
// fn makeBag() { return []; }
// v0 = Const("[]")
// terminator: Return(v0)
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![inst(0, SsaOp::Const(Some("[]".to_string())), None)],
terminator: Terminator::Return(Some(SsaValue(0))),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 1);
let s = analyse_param_points_to(&body, &[], 0, None, Some(Lang::JavaScript));
assert!(s.returns_fresh_alloc);
assert!(s.edges.is_empty());
}
#[test]
fn constructor_return_sets_flag() {
// fn makeList() { return list(); }
// v0 = Call("list", [])
// terminator: Return(v0)
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![inst(
0,
SsaOp::Call {
callee: "list".to_string(),
callee_text: None,
args: vec![],
receiver: None,
},
None,
)],
terminator: Terminator::Return(Some(SsaValue(0))),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 1);
let s = analyse_param_points_to(&body, &[], 0, None, Some(Lang::Python));
assert!(s.returns_fresh_alloc);
}
#[test]
fn return_of_param_does_not_set_fresh_flag() {
// fn identity(a) { return a; }
let block = SsaBlock {
id: BlockId(0),
phis: vec![],
body: vec![inst(0, SsaOp::Param { index: 0 }, Some("a"))],
terminator: Terminator::Return(Some(SsaValue(0))),
preds: smallvec![],
succs: smallvec![],
};
let body = mk_body(vec![block], 1);
let pinfo = vec![(0usize, "a".to_string(), SsaValue(0))];
let s = analyse_param_points_to(&body, &pinfo, 1, None, Some(Lang::JavaScript));
assert!(
!s.returns_fresh_alloc,
"param-only return must not set fresh-alloc flag"
);
// But the Param(0) → Return edge must still be emitted.
assert!(
s.edges
.iter()
.any(|e| e.source == AliasPosition::Param(0) && e.target == AliasPosition::Return),
"expected Param(0) → Return edge, got {s:?}"
);
}
}