nyx/src/summary/points_to.rs
Eli Peter 1f2bfe76c1
docs: Enhance module documentation across various files for clarity a… (#62)
* docs: Enhance module documentation across various files for clarity and completeness

* fix: Remove unnecessary blank line in build.rs for cleaner code

* docs: Update documentation to improve clarity and consistency in code comments
2026-05-02 17:46:45 -04:00

543 lines
20 KiB
Rust

//! Parameter-granularity points-to summaries.
//!
//! Captures the subset of intra-procedural alias behaviour that matters
//! at cross-file call sites: which parameters' heap/field writes are
//! observable to the caller through *another* parameter's alias, and
//! which parameters flow identity to the return value.
//!
//! ## Scope
//!
//! This is **intentionally not** a whole-program points-to analysis.
//! Nyx already has bounded intra-procedural heap tracking
//! ([`crate::ssa::heap`]); this module bridges the cross-file cliff by recording
//! a small, bounded alias graph between parameter positions and the return
//! value, then replaying it at summary-resolution time.
//!
//! ## Edge model
//!
//! Edges are directed `AliasEdge { source, target, kind }`:
//!
//! * `Source(Param(i)) → Target(Param(j))`, the callee stores data
//! derived from parameter `i` into a field/element of parameter `j`.
//! Mutation is observable to the caller through its argument for `j`.
//! * `Source(Param(i)) → Target(Return)`, the return value aliases
//! parameter `i`'s heap identity. Adds heap-level precision on top of
//! the coarser [`crate::summary::ssa_summary::TaintTransform::Identity`] view already carried in
//! [`crate::summary::ssa_summary::SsaFuncSummary::param_to_return`].
//!
//! `MustAlias` is intentionally omitted, the ROI on
//! must-alias inference for cross-file summaries is low, and the soundness
//! story for `MayAlias`-only application is straightforward ("take the
//! union").
//!
//! ## Bound and overflow policy
//!
//! Edge count is capped at [`MAX_ALIAS_EDGES`]. When a callee's alias
//! graph exceeds the cap the summary records `overflow = true` and
//! callers treat the function as "any tainted parameter may spread to
//! every other parameter and to the return", the conservative
//! greatest-lower-bound over the alias lattice.
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
/// Identity of one endpoint in an alias edge.
///
/// Parameters are identified by their 0-based positional index as reported
/// by [`crate::ssa::ir::SsaOp::Param`]; the implicit receiver (`self`/`this`)
/// is handled outside this table and is deliberately not representable here.
/// `Return` denotes the function's return SSA value, one per function, so
/// no further qualifier is needed.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
pub enum AliasPosition {
/// Positional parameter, 0-based. Receiver is excluded.
Param(u32),
/// The function's return value (union of every `Terminator::Return`).
Return,
}
/// Strength of an alias edge. Only [`AliasKind::MayAlias`] is emitted
///, the analysis over-approximates identity-level aliasing rather than
/// proving must-alias. The variant is kept as an enum so a future
/// extension that distinguishes the two can slot in without migrating
/// on-disk data.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
pub enum AliasKind {
/// Under some execution, the two positions may reference the same
/// heap object. Callers applying the edge take the *union* of
/// points-to / taint at the source into the target.
MayAlias,
}
/// A single directed alias edge.
///
/// `(source, target)` are order-sensitive: data flows from `source` to
/// `target` at the callee. Callers apply each edge by reading their
/// argument / return abstraction for `source` and propagating into
/// `target`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct AliasEdge {
pub source: AliasPosition,
pub target: AliasPosition,
pub kind: AliasKind,
}
/// Maximum edges retained per [`PointsToSummary`].
///
/// Chosen so typical callees (≤ 4 parameters, one return, a handful of
/// field writes) fit without approximation while pathological graphs
/// still terminate the analysis in bounded time. Overflow triggers the
/// [`PointsToSummary::overflow`] fallback instead of silently dropping
/// edges, so callers can reason about soundness.
pub const MAX_ALIAS_EDGES: usize = 8;
/// Parameter-granularity alias summary persisted in
/// [`crate::summary::ssa_summary::SsaFuncSummary`].
///
/// The summary is empty by default, functions without any parameter /
/// return aliasing (pure transformers, sinks that consume but don't
/// mutate their arguments) carry no edges and cost nothing on disk.
///
/// When the callee's alias graph exceeds [`MAX_ALIAS_EDGES`], extraction
/// sets [`overflow = true`](Self::overflow) and callers must treat every
/// parameter as reaching every other parameter and the return. This is
/// the conservative fallback for bounded alias analysis.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct PointsToSummary {
/// Bounded edge list, deduped by `(source, target, kind)`. The
/// `#[serde(default)]` attribute lets summaries pre-dating points-to
/// tracking deserialise cleanly (no edges).
#[serde(default, skip_serializing_if = "SmallVec::is_empty")]
pub edges: SmallVec<[AliasEdge; 4]>,
/// Conservative fallback flag, set when extraction hit
/// [`MAX_ALIAS_EDGES`] and refused to drop any edge silently. When
/// `true`, callers treat the callee as "every parameter may alias
/// every other parameter and the return value".
#[serde(default, skip_serializing_if = "core::ops::Not::not")]
pub overflow: bool,
/// At least one return path produces a *fresh* container allocation ,
/// a container literal (`[]`, `{}`) or a known container constructor
/// call (`new Map()`, `list()`, …) that does not trace back to any
/// parameter. When this is `true` the caller synthesises a fresh
/// [`crate::ssa::heap::HeapObjectId`] keyed on the call's SSA value
/// and seeds it into `dynamic_pts`, so later container operations on
/// the call result (e.g. `bag[0]`, `fillBag(bag, …)`) can find a heap
/// cell to read from or store into.
///
/// Closes the factory-pattern cross-file gap, `const bag = makeBag()`
/// followed by `fillBag(bag, env)` and `exec(bag[0])`, by giving the
/// caller's heap analysis a stable identity to attach stores to.
/// Combines freely with `Param(i) → Return` edges: a mixed-return
/// function (one branch returns a param, another returns a fresh
/// allocation) emits both and the caller joins the two points-to
/// sets.
#[serde(default, skip_serializing_if = "core::ops::Not::not")]
pub returns_fresh_alloc: bool,
}
impl PointsToSummary {
/// Empty summary, no aliasing, no overflow. Equivalent to
/// [`Self::default`] but explicit at call sites.
pub fn empty() -> Self {
Self::default()
}
/// Whether this summary adds any information over the default "no
/// aliasing" interpretation. Used by extraction to decide whether
/// the field should be persisted or left empty.
pub fn is_empty(&self) -> bool {
self.edges.is_empty() && !self.overflow && !self.returns_fresh_alloc
}
/// Insert an edge, preserving dedup and the bounded-size invariant.
///
/// Returns `true` when the edge was added, `false` when it was a
/// duplicate or when the cap triggered an overflow. The caller can
/// ignore the return, the summary always remains in a valid state.
pub fn insert(&mut self, source: AliasPosition, target: AliasPosition, kind: AliasKind) {
if self.overflow {
return;
}
let edge = AliasEdge {
source,
target,
kind,
};
if self.edges.contains(&edge) {
return;
}
if self.edges.len() >= MAX_ALIAS_EDGES {
self.overflow = true;
// Keep the existing edge list, a consumer that still reads
// the vector gets a strict *subset* of the sound over-
// approximation conveyed by `overflow`. Correctness is
// owned by the overflow flag; the residual edges are purely
// diagnostic.
return;
}
self.edges.push(edge);
}
/// Union two summaries, merging edges and OR-ing the overflow /
/// fresh-alloc flags. Respects the [`MAX_ALIAS_EDGES`] cap via the
/// same overflow promotion used by [`Self::insert`].
pub fn merge(&mut self, other: &Self) {
self.returns_fresh_alloc |= other.returns_fresh_alloc;
if other.overflow {
self.overflow = true;
return;
}
for edge in &other.edges {
self.insert(edge.source, edge.target, edge.kind);
}
}
/// Parameter indices referenced by any edge in this summary. Used by
/// `ssa_summary_fits_arity` to confirm the summary
/// does not reference a parameter beyond the key's declared arity
/// (which would indicate a synthetic-param mis-attribution in
/// extraction).
pub fn max_param_index(&self) -> Option<u32> {
let mut max: Option<u32> = None;
for edge in &self.edges {
if let AliasPosition::Param(i) = edge.source {
max = Some(max.map_or(i, |m| m.max(i)));
}
if let AliasPosition::Param(i) = edge.target {
max = Some(max.map_or(i, |m| m.max(i)));
}
}
max
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_summary_is_noop() {
let s = PointsToSummary::empty();
assert!(s.is_empty());
assert!(!s.overflow);
assert_eq!(s.edges.len(), 0);
}
#[test]
fn insert_dedups() {
let mut s = PointsToSummary::empty();
s.insert(
AliasPosition::Param(0),
AliasPosition::Param(1),
AliasKind::MayAlias,
);
s.insert(
AliasPosition::Param(0),
AliasPosition::Param(1),
AliasKind::MayAlias,
);
assert_eq!(s.edges.len(), 1);
}
#[test]
fn insert_overflows_at_cap() {
let mut s = PointsToSummary::empty();
for i in 0..(MAX_ALIAS_EDGES as u32) {
s.insert(
AliasPosition::Param(i),
AliasPosition::Return,
AliasKind::MayAlias,
);
}
assert_eq!(s.edges.len(), MAX_ALIAS_EDGES);
assert!(!s.overflow);
s.insert(
AliasPosition::Param(99),
AliasPosition::Return,
AliasKind::MayAlias,
);
assert!(s.overflow);
assert_eq!(s.edges.len(), MAX_ALIAS_EDGES);
}
#[test]
fn merge_propagates_overflow() {
let mut a = PointsToSummary::empty();
let mut b = PointsToSummary::empty();
b.overflow = true;
a.merge(&b);
assert!(a.overflow);
}
#[test]
fn max_param_index_tracks_both_endpoints() {
let mut s = PointsToSummary::empty();
s.insert(
AliasPosition::Param(0),
AliasPosition::Param(3),
AliasKind::MayAlias,
);
s.insert(
AliasPosition::Param(1),
AliasPosition::Return,
AliasKind::MayAlias,
);
assert_eq!(s.max_param_index(), Some(3));
}
#[test]
fn serde_round_trip_is_stable() {
let mut s = PointsToSummary::empty();
s.insert(
AliasPosition::Param(0),
AliasPosition::Param(1),
AliasKind::MayAlias,
);
s.insert(
AliasPosition::Param(2),
AliasPosition::Return,
AliasKind::MayAlias,
);
let json = serde_json::to_string(&s).unwrap();
let back: PointsToSummary = serde_json::from_str(&json).unwrap();
assert_eq!(s, back);
}
#[test]
fn serde_default_decodes_empty_object() {
let back: PointsToSummary = serde_json::from_str("{}").unwrap();
assert!(back.is_empty());
}
#[test]
fn returns_fresh_alloc_is_not_empty() {
let mut s = PointsToSummary::empty();
assert!(s.is_empty());
s.returns_fresh_alloc = true;
assert!(!s.is_empty());
}
#[test]
fn merge_propagates_fresh_alloc_flag() {
let mut a = PointsToSummary::empty();
let mut b = PointsToSummary::empty();
b.returns_fresh_alloc = true;
a.merge(&b);
assert!(a.returns_fresh_alloc);
}
#[test]
fn returns_fresh_alloc_roundtrips() {
let mut s = PointsToSummary::empty();
s.returns_fresh_alloc = true;
let json = serde_json::to_string(&s).unwrap();
let back: PointsToSummary = serde_json::from_str(&json).unwrap();
assert!(back.returns_fresh_alloc);
assert_eq!(s, back);
}
}
// ── field-granularity points-to summary ──────────────
/// Maximum field names retained per parameter in [`FieldPointsToSummary`].
///
/// Mirror of [`MAX_ALIAS_EDGES`]. Bounds on-disk + cross-file work
/// while leaving room for typical helpers (a handful of fields each).
pub const MAX_FIELDS_PER_PARAM: usize = 8;
/// field-granularity per-parameter points-to summary.
///
/// Records, for each positional parameter index, the set of field
/// **names** read from and written to inside the callee body. Names
/// (not [`crate::ssa::ir::FieldId`]) are persisted because field IDs
/// are body-local, the per-body [`crate::ssa::ir::FieldInterner`]
/// reassigns IDs across files. Callers re-intern through their own
/// body's interner before consulting `field_taint` cells.
///
/// The receiver (`self` / `this`) uses sentinel index [`usize::MAX`]
/// in the outer `Vec` so positional params and the receiver share the
/// same indexing convention as `SsaFuncSummary::receiver_to_*`
/// (separate channel).
///
/// Empty by default, functions that don't read or write any field on
/// their parameters carry no entries and cost nothing on disk.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct FieldPointsToSummary {
/// `(param_index, field_names_read)`, the callee projected each
/// listed field on a value derived from `param_index` somewhere
/// in its body. Sorted, deduped per-entry.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub param_field_reads: Vec<(u32, SmallVec<[String; 2]>)>,
/// `(param_index, field_names_written)`, the callee assigned to
/// each listed field on a value derived from `param_index`.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub param_field_writes: Vec<(u32, SmallVec<[String; 2]>)>,
/// Set when the read/write graph hit
/// [`MAX_FIELDS_PER_PARAM`] for any parameter. Callers seeing
/// `overflow=true` treat each parameter as reading/writing every
/// field on every other parameter, the conservative greatest
/// lower bound that preserves soundness.
#[serde(default, skip_serializing_if = "core::ops::Not::not")]
pub overflow: bool,
}
impl FieldPointsToSummary {
pub fn empty() -> Self {
Self::default()
}
pub fn is_empty(&self) -> bool {
self.param_field_reads.is_empty() && self.param_field_writes.is_empty() && !self.overflow
}
fn insert_into(
list: &mut Vec<(u32, SmallVec<[String; 2]>)>,
param: u32,
field: &str,
overflow: &mut bool,
) {
let entry = match list.iter_mut().find(|(p, _)| *p == param) {
Some(e) => &mut e.1,
None => {
list.push((param, SmallVec::new()));
&mut list.last_mut().unwrap().1
}
};
if entry.iter().any(|s| s == field) {
return;
}
if entry.len() >= MAX_FIELDS_PER_PARAM {
*overflow = true;
return;
}
entry.push(field.to_string());
entry.sort();
}
/// Record a field READ on parameter `param`. Bounded by
/// [`MAX_FIELDS_PER_PARAM`] per parameter; over-cap inserts trip
/// `overflow`.
pub fn add_read(&mut self, param: u32, field: &str) {
if self.overflow {
return;
}
let mut overflow = false;
Self::insert_into(&mut self.param_field_reads, param, field, &mut overflow);
if overflow {
self.overflow = true;
}
}
/// Record a field WRITE on parameter `param`. Mirror of [`Self::add_read`].
pub fn add_write(&mut self, param: u32, field: &str) {
if self.overflow {
return;
}
let mut overflow = false;
Self::insert_into(&mut self.param_field_writes, param, field, &mut overflow);
if overflow {
self.overflow = true;
}
}
/// Union with `other`. Overflow propagates per
/// [`PointsToSummary::merge`]'s semantics, once a callee is
/// "any field on any parameter", merging cannot recover precision.
pub fn merge(&mut self, other: &Self) {
if other.overflow {
self.overflow = true;
return;
}
for (p, fields) in &other.param_field_reads {
for f in fields {
self.add_read(*p, f);
}
}
for (p, fields) in &other.param_field_writes {
for f in fields {
self.add_write(*p, f);
}
}
}
}
#[cfg(test)]
mod field_summary_tests {
use super::*;
#[test]
fn empty_summary_round_trips() {
let s = FieldPointsToSummary::empty();
assert!(s.is_empty());
let json = serde_json::to_string(&s).unwrap();
let back: FieldPointsToSummary = serde_json::from_str(&json).unwrap();
assert_eq!(s, back);
}
#[test]
fn add_read_dedupes_and_sorts() {
let mut s = FieldPointsToSummary::empty();
s.add_read(0, "name");
s.add_read(0, "id");
s.add_read(0, "name"); // duplicate
let entry = s.param_field_reads.iter().find(|(p, _)| *p == 0).unwrap();
assert_eq!(entry.1.as_slice(), &["id".to_string(), "name".to_string()]);
}
#[test]
fn distinct_params_get_distinct_entries() {
let mut s = FieldPointsToSummary::empty();
s.add_write(0, "cache");
s.add_write(1, "log");
assert_eq!(s.param_field_writes.len(), 2);
}
#[test]
fn overflow_trips_at_cap() {
let mut s = FieldPointsToSummary::empty();
for i in 0..(MAX_FIELDS_PER_PARAM + 4) {
s.add_read(0, &format!("field{i}"));
}
assert!(s.overflow);
}
#[test]
fn merge_unions_disjoint_keys() {
let mut a = FieldPointsToSummary::empty();
let mut b = FieldPointsToSummary::empty();
a.add_read(0, "alpha");
b.add_read(1, "beta");
a.merge(&b);
assert!(a.param_field_reads.iter().any(|(p, _)| *p == 0));
assert!(a.param_field_reads.iter().any(|(p, _)| *p == 1));
}
#[test]
fn merge_propagates_overflow() {
let mut a = FieldPointsToSummary::empty();
let mut b = FieldPointsToSummary::empty();
b.overflow = true;
a.merge(&b);
assert!(a.overflow);
}
#[test]
fn round_trip_preserves_entries() {
let mut s = FieldPointsToSummary::empty();
s.add_read(0, "name");
s.add_write(1, "cache");
s.add_write(1, "log");
let json = serde_json::to_string(&s).unwrap();
let back: FieldPointsToSummary = serde_json::from_str(&json).unwrap();
assert_eq!(s, back);
}
#[test]
fn empty_serializes_as_empty_object() {
let s = FieldPointsToSummary::empty();
let json = serde_json::to_string(&s).unwrap();
assert_eq!(json, "{}");
let back: FieldPointsToSummary = serde_json::from_str("{}").unwrap();
assert!(back.is_empty());
}
}