nyx/src/abstract_interp/mod.rs
Eli Peter a438886217
Python fp and docs updtes (#58)
* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
2026-04-29 19:53:34 -04:00

712 lines
23 KiB
Rust

//! Abstract interpretation framework.
//!
//! Provides a product abstract domain ([`AbstractValue`]) composing independent
//! subdomains:
//! - [`IntervalFact`]: numeric interval `[lo, hi]` with arithmetic transfer
//! - [`StringFact`]: string prefix + suffix with concatenation transfer
//! - [`BitFact`]: known-zero/known-one bit masks for bitwise transfer
//!
//! Abstract values are stored per-SSA-value in [`AbstractState`], which is
//! carried through the taint analysis worklist in `SsaTaintState`. The framework
//! propagates abstract values forward through SSA operations, joins at CFG
//! merges, and widens at loop heads to ensure termination.
//!
//! ## Feature gate
//!
//! Enabled by default. Disable via `analysis.engine.abstract_interpretation
//! = false` in `nyx.conf` or the `--no-abstract-interp` CLI flag.
pub mod bit_domain;
pub mod interval;
pub mod path_domain;
pub mod string_domain;
pub use bit_domain::BitFact;
pub use interval::IntervalFact;
pub use path_domain::{PathFact, Tri};
pub use string_domain::StringFact;
use crate::ssa::ir::SsaValue;
use crate::state::lattice::{AbstractDomain, Lattice};
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
/// Feature gate: check if abstract interpretation is enabled.
///
/// Controlled by `analysis.engine.abstract_interpretation` in `nyx.conf`
/// (default `true`) or the `--abstract-interp / --no-abstract-interp` CLI
/// flag. The legacy `NYX_ABSTRACT_INTERP` env var is consulted only when no
/// runtime has been installed (library use / legacy tests).
pub fn is_enabled() -> bool {
crate::utils::analysis_options::current().abstract_interpretation
}
// ── AbstractValue ───────────────────────────────────────────────────────
/// Per-SSA-value abstract element: product of all subdomains.
///
/// Each subdomain is independent, join, meet, widen, and leq are applied
/// component-wise. Adding a new subdomain requires adding a field here
/// and updating the component-wise implementations.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct AbstractValue {
pub interval: IntervalFact,
pub string: StringFact,
pub bits: BitFact,
#[serde(default, skip_serializing_if = "path_fact_is_top")]
pub path: PathFact,
}
fn path_fact_is_top(p: &PathFact) -> bool {
p.is_top()
}
impl AbstractValue {
pub fn top() -> Self {
Self {
interval: IntervalFact::top(),
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
}
}
pub fn bottom() -> Self {
Self {
interval: IntervalFact::bottom(),
string: StringFact::bottom(),
bits: BitFact::bottom(),
path: PathFact::bottom(),
}
}
/// Construct a value with a specific [`PathFact`] and every other
/// subdomain at Top. Used by the Rust path-primitive transfer rules.
pub fn with_path_fact(path: PathFact) -> Self {
Self {
interval: IntervalFact::top(),
string: StringFact::top(),
bits: BitFact::top(),
path,
}
}
pub fn is_top(&self) -> bool {
self.interval.is_top() && self.string.is_top() && self.bits.is_top() && self.path.is_top()
}
pub fn is_bottom(&self) -> bool {
self.interval.is_bottom()
&& self.string.is_bottom()
&& self.bits.is_bottom()
&& self.path.is_bottom()
}
pub fn join(&self, other: &Self) -> Self {
Self {
interval: self.interval.join(&other.interval),
string: self.string.join(&other.string),
bits: self.bits.join(&other.bits),
path: self.path.join(&other.path),
}
}
pub fn meet(&self, other: &Self) -> Self {
Self {
interval: self.interval.meet(&other.interval),
string: self.string.meet(&other.string),
bits: <BitFact as AbstractDomain>::meet(&self.bits, &other.bits),
path: <PathFact as AbstractDomain>::meet(&self.path, &other.path),
}
}
pub fn widen(&self, other: &Self) -> Self {
Self {
interval: self.interval.widen(&other.interval),
string: self.string.widen(&other.string),
bits: self.bits.widen(&other.bits),
path: self.path.widen(&other.path),
}
}
pub fn leq(&self, other: &Self) -> bool {
self.interval.leq(&other.interval)
&& self.string.leq(&other.string)
&& self.bits.leq(&other.bits)
&& self.path.leq(&other.path)
}
}
impl Lattice for AbstractValue {
fn bot() -> Self {
Self::bottom()
}
fn join(&self, other: &Self) -> Self {
self.join(other)
}
fn leq(&self, other: &Self) -> bool {
self.leq(other)
}
}
impl AbstractDomain for AbstractValue {
fn top() -> Self {
Self::top()
}
fn meet(&self, other: &Self) -> Self {
self.meet(other)
}
fn widen(&self, other: &Self) -> Self {
self.widen(other)
}
}
// ── AbstractTransfer ────────────────────────────────────────────────────
/// Maximum length of a literal prefix tracked by [`StringTransfer::LiteralPrefix`].
///
/// Caps the on-disk summary size when a callee produces a long known prefix.
/// The interval domain already has a natural bound (two `i64`s); the string
/// side needs an explicit cap so a callee that returns a 10KB constant does
/// not balloon every cross-file summary that references it.
pub const MAX_LITERAL_PREFIX_LEN: usize = 64;
/// Per-parameter interval-to-return transform.
///
/// This is a **bounded** description of how a caller-known interval on one
/// parameter maps to the callee's return interval. The forms are intentionally
/// restricted so the summary size stays constant regardless of callee body
/// complexity:
///
/// * [`IntervalTransfer::Top`], no interval knowledge crosses (default).
/// * [`IntervalTransfer::Identity`], return = param (pass-through).
/// * [`IntervalTransfer::Affine`], return = param * `mul` + `add` with
/// `i64` constants; overflow defaults to Top at apply time.
/// * [`IntervalTransfer::Clamped`], return is always in `[lo, hi]` regardless
/// of input. Captures callee-intrinsic bounds (e.g. `saturating` ops).
///
/// No unbounded expression trees, no nesting. A callee whose behaviour does
/// not fit one of these forms falls back to `Top`, we never try to encode
/// richer algebra in the summary.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum IntervalTransfer {
#[default]
Top,
Identity,
Affine {
add: i64,
mul: i64,
},
Clamped {
lo: i64,
hi: i64,
},
}
impl IntervalTransfer {
/// Apply the transform to a caller-known input interval.
pub fn apply(&self, input: &IntervalFact) -> IntervalFact {
match self {
Self::Top => IntervalFact::top(),
Self::Identity => input.clone(),
Self::Affine { add, mul } => input
.mul(&IntervalFact::exact(*mul))
.add(&IntervalFact::exact(*add)),
Self::Clamped { lo, hi } if lo <= hi => IntervalFact {
lo: Some(*lo),
hi: Some(*hi),
},
Self::Clamped { .. } => IntervalFact::top(),
}
}
/// Join two transforms. Used when multiple return paths produce
/// differing transforms for the same parameter: the aggregate is the
/// widest safe form.
pub fn join(&self, other: &Self) -> Self {
use IntervalTransfer::*;
match (self, other) {
(Top, _) | (_, Top) => Top,
(a, b) if a == b => a.clone(),
(Clamped { lo: a, hi: b }, Clamped { lo: c, hi: d }) => Clamped {
lo: (*a).min(*c),
hi: (*b).max(*d),
},
// Identity ⊔ anything else = Top (different flow shapes).
_ => Top,
}
}
}
/// Per-parameter string-to-return transform.
///
/// Mirrors [`IntervalTransfer`] for the string subdomain. Bounded by
/// [`MAX_LITERAL_PREFIX_LEN`] to keep summary size constant.
///
/// * [`StringTransfer::Unknown`], default.
/// * [`StringTransfer::Identity`], return = param.
/// * [`StringTransfer::LiteralPrefix`], return has this literal prefix
/// regardless of input (callee-intrinsic).
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum StringTransfer {
#[default]
Unknown,
Identity,
LiteralPrefix(String),
}
impl StringTransfer {
/// Construct a `LiteralPrefix`, truncating to [`MAX_LITERAL_PREFIX_LEN`]
/// and degrading to `Unknown` on empty input.
pub fn literal_prefix(s: &str) -> Self {
if s.is_empty() {
return Self::Unknown;
}
if s.len() <= MAX_LITERAL_PREFIX_LEN {
Self::LiteralPrefix(s.to_string())
} else {
// Truncate on a char boundary to stay valid UTF-8.
let mut cut = MAX_LITERAL_PREFIX_LEN;
while cut > 0 && !s.is_char_boundary(cut) {
cut -= 1;
}
if cut == 0 {
Self::Unknown
} else {
Self::LiteralPrefix(s[..cut].to_string())
}
}
}
/// Apply the transform to a caller-known input string fact.
pub fn apply(&self, input: &StringFact) -> StringFact {
match self {
Self::Unknown => StringFact::top(),
Self::Identity => input.clone(),
Self::LiteralPrefix(p) => StringFact::from_prefix(p),
}
}
/// Join two transforms.
pub fn join(&self, other: &Self) -> Self {
use StringTransfer::*;
match (self, other) {
(Unknown, _) | (_, Unknown) => Unknown,
(a, b) if a == b => a.clone(),
(LiteralPrefix(a), LiteralPrefix(b)) => {
// Longest common prefix.
let lcp: String = a
.chars()
.zip(b.chars())
.take_while(|(x, y)| x == y)
.map(|(x, _)| x)
.collect();
if lcp.is_empty() {
Unknown
} else {
Self::literal_prefix(&lcp)
}
}
// Identity vs LiteralPrefix → Unknown (different flow shapes).
_ => Unknown,
}
}
}
/// Per-parameter abstract-domain transfer channel.
///
/// Combines the per-subdomain transforms into one record attached to each
/// parameter in [`crate::summary::ssa_summary::SsaFuncSummary`]. Used at
/// cross-file call sites to synthesise a return abstract value from the
/// caller's knowledge of each argument, without having to re-run the callee.
///
/// Composition rule: `apply(input) = (interval.apply, string.apply,
/// bits=top)`. The bit domain is always Top, we do not track cross-file
/// bit transfers.
#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct AbstractTransfer {
#[serde(default, skip_serializing_if = "is_interval_top")]
pub interval: IntervalTransfer,
#[serde(default, skip_serializing_if = "is_string_unknown")]
pub string: StringTransfer,
}
fn is_interval_top(t: &IntervalTransfer) -> bool {
matches!(t, IntervalTransfer::Top)
}
fn is_string_unknown(t: &StringTransfer) -> bool {
matches!(t, StringTransfer::Unknown)
}
impl AbstractTransfer {
/// Fully-imprecise transfer: no information crosses. Used as the
/// conservative default when a parameter's flow does not fit any of the
/// bounded forms.
pub fn top() -> Self {
Self::default()
}
/// True when neither subdomain carries any information, equivalent to
/// "omit this entry entirely".
pub fn is_top(&self) -> bool {
is_interval_top(&self.interval) && is_string_unknown(&self.string)
}
/// Apply the transform to a caller-known input abstract value.
pub fn apply(&self, input: &AbstractValue) -> AbstractValue {
AbstractValue {
interval: self.interval.apply(&input.interval),
string: self.string.apply(&input.string),
bits: BitFact::top(),
path: PathFact::top(),
}
}
/// Join two transfers component-wise.
pub fn join(&self, other: &Self) -> Self {
Self {
interval: self.interval.join(&other.interval),
string: self.string.join(&other.string),
}
}
}
// ── AbstractState ───────────────────────────────────────────────────────
/// Maximum abstract values tracked per block (performance bound).
const MAX_ABSTRACT_VALUES: usize = 64;
/// Per-block abstract state: sorted map from SsaValue → AbstractValue.
///
/// Values not in the map are implicitly Top (no knowledge). Sorted by
/// SsaValue for O(n) merge-join, matching the pattern used by
/// `SsaTaintState.values`.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct AbstractState {
values: SmallVec<[(SsaValue, AbstractValue); 8]>,
}
impl AbstractState {
pub fn empty() -> Self {
Self {
values: SmallVec::new(),
}
}
/// Get abstract value for an SSA value. Returns Top if absent.
pub fn get(&self, v: SsaValue) -> AbstractValue {
self.values
.binary_search_by_key(&v, |(id, _)| *id)
.ok()
.map(|idx| self.values[idx].1.clone())
.unwrap_or_else(AbstractValue::top)
}
/// Set abstract value for an SSA value. Drops Top values to save space.
pub fn set(&mut self, v: SsaValue, val: AbstractValue) {
if val.is_top() {
// Don't store Top, it's the default
if let Ok(idx) = self.values.binary_search_by_key(&v, |(id, _)| *id) {
self.values.remove(idx);
}
return;
}
match self.values.binary_search_by_key(&v, |(id, _)| *id) {
Ok(idx) => self.values[idx].1 = val,
Err(idx) => {
if self.values.len() < MAX_ABSTRACT_VALUES {
self.values.insert(idx, (v, val));
}
// Over budget: silently drop (conservative, defaults to Top)
}
}
}
/// Merge-join two abstract states. Values present in both are joined;
/// values present in only one side are dropped (absent = Top, join with
/// Top = Top).
pub fn join(&self, other: &Self) -> Self {
let mut result = SmallVec::with_capacity(self.values.len().min(other.values.len()));
let (mut i, mut j) = (0, 0);
while i < self.values.len() && j < other.values.len() {
match self.values[i].0.cmp(&other.values[j].0) {
std::cmp::Ordering::Less => {
// Only in self → join with Top = Top → drop
i += 1;
}
std::cmp::Ordering::Greater => {
// Only in other → drop
j += 1;
}
std::cmp::Ordering::Equal => {
let joined = self.values[i].1.join(&other.values[j].1);
if !joined.is_top() {
result.push((self.values[i].0, joined));
}
i += 1;
j += 1;
}
}
}
Self { values: result }
}
/// Merge-widen: for values present in both states, apply widening.
/// Values present in only one side are dropped (Top).
pub fn widen(&self, other: &Self) -> Self {
let mut result = SmallVec::with_capacity(self.values.len().min(other.values.len()));
let (mut i, mut j) = (0, 0);
while i < self.values.len() && j < other.values.len() {
match self.values[i].0.cmp(&other.values[j].0) {
std::cmp::Ordering::Less => {
i += 1;
}
std::cmp::Ordering::Greater => {
j += 1;
}
std::cmp::Ordering::Equal => {
let widened = self.values[i].1.widen(&other.values[j].1);
if !widened.is_top() {
result.push((self.values[i].0, widened));
}
i += 1;
j += 1;
}
}
}
Self { values: result }
}
/// Partial order: self ⊑ other.
pub fn leq(&self, other: &Self) -> bool {
// Every non-Top entry in self must have a corresponding entry in other
// with self[v] ⊑ other[v]. Entries only in other are fine (Top ⊑ anything
// is false, but absent self entries are Top which is handled).
for (v, val) in &self.values {
let other_val = other.get(*v);
if !val.leq(&other_val) {
return false;
}
}
true
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn abstract_value_top_bottom() {
assert!(AbstractValue::top().is_top());
assert!(AbstractValue::bottom().is_bottom());
assert!(!AbstractValue::top().is_bottom());
assert!(!AbstractValue::bottom().is_top());
}
#[test]
fn abstract_value_join_componentwise() {
let a = AbstractValue {
interval: IntervalFact::exact(1),
string: StringFact::from_prefix("https://a.com/"),
bits: BitFact::top(),
path: PathFact::top(),
};
let b = AbstractValue {
interval: IntervalFact::exact(5),
string: StringFact::from_prefix("https://b.com/"),
bits: BitFact::top(),
path: PathFact::top(),
};
let j = a.join(&b);
assert_eq!(j.interval.lo, Some(1));
assert_eq!(j.interval.hi, Some(5));
assert_eq!(j.string.prefix.as_deref(), Some("https://"));
}
#[test]
fn abstract_value_widen_componentwise() {
let old = AbstractValue {
interval: IntervalFact {
lo: Some(0),
hi: Some(5),
},
string: StringFact::from_prefix("hello"),
bits: BitFact::top(),
path: PathFact::top(),
};
let new = AbstractValue {
interval: IntervalFact {
lo: Some(0),
hi: Some(10),
},
string: StringFact::from_prefix("hello"),
bits: BitFact::top(),
path: PathFact::top(),
};
let w = old.widen(&new);
assert_eq!(w.interval.lo, Some(0)); // stable
assert_eq!(w.interval.hi, None); // grew → widened
assert_eq!(w.string.prefix.as_deref(), Some("hello")); // stable
}
#[test]
fn abstract_state_get_default_top() {
let state = AbstractState::empty();
assert!(state.get(SsaValue(42)).is_top());
}
#[test]
fn abstract_state_set_get() {
let mut state = AbstractState::empty();
let val = AbstractValue {
interval: IntervalFact::exact(10),
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
};
state.set(SsaValue(1), val.clone());
assert_eq!(state.get(SsaValue(1)), val);
}
#[test]
fn abstract_state_set_top_removes() {
let mut state = AbstractState::empty();
state.set(
SsaValue(1),
AbstractValue {
interval: IntervalFact::exact(5),
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
},
);
assert!(!state.get(SsaValue(1)).is_top());
state.set(SsaValue(1), AbstractValue::top());
assert!(state.get(SsaValue(1)).is_top());
assert!(state.values.is_empty());
}
#[test]
fn abstract_state_join() {
let mut a = AbstractState::empty();
a.set(
SsaValue(1),
AbstractValue {
interval: IntervalFact::exact(3),
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
},
);
a.set(
SsaValue(2),
AbstractValue {
interval: IntervalFact::exact(10),
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
},
);
let mut b = AbstractState::empty();
b.set(
SsaValue(1),
AbstractValue {
interval: IntervalFact::exact(7),
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
},
);
// SsaValue(2) not in b → join drops it (Top)
let j = a.join(&b);
// SsaValue(1): join [3,3] and [7,7] = [3,7]
let v1 = j.get(SsaValue(1));
assert_eq!(v1.interval.lo, Some(3));
assert_eq!(v1.interval.hi, Some(7));
// SsaValue(2): only in a → dropped to Top
assert!(j.get(SsaValue(2)).is_top());
}
#[test]
fn abstract_state_widen() {
let mut old = AbstractState::empty();
old.set(
SsaValue(1),
AbstractValue {
interval: IntervalFact {
lo: Some(0),
hi: Some(5),
},
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
},
);
let mut new = AbstractState::empty();
new.set(
SsaValue(1),
AbstractValue {
interval: IntervalFact {
lo: Some(0),
hi: Some(10),
},
string: StringFact::top(),
bits: BitFact::top(),
path: PathFact::top(),
},
);
let w = old.widen(&new);
let v1 = w.get(SsaValue(1));
assert_eq!(v1.interval.lo, Some(0)); // stable
assert_eq!(v1.interval.hi, None); // grew → widened
}
#[test]
fn loop_carried_phi_join_and_widen() {
// Simulate: x = 0; loop { x = phi(0, x+1) }
// Iteration 1: join([0,0], [1,1]) = [0,1]
let init = IntervalFact::exact(0);
let inc1 = IntervalFact::exact(1);
let phi1 = init.join(&inc1);
assert_eq!(phi1.lo, Some(0));
assert_eq!(phi1.hi, Some(1));
// Iteration 2: join([0,1], [1,2]) = [0,2]
let inc2 = IntervalFact {
lo: Some(1),
hi: Some(2),
};
let phi2 = phi1.join(&inc2);
assert_eq!(phi2.lo, Some(0));
assert_eq!(phi2.hi, Some(2));
// Widen: [0,1] vs [0,2] → upper bound grew → [0, None]
let widened = phi1.widen(&phi2);
assert_eq!(widened.lo, Some(0));
assert_eq!(widened.hi, None);
// Iteration 3: join([0,None], [1,None]) = [0,None] (stable!)
let inc3 = IntervalFact {
lo: Some(1),
hi: None,
};
let phi3 = widened.join(&inc3);
assert_eq!(phi3.lo, Some(0));
assert_eq!(phi3.hi, None);
assert_eq!(phi3, widened); // converged
}
}