mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-21 20:18:06 +02:00
Phase 1 (#33)
* chore: Exclude CLAUDE.md from Cargo.toml * feat: add callgraph module and integrate into main analysis flow * feat: enhance CLI with new severity filtering and analysis modes * feat: update CHANGELOG with recent enhancements and fixes to severity filtering and output handling * feat: implement state-model dataflow analysis for resource lifecycle and auth state * feat: enhance diagnostic output formatting and add evidence structure * feat: implement attack surface ranking for diagnostics with scoring and sorting * feat: add comprehensive documentation for installation, usage, and rules reference * feat: add multiple language support for command execution and evaluation endpoints * feat: implement inline suppression for findings using `nyx:ignore` comments * feat: add confidence levels to AST patterns and update output structure * feat: implement low-noise prioritization system with category filtering, rollup grouping, and configurable budgets * feat: bump version to 0.4.0 and update changelog with new features and improvements * feat: add dead code allowances to various functions in mod.rs and real_world_tests.rs
This commit is contained in:
parent
19b578c5c4
commit
1bbe4b1cfb
456 changed files with 25628 additions and 1228 deletions
620
src/taint/domain.rs
Normal file
620
src/taint/domain.rs
Normal file
|
|
@ -0,0 +1,620 @@
|
|||
use crate::labels::{Cap, SourceKind};
|
||||
use crate::state::lattice::Lattice;
|
||||
use crate::state::symbol::SymbolId;
|
||||
use crate::taint::path_state::PredicateKind;
|
||||
use petgraph::graph::NodeIndex;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
/// Maximum origins tracked per variable (bounded to prevent growth).
|
||||
const MAX_ORIGINS_PER_VAR: usize = 4;
|
||||
|
||||
/// Per-variable taint information.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct VarTaint {
|
||||
pub caps: Cap,
|
||||
/// Up to N origins that contributed taint (bounded).
|
||||
pub origins: SmallVec<[TaintOrigin; 2]>,
|
||||
}
|
||||
|
||||
/// A single taint origin — the node and classification of where taint came from.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct TaintOrigin {
|
||||
pub node: NodeIndex,
|
||||
pub source_kind: SourceKind,
|
||||
}
|
||||
|
||||
/// Compact bitset for up to 64 variables (indexed by SymbolId ordinal).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub struct SmallBitSet(u64);
|
||||
|
||||
impl SmallBitSet {
|
||||
pub fn empty() -> Self {
|
||||
Self(0)
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, id: SymbolId) {
|
||||
let idx = id.0;
|
||||
if idx < 64 {
|
||||
self.0 |= 1u64 << idx;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn contains(&self, id: SymbolId) -> bool {
|
||||
let idx = id.0;
|
||||
if idx < 64 {
|
||||
self.0 & (1u64 << idx) != 0
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Union: self | other
|
||||
pub fn union(self, other: Self) -> Self {
|
||||
Self(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Intersection: self & other
|
||||
pub fn intersection(self, other: Self) -> Self {
|
||||
Self(self.0 & other.0)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
|
||||
/// Whether self is a subset of other.
|
||||
#[allow(dead_code)] // used by Lattice::leq
|
||||
pub fn is_subset_of(self, other: Self) -> bool {
|
||||
self.0 & other.0 == self.0
|
||||
}
|
||||
|
||||
/// Whether self is a superset of other.
|
||||
#[allow(dead_code)] // used by Lattice::leq
|
||||
pub fn is_superset_of(self, other: Self) -> bool {
|
||||
other.is_subset_of(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Monotone predicate summary per variable.
|
||||
///
|
||||
/// Tracks which whitelisted predicate kinds are known true/false on ALL paths.
|
||||
/// join = intersection of bits (must-hold semantics).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub struct PredicateSummary {
|
||||
/// Bitmask: bit 0=NullCheck, 1=EmptyCheck, 2=ErrorCheck
|
||||
pub known_true: u8,
|
||||
pub known_false: u8,
|
||||
}
|
||||
|
||||
impl PredicateSummary {
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
known_true: 0,
|
||||
known_false: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Join = intersection (only predicates true on ALL paths).
|
||||
pub fn join(self, other: Self) -> Self {
|
||||
Self {
|
||||
known_true: self.known_true & other.known_true,
|
||||
known_false: self.known_false & other.known_false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check for contradiction: same kind known both true and false.
|
||||
pub fn has_contradiction(self) -> bool {
|
||||
self.known_true & self.known_false != 0
|
||||
}
|
||||
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.known_true == 0 && self.known_false == 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Map a whitelisted PredicateKind to its bit index (0-2).
|
||||
/// Returns None for non-whitelisted kinds.
|
||||
pub fn predicate_kind_bit(kind: PredicateKind) -> Option<u8> {
|
||||
match kind {
|
||||
PredicateKind::NullCheck => Some(0),
|
||||
PredicateKind::EmptyCheck => Some(1),
|
||||
PredicateKind::ErrorCheck => Some(2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// The abstract taint state at a program point.
|
||||
///
|
||||
/// Uses sorted SmallVec keyed by SymbolId for O(n) merge-join.
|
||||
/// Variables beyond the interner's capacity are naturally excluded.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct TaintState {
|
||||
/// Per-variable taint, sorted by SymbolId.
|
||||
pub vars: SmallVec<[(SymbolId, VarTaint); 16]>,
|
||||
|
||||
/// Variables validated on ALL paths (intersection on join).
|
||||
pub validated_must: SmallBitSet,
|
||||
|
||||
/// Variables validated on ANY path (union on join).
|
||||
pub validated_may: SmallBitSet,
|
||||
|
||||
/// Per-variable predicate summary (sorted by SymbolId).
|
||||
pub predicates: SmallVec<[(SymbolId, PredicateSummary); 4]>,
|
||||
}
|
||||
|
||||
impl TaintState {
|
||||
/// Create the initial state (no taint, no validation, no predicates).
|
||||
pub fn initial() -> Self {
|
||||
Self {
|
||||
vars: SmallVec::new(),
|
||||
validated_must: SmallBitSet::empty(),
|
||||
validated_may: SmallBitSet::empty(),
|
||||
predicates: SmallVec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up taint for a variable.
|
||||
pub fn get(&self, sym: SymbolId) -> Option<&VarTaint> {
|
||||
self.vars
|
||||
.binary_search_by_key(&sym, |(id, _)| *id)
|
||||
.ok()
|
||||
.map(|idx| &self.vars[idx].1)
|
||||
}
|
||||
|
||||
/// Insert or update taint for a variable.
|
||||
pub fn set(&mut self, sym: SymbolId, taint: VarTaint) {
|
||||
match self.vars.binary_search_by_key(&sym, |(id, _)| *id) {
|
||||
Ok(idx) => self.vars[idx].1 = taint,
|
||||
Err(idx) => self.vars.insert(idx, (sym, taint)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove taint for a variable.
|
||||
pub fn remove(&mut self, sym: SymbolId) {
|
||||
if let Ok(idx) = self.vars.binary_search_by_key(&sym, |(id, _)| *id) {
|
||||
self.vars.remove(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a predicate summary for a variable.
|
||||
pub fn set_predicate(&mut self, sym: SymbolId, summary: PredicateSummary) {
|
||||
match self.predicates.binary_search_by_key(&sym, |(id, _)| *id) {
|
||||
Ok(idx) => self.predicates[idx].1 = summary,
|
||||
Err(idx) => self.predicates.insert(idx, (sym, summary)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get predicate summary for a variable.
|
||||
pub fn get_predicate(&self, sym: SymbolId) -> PredicateSummary {
|
||||
self.predicates
|
||||
.binary_search_by_key(&sym, |(id, _)| *id)
|
||||
.ok()
|
||||
.map(|idx| self.predicates[idx].1)
|
||||
.unwrap_or_else(PredicateSummary::empty)
|
||||
}
|
||||
|
||||
/// Check if any variable has contradictory predicates.
|
||||
pub fn has_contradiction(&self) -> bool {
|
||||
self.predicates.iter().any(|(_, s)| s.has_contradiction())
|
||||
}
|
||||
}
|
||||
|
||||
impl Lattice for TaintState {
|
||||
fn bot() -> Self {
|
||||
Self::initial()
|
||||
}
|
||||
|
||||
fn join(&self, other: &Self) -> Self {
|
||||
// Merge-join vars (sorted by SymbolId)
|
||||
let vars = merge_join_vars(&self.vars, &other.vars);
|
||||
|
||||
// validated_must = intersection (must hold on ALL paths)
|
||||
let validated_must = self.validated_must.intersection(other.validated_must);
|
||||
|
||||
// validated_may = union (holds on ANY path)
|
||||
let validated_may = self.validated_may.union(other.validated_may);
|
||||
|
||||
// predicates = per-key intersection of known_true/known_false bits
|
||||
let predicates = merge_join_predicates(&self.predicates, &other.predicates);
|
||||
|
||||
TaintState {
|
||||
vars,
|
||||
validated_must,
|
||||
validated_may,
|
||||
predicates,
|
||||
}
|
||||
}
|
||||
|
||||
fn leq(&self, other: &Self) -> bool {
|
||||
// Per-key Cap subset + origins subset
|
||||
if !vars_leq(&self.vars, &other.vars) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// validated_must: self ⊇ other (superset = less info = lower)
|
||||
if !self.validated_must.is_superset_of(other.validated_must) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// validated_may: self ⊆ other
|
||||
if !self.validated_may.is_subset_of(other.validated_may) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// predicates: self.known_true ⊇ other.known_true (more precise = lower)
|
||||
predicates_leq(&self.predicates, &other.predicates)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge-join two sorted var lists: per-key Cap OR + origins merge (bounded).
|
||||
fn merge_join_vars(
|
||||
a: &[(SymbolId, VarTaint)],
|
||||
b: &[(SymbolId, VarTaint)],
|
||||
) -> SmallVec<[(SymbolId, VarTaint); 16]> {
|
||||
let mut result = SmallVec::with_capacity(a.len().max(b.len()));
|
||||
let (mut i, mut j) = (0, 0);
|
||||
|
||||
while i < a.len() && j < b.len() {
|
||||
match a[i].0.cmp(&b[j].0) {
|
||||
std::cmp::Ordering::Less => {
|
||||
result.push(a[i].clone());
|
||||
i += 1;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
result.push(b[j].clone());
|
||||
j += 1;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
let caps = a[i].1.caps | b[j].1.caps;
|
||||
let origins = merge_origins(&a[i].1.origins, &b[j].1.origins);
|
||||
result.push((a[i].0, VarTaint { caps, origins }));
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remaining from either side
|
||||
while i < a.len() {
|
||||
result.push(a[i].clone());
|
||||
i += 1;
|
||||
}
|
||||
while j < b.len() {
|
||||
result.push(b[j].clone());
|
||||
j += 1;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Merge two origin lists, deduplicating by node and bounding at MAX_ORIGINS_PER_VAR.
|
||||
fn merge_origins(
|
||||
a: &SmallVec<[TaintOrigin; 2]>,
|
||||
b: &SmallVec<[TaintOrigin; 2]>,
|
||||
) -> SmallVec<[TaintOrigin; 2]> {
|
||||
let mut merged = a.clone();
|
||||
for origin in b {
|
||||
if merged.len() >= MAX_ORIGINS_PER_VAR {
|
||||
break;
|
||||
}
|
||||
if !merged.iter().any(|o| o.node == origin.node) {
|
||||
merged.push(*origin);
|
||||
}
|
||||
}
|
||||
merged
|
||||
}
|
||||
|
||||
/// Check if a.vars ⊑ b.vars (per-key Cap subset + origins subset).
|
||||
#[allow(dead_code)] // called by Lattice::leq
|
||||
fn vars_leq(a: &[(SymbolId, VarTaint)], b: &[(SymbolId, VarTaint)]) -> bool {
|
||||
let (mut i, mut j) = (0, 0);
|
||||
|
||||
while i < a.len() {
|
||||
if j >= b.len() {
|
||||
return false; // a has keys not in b → not ⊑
|
||||
}
|
||||
match a[i].0.cmp(&b[j].0) {
|
||||
std::cmp::Ordering::Less => return false, // key in a but not b
|
||||
std::cmp::Ordering::Greater => {
|
||||
j += 1; // key only in b, skip
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
// Cap subset check
|
||||
if a[i].1.caps & b[j].1.caps != a[i].1.caps {
|
||||
return false;
|
||||
}
|
||||
// Origins subset check (by node)
|
||||
for orig in &a[i].1.origins {
|
||||
if !b[j].1.origins.iter().any(|o| o.node == orig.node) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Merge-join predicate summaries with intersection semantics.
|
||||
fn merge_join_predicates(
|
||||
a: &[(SymbolId, PredicateSummary)],
|
||||
b: &[(SymbolId, PredicateSummary)],
|
||||
) -> SmallVec<[(SymbolId, PredicateSummary); 4]> {
|
||||
let mut result = SmallVec::new();
|
||||
let (mut i, mut j) = (0, 0);
|
||||
|
||||
while i < a.len() && j < b.len() {
|
||||
match a[i].0.cmp(&b[j].0) {
|
||||
std::cmp::Ordering::Less => {
|
||||
// Key only in a — intersection with empty = empty → drop
|
||||
i += 1;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
j += 1;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
let joined = a[i].1.join(b[j].1);
|
||||
if !joined.is_empty() {
|
||||
result.push((a[i].0, joined));
|
||||
}
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Keys only in one side → intersection with empty = drop
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Check if a.predicates ⊑ b.predicates.
|
||||
/// More precise (more known_true bits) = lower in the lattice.
|
||||
/// So a ⊑ b means a.known_true ⊇ b.known_true for each key.
|
||||
#[allow(dead_code)] // called by Lattice::leq
|
||||
fn predicates_leq(a: &[(SymbolId, PredicateSummary)], b: &[(SymbolId, PredicateSummary)]) -> bool {
|
||||
let (mut i, mut j) = (0, 0);
|
||||
|
||||
// For each key in b, a must have at least as many bits
|
||||
while j < b.len() {
|
||||
if i >= a.len() {
|
||||
// b has keys that a doesn't — a is missing info = not lower
|
||||
return false;
|
||||
}
|
||||
match a[i].0.cmp(&b[j].0) {
|
||||
std::cmp::Ordering::Less => {
|
||||
// a has extra keys (more info) — OK for leq
|
||||
i += 1;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
// b has a key that a doesn't → a has fewer bits → not ⊑
|
||||
return false;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
// a.known_true must be a superset of b.known_true
|
||||
if a[i].1.known_true & b[j].1.known_true != b[j].1.known_true {
|
||||
return false;
|
||||
}
|
||||
if a[i].1.known_false & b[j].1.known_false != b[j].1.known_false {
|
||||
return false;
|
||||
}
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_taint(sym: u32, caps: Cap) -> (SymbolId, VarTaint) {
|
||||
(
|
||||
SymbolId(sym),
|
||||
VarTaint {
|
||||
caps,
|
||||
origins: SmallVec::new(),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn make_taint_with_origin(sym: u32, caps: Cap, node: usize) -> (SymbolId, VarTaint) {
|
||||
(
|
||||
SymbolId(sym),
|
||||
VarTaint {
|
||||
caps,
|
||||
origins: smallvec::smallvec![TaintOrigin {
|
||||
node: NodeIndex::new(node),
|
||||
source_kind: SourceKind::Unknown,
|
||||
}],
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn state_with_vars(vars: Vec<(SymbolId, VarTaint)>) -> TaintState {
|
||||
let mut s = TaintState::initial();
|
||||
s.vars = SmallVec::from_vec(vars);
|
||||
s
|
||||
}
|
||||
|
||||
// ── Lattice property tests ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn bot_identity() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
|
||||
assert_eq!(a.join(&TaintState::bot()), a);
|
||||
assert_eq!(TaintState::bot().join(&a), a);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn join_commutativity() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
|
||||
let b = state_with_vars(vec![make_taint(1, Cap::SHELL_ESCAPE)]);
|
||||
assert_eq!(a.join(&b), b.join(&a));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn join_associativity() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
|
||||
let b = state_with_vars(vec![make_taint(0, Cap::SHELL_ESCAPE)]);
|
||||
let c = state_with_vars(vec![make_taint(1, Cap::HTML_ESCAPE)]);
|
||||
assert_eq!(a.join(&b).join(&c), a.join(&b.join(&c)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn join_idempotency() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR | Cap::SHELL_ESCAPE)]);
|
||||
assert_eq!(a.join(&a), a);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn leq_reflexive() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
|
||||
assert!(a.leq(&a));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn leq_consistent_with_join() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
|
||||
let b = state_with_vars(vec![make_taint(0, Cap::ENV_VAR | Cap::SHELL_ESCAPE)]);
|
||||
assert!(a.leq(&b));
|
||||
assert_eq!(a.join(&b), b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn join_merges_caps() {
|
||||
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
|
||||
let b = state_with_vars(vec![make_taint(0, Cap::SHELL_ESCAPE)]);
|
||||
let joined = a.join(&b);
|
||||
assert_eq!(
|
||||
joined.get(SymbolId(0)).unwrap().caps,
|
||||
Cap::ENV_VAR | Cap::SHELL_ESCAPE
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn join_merges_origins() {
|
||||
let a = state_with_vars(vec![make_taint_with_origin(0, Cap::ENV_VAR, 1)]);
|
||||
let b = state_with_vars(vec![make_taint_with_origin(0, Cap::ENV_VAR, 2)]);
|
||||
let joined = a.join(&b);
|
||||
assert_eq!(joined.get(SymbolId(0)).unwrap().origins.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validated_must_intersection() {
|
||||
let mut a = TaintState::initial();
|
||||
a.validated_must.insert(SymbolId(0));
|
||||
a.validated_must.insert(SymbolId(1));
|
||||
|
||||
let mut b = TaintState::initial();
|
||||
b.validated_must.insert(SymbolId(1));
|
||||
b.validated_must.insert(SymbolId(2));
|
||||
|
||||
let joined = a.join(&b);
|
||||
assert!(!joined.validated_must.contains(SymbolId(0)));
|
||||
assert!(joined.validated_must.contains(SymbolId(1)));
|
||||
assert!(!joined.validated_must.contains(SymbolId(2)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validated_may_union() {
|
||||
let mut a = TaintState::initial();
|
||||
a.validated_may.insert(SymbolId(0));
|
||||
|
||||
let mut b = TaintState::initial();
|
||||
b.validated_may.insert(SymbolId(1));
|
||||
|
||||
let joined = a.join(&b);
|
||||
assert!(joined.validated_may.contains(SymbolId(0)));
|
||||
assert!(joined.validated_may.contains(SymbolId(1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn predicate_contradiction() {
|
||||
let mut state = TaintState::initial();
|
||||
state.set_predicate(
|
||||
SymbolId(0),
|
||||
PredicateSummary {
|
||||
known_true: 1, // NullCheck true
|
||||
known_false: 1, // NullCheck false
|
||||
},
|
||||
);
|
||||
assert!(state.has_contradiction());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn predicate_no_contradiction() {
|
||||
let mut state = TaintState::initial();
|
||||
state.set_predicate(
|
||||
SymbolId(0),
|
||||
PredicateSummary {
|
||||
known_true: 1, // NullCheck true
|
||||
known_false: 2, // EmptyCheck false (different kind)
|
||||
},
|
||||
);
|
||||
assert!(!state.has_contradiction());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn predicate_join_intersection() {
|
||||
let mut a = TaintState::initial();
|
||||
a.set_predicate(
|
||||
SymbolId(0),
|
||||
PredicateSummary {
|
||||
known_true: 0b011, // NullCheck + EmptyCheck
|
||||
known_false: 0,
|
||||
},
|
||||
);
|
||||
|
||||
let mut b = TaintState::initial();
|
||||
b.set_predicate(
|
||||
SymbolId(0),
|
||||
PredicateSummary {
|
||||
known_true: 0b010, // EmptyCheck only
|
||||
known_false: 0,
|
||||
},
|
||||
);
|
||||
|
||||
let joined = a.join(&b);
|
||||
let pred = joined.get_predicate(SymbolId(0));
|
||||
assert_eq!(pred.known_true, 0b010); // only EmptyCheck on both paths
|
||||
}
|
||||
|
||||
// ── SmallBitSet tests ───────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn small_bitset_basic() {
|
||||
let mut bs = SmallBitSet::empty();
|
||||
assert!(bs.is_empty());
|
||||
|
||||
bs.insert(SymbolId(0));
|
||||
assert!(bs.contains(SymbolId(0)));
|
||||
assert!(!bs.contains(SymbolId(1)));
|
||||
assert!(!bs.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_bitset_union_intersection() {
|
||||
let mut a = SmallBitSet::empty();
|
||||
a.insert(SymbolId(0));
|
||||
a.insert(SymbolId(2));
|
||||
|
||||
let mut b = SmallBitSet::empty();
|
||||
b.insert(SymbolId(1));
|
||||
b.insert(SymbolId(2));
|
||||
|
||||
let u = a.union(b);
|
||||
assert!(u.contains(SymbolId(0)));
|
||||
assert!(u.contains(SymbolId(1)));
|
||||
assert!(u.contains(SymbolId(2)));
|
||||
|
||||
let i = a.intersection(b);
|
||||
assert!(!i.contains(SymbolId(0)));
|
||||
assert!(!i.contains(SymbolId(1)));
|
||||
assert!(i.contains(SymbolId(2)));
|
||||
}
|
||||
}
|
||||
563
src/taint/mod.rs
563
src/taint/mod.rs
|
|
@ -1,11 +1,21 @@
|
|||
use crate::cfg::{Cfg, FuncSummaries, NodeInfo, StmtKind};
|
||||
pub mod domain;
|
||||
pub mod path_state;
|
||||
pub mod transfer;
|
||||
|
||||
use crate::cfg::{Cfg, FuncSummaries};
|
||||
use crate::interop::InteropEdge;
|
||||
use crate::labels::{Cap, DataLabel, SourceKind};
|
||||
use crate::labels::SourceKind;
|
||||
use crate::state::engine::{self, MAX_TRACKED_VARS};
|
||||
use crate::state::lattice::Lattice;
|
||||
use crate::state::symbol::SymbolInterner;
|
||||
use crate::summary::GlobalSummaries;
|
||||
use crate::symbol::Lang;
|
||||
use domain::TaintState;
|
||||
use path_state::PredicateKind;
|
||||
use petgraph::graph::NodeIndex;
|
||||
use std::collections::HashMap;
|
||||
use tracing::debug;
|
||||
use petgraph::visit::IntoNodeReferences;
|
||||
use std::collections::HashSet;
|
||||
use transfer::{TaintEvent, TaintTransfer};
|
||||
|
||||
/// A detected taint finding with both source and sink locations.
|
||||
#[derive(Debug, Clone)]
|
||||
|
|
@ -20,269 +30,23 @@ pub struct Finding {
|
|||
pub path: Vec<NodeIndex>,
|
||||
/// The kind of source that originated the taint.
|
||||
pub source_kind: SourceKind,
|
||||
}
|
||||
|
||||
/// Order-independent hash of a taint map.
|
||||
///
|
||||
/// Uses XOR of per-entry hashes so the result is the same regardless of
|
||||
/// iteration order — no allocation or sorting required.
|
||||
fn taint_hash(taint: &HashMap<String, Cap>) -> u64 {
|
||||
let mut h: u64 = 0;
|
||||
for (k, bits) in taint {
|
||||
// Per-entry hash: FNV-1a-style mixing of key bytes + cap bits.
|
||||
let mut entry_h: u64 = 0xcbf2_9ce4_8422_2325; // FNV offset basis
|
||||
for b in k.as_bytes() {
|
||||
entry_h ^= *b as u64;
|
||||
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3); // FNV prime
|
||||
}
|
||||
entry_h ^= bits.bits() as u64;
|
||||
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3);
|
||||
h ^= entry_h;
|
||||
}
|
||||
h
|
||||
}
|
||||
|
||||
/// Resolved summary for a callee — a uniform view regardless of whether the
|
||||
/// summary came from a local (same‑file) or global (cross‑file) source.
|
||||
struct ResolvedSummary {
|
||||
source_caps: Cap,
|
||||
sanitizer_caps: Cap,
|
||||
sink_caps: Cap,
|
||||
propagates_taint: bool,
|
||||
}
|
||||
|
||||
/// Try to resolve a callee name using conservative same-language resolution.
|
||||
///
|
||||
/// Resolution order:
|
||||
/// 1. Local (same-file): exact name + same lang + same namespace
|
||||
/// 2. Global same-language: via `lookup_same_lang`; must be unambiguous
|
||||
/// 3. Interop edges: explicit cross-language bridges
|
||||
/// 4. No cross-language fallback
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn resolve_callee(
|
||||
callee: &str,
|
||||
caller_lang: Lang,
|
||||
caller_namespace: &str,
|
||||
caller_func: &str,
|
||||
call_ordinal: u32,
|
||||
local: &FuncSummaries,
|
||||
global: Option<&GlobalSummaries>,
|
||||
interop_edges: &[InteropEdge],
|
||||
) -> Option<ResolvedSummary> {
|
||||
// 1) Local (same-file): scan local summaries for matching name + lang + namespace
|
||||
let local_matches: Vec<_> = local
|
||||
.iter()
|
||||
.filter(|(k, _)| {
|
||||
k.name == callee && k.lang == caller_lang && k.namespace == caller_namespace
|
||||
})
|
||||
.collect();
|
||||
|
||||
if local_matches.len() == 1 {
|
||||
let (_, ls) = local_matches[0];
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: ls.source_caps,
|
||||
sanitizer_caps: ls.sanitizer_caps,
|
||||
sink_caps: ls.sink_caps,
|
||||
propagates_taint: ls.propagates_taint,
|
||||
});
|
||||
}
|
||||
|
||||
// Multiple local matches — try arity disambiguation (future), for now return None
|
||||
if local_matches.len() > 1 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// 2) Global same-language
|
||||
if let Some(gs) = global {
|
||||
let matches = gs.lookup_same_lang(caller_lang, callee);
|
||||
if matches.len() == 1 {
|
||||
let (_, fs) = matches[0];
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: fs.source_caps(),
|
||||
sanitizer_caps: fs.sanitizer_caps(),
|
||||
sink_caps: fs.sink_caps(),
|
||||
propagates_taint: fs.propagates_taint,
|
||||
});
|
||||
}
|
||||
// Multiple matches — try namespace match first
|
||||
if matches.len() > 1 {
|
||||
let same_ns: Vec<_> = matches
|
||||
.iter()
|
||||
.filter(|(k, _)| k.namespace == caller_namespace)
|
||||
.collect();
|
||||
if same_ns.len() == 1 {
|
||||
let (_, fs) = same_ns[0];
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: fs.source_caps(),
|
||||
sanitizer_caps: fs.sanitizer_caps(),
|
||||
sink_caps: fs.sink_caps(),
|
||||
propagates_taint: fs.propagates_taint,
|
||||
});
|
||||
}
|
||||
// Still ambiguous — return None (conservative)
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Interop edges: explicit cross-language bridges
|
||||
for edge in interop_edges {
|
||||
if edge.from.caller_lang == caller_lang
|
||||
&& edge.from.caller_namespace == caller_namespace
|
||||
&& edge.from.callee_symbol == callee
|
||||
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_func)
|
||||
&& (edge.from.ordinal == 0 || edge.from.ordinal == call_ordinal)
|
||||
{
|
||||
// Look up the target in global summaries by exact FuncKey
|
||||
if let Some(gs) = global
|
||||
&& let Some(fs) = gs.get(&edge.to)
|
||||
{
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: fs.source_caps(),
|
||||
sanitizer_caps: fs.sanitizer_caps(),
|
||||
sink_caps: fs.sink_caps(),
|
||||
propagates_taint: fs.propagates_taint,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4) No cross-language fallback
|
||||
None
|
||||
}
|
||||
|
||||
/// Apply taint transfer for a single node, mutating `out` in place.
|
||||
///
|
||||
/// Callers should clone the taint map before calling if they need
|
||||
/// the original state preserved.
|
||||
fn apply_taint(
|
||||
node: &NodeInfo,
|
||||
out: &mut HashMap<String, Cap>,
|
||||
local_summaries: &FuncSummaries,
|
||||
global_summaries: Option<&GlobalSummaries>,
|
||||
caller_lang: Lang,
|
||||
caller_namespace: &str,
|
||||
interop_edges: &[InteropEdge],
|
||||
) {
|
||||
debug!(target: "taint", "Applying taint to node: {:?}", node);
|
||||
debug!(target: "taint", "Taint: {:?}", out);
|
||||
|
||||
let caller_func = node.enclosing_func.as_deref().unwrap_or("");
|
||||
|
||||
match node.label {
|
||||
// A new untrusted value enters the program
|
||||
Some(DataLabel::Source(bits)) => {
|
||||
if let Some(v) = &node.defines {
|
||||
out.insert(v.clone(), bits);
|
||||
}
|
||||
}
|
||||
// Sanitizer: propagate input taint through the assignment FIRST,
|
||||
// then strip the sanitizer's capability bits. This ensures that
|
||||
// `let y = sanitize_html(&x)` gives y the taint of x minus the
|
||||
// HTML_ESCAPE bit — rather than leaving y completely clean (which
|
||||
// would hide "wrong sanitiser for this sink" bugs).
|
||||
Some(DataLabel::Sanitizer(bits)) => {
|
||||
if let Some(v) = &node.defines {
|
||||
// 1. Propagate: union taint from all read variables
|
||||
let mut combined = Cap::empty();
|
||||
for u in &node.uses {
|
||||
if let Some(b) = out.get(u) {
|
||||
combined |= *b;
|
||||
}
|
||||
}
|
||||
// 2. Strip the sanitiser's bits
|
||||
let new = combined & !bits;
|
||||
if new.is_empty() {
|
||||
out.remove(v);
|
||||
} else {
|
||||
out.insert(v.clone(), new);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A function call — resolve against local + global summaries
|
||||
_ if node.kind == StmtKind::Call => {
|
||||
if let Some(callee) = &node.callee
|
||||
&& let Some(resolved) = resolve_callee(
|
||||
callee,
|
||||
caller_lang,
|
||||
caller_namespace,
|
||||
caller_func,
|
||||
node.call_ordinal,
|
||||
local_summaries,
|
||||
global_summaries,
|
||||
interop_edges,
|
||||
)
|
||||
{
|
||||
// Build the return value's taint bits in stages, then
|
||||
// write once at the end. Order matters:
|
||||
//
|
||||
// 1. Start with fresh source taint (if the callee is a source)
|
||||
// 2. Union with propagated arg taint (if the callee propagates)
|
||||
// 3. Strip sanitizer bits last (so sanitization always wins)
|
||||
|
||||
let mut return_bits = Cap::empty();
|
||||
|
||||
// ── 1. Source behaviour ──
|
||||
return_bits |= resolved.source_caps;
|
||||
|
||||
// ── 2. Propagation ──
|
||||
if resolved.propagates_taint {
|
||||
for u in &node.uses {
|
||||
if let Some(bits) = out.get(u) {
|
||||
return_bits |= *bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── 3. Sanitizer behaviour (applied last so it always wins) ──
|
||||
return_bits &= !resolved.sanitizer_caps;
|
||||
|
||||
// ── Write the result ──
|
||||
if let Some(v) = &node.defines {
|
||||
if return_bits.is_empty() {
|
||||
out.remove(v);
|
||||
} else {
|
||||
out.insert(v.clone(), return_bits);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Sink behaviour: handled in the main analysis loop
|
||||
// (checked via node.label or resolved summary) ──
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Unresolved call — fall through to default gen/kill below
|
||||
}
|
||||
|
||||
// All other statements: classic gen/kill for assignments
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Default gen/kill: propagate taint through variable assignments
|
||||
if !matches!(
|
||||
node.label,
|
||||
Some(DataLabel::Source(_)) | Some(DataLabel::Sanitizer(_))
|
||||
) && let Some(d) = &node.defines
|
||||
{
|
||||
let mut combined = Cap::empty();
|
||||
for u in &node.uses {
|
||||
if let Some(bits) = out.get(u) {
|
||||
combined |= *bits;
|
||||
}
|
||||
}
|
||||
if combined.is_empty() {
|
||||
out.remove(d);
|
||||
} else {
|
||||
out.insert(d.clone(), combined);
|
||||
}
|
||||
}
|
||||
/// Whether all tainted sink variables are guarded by a validation
|
||||
/// predicate on this path (metadata only — does not change severity).
|
||||
#[allow(dead_code)] // surfaced in Diag output (task 4)
|
||||
pub path_validated: bool,
|
||||
/// The kind of validation guard protecting this path, if any.
|
||||
#[allow(dead_code)] // surfaced in Diag output (task 4)
|
||||
pub guard_kind: Option<PredicateKind>,
|
||||
}
|
||||
|
||||
/// Run taint analysis on a single file's CFG.
|
||||
///
|
||||
/// `global_summaries` is `None` for pass‑1 / single‑file mode and
|
||||
/// `Some(&map)` for pass‑2 cross‑file analysis.
|
||||
/// Uses a monotone forward dataflow analysis via `state::engine::run_forward`
|
||||
/// with the `TaintTransfer` function. Termination is guaranteed by lattice
|
||||
/// finiteness (bounded `Cap` bits × bounded variable count).
|
||||
///
|
||||
/// For JS/TS files: uses a two-level solve to prevent cross-function taint
|
||||
/// leakage while preserving global-to-function flows.
|
||||
pub fn analyse_file(
|
||||
cfg: &Cfg,
|
||||
entry: NodeIndex,
|
||||
|
|
@ -292,162 +56,155 @@ pub fn analyse_file(
|
|||
caller_namespace: &str,
|
||||
interop_edges: &[InteropEdge],
|
||||
) -> Vec<Finding> {
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
let _span = tracing::debug_span!("taint_analyse_file").entered();
|
||||
|
||||
/// Queue item: current CFG node + taint map that holds here
|
||||
#[derive(Clone)]
|
||||
struct Item {
|
||||
node: NodeIndex,
|
||||
taint: HashMap<String, Cap>,
|
||||
// 1. Build symbol interner from CFG
|
||||
let interner = SymbolInterner::from_cfg(cfg);
|
||||
|
||||
if interner.len() > MAX_TRACKED_VARS {
|
||||
tracing::warn!(
|
||||
symbols = interner.len(),
|
||||
max = MAX_TRACKED_VARS,
|
||||
"taint analysis: too many variables, some will be ignored"
|
||||
);
|
||||
}
|
||||
|
||||
// (node, taint_hash) → predecessor key (for path rebuild)
|
||||
type Key = (NodeIndex, u64);
|
||||
let mut pred: HashMap<Key, Key> = HashMap::new();
|
||||
// 2. Build base transfer function
|
||||
let base_transfer = TaintTransfer {
|
||||
lang: caller_lang,
|
||||
namespace: caller_namespace,
|
||||
interner: &interner, // also used for events_to_findings below
|
||||
local_summaries,
|
||||
global_summaries,
|
||||
interop_edges,
|
||||
global_seed: None,
|
||||
scope_filter: None,
|
||||
};
|
||||
|
||||
// Seen states so we do not revisit them infinitely
|
||||
let mut seen: HashSet<Key> = HashSet::new();
|
||||
// 3. Run analysis (two-level for JS/TS, single-pass otherwise)
|
||||
let events = if matches!(caller_lang, Lang::JavaScript | Lang::TypeScript) {
|
||||
analyse_js_two_level(cfg, entry, &interner, &base_transfer)
|
||||
} else {
|
||||
let result = engine::run_forward(cfg, entry, &base_transfer, TaintState::initial());
|
||||
result.events
|
||||
};
|
||||
|
||||
// Resulting findings: (sink_node, source_node, full_path)
|
||||
let mut findings: Vec<Finding> = Vec::new();
|
||||
// 4. Convert events to findings
|
||||
let mut findings = events_to_findings(&events, &interner);
|
||||
|
||||
let mut q = VecDeque::new();
|
||||
q.push_back(Item {
|
||||
node: entry,
|
||||
taint: HashMap::new(),
|
||||
});
|
||||
seen.insert((entry, 0));
|
||||
// 5. Deduplicate findings by (sink, source), prefer path_validated=true
|
||||
findings.sort_by_key(|f| (f.sink.index(), f.source.index(), !f.path_validated));
|
||||
findings.dedup_by_key(|f| (f.sink, f.source));
|
||||
|
||||
while let Some(Item { node, taint }) = q.pop_front() {
|
||||
let caller_func = cfg[node].enclosing_func.as_deref().unwrap_or("");
|
||||
let mut out = taint.clone();
|
||||
apply_taint(
|
||||
&cfg[node],
|
||||
&mut out,
|
||||
local_summaries,
|
||||
global_summaries,
|
||||
caller_lang,
|
||||
caller_namespace,
|
||||
interop_edges,
|
||||
);
|
||||
findings
|
||||
}
|
||||
|
||||
// ── Sink check ──────────────────────────────────────────────────
|
||||
// Two ways a node can be a sink:
|
||||
// 1. Its AST label says Sink (existing inline labels)
|
||||
// 2. Its callee resolves to a function with sink_caps (cross-file)
|
||||
let sink_caps = match cfg[node].label {
|
||||
Some(DataLabel::Sink(caps)) => caps,
|
||||
_ => {
|
||||
// check if callee resolves to a sink
|
||||
cfg[node]
|
||||
.callee
|
||||
.as_ref()
|
||||
.and_then(|c| {
|
||||
resolve_callee(
|
||||
c,
|
||||
caller_lang,
|
||||
caller_namespace,
|
||||
caller_func,
|
||||
cfg[node].call_ordinal,
|
||||
local_summaries,
|
||||
global_summaries,
|
||||
interop_edges,
|
||||
)
|
||||
})
|
||||
.filter(|r| !r.sink_caps.is_empty())
|
||||
.map(|r| r.sink_caps)
|
||||
.unwrap_or(Cap::empty())
|
||||
}
|
||||
/// JS/TS two-level solve to prevent cross-function taint leakage.
|
||||
///
|
||||
/// Level 1: Solve top-level code (nodes where `enclosing_func.is_none()`).
|
||||
/// Level 2: For each function, solve seeded with top-level taint.
|
||||
fn analyse_js_two_level(
|
||||
cfg: &Cfg,
|
||||
entry: NodeIndex,
|
||||
_interner: &SymbolInterner,
|
||||
base_transfer: &TaintTransfer,
|
||||
) -> Vec<TaintEvent> {
|
||||
// Level 1: solve top-level only
|
||||
let toplevel_transfer = TaintTransfer {
|
||||
lang: base_transfer.lang,
|
||||
namespace: base_transfer.namespace,
|
||||
interner: base_transfer.interner,
|
||||
local_summaries: base_transfer.local_summaries,
|
||||
global_summaries: base_transfer.global_summaries,
|
||||
interop_edges: base_transfer.interop_edges,
|
||||
global_seed: None,
|
||||
scope_filter: Some(None), // top-level only (enclosing_func == None)
|
||||
};
|
||||
|
||||
let toplevel_result =
|
||||
engine::run_forward(cfg, entry, &toplevel_transfer, TaintState::initial());
|
||||
|
||||
// Extract top-level taint state at the last converged point
|
||||
let toplevel_state = extract_exit_state(&toplevel_result.states);
|
||||
|
||||
// Level 2: solve each function seeded with top-level state
|
||||
let mut all_events = toplevel_result.events;
|
||||
|
||||
let func_entries = find_function_entries(cfg);
|
||||
for (func_name, func_entry) in &func_entries {
|
||||
let func_transfer = TaintTransfer {
|
||||
lang: base_transfer.lang,
|
||||
namespace: base_transfer.namespace,
|
||||
interner: base_transfer.interner,
|
||||
local_summaries: base_transfer.local_summaries,
|
||||
global_summaries: base_transfer.global_summaries,
|
||||
interop_edges: base_transfer.interop_edges,
|
||||
global_seed: Some(&toplevel_state),
|
||||
scope_filter: Some(Some(func_name.as_str())),
|
||||
};
|
||||
|
||||
if !sink_caps.is_empty() {
|
||||
let bad = cfg[node]
|
||||
.uses
|
||||
.iter()
|
||||
.any(|u| out.get(u).is_some_and(|b| (*b & sink_caps) != Cap::empty()));
|
||||
if bad {
|
||||
// Reconstruct path backwards from sink to source.
|
||||
//
|
||||
// A node is considered a "source" if:
|
||||
// 1. It has an inline DataLabel::Source (same-file), OR
|
||||
// 2. It is a Call whose callee resolves to a source via
|
||||
// local or global summaries (cross-file).
|
||||
let sink_node = node;
|
||||
let mut path = vec![node];
|
||||
let mut source_node = node; // fallback: sink itself
|
||||
let mut key = (node, taint_hash(&taint));
|
||||
let func_result =
|
||||
engine::run_forward(cfg, *func_entry, &func_transfer, TaintState::initial());
|
||||
all_events.extend(func_result.events);
|
||||
}
|
||||
|
||||
while let Some(&(prev, prev_hash)) = pred.get(&key) {
|
||||
path.push(prev);
|
||||
all_events
|
||||
}
|
||||
|
||||
// Check inline source label
|
||||
if matches!(cfg[prev].label, Some(DataLabel::Source(_))) {
|
||||
source_node = prev;
|
||||
break;
|
||||
}
|
||||
/// Extract the "best" taint state from converged states (join all exit/reachable states).
|
||||
fn extract_exit_state(states: &std::collections::HashMap<NodeIndex, TaintState>) -> TaintState {
|
||||
let mut result = TaintState::initial();
|
||||
for state in states.values() {
|
||||
result = result.join(state);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
// Check cross-file source via resolved callee summary
|
||||
let prev_caller_func = cfg[prev].enclosing_func.as_deref().unwrap_or("");
|
||||
if cfg[prev].kind == StmtKind::Call
|
||||
&& let Some(callee) = &cfg[prev].callee
|
||||
&& let Some(resolved) = resolve_callee(
|
||||
callee,
|
||||
caller_lang,
|
||||
caller_namespace,
|
||||
prev_caller_func,
|
||||
cfg[prev].call_ordinal,
|
||||
local_summaries,
|
||||
global_summaries,
|
||||
interop_edges,
|
||||
)
|
||||
&& !resolved.source_caps.is_empty()
|
||||
{
|
||||
source_node = prev;
|
||||
break;
|
||||
}
|
||||
/// Find function entry nodes: (func_name, entry_node) pairs.
|
||||
///
|
||||
/// A function entry is the first node with a given `enclosing_func` value.
|
||||
fn find_function_entries(cfg: &Cfg) -> Vec<(String, NodeIndex)> {
|
||||
let mut seen = HashSet::new();
|
||||
let mut entries = Vec::new();
|
||||
|
||||
key = (prev, prev_hash);
|
||||
}
|
||||
|
||||
path.reverse();
|
||||
|
||||
// Infer the source kind from the source node's label and callee
|
||||
let source_kind = match cfg[source_node].label {
|
||||
Some(DataLabel::Source(caps)) => {
|
||||
let callee = cfg[source_node].callee.as_deref().unwrap_or("");
|
||||
crate::labels::infer_source_kind(caps, callee)
|
||||
}
|
||||
_ => SourceKind::Unknown,
|
||||
};
|
||||
|
||||
findings.push(Finding {
|
||||
sink: sink_node,
|
||||
source: source_node,
|
||||
path,
|
||||
source_kind,
|
||||
});
|
||||
}
|
||||
for (idx, info) in cfg.node_references() {
|
||||
if let Some(ref func_name) = info.enclosing_func
|
||||
&& seen.insert(func_name.clone())
|
||||
{
|
||||
entries.push((func_name.clone(), idx));
|
||||
}
|
||||
}
|
||||
|
||||
// enqueue successors — cache hashes to avoid recomputation
|
||||
let out_h = taint_hash(&out);
|
||||
let in_h = taint_hash(&taint);
|
||||
let succs: Vec<_> = cfg.neighbors(node).collect();
|
||||
for (i, succ) in succs.iter().enumerate() {
|
||||
let key = (*succ, out_h);
|
||||
if !seen.contains(&key) {
|
||||
seen.insert(key);
|
||||
pred.insert(key, (node, in_h));
|
||||
// Move the map into the last successor to avoid a clone
|
||||
let taint_for_succ = if i + 1 == succs.len() {
|
||||
std::mem::take(&mut out)
|
||||
} else {
|
||||
out.clone()
|
||||
};
|
||||
q.push_back(Item {
|
||||
node: *succ,
|
||||
taint: taint_for_succ,
|
||||
});
|
||||
entries
|
||||
}
|
||||
|
||||
/// Convert TaintEvents into Findings.
|
||||
fn events_to_findings(events: &[TaintEvent], _interner: &SymbolInterner) -> Vec<Finding> {
|
||||
let mut findings = Vec::new();
|
||||
|
||||
for event in events {
|
||||
let TaintEvent::SinkReached {
|
||||
sink_node,
|
||||
tainted_vars,
|
||||
all_validated,
|
||||
guard_kind,
|
||||
..
|
||||
} = event;
|
||||
|
||||
// Collect unique origins across all tainted vars at this sink
|
||||
let mut seen_origins: HashSet<(usize, usize)> = HashSet::new();
|
||||
for (_sym, _caps, origins) in tainted_vars {
|
||||
for origin in origins {
|
||||
if seen_origins.insert((origin.node.index(), sink_node.index())) {
|
||||
findings.push(Finding {
|
||||
sink: *sink_node,
|
||||
source: origin.node,
|
||||
path: vec![origin.node, *sink_node],
|
||||
source_kind: origin.source_kind,
|
||||
path_validated: *all_validated,
|
||||
guard_kind: *guard_kind,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
234
src/taint/path_state.rs
Normal file
234
src/taint/path_state.rs
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
// ─── PredicateKind ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Classification of what an if-condition tests.
|
||||
///
|
||||
/// Determined by heuristic analysis of the raw condition text.
|
||||
/// Classification is conservative: prefer [`Unknown`](PredicateKind::Unknown)
|
||||
/// over a wrong guess.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum PredicateKind {
|
||||
/// `x.is_none()`, `x == null`, `x == nil`, `x is None`
|
||||
NullCheck,
|
||||
/// `x.is_empty()`, `x.len() == 0`, `x == ""`
|
||||
EmptyCheck,
|
||||
/// `x.is_err()`, `x.is_ok()`, `err != nil`
|
||||
ErrorCheck,
|
||||
/// Call to a validation/guard function: `validate(x)`, `is_safe(x)`
|
||||
ValidationCall,
|
||||
/// Call to a sanitizer function: `sanitize(x)`, `escape(x)`
|
||||
SanitizerCall,
|
||||
/// Comparison operators: `x == 5`, `x > threshold`
|
||||
Comparison,
|
||||
/// Generic boolean test — cannot classify further.
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// Classify a raw condition text into a [`PredicateKind`].
|
||||
///
|
||||
/// # Rules
|
||||
///
|
||||
/// - Empty/None text → [`Unknown`](PredicateKind::Unknown).
|
||||
/// - `ValidationCall` / `SanitizerCall` require a `(` in the text **and** a
|
||||
/// matching callee token. This avoids misclassifying comparisons like
|
||||
/// `x_valid == true`.
|
||||
/// - Prefers [`Unknown`](PredicateKind::Unknown) over false positives.
|
||||
pub fn classify_condition(text: &str) -> PredicateKind {
|
||||
if text.is_empty() {
|
||||
return PredicateKind::Unknown;
|
||||
}
|
||||
|
||||
let lower = text.to_ascii_lowercase();
|
||||
|
||||
// ── Error checks (before null checks: `err != nil` is an error check,
|
||||
// not a null check, even though it contains `!= nil`) ──────────────
|
||||
if lower.contains("is_err")
|
||||
|| lower.contains("is_ok")
|
||||
|| lower.contains("err != nil")
|
||||
|| lower.contains("err == nil")
|
||||
|| lower.contains("error != nil")
|
||||
|| lower.contains("error == nil")
|
||||
{
|
||||
return PredicateKind::ErrorCheck;
|
||||
}
|
||||
|
||||
// ── Null checks ──────────────────────────────────────────────────────
|
||||
if lower.contains("is_none")
|
||||
|| lower.contains("is_some")
|
||||
|| lower.contains("== none")
|
||||
|| lower.contains("!= none")
|
||||
|| lower.contains("is none")
|
||||
|| lower.contains("is not none")
|
||||
|| lower.contains("== null")
|
||||
|| lower.contains("!= null")
|
||||
|| lower.contains("=== null")
|
||||
|| lower.contains("!== null")
|
||||
|| lower.contains("== nil")
|
||||
|| lower.contains("!= nil")
|
||||
{
|
||||
return PredicateKind::NullCheck;
|
||||
}
|
||||
|
||||
// ── Empty checks ─────────────────────────────────────────────────────
|
||||
if lower.contains("is_empty")
|
||||
|| lower.contains(".len() == 0")
|
||||
|| lower.contains(".len() != 0")
|
||||
|| lower.contains(".length == 0")
|
||||
|| lower.contains(".length === 0")
|
||||
|| lower.contains(".length != 0")
|
||||
|| lower.contains(".length !== 0")
|
||||
|| lower.contains("== \"\"")
|
||||
|| lower.contains("== ''")
|
||||
{
|
||||
return PredicateKind::EmptyCheck;
|
||||
}
|
||||
|
||||
// ── Call-based kinds (require `(` to be present) ─────────────────────
|
||||
if lower.contains('(') {
|
||||
// Extract a rough callee token: everything before the first `(`
|
||||
// that looks like an identifier (letters, digits, underscores, dots).
|
||||
let callee_part = lower.split('(').next().unwrap_or("");
|
||||
// Take the last segment (after `.` or `::`) as the bare name.
|
||||
let bare = callee_part
|
||||
.rsplit(['.', ':'])
|
||||
.next()
|
||||
.unwrap_or(callee_part)
|
||||
.trim();
|
||||
|
||||
// Validation
|
||||
if bare.contains("valid")
|
||||
|| bare.contains("check")
|
||||
|| bare.contains("verify")
|
||||
|| bare.starts_with("is_safe")
|
||||
|| bare.starts_with("is_authorized")
|
||||
|| bare.starts_with("is_authenticated")
|
||||
{
|
||||
return PredicateKind::ValidationCall;
|
||||
}
|
||||
|
||||
// Sanitizer
|
||||
if bare.contains("sanitiz") || bare.contains("escape") || bare.contains("encode") {
|
||||
return PredicateKind::SanitizerCall;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Comparison operators ─────────────────────────────────────────────
|
||||
if lower.contains("==")
|
||||
|| lower.contains("!=")
|
||||
|| lower.contains(">=")
|
||||
|| lower.contains("<=")
|
||||
|| lower.contains(" > ")
|
||||
|| lower.contains(" < ")
|
||||
{
|
||||
return PredicateKind::Comparison;
|
||||
}
|
||||
|
||||
PredicateKind::Unknown
|
||||
}
|
||||
|
||||
// ─── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// ── classify_condition ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn classify_empty_is_unknown() {
|
||||
assert_eq!(classify_condition(""), PredicateKind::Unknown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_null_checks() {
|
||||
assert_eq!(classify_condition("x.is_none()"), PredicateKind::NullCheck);
|
||||
assert_eq!(classify_condition("x == null"), PredicateKind::NullCheck);
|
||||
assert_eq!(classify_condition("x != nil"), PredicateKind::NullCheck);
|
||||
assert_eq!(classify_condition("x is None"), PredicateKind::NullCheck);
|
||||
assert_eq!(classify_condition("x === null"), PredicateKind::NullCheck);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_error_checks() {
|
||||
assert_eq!(classify_condition("x.is_err()"), PredicateKind::ErrorCheck);
|
||||
assert_eq!(classify_condition("err != nil"), PredicateKind::ErrorCheck);
|
||||
assert_eq!(classify_condition("x.is_ok()"), PredicateKind::ErrorCheck);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_empty_checks() {
|
||||
assert_eq!(
|
||||
classify_condition("x.is_empty()"),
|
||||
PredicateKind::EmptyCheck
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("x.len() == 0"),
|
||||
PredicateKind::EmptyCheck
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("x.length === 0"),
|
||||
PredicateKind::EmptyCheck
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_validation_call() {
|
||||
assert_eq!(
|
||||
classify_condition("validate(x)"),
|
||||
PredicateKind::ValidationCall
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("is_safe(input)"),
|
||||
PredicateKind::ValidationCall
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("check_auth(req)"),
|
||||
PredicateKind::ValidationCall
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("input.verify(sig)"),
|
||||
PredicateKind::ValidationCall
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_validation_requires_paren() {
|
||||
// `x_valid == true` should NOT be ValidationCall — no `(` call syntax.
|
||||
assert_eq!(
|
||||
classify_condition("x_valid == true"),
|
||||
PredicateKind::Comparison
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("is_valid && ready"),
|
||||
PredicateKind::Unknown
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_sanitizer_call() {
|
||||
assert_eq!(
|
||||
classify_condition("sanitize(x)"),
|
||||
PredicateKind::SanitizerCall
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("html_escape(s)"),
|
||||
PredicateKind::SanitizerCall
|
||||
);
|
||||
assert_eq!(
|
||||
classify_condition("url_encode(path)"),
|
||||
PredicateKind::SanitizerCall
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_comparison() {
|
||||
assert_eq!(classify_condition("x == 5"), PredicateKind::Comparison);
|
||||
assert_eq!(classify_condition("x != y"), PredicateKind::Comparison);
|
||||
assert_eq!(classify_condition("a >= b"), PredicateKind::Comparison);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_unknown_fallback() {
|
||||
assert_eq!(classify_condition("flag"), PredicateKind::Unknown);
|
||||
assert_eq!(classify_condition("a && b"), PredicateKind::Unknown);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
use super::*;
|
||||
use crate::cfg::FuncSummaries;
|
||||
use crate::interop::InteropEdge;
|
||||
use crate::labels::Cap;
|
||||
use crate::symbol::FuncKey;
|
||||
|
||||
#[test]
|
||||
|
|
@ -52,8 +53,10 @@ fn taint_through_if_else() {
|
|||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// exactly one path (via the True branch) should be flagged
|
||||
assert_eq!(findings.len(), 1);
|
||||
// Both branches have findings: the true branch uses unsanitized `x`,
|
||||
// the else branch uses `safe` which was sanitized with HTML_ESCAPE
|
||||
// but the sink requires SHELL_ESCAPE (wrong sanitizer → still tainted).
|
||||
assert_eq!(findings.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2218,3 +2221,318 @@ fn return_call_recognized_as_source() {
|
|||
"foo() should have source_caps set because env::var is called inside return"
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Path-sensitive analysis tests ───────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn validate_and_early_return() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Validate before use: if validation fails, early return.
|
||||
// The sink after the guard is on the "validated" path.
|
||||
//
|
||||
// The CFG creates a synthetic pass-through node for the false path
|
||||
// with an explicit False edge from the If node. BFS reaches the
|
||||
// sink via: cond → (False) → pass-through → (Seq) → sink.
|
||||
// The predicate on the False edge records that `!validate(&x)` was
|
||||
// false (i.e. validation passed), so the sink is path-guarded.
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").unwrap();
|
||||
if !validate(&x) { return; }
|
||||
Command::new("sh").arg(x).status().unwrap();
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// Taint still flows (validate doesn't kill taint), but the finding
|
||||
// should be annotated as path_validated because the false path
|
||||
// (validation passed) has a ValidationCall predicate with polarity=true.
|
||||
assert_eq!(findings.len(), 1, "should still detect the taint flow");
|
||||
assert!(
|
||||
findings[0].path_validated,
|
||||
"finding should be marked as path_validated (early-return guard detected)"
|
||||
);
|
||||
assert_eq!(
|
||||
findings[0].guard_kind,
|
||||
Some(PredicateKind::ValidationCall),
|
||||
"guard_kind should be ValidationCall"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_in_if_else_path_validated() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// If/else where the True branch (validation passed) contains the sink.
|
||||
// This IS detectable because the If node has genuine True/False branches.
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").unwrap();
|
||||
if validate(&x) {
|
||||
Command::new("sh").arg(&x).status().unwrap();
|
||||
} else {
|
||||
println!("invalid input");
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
assert_eq!(findings.len(), 1, "should detect the taint flow");
|
||||
assert!(
|
||||
findings[0].path_validated,
|
||||
"finding should be path_validated (sink in validated branch)"
|
||||
);
|
||||
assert_eq!(
|
||||
findings[0].guard_kind,
|
||||
Some(PredicateKind::ValidationCall),
|
||||
"guard_kind should be ValidationCall"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sink_on_failed_validation_branch() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Sink is in the failed-validation branch (negated condition, false edge).
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").unwrap();
|
||||
if !validate(&x) {
|
||||
Command::new("sh").arg(&x).status().unwrap();
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
assert_eq!(findings.len(), 1, "should detect taint flow to sink");
|
||||
assert!(
|
||||
!findings[0].path_validated,
|
||||
"finding should NOT be path_validated (sink is in failed-validation branch)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn contradictory_null_check_pruned() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Inner branch is infeasible: if x.is_none() then x cannot also be is_none().
|
||||
// After early return on is_none(), the fall-through path has polarity=false
|
||||
// for NullCheck. The inner `if x.is_none()` True branch has polarity=true —
|
||||
// contradiction.
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").ok();
|
||||
if x.is_none() { return; }
|
||||
if x.is_none() {
|
||||
Command::new("sh").arg("dangerous").status().unwrap();
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// The inner branch is infeasible, and the arg "dangerous" is a string
|
||||
// literal (not tainted), so there should be no findings.
|
||||
assert!(
|
||||
findings.is_empty(),
|
||||
"inner branch is infeasible — should produce no findings (got {})",
|
||||
findings.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sanitize_one_branch_no_regression() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Same as existing taint_through_if_else: sanitized in one branch, not in the other.
|
||||
// Verify the finding count stays at 1 (no regression from path sensitivity).
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("DANGEROUS").unwrap();
|
||||
let safe = html_escape::encode_safe(&x);
|
||||
|
||||
if x.len() > 5 {
|
||||
Command::new("sh").arg(&x).status().unwrap(); // UNSAFE
|
||||
} else {
|
||||
Command::new("sh").arg(&safe).status().unwrap(); // SAFE
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// Both branches produce findings: the true branch uses unsanitized `x`,
|
||||
// the else branch uses `safe` (HTML_ESCAPE sanitizer vs SHELL_ESCAPE sink).
|
||||
// Previously only 1 finding because else_clause was silently dropped from CFG.
|
||||
assert_eq!(
|
||||
findings.len(),
|
||||
2,
|
||||
"two findings expected (both branches reach sink with wrong/no sanitizer)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn path_state_budget_graceful() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Deeply nested ifs with a sink at the innermost level.
|
||||
// PathState should truncate gracefully after MAX_PATH_PREDICATES.
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").unwrap();
|
||||
if x.len() > 1 {
|
||||
if x.len() > 2 {
|
||||
if x.len() > 3 {
|
||||
if x.len() > 4 {
|
||||
if x.len() > 5 {
|
||||
if x.len() > 6 {
|
||||
if x.len() > 7 {
|
||||
if x.len() > 8 {
|
||||
if x.len() > 9 {
|
||||
Command::new("sh").arg(&x).status().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// Should still detect the flow — truncation shouldn't cause false negatives.
|
||||
assert_eq!(
|
||||
findings.len(),
|
||||
1,
|
||||
"should detect taint flow even with truncated PathState"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_predicate_not_pruned() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Comparison predicates are NOT in the contradiction whitelist, so even
|
||||
// seemingly contradictory comparisons should not be pruned.
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").unwrap();
|
||||
if x.len() > 5 { return; }
|
||||
if x.len() > 5 {
|
||||
Command::new("sh").arg(&x).status().unwrap();
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// Comparison is not in the whitelist — the path should NOT be pruned.
|
||||
assert_eq!(
|
||||
findings.len(),
|
||||
1,
|
||||
"Comparison predicate should not cause contradiction pruning"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_var_predicate_not_pruned() {
|
||||
use crate::cfg::build_cfg;
|
||||
use tree_sitter::Language;
|
||||
|
||||
// Multi-variable conditions should never be pruned for contradiction,
|
||||
// even if the kind is in the whitelist.
|
||||
let src = br#"
|
||||
use std::env; use std::process::Command;
|
||||
fn main() {
|
||||
let x = env::var("INPUT").unwrap();
|
||||
let y = env::var("OTHER").ok();
|
||||
if y.is_none() { return; }
|
||||
if y.is_none() {
|
||||
Command::new("sh").arg(&x).status().unwrap();
|
||||
}
|
||||
}"#;
|
||||
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
|
||||
.unwrap();
|
||||
let tree = parser.parse(src as &[u8], None).unwrap();
|
||||
|
||||
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
|
||||
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
|
||||
|
||||
// Note: y.is_none() condition references `y` and `is_none` — two idents.
|
||||
// Wait, `is_none` is a method — collect_idents finds `y` and `is_none` as
|
||||
// separate identifiers. That makes it multi-var, so contradiction should
|
||||
// NOT fire. However, the actual behavior depends on how many idents
|
||||
// collect_idents extracts from `y.is_none()`. If it returns ["y", "is_none"],
|
||||
// then the predicate has 2 vars → multi-var → not pruned → finding exists.
|
||||
assert!(
|
||||
!findings.is_empty(),
|
||||
"multi-var predicate should not be pruned; flow should be detected"
|
||||
);
|
||||
}
|
||||
|
|
|
|||
458
src/taint/transfer.rs
Normal file
458
src/taint/transfer.rs
Normal file
|
|
@ -0,0 +1,458 @@
|
|||
use crate::callgraph::normalize_callee_name;
|
||||
use crate::cfg::{EdgeKind, FuncSummaries, NodeInfo, StmtKind};
|
||||
use crate::interop::InteropEdge;
|
||||
use crate::labels::{Cap, DataLabel};
|
||||
use crate::state::engine::Transfer;
|
||||
use crate::state::lattice::Lattice;
|
||||
use crate::state::symbol::{SymbolId, SymbolInterner};
|
||||
use crate::summary::{CalleeResolution, GlobalSummaries};
|
||||
use crate::symbol::Lang;
|
||||
use crate::taint::domain::{TaintOrigin, TaintState, VarTaint, predicate_kind_bit};
|
||||
use crate::taint::path_state::{PredicateKind, classify_condition};
|
||||
use petgraph::graph::NodeIndex;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
/// Events emitted by the taint transfer function during Phase 2.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum TaintEvent {
|
||||
SinkReached {
|
||||
sink_node: NodeIndex,
|
||||
tainted_vars: Vec<(SymbolId, Cap, SmallVec<[TaintOrigin; 2]>)>,
|
||||
#[allow(dead_code)]
|
||||
sink_caps: Cap,
|
||||
all_validated: bool,
|
||||
guard_kind: Option<PredicateKind>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Taint transfer function for forward dataflow analysis.
|
||||
pub struct TaintTransfer<'a> {
|
||||
pub lang: Lang,
|
||||
pub namespace: &'a str,
|
||||
pub interner: &'a SymbolInterner,
|
||||
pub local_summaries: &'a FuncSummaries,
|
||||
pub global_summaries: Option<&'a GlobalSummaries>,
|
||||
pub interop_edges: &'a [InteropEdge],
|
||||
/// For JS two-level solve: top-level taint state seeded into function solves.
|
||||
pub global_seed: Option<&'a TaintState>,
|
||||
/// Optional scope filter: if set, only process nodes whose enclosing_func matches.
|
||||
/// None = process all nodes. Some(None) = top-level only. Some(Some(name)) = function only.
|
||||
pub scope_filter: Option<Option<&'a str>>,
|
||||
}
|
||||
|
||||
impl Transfer<TaintState> for TaintTransfer<'_> {
|
||||
type Event = TaintEvent;
|
||||
|
||||
fn apply(
|
||||
&self,
|
||||
node: NodeIndex,
|
||||
info: &NodeInfo,
|
||||
edge: Option<EdgeKind>,
|
||||
mut state: TaintState,
|
||||
) -> (TaintState, Vec<TaintEvent>) {
|
||||
let mut events = Vec::new();
|
||||
|
||||
// Scope filter: skip nodes outside our scope (return state unchanged)
|
||||
if let Some(ref filter) = self.scope_filter {
|
||||
let node_func = info.enclosing_func.as_deref();
|
||||
if node_func != *filter {
|
||||
return (state, events);
|
||||
}
|
||||
}
|
||||
|
||||
let caller_func = info.enclosing_func.as_deref().unwrap_or("");
|
||||
|
||||
// ── Apply taint transfer ────────────────────────────────────────
|
||||
match info.label {
|
||||
Some(DataLabel::Source(bits)) => {
|
||||
self.apply_source(node, info, bits, &mut state);
|
||||
}
|
||||
Some(DataLabel::Sanitizer(bits)) => {
|
||||
self.apply_sanitizer(info, bits, &mut state);
|
||||
}
|
||||
_ if info.kind == StmtKind::Call => {
|
||||
self.apply_call(node, info, caller_func, &mut state);
|
||||
}
|
||||
_ => {
|
||||
self.apply_assignment(info, &mut state);
|
||||
}
|
||||
}
|
||||
|
||||
// ── If-node predicate handling (edge-aware) ─────────────────────
|
||||
if info.kind == StmtKind::If
|
||||
&& !info.condition_vars.is_empty()
|
||||
&& matches!(edge, Some(EdgeKind::True) | Some(EdgeKind::False))
|
||||
{
|
||||
let cond_text = info.condition_text.as_deref().unwrap_or("");
|
||||
let kind = classify_condition(cond_text);
|
||||
let polarity = matches!(edge, Some(EdgeKind::True)) ^ info.condition_negated;
|
||||
|
||||
// ValidationCall handling
|
||||
if kind == PredicateKind::ValidationCall && polarity {
|
||||
for var in &info.condition_vars {
|
||||
if let Some(sym) = self.interner.get(var) {
|
||||
state.validated_may.insert(sym);
|
||||
state.validated_must.insert(sym);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Predicate summary for whitelisted kinds (contradiction pruning)
|
||||
if let Some(bit_idx) = predicate_kind_bit(kind) {
|
||||
for var in &info.condition_vars {
|
||||
if let Some(sym) = self.interner.get(var) {
|
||||
let mut summary = state.get_predicate(sym);
|
||||
if polarity {
|
||||
summary.known_true |= 1 << bit_idx;
|
||||
} else {
|
||||
summary.known_false |= 1 << bit_idx;
|
||||
}
|
||||
state.set_predicate(sym, summary);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Contradiction pruning: if any variable has contradictory predicates,
|
||||
// this is an infeasible path → return bot (monotonically kills branch).
|
||||
if state.has_contradiction() {
|
||||
return (TaintState::bot(), events);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Sink check ──────────────────────────────────────────────────
|
||||
let sink_caps = self.resolve_sink_caps(info, caller_func);
|
||||
if !sink_caps.is_empty() {
|
||||
let tainted_vars = self.collect_tainted_sink_vars(info, &state, sink_caps);
|
||||
if !tainted_vars.is_empty() {
|
||||
let all_validated = tainted_vars
|
||||
.iter()
|
||||
.all(|(sym, _, _)| state.validated_may.contains(*sym));
|
||||
|
||||
let guard_kind = if all_validated {
|
||||
Some(PredicateKind::ValidationCall)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
events.push(TaintEvent::SinkReached {
|
||||
sink_node: node,
|
||||
tainted_vars,
|
||||
sink_caps,
|
||||
all_validated,
|
||||
guard_kind,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
(state, events)
|
||||
}
|
||||
|
||||
fn iteration_budget(&self) -> usize {
|
||||
100_000
|
||||
}
|
||||
|
||||
fn on_budget_exceeded(&self) -> bool {
|
||||
tracing::warn!("taint analysis: worklist budget exceeded, returning partial results");
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl TaintTransfer<'_> {
|
||||
/// Apply a Source label: insert taint for the defined variable.
|
||||
fn apply_source(&self, node: NodeIndex, info: &NodeInfo, bits: Cap, state: &mut TaintState) {
|
||||
if let Some(ref v) = info.defines
|
||||
&& let Some(sym) = self.interner.get(v)
|
||||
{
|
||||
let callee = info.callee.as_deref().unwrap_or("");
|
||||
let source_kind = crate::labels::infer_source_kind(bits, callee);
|
||||
let origin = TaintOrigin { node, source_kind };
|
||||
|
||||
match state.get(sym) {
|
||||
Some(existing) => {
|
||||
let mut new_taint = existing.clone();
|
||||
new_taint.caps |= bits;
|
||||
if new_taint.origins.len() < 4
|
||||
&& !new_taint.origins.iter().any(|o| o.node == node)
|
||||
{
|
||||
new_taint.origins.push(origin);
|
||||
}
|
||||
state.set(sym, new_taint);
|
||||
}
|
||||
None => {
|
||||
state.set(
|
||||
sym,
|
||||
VarTaint {
|
||||
caps: bits,
|
||||
origins: SmallVec::from_elem(origin, 1),
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply a Sanitizer label: propagate input taint, then strip sanitizer bits.
|
||||
fn apply_sanitizer(&self, info: &NodeInfo, bits: Cap, state: &mut TaintState) {
|
||||
if let Some(ref v) = info.defines
|
||||
&& let Some(sym) = self.interner.get(v)
|
||||
{
|
||||
let (combined_caps, combined_origins) = self.collect_uses_taint(info, state);
|
||||
let new_caps = combined_caps & !bits;
|
||||
if new_caps.is_empty() {
|
||||
state.remove(sym);
|
||||
} else {
|
||||
state.set(
|
||||
sym,
|
||||
VarTaint {
|
||||
caps: new_caps,
|
||||
origins: combined_origins,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply a function call: resolve callee and compute return taint.
|
||||
fn apply_call(
|
||||
&self,
|
||||
node: NodeIndex,
|
||||
info: &NodeInfo,
|
||||
caller_func: &str,
|
||||
state: &mut TaintState,
|
||||
) {
|
||||
if let Some(ref callee) = info.callee
|
||||
&& let Some(resolved) = self.resolve_callee(callee, caller_func, info.call_ordinal)
|
||||
{
|
||||
let mut return_bits = Cap::empty();
|
||||
let mut return_origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new();
|
||||
|
||||
// 1. Source behaviour
|
||||
if !resolved.source_caps.is_empty() {
|
||||
return_bits |= resolved.source_caps;
|
||||
let callee_str = info.callee.as_deref().unwrap_or("");
|
||||
let source_kind =
|
||||
crate::labels::infer_source_kind(resolved.source_caps, callee_str);
|
||||
let origin = TaintOrigin { node, source_kind };
|
||||
if !return_origins.iter().any(|o| o.node == node) {
|
||||
return_origins.push(origin);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Propagation
|
||||
if resolved.propagates_taint {
|
||||
let (use_caps, use_origins) = self.collect_uses_taint(info, state);
|
||||
return_bits |= use_caps;
|
||||
for orig in &use_origins {
|
||||
if return_origins.len() < 4
|
||||
&& !return_origins.iter().any(|o| o.node == orig.node)
|
||||
{
|
||||
return_origins.push(*orig);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Sanitizer behaviour (applied last so it always wins)
|
||||
return_bits &= !resolved.sanitizer_caps;
|
||||
|
||||
// Write result
|
||||
if let Some(ref v) = info.defines
|
||||
&& let Some(sym) = self.interner.get(v)
|
||||
{
|
||||
if return_bits.is_empty() {
|
||||
state.remove(sym);
|
||||
} else {
|
||||
state.set(
|
||||
sym,
|
||||
VarTaint {
|
||||
caps: return_bits,
|
||||
origins: return_origins,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Unresolved call — fall through to default gen/kill
|
||||
self.apply_assignment(info, state);
|
||||
}
|
||||
|
||||
/// Default gen/kill: propagate taint through variable assignments.
|
||||
fn apply_assignment(&self, info: &NodeInfo, state: &mut TaintState) {
|
||||
if matches!(
|
||||
info.label,
|
||||
Some(DataLabel::Source(_)) | Some(DataLabel::Sanitizer(_))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(ref d) = info.defines
|
||||
&& let Some(sym) = self.interner.get(d)
|
||||
{
|
||||
let (combined_caps, combined_origins) = self.collect_uses_taint(info, state);
|
||||
if combined_caps.is_empty() {
|
||||
state.remove(sym);
|
||||
} else {
|
||||
state.set(
|
||||
sym,
|
||||
VarTaint {
|
||||
caps: combined_caps,
|
||||
origins: combined_origins,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect taint from all `uses` variables (union of caps + merge origins).
|
||||
fn collect_uses_taint(
|
||||
&self,
|
||||
info: &NodeInfo,
|
||||
state: &TaintState,
|
||||
) -> (Cap, SmallVec<[TaintOrigin; 2]>) {
|
||||
let mut combined_caps = Cap::empty();
|
||||
let mut combined_origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new();
|
||||
|
||||
for u in &info.uses {
|
||||
let taint = self.lookup_var(u, state);
|
||||
if let Some(t) = taint {
|
||||
combined_caps |= t.caps;
|
||||
for orig in &t.origins {
|
||||
if combined_origins.len() < 4
|
||||
&& !combined_origins.iter().any(|o| o.node == orig.node)
|
||||
{
|
||||
combined_origins.push(*orig);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(combined_caps, combined_origins)
|
||||
}
|
||||
|
||||
/// Look up a variable's taint, falling back to global_seed for JS two-level solve.
|
||||
fn lookup_var<'a>(&'a self, name: &str, state: &'a TaintState) -> Option<&'a VarTaint> {
|
||||
if let Some(sym) = self.interner.get(name) {
|
||||
if let Some(taint) = state.get(sym) {
|
||||
return Some(taint);
|
||||
}
|
||||
// Fall back to global seed (JS two-level solve)
|
||||
if let Some(seed) = self.global_seed {
|
||||
return seed.get(sym);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Resolve sink caps from label or callee summary.
|
||||
fn resolve_sink_caps(&self, info: &NodeInfo, caller_func: &str) -> Cap {
|
||||
match info.label {
|
||||
Some(DataLabel::Sink(caps)) => caps,
|
||||
_ => info
|
||||
.callee
|
||||
.as_ref()
|
||||
.and_then(|c| self.resolve_callee(c, caller_func, info.call_ordinal))
|
||||
.filter(|r| !r.sink_caps.is_empty())
|
||||
.map(|r| r.sink_caps)
|
||||
.unwrap_or(Cap::empty()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect tainted variables at a sink node.
|
||||
fn collect_tainted_sink_vars(
|
||||
&self,
|
||||
info: &NodeInfo,
|
||||
state: &TaintState,
|
||||
sink_caps: Cap,
|
||||
) -> Vec<(SymbolId, Cap, SmallVec<[TaintOrigin; 2]>)> {
|
||||
let mut result = Vec::new();
|
||||
for u in &info.uses {
|
||||
if let Some(taint) = self.lookup_var(u, state)
|
||||
&& (taint.caps & sink_caps) != Cap::empty()
|
||||
&& let Some(sym) = self.interner.get(u)
|
||||
{
|
||||
result.push((sym, taint.caps, taint.origins.clone()));
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Resolve a callee name to its summary (local → global → interop).
|
||||
fn resolve_callee(
|
||||
&self,
|
||||
callee: &str,
|
||||
caller_func: &str,
|
||||
call_ordinal: u32,
|
||||
) -> Option<ResolvedSummary> {
|
||||
let normalized = normalize_callee_name(callee);
|
||||
|
||||
// 1) Local (same-file)
|
||||
let local_matches: Vec<_> = self
|
||||
.local_summaries
|
||||
.iter()
|
||||
.filter(|(k, _)| {
|
||||
k.name == normalized && k.lang == self.lang && k.namespace == self.namespace
|
||||
})
|
||||
.collect();
|
||||
|
||||
if local_matches.len() == 1 {
|
||||
let (_, ls) = local_matches[0];
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: ls.source_caps,
|
||||
sanitizer_caps: ls.sanitizer_caps,
|
||||
sink_caps: ls.sink_caps,
|
||||
propagates_taint: ls.propagates_taint,
|
||||
});
|
||||
}
|
||||
if local_matches.len() > 1 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// 2) Global same-language
|
||||
if let Some(gs) = self.global_summaries {
|
||||
match gs.resolve_callee_key(normalized, self.lang, self.namespace, None) {
|
||||
CalleeResolution::Resolved(target_key) => {
|
||||
if let Some(fs) = gs.get(&target_key) {
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: fs.source_caps(),
|
||||
sanitizer_caps: fs.sanitizer_caps(),
|
||||
sink_caps: fs.sink_caps(),
|
||||
propagates_taint: fs.propagates_taint,
|
||||
});
|
||||
}
|
||||
}
|
||||
CalleeResolution::NotFound | CalleeResolution::Ambiguous(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Interop edges
|
||||
for edge in self.interop_edges {
|
||||
if edge.from.caller_lang == self.lang
|
||||
&& edge.from.caller_namespace == self.namespace
|
||||
&& edge.from.callee_symbol == callee
|
||||
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_func)
|
||||
&& (edge.from.ordinal == 0 || edge.from.ordinal == call_ordinal)
|
||||
&& let Some(gs) = self.global_summaries
|
||||
&& let Some(fs) = gs.get(&edge.to)
|
||||
{
|
||||
return Some(ResolvedSummary {
|
||||
source_caps: fs.source_caps(),
|
||||
sanitizer_caps: fs.sanitizer_caps(),
|
||||
sink_caps: fs.sink_caps(),
|
||||
propagates_taint: fs.propagates_taint,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolved summary for a callee.
|
||||
struct ResolvedSummary {
|
||||
source_caps: Cap,
|
||||
sanitizer_caps: Cap,
|
||||
sink_caps: Cap,
|
||||
propagates_taint: bool,
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue