* chore: Exclude CLAUDE.md from Cargo.toml

* feat: add callgraph module and integrate into main analysis flow

* feat: enhance CLI with new severity filtering and analysis modes

* feat: update CHANGELOG with recent enhancements and fixes to severity filtering and output handling

* feat: implement state-model dataflow analysis for resource lifecycle and auth state

* feat: enhance diagnostic output formatting and add evidence structure

* feat: implement attack surface ranking for diagnostics with scoring and sorting

* feat: add comprehensive documentation for installation, usage, and rules reference

* feat: add multiple language support for command execution and evaluation endpoints

* feat: implement inline suppression for findings using `nyx:ignore` comments

* feat: add confidence levels to AST patterns and update output structure

* feat: implement low-noise prioritization system with category filtering, rollup grouping, and configurable budgets

* feat: bump version to 0.4.0 and update changelog with new features and improvements

* feat: add dead code allowances to various functions in mod.rs and real_world_tests.rs
This commit is contained in:
Eli Peter 2026-02-25 21:16:36 -05:00 committed by GitHub
parent 19b578c5c4
commit 1bbe4b1cfb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
456 changed files with 25628 additions and 1228 deletions

620
src/taint/domain.rs Normal file
View file

@ -0,0 +1,620 @@
use crate::labels::{Cap, SourceKind};
use crate::state::lattice::Lattice;
use crate::state::symbol::SymbolId;
use crate::taint::path_state::PredicateKind;
use petgraph::graph::NodeIndex;
use smallvec::SmallVec;
/// Maximum origins tracked per variable (bounded to prevent growth).
const MAX_ORIGINS_PER_VAR: usize = 4;
/// Per-variable taint information.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct VarTaint {
pub caps: Cap,
/// Up to N origins that contributed taint (bounded).
pub origins: SmallVec<[TaintOrigin; 2]>,
}
/// A single taint origin — the node and classification of where taint came from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct TaintOrigin {
pub node: NodeIndex,
pub source_kind: SourceKind,
}
/// Compact bitset for up to 64 variables (indexed by SymbolId ordinal).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct SmallBitSet(u64);
impl SmallBitSet {
pub fn empty() -> Self {
Self(0)
}
pub fn insert(&mut self, id: SymbolId) {
let idx = id.0;
if idx < 64 {
self.0 |= 1u64 << idx;
}
}
pub fn contains(&self, id: SymbolId) -> bool {
let idx = id.0;
if idx < 64 {
self.0 & (1u64 << idx) != 0
} else {
false
}
}
/// Union: self | other
pub fn union(self, other: Self) -> Self {
Self(self.0 | other.0)
}
/// Intersection: self & other
pub fn intersection(self, other: Self) -> Self {
Self(self.0 & other.0)
}
#[allow(dead_code)]
pub fn is_empty(self) -> bool {
self.0 == 0
}
/// Whether self is a subset of other.
#[allow(dead_code)] // used by Lattice::leq
pub fn is_subset_of(self, other: Self) -> bool {
self.0 & other.0 == self.0
}
/// Whether self is a superset of other.
#[allow(dead_code)] // used by Lattice::leq
pub fn is_superset_of(self, other: Self) -> bool {
other.is_subset_of(self)
}
}
/// Monotone predicate summary per variable.
///
/// Tracks which whitelisted predicate kinds are known true/false on ALL paths.
/// join = intersection of bits (must-hold semantics).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct PredicateSummary {
/// Bitmask: bit 0=NullCheck, 1=EmptyCheck, 2=ErrorCheck
pub known_true: u8,
pub known_false: u8,
}
impl PredicateSummary {
pub fn empty() -> Self {
Self {
known_true: 0,
known_false: 0,
}
}
/// Join = intersection (only predicates true on ALL paths).
pub fn join(self, other: Self) -> Self {
Self {
known_true: self.known_true & other.known_true,
known_false: self.known_false & other.known_false,
}
}
/// Check for contradiction: same kind known both true and false.
pub fn has_contradiction(self) -> bool {
self.known_true & self.known_false != 0
}
pub fn is_empty(self) -> bool {
self.known_true == 0 && self.known_false == 0
}
}
/// Map a whitelisted PredicateKind to its bit index (0-2).
/// Returns None for non-whitelisted kinds.
pub fn predicate_kind_bit(kind: PredicateKind) -> Option<u8> {
match kind {
PredicateKind::NullCheck => Some(0),
PredicateKind::EmptyCheck => Some(1),
PredicateKind::ErrorCheck => Some(2),
_ => None,
}
}
/// The abstract taint state at a program point.
///
/// Uses sorted SmallVec keyed by SymbolId for O(n) merge-join.
/// Variables beyond the interner's capacity are naturally excluded.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct TaintState {
/// Per-variable taint, sorted by SymbolId.
pub vars: SmallVec<[(SymbolId, VarTaint); 16]>,
/// Variables validated on ALL paths (intersection on join).
pub validated_must: SmallBitSet,
/// Variables validated on ANY path (union on join).
pub validated_may: SmallBitSet,
/// Per-variable predicate summary (sorted by SymbolId).
pub predicates: SmallVec<[(SymbolId, PredicateSummary); 4]>,
}
impl TaintState {
/// Create the initial state (no taint, no validation, no predicates).
pub fn initial() -> Self {
Self {
vars: SmallVec::new(),
validated_must: SmallBitSet::empty(),
validated_may: SmallBitSet::empty(),
predicates: SmallVec::new(),
}
}
/// Look up taint for a variable.
pub fn get(&self, sym: SymbolId) -> Option<&VarTaint> {
self.vars
.binary_search_by_key(&sym, |(id, _)| *id)
.ok()
.map(|idx| &self.vars[idx].1)
}
/// Insert or update taint for a variable.
pub fn set(&mut self, sym: SymbolId, taint: VarTaint) {
match self.vars.binary_search_by_key(&sym, |(id, _)| *id) {
Ok(idx) => self.vars[idx].1 = taint,
Err(idx) => self.vars.insert(idx, (sym, taint)),
}
}
/// Remove taint for a variable.
pub fn remove(&mut self, sym: SymbolId) {
if let Ok(idx) = self.vars.binary_search_by_key(&sym, |(id, _)| *id) {
self.vars.remove(idx);
}
}
/// Set a predicate summary for a variable.
pub fn set_predicate(&mut self, sym: SymbolId, summary: PredicateSummary) {
match self.predicates.binary_search_by_key(&sym, |(id, _)| *id) {
Ok(idx) => self.predicates[idx].1 = summary,
Err(idx) => self.predicates.insert(idx, (sym, summary)),
}
}
/// Get predicate summary for a variable.
pub fn get_predicate(&self, sym: SymbolId) -> PredicateSummary {
self.predicates
.binary_search_by_key(&sym, |(id, _)| *id)
.ok()
.map(|idx| self.predicates[idx].1)
.unwrap_or_else(PredicateSummary::empty)
}
/// Check if any variable has contradictory predicates.
pub fn has_contradiction(&self) -> bool {
self.predicates.iter().any(|(_, s)| s.has_contradiction())
}
}
impl Lattice for TaintState {
fn bot() -> Self {
Self::initial()
}
fn join(&self, other: &Self) -> Self {
// Merge-join vars (sorted by SymbolId)
let vars = merge_join_vars(&self.vars, &other.vars);
// validated_must = intersection (must hold on ALL paths)
let validated_must = self.validated_must.intersection(other.validated_must);
// validated_may = union (holds on ANY path)
let validated_may = self.validated_may.union(other.validated_may);
// predicates = per-key intersection of known_true/known_false bits
let predicates = merge_join_predicates(&self.predicates, &other.predicates);
TaintState {
vars,
validated_must,
validated_may,
predicates,
}
}
fn leq(&self, other: &Self) -> bool {
// Per-key Cap subset + origins subset
if !vars_leq(&self.vars, &other.vars) {
return false;
}
// validated_must: self ⊇ other (superset = less info = lower)
if !self.validated_must.is_superset_of(other.validated_must) {
return false;
}
// validated_may: self ⊆ other
if !self.validated_may.is_subset_of(other.validated_may) {
return false;
}
// predicates: self.known_true ⊇ other.known_true (more precise = lower)
predicates_leq(&self.predicates, &other.predicates)
}
}
/// Merge-join two sorted var lists: per-key Cap OR + origins merge (bounded).
fn merge_join_vars(
a: &[(SymbolId, VarTaint)],
b: &[(SymbolId, VarTaint)],
) -> SmallVec<[(SymbolId, VarTaint); 16]> {
let mut result = SmallVec::with_capacity(a.len().max(b.len()));
let (mut i, mut j) = (0, 0);
while i < a.len() && j < b.len() {
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
result.push(a[i].clone());
i += 1;
}
std::cmp::Ordering::Greater => {
result.push(b[j].clone());
j += 1;
}
std::cmp::Ordering::Equal => {
let caps = a[i].1.caps | b[j].1.caps;
let origins = merge_origins(&a[i].1.origins, &b[j].1.origins);
result.push((a[i].0, VarTaint { caps, origins }));
i += 1;
j += 1;
}
}
}
// Remaining from either side
while i < a.len() {
result.push(a[i].clone());
i += 1;
}
while j < b.len() {
result.push(b[j].clone());
j += 1;
}
result
}
/// Merge two origin lists, deduplicating by node and bounding at MAX_ORIGINS_PER_VAR.
fn merge_origins(
a: &SmallVec<[TaintOrigin; 2]>,
b: &SmallVec<[TaintOrigin; 2]>,
) -> SmallVec<[TaintOrigin; 2]> {
let mut merged = a.clone();
for origin in b {
if merged.len() >= MAX_ORIGINS_PER_VAR {
break;
}
if !merged.iter().any(|o| o.node == origin.node) {
merged.push(*origin);
}
}
merged
}
/// Check if a.vars ⊑ b.vars (per-key Cap subset + origins subset).
#[allow(dead_code)] // called by Lattice::leq
fn vars_leq(a: &[(SymbolId, VarTaint)], b: &[(SymbolId, VarTaint)]) -> bool {
let (mut i, mut j) = (0, 0);
while i < a.len() {
if j >= b.len() {
return false; // a has keys not in b → not ⊑
}
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => return false, // key in a but not b
std::cmp::Ordering::Greater => {
j += 1; // key only in b, skip
}
std::cmp::Ordering::Equal => {
// Cap subset check
if a[i].1.caps & b[j].1.caps != a[i].1.caps {
return false;
}
// Origins subset check (by node)
for orig in &a[i].1.origins {
if !b[j].1.origins.iter().any(|o| o.node == orig.node) {
return false;
}
}
i += 1;
j += 1;
}
}
}
true
}
/// Merge-join predicate summaries with intersection semantics.
fn merge_join_predicates(
a: &[(SymbolId, PredicateSummary)],
b: &[(SymbolId, PredicateSummary)],
) -> SmallVec<[(SymbolId, PredicateSummary); 4]> {
let mut result = SmallVec::new();
let (mut i, mut j) = (0, 0);
while i < a.len() && j < b.len() {
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
// Key only in a — intersection with empty = empty → drop
i += 1;
}
std::cmp::Ordering::Greater => {
j += 1;
}
std::cmp::Ordering::Equal => {
let joined = a[i].1.join(b[j].1);
if !joined.is_empty() {
result.push((a[i].0, joined));
}
i += 1;
j += 1;
}
}
}
// Keys only in one side → intersection with empty = drop
result
}
/// Check if a.predicates ⊑ b.predicates.
/// More precise (more known_true bits) = lower in the lattice.
/// So a ⊑ b means a.known_true ⊇ b.known_true for each key.
#[allow(dead_code)] // called by Lattice::leq
fn predicates_leq(a: &[(SymbolId, PredicateSummary)], b: &[(SymbolId, PredicateSummary)]) -> bool {
let (mut i, mut j) = (0, 0);
// For each key in b, a must have at least as many bits
while j < b.len() {
if i >= a.len() {
// b has keys that a doesn't — a is missing info = not lower
return false;
}
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
// a has extra keys (more info) — OK for leq
i += 1;
}
std::cmp::Ordering::Greater => {
// b has a key that a doesn't → a has fewer bits → not ⊑
return false;
}
std::cmp::Ordering::Equal => {
// a.known_true must be a superset of b.known_true
if a[i].1.known_true & b[j].1.known_true != b[j].1.known_true {
return false;
}
if a[i].1.known_false & b[j].1.known_false != b[j].1.known_false {
return false;
}
i += 1;
j += 1;
}
}
}
true
}
#[cfg(test)]
mod tests {
use super::*;
fn make_taint(sym: u32, caps: Cap) -> (SymbolId, VarTaint) {
(
SymbolId(sym),
VarTaint {
caps,
origins: SmallVec::new(),
},
)
}
fn make_taint_with_origin(sym: u32, caps: Cap, node: usize) -> (SymbolId, VarTaint) {
(
SymbolId(sym),
VarTaint {
caps,
origins: smallvec::smallvec![TaintOrigin {
node: NodeIndex::new(node),
source_kind: SourceKind::Unknown,
}],
},
)
}
fn state_with_vars(vars: Vec<(SymbolId, VarTaint)>) -> TaintState {
let mut s = TaintState::initial();
s.vars = SmallVec::from_vec(vars);
s
}
// ── Lattice property tests ──────────────────────────────────────────
#[test]
fn bot_identity() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
assert_eq!(a.join(&TaintState::bot()), a);
assert_eq!(TaintState::bot().join(&a), a);
}
#[test]
fn join_commutativity() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(1, Cap::SHELL_ESCAPE)]);
assert_eq!(a.join(&b), b.join(&a));
}
#[test]
fn join_associativity() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(0, Cap::SHELL_ESCAPE)]);
let c = state_with_vars(vec![make_taint(1, Cap::HTML_ESCAPE)]);
assert_eq!(a.join(&b).join(&c), a.join(&b.join(&c)));
}
#[test]
fn join_idempotency() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR | Cap::SHELL_ESCAPE)]);
assert_eq!(a.join(&a), a);
}
#[test]
fn leq_reflexive() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
assert!(a.leq(&a));
}
#[test]
fn leq_consistent_with_join() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(0, Cap::ENV_VAR | Cap::SHELL_ESCAPE)]);
assert!(a.leq(&b));
assert_eq!(a.join(&b), b);
}
#[test]
fn join_merges_caps() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(0, Cap::SHELL_ESCAPE)]);
let joined = a.join(&b);
assert_eq!(
joined.get(SymbolId(0)).unwrap().caps,
Cap::ENV_VAR | Cap::SHELL_ESCAPE
);
}
#[test]
fn join_merges_origins() {
let a = state_with_vars(vec![make_taint_with_origin(0, Cap::ENV_VAR, 1)]);
let b = state_with_vars(vec![make_taint_with_origin(0, Cap::ENV_VAR, 2)]);
let joined = a.join(&b);
assert_eq!(joined.get(SymbolId(0)).unwrap().origins.len(), 2);
}
#[test]
fn validated_must_intersection() {
let mut a = TaintState::initial();
a.validated_must.insert(SymbolId(0));
a.validated_must.insert(SymbolId(1));
let mut b = TaintState::initial();
b.validated_must.insert(SymbolId(1));
b.validated_must.insert(SymbolId(2));
let joined = a.join(&b);
assert!(!joined.validated_must.contains(SymbolId(0)));
assert!(joined.validated_must.contains(SymbolId(1)));
assert!(!joined.validated_must.contains(SymbolId(2)));
}
#[test]
fn validated_may_union() {
let mut a = TaintState::initial();
a.validated_may.insert(SymbolId(0));
let mut b = TaintState::initial();
b.validated_may.insert(SymbolId(1));
let joined = a.join(&b);
assert!(joined.validated_may.contains(SymbolId(0)));
assert!(joined.validated_may.contains(SymbolId(1)));
}
#[test]
fn predicate_contradiction() {
let mut state = TaintState::initial();
state.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 1, // NullCheck true
known_false: 1, // NullCheck false
},
);
assert!(state.has_contradiction());
}
#[test]
fn predicate_no_contradiction() {
let mut state = TaintState::initial();
state.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 1, // NullCheck true
known_false: 2, // EmptyCheck false (different kind)
},
);
assert!(!state.has_contradiction());
}
#[test]
fn predicate_join_intersection() {
let mut a = TaintState::initial();
a.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 0b011, // NullCheck + EmptyCheck
known_false: 0,
},
);
let mut b = TaintState::initial();
b.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 0b010, // EmptyCheck only
known_false: 0,
},
);
let joined = a.join(&b);
let pred = joined.get_predicate(SymbolId(0));
assert_eq!(pred.known_true, 0b010); // only EmptyCheck on both paths
}
// ── SmallBitSet tests ───────────────────────────────────────────────
#[test]
fn small_bitset_basic() {
let mut bs = SmallBitSet::empty();
assert!(bs.is_empty());
bs.insert(SymbolId(0));
assert!(bs.contains(SymbolId(0)));
assert!(!bs.contains(SymbolId(1)));
assert!(!bs.is_empty());
}
#[test]
fn small_bitset_union_intersection() {
let mut a = SmallBitSet::empty();
a.insert(SymbolId(0));
a.insert(SymbolId(2));
let mut b = SmallBitSet::empty();
b.insert(SymbolId(1));
b.insert(SymbolId(2));
let u = a.union(b);
assert!(u.contains(SymbolId(0)));
assert!(u.contains(SymbolId(1)));
assert!(u.contains(SymbolId(2)));
let i = a.intersection(b);
assert!(!i.contains(SymbolId(0)));
assert!(!i.contains(SymbolId(1)));
assert!(i.contains(SymbolId(2)));
}
}

View file

@ -1,11 +1,21 @@
use crate::cfg::{Cfg, FuncSummaries, NodeInfo, StmtKind};
pub mod domain;
pub mod path_state;
pub mod transfer;
use crate::cfg::{Cfg, FuncSummaries};
use crate::interop::InteropEdge;
use crate::labels::{Cap, DataLabel, SourceKind};
use crate::labels::SourceKind;
use crate::state::engine::{self, MAX_TRACKED_VARS};
use crate::state::lattice::Lattice;
use crate::state::symbol::SymbolInterner;
use crate::summary::GlobalSummaries;
use crate::symbol::Lang;
use domain::TaintState;
use path_state::PredicateKind;
use petgraph::graph::NodeIndex;
use std::collections::HashMap;
use tracing::debug;
use petgraph::visit::IntoNodeReferences;
use std::collections::HashSet;
use transfer::{TaintEvent, TaintTransfer};
/// A detected taint finding with both source and sink locations.
#[derive(Debug, Clone)]
@ -20,269 +30,23 @@ pub struct Finding {
pub path: Vec<NodeIndex>,
/// The kind of source that originated the taint.
pub source_kind: SourceKind,
}
/// Order-independent hash of a taint map.
///
/// Uses XOR of per-entry hashes so the result is the same regardless of
/// iteration order — no allocation or sorting required.
fn taint_hash(taint: &HashMap<String, Cap>) -> u64 {
let mut h: u64 = 0;
for (k, bits) in taint {
// Per-entry hash: FNV-1a-style mixing of key bytes + cap bits.
let mut entry_h: u64 = 0xcbf2_9ce4_8422_2325; // FNV offset basis
for b in k.as_bytes() {
entry_h ^= *b as u64;
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3); // FNV prime
}
entry_h ^= bits.bits() as u64;
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3);
h ^= entry_h;
}
h
}
/// Resolved summary for a callee — a uniform view regardless of whether the
/// summary came from a local (samefile) or global (crossfile) source.
struct ResolvedSummary {
source_caps: Cap,
sanitizer_caps: Cap,
sink_caps: Cap,
propagates_taint: bool,
}
/// Try to resolve a callee name using conservative same-language resolution.
///
/// Resolution order:
/// 1. Local (same-file): exact name + same lang + same namespace
/// 2. Global same-language: via `lookup_same_lang`; must be unambiguous
/// 3. Interop edges: explicit cross-language bridges
/// 4. No cross-language fallback
#[allow(clippy::too_many_arguments)]
fn resolve_callee(
callee: &str,
caller_lang: Lang,
caller_namespace: &str,
caller_func: &str,
call_ordinal: u32,
local: &FuncSummaries,
global: Option<&GlobalSummaries>,
interop_edges: &[InteropEdge],
) -> Option<ResolvedSummary> {
// 1) Local (same-file): scan local summaries for matching name + lang + namespace
let local_matches: Vec<_> = local
.iter()
.filter(|(k, _)| {
k.name == callee && k.lang == caller_lang && k.namespace == caller_namespace
})
.collect();
if local_matches.len() == 1 {
let (_, ls) = local_matches[0];
return Some(ResolvedSummary {
source_caps: ls.source_caps,
sanitizer_caps: ls.sanitizer_caps,
sink_caps: ls.sink_caps,
propagates_taint: ls.propagates_taint,
});
}
// Multiple local matches — try arity disambiguation (future), for now return None
if local_matches.len() > 1 {
return None;
}
// 2) Global same-language
if let Some(gs) = global {
let matches = gs.lookup_same_lang(caller_lang, callee);
if matches.len() == 1 {
let (_, fs) = matches[0];
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
// Multiple matches — try namespace match first
if matches.len() > 1 {
let same_ns: Vec<_> = matches
.iter()
.filter(|(k, _)| k.namespace == caller_namespace)
.collect();
if same_ns.len() == 1 {
let (_, fs) = same_ns[0];
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
// Still ambiguous — return None (conservative)
return None;
}
}
// 3) Interop edges: explicit cross-language bridges
for edge in interop_edges {
if edge.from.caller_lang == caller_lang
&& edge.from.caller_namespace == caller_namespace
&& edge.from.callee_symbol == callee
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_func)
&& (edge.from.ordinal == 0 || edge.from.ordinal == call_ordinal)
{
// Look up the target in global summaries by exact FuncKey
if let Some(gs) = global
&& let Some(fs) = gs.get(&edge.to)
{
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
}
}
// 4) No cross-language fallback
None
}
/// Apply taint transfer for a single node, mutating `out` in place.
///
/// Callers should clone the taint map before calling if they need
/// the original state preserved.
fn apply_taint(
node: &NodeInfo,
out: &mut HashMap<String, Cap>,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
caller_lang: Lang,
caller_namespace: &str,
interop_edges: &[InteropEdge],
) {
debug!(target: "taint", "Applying taint to node: {:?}", node);
debug!(target: "taint", "Taint: {:?}", out);
let caller_func = node.enclosing_func.as_deref().unwrap_or("");
match node.label {
// A new untrusted value enters the program
Some(DataLabel::Source(bits)) => {
if let Some(v) = &node.defines {
out.insert(v.clone(), bits);
}
}
// Sanitizer: propagate input taint through the assignment FIRST,
// then strip the sanitizer's capability bits. This ensures that
// `let y = sanitize_html(&x)` gives y the taint of x minus the
// HTML_ESCAPE bit — rather than leaving y completely clean (which
// would hide "wrong sanitiser for this sink" bugs).
Some(DataLabel::Sanitizer(bits)) => {
if let Some(v) = &node.defines {
// 1. Propagate: union taint from all read variables
let mut combined = Cap::empty();
for u in &node.uses {
if let Some(b) = out.get(u) {
combined |= *b;
}
}
// 2. Strip the sanitiser's bits
let new = combined & !bits;
if new.is_empty() {
out.remove(v);
} else {
out.insert(v.clone(), new);
}
}
}
// A function call — resolve against local + global summaries
_ if node.kind == StmtKind::Call => {
if let Some(callee) = &node.callee
&& let Some(resolved) = resolve_callee(
callee,
caller_lang,
caller_namespace,
caller_func,
node.call_ordinal,
local_summaries,
global_summaries,
interop_edges,
)
{
// Build the return value's taint bits in stages, then
// write once at the end. Order matters:
//
// 1. Start with fresh source taint (if the callee is a source)
// 2. Union with propagated arg taint (if the callee propagates)
// 3. Strip sanitizer bits last (so sanitization always wins)
let mut return_bits = Cap::empty();
// ── 1. Source behaviour ──
return_bits |= resolved.source_caps;
// ── 2. Propagation ──
if resolved.propagates_taint {
for u in &node.uses {
if let Some(bits) = out.get(u) {
return_bits |= *bits;
}
}
}
// ── 3. Sanitizer behaviour (applied last so it always wins) ──
return_bits &= !resolved.sanitizer_caps;
// ── Write the result ──
if let Some(v) = &node.defines {
if return_bits.is_empty() {
out.remove(v);
} else {
out.insert(v.clone(), return_bits);
}
}
// ── Sink behaviour: handled in the main analysis loop
// (checked via node.label or resolved summary) ──
return;
}
// Unresolved call — fall through to default gen/kill below
}
// All other statements: classic gen/kill for assignments
_ => {}
}
// Default gen/kill: propagate taint through variable assignments
if !matches!(
node.label,
Some(DataLabel::Source(_)) | Some(DataLabel::Sanitizer(_))
) && let Some(d) = &node.defines
{
let mut combined = Cap::empty();
for u in &node.uses {
if let Some(bits) = out.get(u) {
combined |= *bits;
}
}
if combined.is_empty() {
out.remove(d);
} else {
out.insert(d.clone(), combined);
}
}
/// Whether all tainted sink variables are guarded by a validation
/// predicate on this path (metadata only — does not change severity).
#[allow(dead_code)] // surfaced in Diag output (task 4)
pub path_validated: bool,
/// The kind of validation guard protecting this path, if any.
#[allow(dead_code)] // surfaced in Diag output (task 4)
pub guard_kind: Option<PredicateKind>,
}
/// Run taint analysis on a single file's CFG.
///
/// `global_summaries` is `None` for pass1 / singlefile mode and
/// `Some(&map)` for pass2 crossfile analysis.
/// Uses a monotone forward dataflow analysis via `state::engine::run_forward`
/// with the `TaintTransfer` function. Termination is guaranteed by lattice
/// finiteness (bounded `Cap` bits × bounded variable count).
///
/// For JS/TS files: uses a two-level solve to prevent cross-function taint
/// leakage while preserving global-to-function flows.
pub fn analyse_file(
cfg: &Cfg,
entry: NodeIndex,
@ -292,162 +56,155 @@ pub fn analyse_file(
caller_namespace: &str,
interop_edges: &[InteropEdge],
) -> Vec<Finding> {
use std::collections::{HashMap, HashSet, VecDeque};
let _span = tracing::debug_span!("taint_analyse_file").entered();
/// Queue item: current CFG node + taint map that holds here
#[derive(Clone)]
struct Item {
node: NodeIndex,
taint: HashMap<String, Cap>,
// 1. Build symbol interner from CFG
let interner = SymbolInterner::from_cfg(cfg);
if interner.len() > MAX_TRACKED_VARS {
tracing::warn!(
symbols = interner.len(),
max = MAX_TRACKED_VARS,
"taint analysis: too many variables, some will be ignored"
);
}
// (node, taint_hash) → predecessor key (for path rebuild)
type Key = (NodeIndex, u64);
let mut pred: HashMap<Key, Key> = HashMap::new();
// 2. Build base transfer function
let base_transfer = TaintTransfer {
lang: caller_lang,
namespace: caller_namespace,
interner: &interner, // also used for events_to_findings below
local_summaries,
global_summaries,
interop_edges,
global_seed: None,
scope_filter: None,
};
// Seen states so we do not revisit them infinitely
let mut seen: HashSet<Key> = HashSet::new();
// 3. Run analysis (two-level for JS/TS, single-pass otherwise)
let events = if matches!(caller_lang, Lang::JavaScript | Lang::TypeScript) {
analyse_js_two_level(cfg, entry, &interner, &base_transfer)
} else {
let result = engine::run_forward(cfg, entry, &base_transfer, TaintState::initial());
result.events
};
// Resulting findings: (sink_node, source_node, full_path)
let mut findings: Vec<Finding> = Vec::new();
// 4. Convert events to findings
let mut findings = events_to_findings(&events, &interner);
let mut q = VecDeque::new();
q.push_back(Item {
node: entry,
taint: HashMap::new(),
});
seen.insert((entry, 0));
// 5. Deduplicate findings by (sink, source), prefer path_validated=true
findings.sort_by_key(|f| (f.sink.index(), f.source.index(), !f.path_validated));
findings.dedup_by_key(|f| (f.sink, f.source));
while let Some(Item { node, taint }) = q.pop_front() {
let caller_func = cfg[node].enclosing_func.as_deref().unwrap_or("");
let mut out = taint.clone();
apply_taint(
&cfg[node],
&mut out,
local_summaries,
global_summaries,
caller_lang,
caller_namespace,
interop_edges,
);
findings
}
// ── Sink check ──────────────────────────────────────────────────
// Two ways a node can be a sink:
// 1. Its AST label says Sink (existing inline labels)
// 2. Its callee resolves to a function with sink_caps (cross-file)
let sink_caps = match cfg[node].label {
Some(DataLabel::Sink(caps)) => caps,
_ => {
// check if callee resolves to a sink
cfg[node]
.callee
.as_ref()
.and_then(|c| {
resolve_callee(
c,
caller_lang,
caller_namespace,
caller_func,
cfg[node].call_ordinal,
local_summaries,
global_summaries,
interop_edges,
)
})
.filter(|r| !r.sink_caps.is_empty())
.map(|r| r.sink_caps)
.unwrap_or(Cap::empty())
}
/// JS/TS two-level solve to prevent cross-function taint leakage.
///
/// Level 1: Solve top-level code (nodes where `enclosing_func.is_none()`).
/// Level 2: For each function, solve seeded with top-level taint.
fn analyse_js_two_level(
cfg: &Cfg,
entry: NodeIndex,
_interner: &SymbolInterner,
base_transfer: &TaintTransfer,
) -> Vec<TaintEvent> {
// Level 1: solve top-level only
let toplevel_transfer = TaintTransfer {
lang: base_transfer.lang,
namespace: base_transfer.namespace,
interner: base_transfer.interner,
local_summaries: base_transfer.local_summaries,
global_summaries: base_transfer.global_summaries,
interop_edges: base_transfer.interop_edges,
global_seed: None,
scope_filter: Some(None), // top-level only (enclosing_func == None)
};
let toplevel_result =
engine::run_forward(cfg, entry, &toplevel_transfer, TaintState::initial());
// Extract top-level taint state at the last converged point
let toplevel_state = extract_exit_state(&toplevel_result.states);
// Level 2: solve each function seeded with top-level state
let mut all_events = toplevel_result.events;
let func_entries = find_function_entries(cfg);
for (func_name, func_entry) in &func_entries {
let func_transfer = TaintTransfer {
lang: base_transfer.lang,
namespace: base_transfer.namespace,
interner: base_transfer.interner,
local_summaries: base_transfer.local_summaries,
global_summaries: base_transfer.global_summaries,
interop_edges: base_transfer.interop_edges,
global_seed: Some(&toplevel_state),
scope_filter: Some(Some(func_name.as_str())),
};
if !sink_caps.is_empty() {
let bad = cfg[node]
.uses
.iter()
.any(|u| out.get(u).is_some_and(|b| (*b & sink_caps) != Cap::empty()));
if bad {
// Reconstruct path backwards from sink to source.
//
// A node is considered a "source" if:
// 1. It has an inline DataLabel::Source (same-file), OR
// 2. It is a Call whose callee resolves to a source via
// local or global summaries (cross-file).
let sink_node = node;
let mut path = vec![node];
let mut source_node = node; // fallback: sink itself
let mut key = (node, taint_hash(&taint));
let func_result =
engine::run_forward(cfg, *func_entry, &func_transfer, TaintState::initial());
all_events.extend(func_result.events);
}
while let Some(&(prev, prev_hash)) = pred.get(&key) {
path.push(prev);
all_events
}
// Check inline source label
if matches!(cfg[prev].label, Some(DataLabel::Source(_))) {
source_node = prev;
break;
}
/// Extract the "best" taint state from converged states (join all exit/reachable states).
fn extract_exit_state(states: &std::collections::HashMap<NodeIndex, TaintState>) -> TaintState {
let mut result = TaintState::initial();
for state in states.values() {
result = result.join(state);
}
result
}
// Check cross-file source via resolved callee summary
let prev_caller_func = cfg[prev].enclosing_func.as_deref().unwrap_or("");
if cfg[prev].kind == StmtKind::Call
&& let Some(callee) = &cfg[prev].callee
&& let Some(resolved) = resolve_callee(
callee,
caller_lang,
caller_namespace,
prev_caller_func,
cfg[prev].call_ordinal,
local_summaries,
global_summaries,
interop_edges,
)
&& !resolved.source_caps.is_empty()
{
source_node = prev;
break;
}
/// Find function entry nodes: (func_name, entry_node) pairs.
///
/// A function entry is the first node with a given `enclosing_func` value.
fn find_function_entries(cfg: &Cfg) -> Vec<(String, NodeIndex)> {
let mut seen = HashSet::new();
let mut entries = Vec::new();
key = (prev, prev_hash);
}
path.reverse();
// Infer the source kind from the source node's label and callee
let source_kind = match cfg[source_node].label {
Some(DataLabel::Source(caps)) => {
let callee = cfg[source_node].callee.as_deref().unwrap_or("");
crate::labels::infer_source_kind(caps, callee)
}
_ => SourceKind::Unknown,
};
findings.push(Finding {
sink: sink_node,
source: source_node,
path,
source_kind,
});
}
for (idx, info) in cfg.node_references() {
if let Some(ref func_name) = info.enclosing_func
&& seen.insert(func_name.clone())
{
entries.push((func_name.clone(), idx));
}
}
// enqueue successors — cache hashes to avoid recomputation
let out_h = taint_hash(&out);
let in_h = taint_hash(&taint);
let succs: Vec<_> = cfg.neighbors(node).collect();
for (i, succ) in succs.iter().enumerate() {
let key = (*succ, out_h);
if !seen.contains(&key) {
seen.insert(key);
pred.insert(key, (node, in_h));
// Move the map into the last successor to avoid a clone
let taint_for_succ = if i + 1 == succs.len() {
std::mem::take(&mut out)
} else {
out.clone()
};
q.push_back(Item {
node: *succ,
taint: taint_for_succ,
});
entries
}
/// Convert TaintEvents into Findings.
fn events_to_findings(events: &[TaintEvent], _interner: &SymbolInterner) -> Vec<Finding> {
let mut findings = Vec::new();
for event in events {
let TaintEvent::SinkReached {
sink_node,
tainted_vars,
all_validated,
guard_kind,
..
} = event;
// Collect unique origins across all tainted vars at this sink
let mut seen_origins: HashSet<(usize, usize)> = HashSet::new();
for (_sym, _caps, origins) in tainted_vars {
for origin in origins {
if seen_origins.insert((origin.node.index(), sink_node.index())) {
findings.push(Finding {
sink: *sink_node,
source: origin.node,
path: vec![origin.node, *sink_node],
source_kind: origin.source_kind,
path_validated: *all_validated,
guard_kind: *guard_kind,
});
}
}
}
}

234
src/taint/path_state.rs Normal file
View file

@ -0,0 +1,234 @@
// ─── PredicateKind ───────────────────────────────────────────────────────────
/// Classification of what an if-condition tests.
///
/// Determined by heuristic analysis of the raw condition text.
/// Classification is conservative: prefer [`Unknown`](PredicateKind::Unknown)
/// over a wrong guess.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PredicateKind {
/// `x.is_none()`, `x == null`, `x == nil`, `x is None`
NullCheck,
/// `x.is_empty()`, `x.len() == 0`, `x == ""`
EmptyCheck,
/// `x.is_err()`, `x.is_ok()`, `err != nil`
ErrorCheck,
/// Call to a validation/guard function: `validate(x)`, `is_safe(x)`
ValidationCall,
/// Call to a sanitizer function: `sanitize(x)`, `escape(x)`
SanitizerCall,
/// Comparison operators: `x == 5`, `x > threshold`
Comparison,
/// Generic boolean test — cannot classify further.
Unknown,
}
/// Classify a raw condition text into a [`PredicateKind`].
///
/// # Rules
///
/// - Empty/None text → [`Unknown`](PredicateKind::Unknown).
/// - `ValidationCall` / `SanitizerCall` require a `(` in the text **and** a
/// matching callee token. This avoids misclassifying comparisons like
/// `x_valid == true`.
/// - Prefers [`Unknown`](PredicateKind::Unknown) over false positives.
pub fn classify_condition(text: &str) -> PredicateKind {
if text.is_empty() {
return PredicateKind::Unknown;
}
let lower = text.to_ascii_lowercase();
// ── Error checks (before null checks: `err != nil` is an error check,
// not a null check, even though it contains `!= nil`) ──────────────
if lower.contains("is_err")
|| lower.contains("is_ok")
|| lower.contains("err != nil")
|| lower.contains("err == nil")
|| lower.contains("error != nil")
|| lower.contains("error == nil")
{
return PredicateKind::ErrorCheck;
}
// ── Null checks ──────────────────────────────────────────────────────
if lower.contains("is_none")
|| lower.contains("is_some")
|| lower.contains("== none")
|| lower.contains("!= none")
|| lower.contains("is none")
|| lower.contains("is not none")
|| lower.contains("== null")
|| lower.contains("!= null")
|| lower.contains("=== null")
|| lower.contains("!== null")
|| lower.contains("== nil")
|| lower.contains("!= nil")
{
return PredicateKind::NullCheck;
}
// ── Empty checks ─────────────────────────────────────────────────────
if lower.contains("is_empty")
|| lower.contains(".len() == 0")
|| lower.contains(".len() != 0")
|| lower.contains(".length == 0")
|| lower.contains(".length === 0")
|| lower.contains(".length != 0")
|| lower.contains(".length !== 0")
|| lower.contains("== \"\"")
|| lower.contains("== ''")
{
return PredicateKind::EmptyCheck;
}
// ── Call-based kinds (require `(` to be present) ─────────────────────
if lower.contains('(') {
// Extract a rough callee token: everything before the first `(`
// that looks like an identifier (letters, digits, underscores, dots).
let callee_part = lower.split('(').next().unwrap_or("");
// Take the last segment (after `.` or `::`) as the bare name.
let bare = callee_part
.rsplit(['.', ':'])
.next()
.unwrap_or(callee_part)
.trim();
// Validation
if bare.contains("valid")
|| bare.contains("check")
|| bare.contains("verify")
|| bare.starts_with("is_safe")
|| bare.starts_with("is_authorized")
|| bare.starts_with("is_authenticated")
{
return PredicateKind::ValidationCall;
}
// Sanitizer
if bare.contains("sanitiz") || bare.contains("escape") || bare.contains("encode") {
return PredicateKind::SanitizerCall;
}
}
// ── Comparison operators ─────────────────────────────────────────────
if lower.contains("==")
|| lower.contains("!=")
|| lower.contains(">=")
|| lower.contains("<=")
|| lower.contains(" > ")
|| lower.contains(" < ")
{
return PredicateKind::Comparison;
}
PredicateKind::Unknown
}
// ─── Tests ───────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
// ── classify_condition ────────────────────────────────────────────────
#[test]
fn classify_empty_is_unknown() {
assert_eq!(classify_condition(""), PredicateKind::Unknown);
}
#[test]
fn classify_null_checks() {
assert_eq!(classify_condition("x.is_none()"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x == null"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x != nil"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x is None"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x === null"), PredicateKind::NullCheck);
}
#[test]
fn classify_error_checks() {
assert_eq!(classify_condition("x.is_err()"), PredicateKind::ErrorCheck);
assert_eq!(classify_condition("err != nil"), PredicateKind::ErrorCheck);
assert_eq!(classify_condition("x.is_ok()"), PredicateKind::ErrorCheck);
}
#[test]
fn classify_empty_checks() {
assert_eq!(
classify_condition("x.is_empty()"),
PredicateKind::EmptyCheck
);
assert_eq!(
classify_condition("x.len() == 0"),
PredicateKind::EmptyCheck
);
assert_eq!(
classify_condition("x.length === 0"),
PredicateKind::EmptyCheck
);
}
#[test]
fn classify_validation_call() {
assert_eq!(
classify_condition("validate(x)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("is_safe(input)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("check_auth(req)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("input.verify(sig)"),
PredicateKind::ValidationCall
);
}
#[test]
fn classify_validation_requires_paren() {
// `x_valid == true` should NOT be ValidationCall — no `(` call syntax.
assert_eq!(
classify_condition("x_valid == true"),
PredicateKind::Comparison
);
assert_eq!(
classify_condition("is_valid && ready"),
PredicateKind::Unknown
);
}
#[test]
fn classify_sanitizer_call() {
assert_eq!(
classify_condition("sanitize(x)"),
PredicateKind::SanitizerCall
);
assert_eq!(
classify_condition("html_escape(s)"),
PredicateKind::SanitizerCall
);
assert_eq!(
classify_condition("url_encode(path)"),
PredicateKind::SanitizerCall
);
}
#[test]
fn classify_comparison() {
assert_eq!(classify_condition("x == 5"), PredicateKind::Comparison);
assert_eq!(classify_condition("x != y"), PredicateKind::Comparison);
assert_eq!(classify_condition("a >= b"), PredicateKind::Comparison);
}
#[test]
fn classify_unknown_fallback() {
assert_eq!(classify_condition("flag"), PredicateKind::Unknown);
assert_eq!(classify_condition("a && b"), PredicateKind::Unknown);
}
}

View file

@ -1,6 +1,7 @@
use super::*;
use crate::cfg::FuncSummaries;
use crate::interop::InteropEdge;
use crate::labels::Cap;
use crate::symbol::FuncKey;
#[test]
@ -52,8 +53,10 @@ fn taint_through_if_else() {
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// exactly one path (via the True branch) should be flagged
assert_eq!(findings.len(), 1);
// Both branches have findings: the true branch uses unsanitized `x`,
// the else branch uses `safe` which was sanitized with HTML_ESCAPE
// but the sink requires SHELL_ESCAPE (wrong sanitizer → still tainted).
assert_eq!(findings.len(), 2);
}
#[test]
@ -2218,3 +2221,318 @@ fn return_call_recognized_as_source() {
"foo() should have source_caps set because env::var is called inside return"
);
}
// ─── Path-sensitive analysis tests ───────────────────────────────────────────
#[test]
fn validate_and_early_return() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Validate before use: if validation fails, early return.
// The sink after the guard is on the "validated" path.
//
// The CFG creates a synthetic pass-through node for the false path
// with an explicit False edge from the If node. BFS reaches the
// sink via: cond → (False) → pass-through → (Seq) → sink.
// The predicate on the False edge records that `!validate(&x)` was
// false (i.e. validation passed), so the sink is path-guarded.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if !validate(&x) { return; }
Command::new("sh").arg(x).status().unwrap();
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Taint still flows (validate doesn't kill taint), but the finding
// should be annotated as path_validated because the false path
// (validation passed) has a ValidationCall predicate with polarity=true.
assert_eq!(findings.len(), 1, "should still detect the taint flow");
assert!(
findings[0].path_validated,
"finding should be marked as path_validated (early-return guard detected)"
);
assert_eq!(
findings[0].guard_kind,
Some(PredicateKind::ValidationCall),
"guard_kind should be ValidationCall"
);
}
#[test]
fn validate_in_if_else_path_validated() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// If/else where the True branch (validation passed) contains the sink.
// This IS detectable because the If node has genuine True/False branches.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if validate(&x) {
Command::new("sh").arg(&x).status().unwrap();
} else {
println!("invalid input");
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1, "should detect the taint flow");
assert!(
findings[0].path_validated,
"finding should be path_validated (sink in validated branch)"
);
assert_eq!(
findings[0].guard_kind,
Some(PredicateKind::ValidationCall),
"guard_kind should be ValidationCall"
);
}
#[test]
fn sink_on_failed_validation_branch() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Sink is in the failed-validation branch (negated condition, false edge).
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if !validate(&x) {
Command::new("sh").arg(&x).status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1, "should detect taint flow to sink");
assert!(
!findings[0].path_validated,
"finding should NOT be path_validated (sink is in failed-validation branch)"
);
}
#[test]
fn contradictory_null_check_pruned() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Inner branch is infeasible: if x.is_none() then x cannot also be is_none().
// After early return on is_none(), the fall-through path has polarity=false
// for NullCheck. The inner `if x.is_none()` True branch has polarity=true —
// contradiction.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").ok();
if x.is_none() { return; }
if x.is_none() {
Command::new("sh").arg("dangerous").status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// The inner branch is infeasible, and the arg "dangerous" is a string
// literal (not tainted), so there should be no findings.
assert!(
findings.is_empty(),
"inner branch is infeasible — should produce no findings (got {})",
findings.len()
);
}
#[test]
fn sanitize_one_branch_no_regression() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Same as existing taint_through_if_else: sanitized in one branch, not in the other.
// Verify the finding count stays at 1 (no regression from path sensitivity).
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("DANGEROUS").unwrap();
let safe = html_escape::encode_safe(&x);
if x.len() > 5 {
Command::new("sh").arg(&x).status().unwrap(); // UNSAFE
} else {
Command::new("sh").arg(&safe).status().unwrap(); // SAFE
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Both branches produce findings: the true branch uses unsanitized `x`,
// the else branch uses `safe` (HTML_ESCAPE sanitizer vs SHELL_ESCAPE sink).
// Previously only 1 finding because else_clause was silently dropped from CFG.
assert_eq!(
findings.len(),
2,
"two findings expected (both branches reach sink with wrong/no sanitizer)"
);
}
#[test]
fn path_state_budget_graceful() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Deeply nested ifs with a sink at the innermost level.
// PathState should truncate gracefully after MAX_PATH_PREDICATES.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if x.len() > 1 {
if x.len() > 2 {
if x.len() > 3 {
if x.len() > 4 {
if x.len() > 5 {
if x.len() > 6 {
if x.len() > 7 {
if x.len() > 8 {
if x.len() > 9 {
Command::new("sh").arg(&x).status().unwrap();
}
}
}
}
}
}
}
}
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Should still detect the flow — truncation shouldn't cause false negatives.
assert_eq!(
findings.len(),
1,
"should detect taint flow even with truncated PathState"
);
}
#[test]
fn unknown_predicate_not_pruned() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Comparison predicates are NOT in the contradiction whitelist, so even
// seemingly contradictory comparisons should not be pruned.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if x.len() > 5 { return; }
if x.len() > 5 {
Command::new("sh").arg(&x).status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Comparison is not in the whitelist — the path should NOT be pruned.
assert_eq!(
findings.len(),
1,
"Comparison predicate should not cause contradiction pruning"
);
}
#[test]
fn multi_var_predicate_not_pruned() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Multi-variable conditions should never be pruned for contradiction,
// even if the kind is in the whitelist.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
let y = env::var("OTHER").ok();
if y.is_none() { return; }
if y.is_none() {
Command::new("sh").arg(&x).status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Note: y.is_none() condition references `y` and `is_none` — two idents.
// Wait, `is_none` is a method — collect_idents finds `y` and `is_none` as
// separate identifiers. That makes it multi-var, so contradiction should
// NOT fire. However, the actual behavior depends on how many idents
// collect_idents extracts from `y.is_none()`. If it returns ["y", "is_none"],
// then the predicate has 2 vars → multi-var → not pruned → finding exists.
assert!(
!findings.is_empty(),
"multi-var predicate should not be pruned; flow should be detected"
);
}

458
src/taint/transfer.rs Normal file
View file

@ -0,0 +1,458 @@
use crate::callgraph::normalize_callee_name;
use crate::cfg::{EdgeKind, FuncSummaries, NodeInfo, StmtKind};
use crate::interop::InteropEdge;
use crate::labels::{Cap, DataLabel};
use crate::state::engine::Transfer;
use crate::state::lattice::Lattice;
use crate::state::symbol::{SymbolId, SymbolInterner};
use crate::summary::{CalleeResolution, GlobalSummaries};
use crate::symbol::Lang;
use crate::taint::domain::{TaintOrigin, TaintState, VarTaint, predicate_kind_bit};
use crate::taint::path_state::{PredicateKind, classify_condition};
use petgraph::graph::NodeIndex;
use smallvec::SmallVec;
/// Events emitted by the taint transfer function during Phase 2.
#[derive(Clone, Debug)]
pub enum TaintEvent {
SinkReached {
sink_node: NodeIndex,
tainted_vars: Vec<(SymbolId, Cap, SmallVec<[TaintOrigin; 2]>)>,
#[allow(dead_code)]
sink_caps: Cap,
all_validated: bool,
guard_kind: Option<PredicateKind>,
},
}
/// Taint transfer function for forward dataflow analysis.
pub struct TaintTransfer<'a> {
pub lang: Lang,
pub namespace: &'a str,
pub interner: &'a SymbolInterner,
pub local_summaries: &'a FuncSummaries,
pub global_summaries: Option<&'a GlobalSummaries>,
pub interop_edges: &'a [InteropEdge],
/// For JS two-level solve: top-level taint state seeded into function solves.
pub global_seed: Option<&'a TaintState>,
/// Optional scope filter: if set, only process nodes whose enclosing_func matches.
/// None = process all nodes. Some(None) = top-level only. Some(Some(name)) = function only.
pub scope_filter: Option<Option<&'a str>>,
}
impl Transfer<TaintState> for TaintTransfer<'_> {
type Event = TaintEvent;
fn apply(
&self,
node: NodeIndex,
info: &NodeInfo,
edge: Option<EdgeKind>,
mut state: TaintState,
) -> (TaintState, Vec<TaintEvent>) {
let mut events = Vec::new();
// Scope filter: skip nodes outside our scope (return state unchanged)
if let Some(ref filter) = self.scope_filter {
let node_func = info.enclosing_func.as_deref();
if node_func != *filter {
return (state, events);
}
}
let caller_func = info.enclosing_func.as_deref().unwrap_or("");
// ── Apply taint transfer ────────────────────────────────────────
match info.label {
Some(DataLabel::Source(bits)) => {
self.apply_source(node, info, bits, &mut state);
}
Some(DataLabel::Sanitizer(bits)) => {
self.apply_sanitizer(info, bits, &mut state);
}
_ if info.kind == StmtKind::Call => {
self.apply_call(node, info, caller_func, &mut state);
}
_ => {
self.apply_assignment(info, &mut state);
}
}
// ── If-node predicate handling (edge-aware) ─────────────────────
if info.kind == StmtKind::If
&& !info.condition_vars.is_empty()
&& matches!(edge, Some(EdgeKind::True) | Some(EdgeKind::False))
{
let cond_text = info.condition_text.as_deref().unwrap_or("");
let kind = classify_condition(cond_text);
let polarity = matches!(edge, Some(EdgeKind::True)) ^ info.condition_negated;
// ValidationCall handling
if kind == PredicateKind::ValidationCall && polarity {
for var in &info.condition_vars {
if let Some(sym) = self.interner.get(var) {
state.validated_may.insert(sym);
state.validated_must.insert(sym);
}
}
}
// Predicate summary for whitelisted kinds (contradiction pruning)
if let Some(bit_idx) = predicate_kind_bit(kind) {
for var in &info.condition_vars {
if let Some(sym) = self.interner.get(var) {
let mut summary = state.get_predicate(sym);
if polarity {
summary.known_true |= 1 << bit_idx;
} else {
summary.known_false |= 1 << bit_idx;
}
state.set_predicate(sym, summary);
}
}
}
// Contradiction pruning: if any variable has contradictory predicates,
// this is an infeasible path → return bot (monotonically kills branch).
if state.has_contradiction() {
return (TaintState::bot(), events);
}
}
// ── Sink check ──────────────────────────────────────────────────
let sink_caps = self.resolve_sink_caps(info, caller_func);
if !sink_caps.is_empty() {
let tainted_vars = self.collect_tainted_sink_vars(info, &state, sink_caps);
if !tainted_vars.is_empty() {
let all_validated = tainted_vars
.iter()
.all(|(sym, _, _)| state.validated_may.contains(*sym));
let guard_kind = if all_validated {
Some(PredicateKind::ValidationCall)
} else {
None
};
events.push(TaintEvent::SinkReached {
sink_node: node,
tainted_vars,
sink_caps,
all_validated,
guard_kind,
});
}
}
(state, events)
}
fn iteration_budget(&self) -> usize {
100_000
}
fn on_budget_exceeded(&self) -> bool {
tracing::warn!("taint analysis: worklist budget exceeded, returning partial results");
false
}
}
impl TaintTransfer<'_> {
/// Apply a Source label: insert taint for the defined variable.
fn apply_source(&self, node: NodeIndex, info: &NodeInfo, bits: Cap, state: &mut TaintState) {
if let Some(ref v) = info.defines
&& let Some(sym) = self.interner.get(v)
{
let callee = info.callee.as_deref().unwrap_or("");
let source_kind = crate::labels::infer_source_kind(bits, callee);
let origin = TaintOrigin { node, source_kind };
match state.get(sym) {
Some(existing) => {
let mut new_taint = existing.clone();
new_taint.caps |= bits;
if new_taint.origins.len() < 4
&& !new_taint.origins.iter().any(|o| o.node == node)
{
new_taint.origins.push(origin);
}
state.set(sym, new_taint);
}
None => {
state.set(
sym,
VarTaint {
caps: bits,
origins: SmallVec::from_elem(origin, 1),
},
);
}
}
}
}
/// Apply a Sanitizer label: propagate input taint, then strip sanitizer bits.
fn apply_sanitizer(&self, info: &NodeInfo, bits: Cap, state: &mut TaintState) {
if let Some(ref v) = info.defines
&& let Some(sym) = self.interner.get(v)
{
let (combined_caps, combined_origins) = self.collect_uses_taint(info, state);
let new_caps = combined_caps & !bits;
if new_caps.is_empty() {
state.remove(sym);
} else {
state.set(
sym,
VarTaint {
caps: new_caps,
origins: combined_origins,
},
);
}
}
}
/// Apply a function call: resolve callee and compute return taint.
fn apply_call(
&self,
node: NodeIndex,
info: &NodeInfo,
caller_func: &str,
state: &mut TaintState,
) {
if let Some(ref callee) = info.callee
&& let Some(resolved) = self.resolve_callee(callee, caller_func, info.call_ordinal)
{
let mut return_bits = Cap::empty();
let mut return_origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new();
// 1. Source behaviour
if !resolved.source_caps.is_empty() {
return_bits |= resolved.source_caps;
let callee_str = info.callee.as_deref().unwrap_or("");
let source_kind =
crate::labels::infer_source_kind(resolved.source_caps, callee_str);
let origin = TaintOrigin { node, source_kind };
if !return_origins.iter().any(|o| o.node == node) {
return_origins.push(origin);
}
}
// 2. Propagation
if resolved.propagates_taint {
let (use_caps, use_origins) = self.collect_uses_taint(info, state);
return_bits |= use_caps;
for orig in &use_origins {
if return_origins.len() < 4
&& !return_origins.iter().any(|o| o.node == orig.node)
{
return_origins.push(*orig);
}
}
}
// 3. Sanitizer behaviour (applied last so it always wins)
return_bits &= !resolved.sanitizer_caps;
// Write result
if let Some(ref v) = info.defines
&& let Some(sym) = self.interner.get(v)
{
if return_bits.is_empty() {
state.remove(sym);
} else {
state.set(
sym,
VarTaint {
caps: return_bits,
origins: return_origins,
},
);
}
}
return;
}
// Unresolved call — fall through to default gen/kill
self.apply_assignment(info, state);
}
/// Default gen/kill: propagate taint through variable assignments.
fn apply_assignment(&self, info: &NodeInfo, state: &mut TaintState) {
if matches!(
info.label,
Some(DataLabel::Source(_)) | Some(DataLabel::Sanitizer(_))
) {
return;
}
if let Some(ref d) = info.defines
&& let Some(sym) = self.interner.get(d)
{
let (combined_caps, combined_origins) = self.collect_uses_taint(info, state);
if combined_caps.is_empty() {
state.remove(sym);
} else {
state.set(
sym,
VarTaint {
caps: combined_caps,
origins: combined_origins,
},
);
}
}
}
/// Collect taint from all `uses` variables (union of caps + merge origins).
fn collect_uses_taint(
&self,
info: &NodeInfo,
state: &TaintState,
) -> (Cap, SmallVec<[TaintOrigin; 2]>) {
let mut combined_caps = Cap::empty();
let mut combined_origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new();
for u in &info.uses {
let taint = self.lookup_var(u, state);
if let Some(t) = taint {
combined_caps |= t.caps;
for orig in &t.origins {
if combined_origins.len() < 4
&& !combined_origins.iter().any(|o| o.node == orig.node)
{
combined_origins.push(*orig);
}
}
}
}
(combined_caps, combined_origins)
}
/// Look up a variable's taint, falling back to global_seed for JS two-level solve.
fn lookup_var<'a>(&'a self, name: &str, state: &'a TaintState) -> Option<&'a VarTaint> {
if let Some(sym) = self.interner.get(name) {
if let Some(taint) = state.get(sym) {
return Some(taint);
}
// Fall back to global seed (JS two-level solve)
if let Some(seed) = self.global_seed {
return seed.get(sym);
}
}
None
}
/// Resolve sink caps from label or callee summary.
fn resolve_sink_caps(&self, info: &NodeInfo, caller_func: &str) -> Cap {
match info.label {
Some(DataLabel::Sink(caps)) => caps,
_ => info
.callee
.as_ref()
.and_then(|c| self.resolve_callee(c, caller_func, info.call_ordinal))
.filter(|r| !r.sink_caps.is_empty())
.map(|r| r.sink_caps)
.unwrap_or(Cap::empty()),
}
}
/// Collect tainted variables at a sink node.
fn collect_tainted_sink_vars(
&self,
info: &NodeInfo,
state: &TaintState,
sink_caps: Cap,
) -> Vec<(SymbolId, Cap, SmallVec<[TaintOrigin; 2]>)> {
let mut result = Vec::new();
for u in &info.uses {
if let Some(taint) = self.lookup_var(u, state)
&& (taint.caps & sink_caps) != Cap::empty()
&& let Some(sym) = self.interner.get(u)
{
result.push((sym, taint.caps, taint.origins.clone()));
}
}
result
}
/// Resolve a callee name to its summary (local → global → interop).
fn resolve_callee(
&self,
callee: &str,
caller_func: &str,
call_ordinal: u32,
) -> Option<ResolvedSummary> {
let normalized = normalize_callee_name(callee);
// 1) Local (same-file)
let local_matches: Vec<_> = self
.local_summaries
.iter()
.filter(|(k, _)| {
k.name == normalized && k.lang == self.lang && k.namespace == self.namespace
})
.collect();
if local_matches.len() == 1 {
let (_, ls) = local_matches[0];
return Some(ResolvedSummary {
source_caps: ls.source_caps,
sanitizer_caps: ls.sanitizer_caps,
sink_caps: ls.sink_caps,
propagates_taint: ls.propagates_taint,
});
}
if local_matches.len() > 1 {
return None;
}
// 2) Global same-language
if let Some(gs) = self.global_summaries {
match gs.resolve_callee_key(normalized, self.lang, self.namespace, None) {
CalleeResolution::Resolved(target_key) => {
if let Some(fs) = gs.get(&target_key) {
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
}
CalleeResolution::NotFound | CalleeResolution::Ambiguous(_) => {}
}
}
// 3) Interop edges
for edge in self.interop_edges {
if edge.from.caller_lang == self.lang
&& edge.from.caller_namespace == self.namespace
&& edge.from.callee_symbol == callee
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_func)
&& (edge.from.ordinal == 0 || edge.from.ordinal == call_ordinal)
&& let Some(gs) = self.global_summaries
&& let Some(fs) = gs.get(&edge.to)
{
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
}
None
}
}
/// Resolved summary for a callee.
struct ResolvedSummary {
source_caps: Cap,
sanitizer_caps: Cap,
sink_caps: Cap,
propagates_taint: bool,
}