nyx/src/cfg_analysis/mod.rs
Eli Peter 58f1794a4e
Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)
* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
2026-05-01 10:59:52 -04:00

293 lines
10 KiB
Rust

#![doc = include_str!(concat!(env!("OUT_DIR"), "/cfg_analysis.md"))]
pub mod auth;
pub mod dominators;
pub mod error_handling;
pub mod guards;
pub mod resources;
pub mod rules;
pub mod scoring;
#[cfg(test)]
mod tests;
pub mod unreachable;
use crate::cfg::{FuncSummaries, NodeInfo, StmtKind};
use crate::labels::{DataLabel, LangAnalysisRules};
use crate::patterns::Severity;
use crate::ssa::const_prop::ConstLattice;
use crate::ssa::type_facts::TypeFactResult;
use crate::ssa::{SsaBody, SsaValue};
use crate::summary::GlobalSummaries;
use crate::symbol::Lang;
use crate::taint;
use petgraph::graph::NodeIndex;
use std::collections::{HashMap, HashSet};
/// Per-body SSA facts used by structural analyses for finer-grained
/// constancy checks. Produced once per body in `run_cfg_analyses` and
/// passed via `AnalysisContext::body_const_facts`.
pub struct BodyConstFacts {
pub ssa: SsaBody,
pub const_values: HashMap<SsaValue, ConstLattice>,
pub type_facts: TypeFactResult,
/// Field-sensitive Steensgaard points-to facts.
///
/// Computed only when [`crate::pointer::is_enabled()`].
/// `state::transfer.rs` consumes this to suppress proxy-acquire
/// mis-attribution on field-aliased locals like `m := c.mu`. When
/// `None`, consumers fall back to pointer-unaware behaviour.
pub pointer_facts: Option<crate::pointer::PointsToFacts>,
}
/// Lower a body to SSA and run constant propagation. Returns `None` when
/// lowering fails (empty CFG, invalid entry), callers treat absence as
/// "no SSA facts available" and fall back to the syntactic path.
pub fn build_body_const_facts(body: &crate::cfg::BodyCfg, lang: Lang) -> Option<BodyConstFacts> {
let mut ssa = crate::ssa::lower_to_ssa_with_params(
&body.graph,
body.entry,
body.meta.name.as_deref(),
body.meta.parent_body_id.is_none(),
&body.meta.params,
)
.ok()?;
let opt = crate::ssa::optimize_ssa_with_param_types(
&mut ssa,
&body.graph,
Some(lang),
&body.meta.param_types,
);
let pointer_facts = if crate::pointer::is_enabled() {
Some(crate::pointer::analyse_body(&ssa, body.meta.id))
} else {
None
};
Some(BodyConstFacts {
ssa,
const_values: opt.const_values,
type_facts: opt.type_facts,
pointer_facts,
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Confidence {
Low,
Medium,
High,
}
#[derive(Debug, Clone)]
pub struct CfgFinding {
pub rule_id: String,
#[allow(dead_code)]
pub title: String,
pub severity: Severity,
pub confidence: Confidence,
pub span: (usize, usize),
pub message: String,
pub evidence: Vec<NodeIndex>,
pub score: Option<f64>,
}
pub struct AnalysisContext<'a> {
pub cfg: &'a crate::cfg::Cfg,
pub entry: NodeIndex,
pub lang: Lang,
#[allow(dead_code)]
pub file_path: &'a str,
#[allow(dead_code)]
pub source_bytes: &'a [u8],
pub func_summaries: &'a FuncSummaries,
#[allow(dead_code)]
pub global_summaries: Option<&'a GlobalSummaries>,
pub taint_findings: &'a [taint::Finding],
pub analysis_rules: Option<&'a LangAnalysisRules>,
/// Whether full taint analysis was active for this file (global summaries
/// existed and taint engine ran). When false, structural findings without
/// taint confirmation should be treated with lower confidence.
pub taint_active: bool,
/// Optional per-body SSA + constant-propagation facts. When present,
/// structural analyses can use SSA const-prop to prove that all argument
/// flows into a sink resolve to literal constants, suppressing false
/// positives that the one-hop CFG trace alone cannot.
pub body_const_facts: Option<&'a BodyConstFacts>,
/// Optional per-body type-fact result produced by `optimize_ssa`.
/// Structural analyses use it to suppress findings when a sink's argument
/// SSA values are proven to carry non-injectable types (e.g. integers
/// parsed from a raw source can't form SHELL/SQL/path payloads). Sourced
/// from `body_const_facts` when present, keep both pointers coherent.
pub type_facts: Option<&'a TypeFactResult>,
/// Decorators / annotations / attributes attached to the body's
/// declaration (e.g. Python `@login_required`, Java `@PreAuthorize`,
/// Symfony `#[IsGranted(...)]`). Consumed by the AuthGap analysis to
/// suppress `cfg-auth-gap` when the framework already enforces auth at
/// the function-declaration level, the gap only matters when the
/// auth call has to live inside the body.
pub auth_decorators: &'a [String],
/// Names of variables whose `.close()` / release calls live in a
/// nested closure body somewhere else in the file (e.g.
/// `socket.on("close", () => ws.close())`). ResourceMisuse uses this
/// to suppress `cfg-resource-leak` for handles whose cleanup happens
/// in a callback the per-body CFG can't observe. When `None`, no
/// closure-based suppression is applied.
pub closure_released_var_names: Option<&'a std::collections::HashSet<String>>,
}
pub trait CfgAnalysis {
#[allow(dead_code)]
fn name(&self) -> &'static str;
fn run(&self, ctx: &AnalysisContext) -> Vec<CfgFinding>;
}
/// Run all registered analyses and return merged findings.
pub fn run_all(ctx: &AnalysisContext) -> Vec<CfgFinding> {
let analyses: Vec<Box<dyn CfgAnalysis>> = vec![
Box::new(unreachable::UnreachableCode),
Box::new(guards::UnguardedSink),
Box::new(auth::AuthGap),
Box::new(error_handling::IncompleteErrorHandling),
Box::new(resources::ResourceMisuse),
];
let mut findings: Vec<CfgFinding> = analyses.iter().flat_map(|a| a.run(ctx)).collect();
// ── Dedup: suppress cfg-unguarded-sink when taint already covers the span ──
// Collect spans where taint findings exist (sink byte offset).
let taint_spans: HashSet<(usize, usize)> = ctx
.taint_findings
.iter()
.map(|f| ctx.cfg[f.sink].ast.span)
.collect();
findings.retain(|f| {
// If both taint and cfg-unguarded-sink fire on the same span,
// suppress the structural CFG finding (taint is the primary signal).
if f.rule_id == "cfg-unguarded-sink" && taint_spans.contains(&f.span) {
return false;
}
true
});
// ── Dedup: suppress cfg-unguarded-sink when cfg-unreachable-sink covers the span ──
let unreachable_spans: HashSet<(usize, usize)> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unreachable-sink")
.map(|f| f.span)
.collect();
findings.retain(|f| {
if f.rule_id == "cfg-unguarded-sink" && unreachable_spans.contains(&f.span) {
return false;
}
true
});
scoring::score_findings(&mut findings, ctx);
findings.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
findings
}
/// Helper: check whether a node is a guard call (validate, sanitize, check, etc.).
pub(crate) fn is_guard_call(
info: &NodeInfo,
lang: Lang,
analysis_rules: Option<&LangAnalysisRules>,
) -> bool {
if info.kind != StmtKind::Call {
return false;
}
if let Some(callee) = &info.call.callee {
// Check config sanitizer rules
if let Some(extras) = analysis_rules {
let callee_lower = callee.to_ascii_lowercase();
for rule in &extras.extra_labels {
if !matches!(rule.label, DataLabel::Sanitizer(_)) {
continue;
}
for m in &rule.matchers {
let ml = m.to_ascii_lowercase();
if ml.ends_with('_') {
if callee_lower.starts_with(&ml) {
return true;
}
} else if callee_lower.ends_with(&ml) {
return true;
}
}
}
}
// Check built-in guard rules
let guard_rules = rules::guard_rules(lang);
let callee_lower = callee.to_ascii_lowercase();
for rule in guard_rules {
for &m in rule.matchers {
let ml = m.to_ascii_lowercase();
if ml.ends_with('_') {
if callee_lower.starts_with(&ml) {
return true;
}
} else if callee_lower.ends_with(&ml) {
return true;
}
}
}
}
false
}
/// Helper: check whether a node is an auth check call.
pub(crate) fn is_auth_call(info: &NodeInfo, lang: Lang) -> bool {
if info.kind != StmtKind::Call {
return false;
}
if let Some(callee) = &info.call.callee {
let auth_rules = rules::auth_rules(lang);
let callee_lower = callee.to_ascii_lowercase();
for rule in auth_rules {
for &m in rule.matchers {
let ml = m.to_ascii_lowercase();
if ml.ends_with('_') {
if callee_lower.starts_with(&ml) {
return true;
}
} else if callee_lower.ends_with(&ml) {
return true;
}
}
}
}
false
}
/// Helper: check if a function name looks like an entry point (HTTP handler, main, etc.).
pub(crate) fn is_entry_point_func(func_name: &str, lang: Lang) -> bool {
let ep_rules = rules::entry_point_rules(lang);
let name_lower = func_name.to_ascii_lowercase();
for rule in ep_rules {
for &m in rule.matchers {
let ml = m.to_ascii_lowercase();
if ml.ends_with('*') {
let prefix = &ml[..ml.len() - 1];
if name_lower.starts_with(prefix) {
return true;
}
} else if name_lower == ml {
return true;
}
}
}
false
}
/// Helper: check if a node is a sink.
pub(crate) fn is_sink(info: &NodeInfo) -> bool {
info.taint
.labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(_)))
}