mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers * feat: Implement cross-file data exfiltration detection with parameter-specific gate filters * feat: Add calibration tests and refine DATA_EXFIL severity scoring logic * feat: Introduce per-detector configuration for data exfiltration suppression * feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output * feat: Add tainted body and URL handling for data exfiltration detection * feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go * feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients * feat: Add synthetic externals handling for closure-captured variables in SSA * feat: Implement closure-based suppression for resource leak findings * feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns * feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders * feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt * feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests * feat: Add data exfiltration sinks for various languages and enhance documentation * refactor: Simplify formatting and improve readability in various files * refactor: Improve readability by simplifying conditional statements and adding clippy linting * docs: Update CHANGELOG and comments for data exfiltration features and configuration * docs: Clarify configuration instructions for data exfiltration trusted destinations * docs: Enhance comments for evidence routing logic in data exfiltration
451 lines
17 KiB
Rust
451 lines
17 KiB
Rust
#![allow(clippy::collapsible_if, clippy::redundant_closure)]
|
|
|
|
//! Static hash-map lookup abstract analysis.
|
|
//!
|
|
//! Recognises the idiom
|
|
//! ```ignore
|
|
//! let mut table = HashMap::new();
|
|
//! table.insert(K1, V1);
|
|
//! table.insert(K2, V2);
|
|
//! let cmd = table.get(k).copied().unwrap_or("safe");
|
|
//! ```
|
|
//! where every insert's *value* slot is a syntactic string literal and the
|
|
//! final lookup is dereffed via a literal fallback (`.unwrap_or(LIT)`). The
|
|
//! result `cmd` is then provably bounded to the finite set
|
|
//! `{V1, V2, …, "safe"}`, regardless of what `k` carries, taint-flavour or
|
|
//! otherwise. Downstream sink suppression consumes this finite set to
|
|
//! clear SHELL/FILE/SQL injection findings whose payload is proved to be
|
|
//! metacharacter-free.
|
|
//!
|
|
//! ## SSA shape assumption
|
|
//!
|
|
//! The taint CFG collapses each method chain into **one** SSA `Call`
|
|
//! instruction whose `callee` text is the entire chain's "function" expression
|
|
//! (e.g. `"table.get(key).copied().unwrap_or"` for `table.get(key).copied()
|
|
//! .unwrap_or("safe")`) and whose `receiver` is the root identifier's SSA
|
|
//! value. We therefore do not need to walk SSA `.copied()` / `.unwrap_or`
|
|
//! instructions as separate hops, pattern-matching on the callee text is
|
|
//! the source of truth. String-literal arguments that the callee text
|
|
//! elides (e.g. the fallback `"safe"`) are read from the CFG node's
|
|
//! `arg_string_literals`, populated during CFG construction.
|
|
//!
|
|
//! Scope is deliberately narrow: only same-function static maps, only
|
|
//! literal-valued inserts, no escape beyond recognised mutate/read methods.
|
|
//! Any deviation (dynamic insert, callee not in the allow-list, map used as
|
|
//! a plain argument, map returned, map joined across a phi) invalidates the
|
|
//! candidate. Missed detection is safe, it just falls through to existing
|
|
//! behaviour.
|
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
use super::const_prop::ConstLattice;
|
|
use super::ir::*;
|
|
use crate::cfg::Cfg;
|
|
use crate::symbol::Lang;
|
|
|
|
/// Output of the static-map analysis: SSA values whose concrete string value
|
|
/// is provably in a finite set, plus the set itself (sorted + deduped).
|
|
#[derive(Clone, Debug, Default)]
|
|
pub struct StaticMapResult {
|
|
pub finite_string_values: HashMap<SsaValue, Vec<String>>,
|
|
}
|
|
|
|
impl StaticMapResult {
|
|
pub fn empty() -> Self {
|
|
Self::default()
|
|
}
|
|
|
|
pub fn is_empty(&self) -> bool {
|
|
self.finite_string_values.is_empty()
|
|
}
|
|
}
|
|
|
|
/// Rust-specific constructors that produce an empty map value.
|
|
fn is_rust_map_constructor(callee: &str) -> bool {
|
|
let leaf_after_colon = callee.rsplit("::").next().unwrap_or(callee);
|
|
if leaf_after_colon != "new" {
|
|
return false;
|
|
}
|
|
let type_part = callee.rsplit("::").nth(1).unwrap_or("");
|
|
matches!(type_part, "HashMap" | "BTreeMap")
|
|
}
|
|
|
|
/// Classification of a Call whose receiver is a candidate map.
|
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
|
enum MapUse {
|
|
/// `{var}.insert(K, V)`, value contributes to the finite domain.
|
|
Insert,
|
|
/// `{var}.get(K)[.copied()|.cloned()|.as_deref()|.as_ref()]*.unwrap_or`
|
|
///, lookup result is bounded by the inserted values plus the fallback
|
|
/// literal on the CFG node.
|
|
StaticLookup,
|
|
/// Whitelisted read-only method (no reference leak).
|
|
ReadOnly,
|
|
/// Anything else, invalidates the map candidate.
|
|
Escape,
|
|
}
|
|
|
|
/// Classify the callee of a Call whose `receiver` SSA value points to a
|
|
/// candidate map bound to `map_var`. Returns [`MapUse::Escape`] when the
|
|
/// callee doesn't match any recognised pattern so the caller invalidates
|
|
/// the map rather than trusting an unknown mutation.
|
|
fn classify_map_use(callee: &str, map_var: &str) -> MapUse {
|
|
// Fast-path: exact single-method calls on the receiver.
|
|
let method = callee
|
|
.strip_prefix(map_var)
|
|
.and_then(|rest| rest.strip_prefix('.'));
|
|
if let Some(method) = method {
|
|
// Single identifier method with no trailing chain.
|
|
match method {
|
|
"insert" => return MapUse::Insert,
|
|
"contains_key" | "len" | "is_empty" | "clear" => return MapUse::ReadOnly,
|
|
_ => {}
|
|
}
|
|
// Chained lookup: must start with `get(…)` and end with `.unwrap_or`.
|
|
if let Some(rest) = method.strip_prefix("get(") {
|
|
if let Some(after_args) = scan_past_balanced_parens(rest) {
|
|
if is_identity_chain_ending_in_unwrap_or(after_args) {
|
|
return MapUse::StaticLookup;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
MapUse::Escape
|
|
}
|
|
|
|
/// Given `s` just after an opening `(`, return the slice after the matching
|
|
/// close `)`. Returns `None` when parens are unbalanced.
|
|
fn scan_past_balanced_parens(s: &str) -> Option<&str> {
|
|
let bytes = s.as_bytes();
|
|
let mut depth: i32 = 1;
|
|
let mut i = 0;
|
|
while i < bytes.len() {
|
|
match bytes[i] {
|
|
b'(' => depth += 1,
|
|
b')' => {
|
|
depth -= 1;
|
|
if depth == 0 {
|
|
return Some(&s[i + 1..]);
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
i += 1;
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Return `true` when `s` is a sequence of zero or more identity chain
|
|
/// methods (`.copied()`, `.cloned()`, `.as_deref()`, `.as_ref()`) followed
|
|
/// by `.unwrap_or` (and nothing else). The trailing arg list of
|
|
/// `.unwrap_or` is elided in the callee text, it appears in the CFG node's
|
|
/// `arg_string_literals` instead.
|
|
fn is_identity_chain_ending_in_unwrap_or(mut s: &str) -> bool {
|
|
const IDENTS: &[&str] = &[".copied()", ".cloned()", ".as_deref()", ".as_ref()"];
|
|
loop {
|
|
if s == ".unwrap_or" {
|
|
return true;
|
|
}
|
|
let mut advanced = false;
|
|
for id in IDENTS {
|
|
if let Some(rest) = s.strip_prefix(id) {
|
|
s = rest;
|
|
advanced = true;
|
|
break;
|
|
}
|
|
}
|
|
if !advanced {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
fn resolve_alias(v: SsaValue, aliases: &HashMap<SsaValue, SsaValue>) -> SsaValue {
|
|
let mut cur = v;
|
|
for _ in 0..64 {
|
|
match aliases.get(&cur) {
|
|
Some(&next) if next != cur => cur = next,
|
|
_ => break,
|
|
}
|
|
}
|
|
cur
|
|
}
|
|
|
|
/// Run the analysis. Bails out immediately for non-Rust bodies, the current
|
|
/// pattern set only models Rust `std::collections::HashMap`.
|
|
pub fn analyze(
|
|
body: &SsaBody,
|
|
cfg: &Cfg,
|
|
lang: Option<Lang>,
|
|
_const_values: &HashMap<SsaValue, ConstLattice>,
|
|
) -> StaticMapResult {
|
|
if lang != Some(Lang::Rust) {
|
|
return StaticMapResult::empty();
|
|
}
|
|
|
|
// ── 1. Discover candidate map allocations + their bound var name ──────
|
|
// The var_name is the identifier the CFG builder attaches to the define
|
|
// site of the let-binding. Without a var_name we can't pattern-match
|
|
// receiver uses in callee text, so such allocations are skipped.
|
|
let mut candidates: HashMap<SsaValue, String> = HashMap::new();
|
|
for block in &body.blocks {
|
|
for inst in block.phis.iter().chain(block.body.iter()) {
|
|
if let SsaOp::Call { callee, .. } = &inst.op {
|
|
if is_rust_map_constructor(callee) {
|
|
if let Some(name) = inst.var_name.as_deref() {
|
|
if !name.is_empty() {
|
|
candidates.insert(inst.value, name.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if candidates.is_empty() {
|
|
return StaticMapResult::empty();
|
|
}
|
|
|
|
// ── 2. Build trivial alias chain: single-use Assign `v = w` where w is
|
|
// a known (or aliased) candidate value. Keeps us robust to wrapper
|
|
// copies SSA lowering occasionally introduces.
|
|
let mut aliases: HashMap<SsaValue, SsaValue> = HashMap::new();
|
|
for block in &body.blocks {
|
|
for inst in &block.body {
|
|
if let SsaOp::Assign(uses) = &inst.op {
|
|
if uses.len() == 1 {
|
|
let src = resolve_alias(uses[0], &aliases);
|
|
if candidates.contains_key(&src) {
|
|
aliases.insert(inst.value, src);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
let canonicalise = |v: SsaValue| -> Option<SsaValue> {
|
|
let c = resolve_alias(v, &aliases);
|
|
if candidates.contains_key(&c) {
|
|
Some(c)
|
|
} else {
|
|
None
|
|
}
|
|
};
|
|
|
|
// ── 3. Walk every instruction, classifying references to any candidate.
|
|
// Collect per-candidate inserted literal values and mark invalidating
|
|
// escapes (phi operand, non-whitelisted method, plain argument use,
|
|
// non-copy Assign, Return).
|
|
let mut inserted: HashMap<SsaValue, HashSet<String>> = HashMap::new();
|
|
let mut invalid: HashSet<SsaValue> = HashSet::new();
|
|
// Each lookup site: (map, result SSA value, fallback literal).
|
|
let mut lookups: Vec<(SsaValue, SsaValue, String)> = Vec::new();
|
|
for c in candidates.keys() {
|
|
inserted.insert(*c, HashSet::new());
|
|
}
|
|
|
|
for block in &body.blocks {
|
|
for inst in block.phis.iter().chain(block.body.iter()) {
|
|
match &inst.op {
|
|
SsaOp::Phi(operands) => {
|
|
for (_, v) in operands {
|
|
if let Some(canon) = canonicalise(*v) {
|
|
invalid.insert(canon);
|
|
}
|
|
}
|
|
}
|
|
SsaOp::Call {
|
|
callee,
|
|
args,
|
|
receiver,
|
|
..
|
|
} => {
|
|
if candidates.contains_key(&inst.value) && is_rust_map_constructor(callee) {
|
|
continue;
|
|
}
|
|
if let Some(map) = receiver.and_then(|r| canonicalise(r)) {
|
|
let map_var = candidates.get(&map).cloned().unwrap_or_default();
|
|
match classify_map_use(callee, &map_var) {
|
|
MapUse::Insert => {
|
|
let node_info = &cfg[inst.cfg_node];
|
|
let value_lit =
|
|
node_info.call.arg_string_literals.get(1).cloned().flatten();
|
|
match value_lit {
|
|
Some(lit) => {
|
|
inserted.entry(map).or_default().insert(lit);
|
|
}
|
|
None => {
|
|
invalid.insert(map);
|
|
}
|
|
}
|
|
}
|
|
MapUse::StaticLookup => {
|
|
let node_info = &cfg[inst.cfg_node];
|
|
if let Some(Some(fallback)) =
|
|
node_info.call.arg_string_literals.first().cloned()
|
|
{
|
|
lookups.push((map, inst.value, fallback));
|
|
}
|
|
// A non-literal fallback silently falls
|
|
// through: the map stays valid, we just
|
|
// don't emit a finite domain for this site.
|
|
}
|
|
MapUse::ReadOnly => {}
|
|
MapUse::Escape => {
|
|
invalid.insert(map);
|
|
}
|
|
}
|
|
}
|
|
for group in args {
|
|
for &v in group {
|
|
if let Some(canon) = canonicalise(v) {
|
|
invalid.insert(canon);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
SsaOp::Assign(uses) if uses.len() != 1 => {
|
|
for &u in uses {
|
|
if let Some(canon) = canonicalise(u) {
|
|
invalid.insert(canon);
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
if let Terminator::Return(Some(v)) = &block.terminator {
|
|
if let Some(canon) = canonicalise(*v) {
|
|
invalid.insert(canon);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── 4. Emit results for still-valid candidates with at least one insert.
|
|
let mut result = StaticMapResult::default();
|
|
for (map, lookup_val, fallback) in lookups {
|
|
if invalid.contains(&map) {
|
|
continue;
|
|
}
|
|
let lits = match inserted.get(&map) {
|
|
Some(s) if !s.is_empty() => s,
|
|
_ => continue,
|
|
};
|
|
let mut domain: Vec<String> = lits.iter().cloned().collect();
|
|
domain.push(fallback);
|
|
domain.sort();
|
|
domain.dedup();
|
|
result.finite_string_values.insert(lookup_val, domain);
|
|
}
|
|
result
|
|
}
|
|
|
|
// ── Tests ───────────────────────────────────────────────────────────────
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn rust_map_constructor_matches() {
|
|
assert!(is_rust_map_constructor("HashMap::new"));
|
|
assert!(is_rust_map_constructor("std::collections::HashMap::new"));
|
|
assert!(is_rust_map_constructor("BTreeMap::new"));
|
|
assert!(!is_rust_map_constructor("HashMap::from"));
|
|
assert!(!is_rust_map_constructor("HashMap::with_capacity"));
|
|
assert!(!is_rust_map_constructor("Vec::new"));
|
|
}
|
|
|
|
#[test]
|
|
fn classify_insert_call() {
|
|
assert_eq!(classify_map_use("table.insert", "table"), MapUse::Insert);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_read_only_call() {
|
|
assert_eq!(
|
|
classify_map_use("table.contains_key", "table"),
|
|
MapUse::ReadOnly
|
|
);
|
|
assert_eq!(classify_map_use("table.len", "table"), MapUse::ReadOnly);
|
|
// Iterator-returning methods (values/iter/keys) escape: they leak
|
|
// references that can flow anywhere.
|
|
assert_eq!(classify_map_use("table.values", "table"), MapUse::Escape);
|
|
assert_eq!(classify_map_use("table.iter", "table"), MapUse::Escape);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_static_lookup_with_copied() {
|
|
assert_eq!(
|
|
classify_map_use("table.get(key.as_str()).copied().unwrap_or", "table"),
|
|
MapUse::StaticLookup
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_static_lookup_without_identity_chain() {
|
|
// `.unwrap_or` directly after `.get(...)` also qualifies, Rust
|
|
// `HashMap::get` returns `Option<&V>`, so `.unwrap_or(&"safe")` is
|
|
// syntactically valid and equally bounded.
|
|
assert_eq!(
|
|
classify_map_use("table.get(k).unwrap_or", "table"),
|
|
MapUse::StaticLookup
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_static_lookup_mixed_identity_chain() {
|
|
assert_eq!(
|
|
classify_map_use("t.get(k).as_deref().cloned().unwrap_or", "t"),
|
|
MapUse::StaticLookup
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_rejects_unknown_terminator() {
|
|
// `.unwrap_or_else(|| …)` is not modelled, closure can return anything.
|
|
assert_eq!(
|
|
classify_map_use("t.get(k).copied().unwrap_or_else", "t"),
|
|
MapUse::Escape
|
|
);
|
|
// A bare `.unwrap()` after `.get(k)` panics rather than bounding,
|
|
// so we refuse to treat it as safe. The caller would need a proven
|
|
// `.contains_key` guard; that is out of scope here.
|
|
assert_eq!(classify_map_use("t.get(k).unwrap", "t"), MapUse::Escape);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_rejects_other_receiver() {
|
|
// `other.insert` does not belong to `table`, receiver mismatch.
|
|
assert_eq!(classify_map_use("other.insert", "table"), MapUse::Escape);
|
|
}
|
|
|
|
#[test]
|
|
fn scan_past_balanced_parens_basic() {
|
|
assert_eq!(scan_past_balanced_parens("foo)").unwrap_or(""), "");
|
|
assert_eq!(scan_past_balanced_parens("foo).bar").unwrap_or(""), ".bar");
|
|
assert_eq!(
|
|
scan_past_balanced_parens("foo(bar)baz).x").unwrap_or(""),
|
|
".x"
|
|
);
|
|
assert!(scan_past_balanced_parens("no-close").is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn non_rust_lang_returns_empty() {
|
|
use petgraph::Graph;
|
|
let body = SsaBody {
|
|
blocks: vec![],
|
|
entry: BlockId(0),
|
|
value_defs: vec![],
|
|
cfg_node_map: std::collections::HashMap::new(),
|
|
exception_edges: vec![],
|
|
field_interner: crate::ssa::ir::FieldInterner::default(),
|
|
field_writes: std::collections::HashMap::new(),
|
|
|
|
synthetic_externals: std::collections::HashSet::new(),
|
|
};
|
|
let cfg: Cfg = Graph::new();
|
|
let const_values = HashMap::new();
|
|
let result = analyze(&body, &cfg, Some(Lang::Java), &const_values);
|
|
assert!(result.is_empty());
|
|
}
|
|
}
|