#![allow(clippy::collapsible_if, clippy::redundant_closure)] //! Static hash-map lookup abstract analysis. //! //! Recognises the idiom //! ```ignore //! let mut table = HashMap::new(); //! table.insert(K1, V1); //! table.insert(K2, V2); //! let cmd = table.get(k).copied().unwrap_or("safe"); //! ``` //! where every insert's *value* slot is a syntactic string literal and the //! final lookup is dereffed via a literal fallback (`.unwrap_or(LIT)`). The //! result `cmd` is then provably bounded to the finite set //! `{V1, V2, …, "safe"}`, regardless of what `k` carries, taint-flavour or //! otherwise. Downstream sink suppression consumes this finite set to //! clear SHELL/FILE/SQL injection findings whose payload is proved to be //! metacharacter-free. //! //! ## SSA shape assumption //! //! The taint CFG collapses each method chain into **one** SSA `Call` //! instruction whose `callee` text is the entire chain's "function" expression //! (e.g. `"table.get(key).copied().unwrap_or"` for `table.get(key).copied() //! .unwrap_or("safe")`) and whose `receiver` is the root identifier's SSA //! value. We therefore do not need to walk SSA `.copied()` / `.unwrap_or` //! instructions as separate hops, pattern-matching on the callee text is //! the source of truth. String-literal arguments that the callee text //! elides (e.g. the fallback `"safe"`) are read from the CFG node's //! `arg_string_literals`, populated during CFG construction. //! //! Scope is deliberately narrow: only same-function static maps, only //! literal-valued inserts, no escape beyond recognised mutate/read methods. //! Any deviation (dynamic insert, callee not in the allow-list, map used as //! a plain argument, map returned, map joined across a phi) invalidates the //! candidate. Missed detection is safe, it just falls through to existing //! behaviour. use std::collections::{HashMap, HashSet}; use super::const_prop::ConstLattice; use super::ir::*; use crate::cfg::Cfg; use crate::symbol::Lang; /// Output of the static-map analysis: SSA values whose concrete string value /// is provably in a finite set, plus the set itself (sorted + deduped). #[derive(Clone, Debug, Default)] pub struct StaticMapResult { pub finite_string_values: HashMap>, } impl StaticMapResult { pub fn empty() -> Self { Self::default() } pub fn is_empty(&self) -> bool { self.finite_string_values.is_empty() } } /// Rust-specific constructors that produce an empty map value. fn is_rust_map_constructor(callee: &str) -> bool { let leaf_after_colon = callee.rsplit("::").next().unwrap_or(callee); if leaf_after_colon != "new" { return false; } let type_part = callee.rsplit("::").nth(1).unwrap_or(""); matches!(type_part, "HashMap" | "BTreeMap") } /// Classification of a Call whose receiver is a candidate map. #[derive(Clone, Debug, PartialEq, Eq)] enum MapUse { /// `{var}.insert(K, V)`, value contributes to the finite domain. Insert, /// `{var}.get(K)[.copied()|.cloned()|.as_deref()|.as_ref()]*.unwrap_or` ///, lookup result is bounded by the inserted values plus the fallback /// literal on the CFG node. StaticLookup, /// Whitelisted read-only method (no reference leak). ReadOnly, /// Anything else, invalidates the map candidate. Escape, } /// Classify the callee of a Call whose `receiver` SSA value points to a /// candidate map bound to `map_var`. Returns [`MapUse::Escape`] when the /// callee doesn't match any recognised pattern so the caller invalidates /// the map rather than trusting an unknown mutation. fn classify_map_use(callee: &str, map_var: &str) -> MapUse { // Fast-path: exact single-method calls on the receiver. let method = callee .strip_prefix(map_var) .and_then(|rest| rest.strip_prefix('.')); if let Some(method) = method { // Single identifier method with no trailing chain. match method { "insert" => return MapUse::Insert, "contains_key" | "len" | "is_empty" | "clear" => return MapUse::ReadOnly, _ => {} } // Chained lookup: must start with `get(…)` and end with `.unwrap_or`. if let Some(rest) = method.strip_prefix("get(") { if let Some(after_args) = scan_past_balanced_parens(rest) { if is_identity_chain_ending_in_unwrap_or(after_args) { return MapUse::StaticLookup; } } } } MapUse::Escape } /// Given `s` just after an opening `(`, return the slice after the matching /// close `)`. Returns `None` when parens are unbalanced. fn scan_past_balanced_parens(s: &str) -> Option<&str> { let bytes = s.as_bytes(); let mut depth: i32 = 1; let mut i = 0; while i < bytes.len() { match bytes[i] { b'(' => depth += 1, b')' => { depth -= 1; if depth == 0 { return Some(&s[i + 1..]); } } _ => {} } i += 1; } None } /// Return `true` when `s` is a sequence of zero or more identity chain /// methods (`.copied()`, `.cloned()`, `.as_deref()`, `.as_ref()`) followed /// by `.unwrap_or` (and nothing else). The trailing arg list of /// `.unwrap_or` is elided in the callee text, it appears in the CFG node's /// `arg_string_literals` instead. fn is_identity_chain_ending_in_unwrap_or(mut s: &str) -> bool { const IDENTS: &[&str] = &[".copied()", ".cloned()", ".as_deref()", ".as_ref()"]; loop { if s == ".unwrap_or" { return true; } let mut advanced = false; for id in IDENTS { if let Some(rest) = s.strip_prefix(id) { s = rest; advanced = true; break; } } if !advanced { return false; } } } fn resolve_alias(v: SsaValue, aliases: &HashMap) -> SsaValue { let mut cur = v; for _ in 0..64 { match aliases.get(&cur) { Some(&next) if next != cur => cur = next, _ => break, } } cur } /// Run the analysis. Bails out immediately for non-Rust bodies, the current /// pattern set only models Rust `std::collections::HashMap`. pub fn analyze( body: &SsaBody, cfg: &Cfg, lang: Option, _const_values: &HashMap, ) -> StaticMapResult { if lang != Some(Lang::Rust) { return StaticMapResult::empty(); } // ── 1. Discover candidate map allocations + their bound var name ────── // The var_name is the identifier the CFG builder attaches to the define // site of the let-binding. Without a var_name we can't pattern-match // receiver uses in callee text, so such allocations are skipped. let mut candidates: HashMap = HashMap::new(); for block in &body.blocks { for inst in block.phis.iter().chain(block.body.iter()) { if let SsaOp::Call { callee, .. } = &inst.op { if is_rust_map_constructor(callee) { if let Some(name) = inst.var_name.as_deref() { if !name.is_empty() { candidates.insert(inst.value, name.to_string()); } } } } } } if candidates.is_empty() { return StaticMapResult::empty(); } // ── 2. Build trivial alias chain: single-use Assign `v = w` where w is // a known (or aliased) candidate value. Keeps us robust to wrapper // copies SSA lowering occasionally introduces. let mut aliases: HashMap = HashMap::new(); for block in &body.blocks { for inst in &block.body { if let SsaOp::Assign(uses) = &inst.op { if uses.len() == 1 { let src = resolve_alias(uses[0], &aliases); if candidates.contains_key(&src) { aliases.insert(inst.value, src); } } } } } let canonicalise = |v: SsaValue| -> Option { let c = resolve_alias(v, &aliases); if candidates.contains_key(&c) { Some(c) } else { None } }; // ── 3. Walk every instruction, classifying references to any candidate. // Collect per-candidate inserted literal values and mark invalidating // escapes (phi operand, non-whitelisted method, plain argument use, // non-copy Assign, Return). let mut inserted: HashMap> = HashMap::new(); let mut invalid: HashSet = HashSet::new(); // Each lookup site: (map, result SSA value, fallback literal). let mut lookups: Vec<(SsaValue, SsaValue, String)> = Vec::new(); for c in candidates.keys() { inserted.insert(*c, HashSet::new()); } for block in &body.blocks { for inst in block.phis.iter().chain(block.body.iter()) { match &inst.op { SsaOp::Phi(operands) => { for (_, v) in operands { if let Some(canon) = canonicalise(*v) { invalid.insert(canon); } } } SsaOp::Call { callee, args, receiver, .. } => { if candidates.contains_key(&inst.value) && is_rust_map_constructor(callee) { continue; } if let Some(map) = receiver.and_then(|r| canonicalise(r)) { let map_var = candidates.get(&map).cloned().unwrap_or_default(); match classify_map_use(callee, &map_var) { MapUse::Insert => { let node_info = &cfg[inst.cfg_node]; let value_lit = node_info.call.arg_string_literals.get(1).cloned().flatten(); match value_lit { Some(lit) => { inserted.entry(map).or_default().insert(lit); } None => { invalid.insert(map); } } } MapUse::StaticLookup => { let node_info = &cfg[inst.cfg_node]; if let Some(Some(fallback)) = node_info.call.arg_string_literals.first().cloned() { lookups.push((map, inst.value, fallback)); } // A non-literal fallback silently falls // through: the map stays valid, we just // don't emit a finite domain for this site. } MapUse::ReadOnly => {} MapUse::Escape => { invalid.insert(map); } } } for group in args { for &v in group { if let Some(canon) = canonicalise(v) { invalid.insert(canon); } } } } SsaOp::Assign(uses) if uses.len() != 1 => { for &u in uses { if let Some(canon) = canonicalise(u) { invalid.insert(canon); } } } _ => {} } } if let Terminator::Return(Some(v)) = &block.terminator { if let Some(canon) = canonicalise(*v) { invalid.insert(canon); } } } // ── 4. Emit results for still-valid candidates with at least one insert. let mut result = StaticMapResult::default(); for (map, lookup_val, fallback) in lookups { if invalid.contains(&map) { continue; } let lits = match inserted.get(&map) { Some(s) if !s.is_empty() => s, _ => continue, }; let mut domain: Vec = lits.iter().cloned().collect(); domain.push(fallback); domain.sort(); domain.dedup(); result.finite_string_values.insert(lookup_val, domain); } result } // ── Tests ─────────────────────────────────────────────────────────────── #[cfg(test)] mod tests { use super::*; #[test] fn rust_map_constructor_matches() { assert!(is_rust_map_constructor("HashMap::new")); assert!(is_rust_map_constructor("std::collections::HashMap::new")); assert!(is_rust_map_constructor("BTreeMap::new")); assert!(!is_rust_map_constructor("HashMap::from")); assert!(!is_rust_map_constructor("HashMap::with_capacity")); assert!(!is_rust_map_constructor("Vec::new")); } #[test] fn classify_insert_call() { assert_eq!(classify_map_use("table.insert", "table"), MapUse::Insert); } #[test] fn classify_read_only_call() { assert_eq!( classify_map_use("table.contains_key", "table"), MapUse::ReadOnly ); assert_eq!(classify_map_use("table.len", "table"), MapUse::ReadOnly); // Iterator-returning methods (values/iter/keys) escape: they leak // references that can flow anywhere. assert_eq!(classify_map_use("table.values", "table"), MapUse::Escape); assert_eq!(classify_map_use("table.iter", "table"), MapUse::Escape); } #[test] fn classify_static_lookup_with_copied() { assert_eq!( classify_map_use("table.get(key.as_str()).copied().unwrap_or", "table"), MapUse::StaticLookup ); } #[test] fn classify_static_lookup_without_identity_chain() { // `.unwrap_or` directly after `.get(...)` also qualifies, Rust // `HashMap::get` returns `Option<&V>`, so `.unwrap_or(&"safe")` is // syntactically valid and equally bounded. assert_eq!( classify_map_use("table.get(k).unwrap_or", "table"), MapUse::StaticLookup ); } #[test] fn classify_static_lookup_mixed_identity_chain() { assert_eq!( classify_map_use("t.get(k).as_deref().cloned().unwrap_or", "t"), MapUse::StaticLookup ); } #[test] fn classify_rejects_unknown_terminator() { // `.unwrap_or_else(|| …)` is not modelled, closure can return anything. assert_eq!( classify_map_use("t.get(k).copied().unwrap_or_else", "t"), MapUse::Escape ); // A bare `.unwrap()` after `.get(k)` panics rather than bounding, // so we refuse to treat it as safe. The caller would need a proven // `.contains_key` guard; that is out of scope here. assert_eq!(classify_map_use("t.get(k).unwrap", "t"), MapUse::Escape); } #[test] fn classify_rejects_other_receiver() { // `other.insert` does not belong to `table`, receiver mismatch. assert_eq!(classify_map_use("other.insert", "table"), MapUse::Escape); } #[test] fn scan_past_balanced_parens_basic() { assert_eq!(scan_past_balanced_parens("foo)").unwrap_or(""), ""); assert_eq!(scan_past_balanced_parens("foo).bar").unwrap_or(""), ".bar"); assert_eq!( scan_past_balanced_parens("foo(bar)baz).x").unwrap_or(""), ".x" ); assert!(scan_past_balanced_parens("no-close").is_none()); } #[test] fn non_rust_lang_returns_empty() { use petgraph::Graph; let body = SsaBody { blocks: vec![], entry: BlockId(0), value_defs: vec![], cfg_node_map: std::collections::HashMap::new(), exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), synthetic_externals: std::collections::HashSet::new(), }; let cfg: Cfg = Graph::new(); let const_values = HashMap::new(); let result = analyze(&body, &cfg, Some(Lang::Java), &const_values); assert!(result.is_empty()); } }