mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
3048 lines
118 KiB
Rust
3048 lines
118 KiB
Rust
#![allow(clippy::collapsible_if)]
|
|
|
|
use super::dominators::{self, dominates};
|
|
use super::rules;
|
|
use super::{
|
|
AnalysisContext, BodyConstFacts, CfgAnalysis, CfgFinding, Confidence, is_entry_point_func,
|
|
};
|
|
use crate::callgraph::callee_leaf_name;
|
|
use crate::cfg::StmtKind;
|
|
use crate::labels::{Cap, DataLabel, RuntimeLabelRule};
|
|
use crate::patterns::Severity;
|
|
use crate::ssa::const_prop::ConstLattice;
|
|
use crate::ssa::type_facts::TypeFactResult;
|
|
use crate::ssa::{SsaOp, SsaValue};
|
|
use crate::symbol::Lang;
|
|
use crate::taint::path_state::{PredicateKind, classify_condition};
|
|
use petgraph::graph::NodeIndex;
|
|
use smallvec::SmallVec;
|
|
use std::collections::HashSet;
|
|
|
|
pub struct UnguardedSink;
|
|
|
|
/// Check whether **all** arguments to the sink are constants (no taint-capable
|
|
/// variable flows). Extends the inline callee-part check by tracing one hop
|
|
/// through the CFG: if a used variable is defined by a node that itself has
|
|
/// empty `uses` and no Source label, the definition is treated as a constant
|
|
/// binding (e.g. `let cmd = "git"; Command::new(cmd)`). When SSA
|
|
/// [`BodyConstFacts`] are available, falls back to walking the sink's
|
|
/// `SsaOp::Call` operands and consulting `OptimizeResult.const_values` for
|
|
/// any operand the syntactic trace can't classify (e.g. a chained method-call
|
|
/// receiver recorded as a compound identifier rather than a named binding).
|
|
fn is_all_args_constant(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
// Fast path: syntactic literal detection from CFG construction.
|
|
// Strictly weaker than the one-hop trace below, serves as an
|
|
// optimization for the common case of inline literal arguments.
|
|
if ctx.cfg[sink].all_args_literal {
|
|
return true;
|
|
}
|
|
let sink_info = &ctx.cfg[sink];
|
|
let callee_desc = sink_info.call.callee.as_deref().unwrap_or("");
|
|
// Split callee description into parts and strip parenthesized arg portions.
|
|
// e.g. `exec.Command("echo", "health-ok").Run` → ["exec", "Command", "Run"]
|
|
let callee_parts: Vec<&str> = callee_desc
|
|
.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect();
|
|
// When the callee was overridden by an inner call (e.g. `db.query` inside
|
|
// `Promise.all([db.query(...)])`), the outer callee's parts (e.g. "Promise",
|
|
// "all") also belong to the callee machinery, not to arguments.
|
|
let outer_parts: Vec<&str> = sink_info
|
|
.call
|
|
.outer_callee
|
|
.as_deref()
|
|
.map(|oc| {
|
|
oc.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
|
|
|
sink_info.taint.uses.iter().all(|u| {
|
|
// Part of the callee name itself → not an argument, skip
|
|
// Check both individual parts and the full dotted callee path
|
|
if callee_parts.contains(&u.as_str())
|
|
|| u == callee_desc
|
|
|| outer_parts.contains(&u.as_str())
|
|
{
|
|
return true;
|
|
}
|
|
// One-hop trace: find the defining node in the same function
|
|
for idx in ctx.cfg.node_indices() {
|
|
let info = &ctx.cfg[idx];
|
|
if info.ast.enclosing_func.as_deref() != sink_func {
|
|
continue;
|
|
}
|
|
if info.taint.defines.as_deref() == Some(u.as_str()) {
|
|
// If the defining node has no uses (pure constant) and is not
|
|
// a Source, the variable is constant.
|
|
if info.taint.uses.is_empty()
|
|
&& !info
|
|
.taint
|
|
.labels
|
|
.iter()
|
|
.any(|l| matches!(l, DataLabel::Source(_)))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
// Class-level constant scalar: Java `static final TYPE NAME = LIT;`
|
|
// field references are compile-time constants that the per-function
|
|
// CFG one-hop trace can't see (fields live outside any function
|
|
// body) and that SSA const-prop doesn't surface either (the per-
|
|
// function lowering treats the cross-scope reference as a free
|
|
// identifier).
|
|
if let Some(map) = ctx.class_constant_scalars
|
|
&& map.contains_key(u.as_str())
|
|
{
|
|
return true;
|
|
}
|
|
false
|
|
}) || ssa_all_sink_operands_constant(ctx, sink, callee_desc, &callee_parts, &outer_parts)
|
|
}
|
|
|
|
/// SSA-backed fallback for `is_all_args_constant`. Looks up the sink CFG
|
|
/// node in `cfg_node_map`, expects an `SsaOp::Call`, and checks that every
|
|
/// operand (positional args and receiver) either names a callee fragment or
|
|
/// resolves to a concrete `ConstLattice` literal.
|
|
fn ssa_all_sink_operands_constant(
|
|
ctx: &AnalysisContext,
|
|
sink: NodeIndex,
|
|
callee_desc: &str,
|
|
callee_parts: &[&str],
|
|
outer_parts: &[&str],
|
|
) -> bool {
|
|
let Some(facts) = ctx.body_const_facts else {
|
|
return false;
|
|
};
|
|
let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else {
|
|
return false;
|
|
};
|
|
let Some(inst) = find_inst(&facts.ssa, sink_val) else {
|
|
return false;
|
|
};
|
|
let SsaOp::Call { args, receiver, .. } = &inst.op else {
|
|
return false;
|
|
};
|
|
|
|
let operand_const = |v: SsaValue| -> bool {
|
|
ssa_operand_constant(v, facts, callee_desc, callee_parts, outer_parts)
|
|
};
|
|
let args_ok = args
|
|
.iter()
|
|
.all(|group| group.iter().all(|v| operand_const(*v)));
|
|
let receiver_ok = receiver.is_none_or(operand_const);
|
|
args_ok && receiver_ok
|
|
}
|
|
|
|
/// SSA-backed reassign-aware safety probe: every operand of the sink
|
|
/// resolves to a constant, callee fragment, OR a function parameter that
|
|
/// is not itself a Source. Used at the cfg-unguarded-sink site under
|
|
/// `!has_taint`, the taint engine has already proved no source-tainted
|
|
/// data reaches the sink, so a non-source Param at operand position is
|
|
/// inert payload-wise (e.g. HTTP writer in `Fprintf(w, "<h1>", "Guest")`).
|
|
///
|
|
/// Gated on the function body actually exhibiting the reassign-to-constant
|
|
/// signature, at least one named SSA def whose RHS is a literal Const
|
|
/// (`name = "Guest"`). In a thin wrapper without a same-block named
|
|
/// const assignment (`fn wrap(p) { sink(p) }`, or C `popen(buf, "r")` where
|
|
/// `buf` is filled in-place by `sprintf` with no Const Assign on `buf`),
|
|
/// the bare Param at operand position IS the payload and the suppression's
|
|
/// rationale does not apply, `cfg-unguarded-sink` must still fire.
|
|
fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
let Some(facts) = ctx.body_const_facts else {
|
|
return false;
|
|
};
|
|
let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else {
|
|
return false;
|
|
};
|
|
let Some(inst) = find_inst(&facts.ssa, sink_val) else {
|
|
return false;
|
|
};
|
|
let SsaOp::Call { args, receiver, .. } = &inst.op else {
|
|
return false;
|
|
};
|
|
|
|
if !func_body_has_named_const_assign(facts) {
|
|
return false;
|
|
}
|
|
|
|
let operand_safe = |v: SsaValue| -> bool { ssa_operand_const_or_param(v, facts, ctx.cfg) };
|
|
let args_ok = args
|
|
.iter()
|
|
.all(|group| group.iter().all(|v| operand_safe(*v)));
|
|
let receiver_ok = receiver.is_none_or(operand_safe);
|
|
args_ok && receiver_ok
|
|
}
|
|
|
|
/// Return true if the SSA body contains a *named* variable whose definition
|
|
/// is a constant, the SSA signature of an explicit `name = "literal"`
|
|
/// reassignment. Used as the gate for the broader operand-Param suppression:
|
|
/// the suppression's purpose is the reassign-to-constant idiom, which by
|
|
/// definition has at least one named const assignment. In a thin wrapper
|
|
/// (`fn wrap(p) { sink(p) }` or `popen(buf, "r")` where `buf` is filled by
|
|
/// `sprintf`), no such named const assignment exists and the suppression's
|
|
/// rationale doesn't apply, so the bare-Param structural finding fires.
|
|
fn func_body_has_named_const_assign(facts: &BodyConstFacts) -> bool {
|
|
for block in &facts.ssa.blocks {
|
|
for inst in &block.body {
|
|
if inst.var_name.is_none() {
|
|
continue;
|
|
}
|
|
let rhs_const = match &inst.op {
|
|
SsaOp::Const(_) => true,
|
|
SsaOp::Assign(vals) => vals.iter().all(|v| {
|
|
matches!(
|
|
facts.const_values.get(v),
|
|
Some(
|
|
ConstLattice::Str(_)
|
|
| ConstLattice::Int(_)
|
|
| ConstLattice::Bool(_)
|
|
| ConstLattice::Null
|
|
)
|
|
)
|
|
}),
|
|
_ => false,
|
|
};
|
|
if rhs_const {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
/// Variant of [`ssa_operand_constant`] that also accepts non-Source Params.
|
|
/// Stricter than `ssa_operand_constant` on Source (always false) but
|
|
/// looser on bare Params (always true unless they are Source-labeled).
|
|
fn ssa_operand_const_or_param(
|
|
root: SsaValue,
|
|
facts: &BodyConstFacts,
|
|
cfg: &crate::cfg::Cfg,
|
|
) -> bool {
|
|
let mut visited: HashSet<SsaValue> = HashSet::new();
|
|
let mut stack = vec![root];
|
|
while let Some(v) = stack.pop() {
|
|
if !visited.insert(v) {
|
|
continue;
|
|
}
|
|
match facts.const_values.get(&v) {
|
|
Some(ConstLattice::Str(_))
|
|
| Some(ConstLattice::Int(_))
|
|
| Some(ConstLattice::Bool(_))
|
|
| Some(ConstLattice::Null) => continue,
|
|
_ => {}
|
|
}
|
|
let Some(inst) = find_inst(&facts.ssa, v) else {
|
|
return false;
|
|
};
|
|
// CFG-node-level Source label: when an SSA `Call` corresponds to a
|
|
// Source-labeled CFG node (e.g. `env::var(...)` whose callee
|
|
// matches a `LabelRule` Source matcher), the call's result is
|
|
// tainted user input, refuse, regardless of how the SSA
|
|
// happened to lower. Catches the `SsaOp::Call` lowering of
|
|
// labeled Source functions, which the `SsaOp::Source` arm only
|
|
// sees for callee-less pure sources like PHP `$_GET`.
|
|
let cfg_node = inst.cfg_node;
|
|
if cfg
|
|
.node_weight(cfg_node)
|
|
.map(|info| {
|
|
info.taint
|
|
.labels
|
|
.iter()
|
|
.any(|l| matches!(l, DataLabel::Source(_)))
|
|
})
|
|
.unwrap_or(false)
|
|
{
|
|
return false;
|
|
}
|
|
match &inst.op {
|
|
SsaOp::Const(_) => {}
|
|
SsaOp::Assign(vals) => stack.extend(vals.iter().copied()),
|
|
SsaOp::Phi(ops) => stack.extend(ops.iter().map(|(_, v)| *v)),
|
|
SsaOp::Call { args, receiver, .. } => {
|
|
for group in args {
|
|
stack.extend(group.iter().copied());
|
|
}
|
|
if let Some(r) = receiver {
|
|
stack.push(*r);
|
|
}
|
|
}
|
|
SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam => {
|
|
// Bare parameters are accepted: at the call site the
|
|
// taint engine has already concluded no source data
|
|
// reaches this sink (`!has_taint` gate). A Param that
|
|
// is not source-tainted contributes only its caller-
|
|
// bound value, which the gate above already filtered.
|
|
}
|
|
SsaOp::Source => return false,
|
|
SsaOp::Nop | SsaOp::Undef => {}
|
|
// FieldProj: walk the receiver, `obj.f` is constant iff `obj`
|
|
// is constant under the same definition. The field name itself
|
|
// is structural and adds no runtime value.
|
|
SsaOp::FieldProj { receiver, .. } => stack.push(*receiver),
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
/// Return true if this SSA operand is a compile-time-known literal, a callee
|
|
/// fragment pseudo-use (not a real runtime value), or transitively composed
|
|
/// of such operands. Returns false for sources, parameters with non-callee
|
|
/// names, `Varying` const-prop facts, and any unresolved definition.
|
|
fn ssa_operand_constant(
|
|
root: SsaValue,
|
|
facts: &BodyConstFacts,
|
|
callee_desc: &str,
|
|
callee_parts: &[&str],
|
|
outer_parts: &[&str],
|
|
) -> bool {
|
|
let mut visited: HashSet<SsaValue> = HashSet::new();
|
|
let mut stack = vec![root];
|
|
while let Some(v) = stack.pop() {
|
|
if !visited.insert(v) {
|
|
continue;
|
|
}
|
|
match facts.const_values.get(&v) {
|
|
Some(ConstLattice::Str(_))
|
|
| Some(ConstLattice::Int(_))
|
|
| Some(ConstLattice::Bool(_))
|
|
| Some(ConstLattice::Null) => continue,
|
|
Some(ConstLattice::Varying) => {
|
|
// Fall through: a Varying lattice entry may still correspond
|
|
// to a callee-fragment pseudo-name that the SSA models as a
|
|
// Param. The per-op check below filters those out.
|
|
}
|
|
_ => {}
|
|
}
|
|
let Some(inst) = find_inst(&facts.ssa, v) else {
|
|
return false;
|
|
};
|
|
match &inst.op {
|
|
SsaOp::Const(_) => {}
|
|
SsaOp::Assign(vals) => stack.extend(vals.iter().copied()),
|
|
SsaOp::Phi(ops) => stack.extend(ops.iter().map(|(_, v)| *v)),
|
|
SsaOp::Call { args, receiver, .. } => {
|
|
for group in args {
|
|
stack.extend(group.iter().copied());
|
|
}
|
|
if let Some(r) = receiver {
|
|
stack.push(*r);
|
|
}
|
|
}
|
|
SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam | SsaOp::Source => {
|
|
// Only acceptable when the param's `var_name` is a callee
|
|
// fragment, i.e. an identifier that only appears because
|
|
// the CFG recorded name components of the dotted/chained
|
|
// callee as uses. Real parameters and sources are dynamic.
|
|
let name = inst.var_name.as_deref().unwrap_or("");
|
|
if matches!(inst.op, SsaOp::Source) {
|
|
return false;
|
|
}
|
|
if !is_callee_fragment(name, callee_desc, callee_parts, outer_parts) {
|
|
return false;
|
|
}
|
|
}
|
|
SsaOp::Nop => {}
|
|
// Undef is a non-user, non-dynamic sentinel, treat like Const
|
|
// (no additional operands to trace).
|
|
SsaOp::Undef => {}
|
|
// FieldProj: structural field read; constness reduces to the
|
|
// receiver's constness.
|
|
SsaOp::FieldProj { receiver, .. } => stack.push(*receiver),
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
fn is_callee_fragment(
|
|
name: &str,
|
|
callee_desc: &str,
|
|
callee_parts: &[&str],
|
|
outer_parts: &[&str],
|
|
) -> bool {
|
|
if name.is_empty() {
|
|
return true;
|
|
}
|
|
if callee_parts.contains(&name) || outer_parts.contains(&name) || name == callee_desc {
|
|
return true;
|
|
}
|
|
// Chained-receiver prefix: the name is a strict prefix of `callee_desc`
|
|
// terminating at a `.` or `::` boundary (e.g. name =
|
|
// `Command::new("sh").arg("-c").arg(cmd)` for callee_desc ending in
|
|
// `.status().unwrap`). These are the outer callee's receiver chain,
|
|
// not user-supplied arguments.
|
|
if callee_desc.len() > name.len() && callee_desc.starts_with(name) {
|
|
let rest = &callee_desc[name.len()..];
|
|
if rest.starts_with('.') || rest.starts_with("::") {
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
fn find_inst(ssa: &crate::ssa::SsaBody, v: SsaValue) -> Option<&crate::ssa::SsaInst> {
|
|
let def = ssa.value_defs.get(v.0 as usize)?;
|
|
let block = ssa.blocks.get(def.block.0 as usize)?;
|
|
block
|
|
.phis
|
|
.iter()
|
|
.chain(block.body.iter())
|
|
.find(|inst| inst.value == v)
|
|
}
|
|
|
|
/// Check whether every operand SSA value of the sink's Call instruction is
|
|
/// proven by type-fact analysis to be non-injectable for `sink_caps`.
|
|
///
|
|
/// Used to suppress `cfg-unguarded-sink` when all arguments are typed safe
|
|
/// (e.g. Rust `port: u16` flowing into `Command::new(…).arg(port.to_string())`).
|
|
/// Returns `false` when any required fact is missing so the structural finding
|
|
/// is preserved whenever typing is ambiguous.
|
|
fn sink_args_typed_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
|
let Some(facts) = ctx.body_const_facts else {
|
|
return false;
|
|
};
|
|
let Some(type_facts) = ctx.type_facts else {
|
|
return false;
|
|
};
|
|
let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else {
|
|
return false;
|
|
};
|
|
let Some(inst) = find_inst(&facts.ssa, sink_val) else {
|
|
return false;
|
|
};
|
|
let SsaOp::Call { args, receiver, .. } = &inst.op else {
|
|
return false;
|
|
};
|
|
|
|
// Chained Rust/JS calls record the whole dotted path as a single Call node.
|
|
// Its SSA operands include pseudo-uses for every identifier segment of the
|
|
// callee (e.g. `Command`, `new`, `arg`, `status`, `unwrap`) plus string
|
|
// literal arguments to intermediate calls. Filter those out so the
|
|
// is-Int check runs only against real argument values.
|
|
let sink_info = &ctx.cfg[sink];
|
|
let callee_desc = sink_info.call.callee.as_deref().unwrap_or("");
|
|
let callee_parts: Vec<&str> = callee_desc
|
|
.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect();
|
|
let outer_parts: Vec<&str> = sink_info
|
|
.call
|
|
.outer_callee
|
|
.as_deref()
|
|
.map(|oc| {
|
|
oc.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
let is_real_arg = |v: SsaValue| -> bool {
|
|
let Some(def) = find_inst(&facts.ssa, v) else {
|
|
return true;
|
|
};
|
|
// Callee-fragment pseudo-uses appear as `Param { .. }` with a
|
|
// var_name that is a segment of the callee text. SelfParam and
|
|
// CatchParam cover `self`/exception bindings that cannot be the
|
|
// implicit callee chain.
|
|
match &def.op {
|
|
SsaOp::Param { .. } => {
|
|
let name = def.var_name.as_deref().unwrap_or("");
|
|
!is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts)
|
|
}
|
|
// Constant string literals used as inline args (e.g. `"listener"`,
|
|
// `"-c"`) are not user-controlled, treat as non-real for the
|
|
// "all int-typed" test so they don't block suppression.
|
|
SsaOp::Const(_) => false,
|
|
_ => true,
|
|
}
|
|
};
|
|
|
|
let mut values: Vec<SsaValue> = Vec::new();
|
|
if let Some(r) = receiver {
|
|
if is_real_arg(*r) {
|
|
values.push(*r);
|
|
}
|
|
}
|
|
for group in args {
|
|
for v in group.iter() {
|
|
if is_real_arg(*v) {
|
|
values.push(*v);
|
|
}
|
|
}
|
|
}
|
|
type_facts_suppress(&values, sink_caps, type_facts)
|
|
}
|
|
|
|
/// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when any positional
|
|
/// argument to the sink Call is provably a JPA / Hibernate Criteria query
|
|
/// object ([`crate::ssa::type_facts::TypeKind::JpaCriteriaQuery`]).
|
|
///
|
|
/// Receiver values are deliberately excluded, the receiver of a JPA
|
|
/// query method (`session.createQuery(cq)`, `em.createQuery(cq)`,
|
|
/// `session.executeUpdate(cq)`) is the connection / EntityManager
|
|
/// channel, never the SQL payload. Including the receiver in the type
|
|
/// check would make this suppression unreachable since `Session` /
|
|
/// `EntityManager` values are typed `Object` / `Unknown` and never
|
|
/// `JpaCriteriaQuery` themselves.
|
|
///
|
|
/// Closes the dominant FP cluster across openmrs (169 of 216
|
|
/// cfg-unguarded-sink), xwiki, and keycloak: Hibernate DAO methods
|
|
/// build a `CriteriaQuery<Foo>` via `cb.createQuery(Foo.class)` +
|
|
/// `Root` / `Predicate` API, then hand the query object to
|
|
/// `session.createQuery(cq)` for execution. No string concatenation
|
|
/// happens, JPA emits parameterized SQL by construction.
|
|
fn sink_args_jpa_criteria_query_safe(
|
|
ctx: &AnalysisContext,
|
|
sink: NodeIndex,
|
|
sink_caps: Cap,
|
|
) -> bool {
|
|
if !sink_caps.intersects(Cap::SQL_QUERY) {
|
|
return false;
|
|
}
|
|
let Some(facts) = ctx.body_const_facts else {
|
|
return false;
|
|
};
|
|
let Some(type_facts) = ctx.type_facts else {
|
|
return false;
|
|
};
|
|
let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else {
|
|
return false;
|
|
};
|
|
let Some(inst) = find_inst(&facts.ssa, sink_val) else {
|
|
return false;
|
|
};
|
|
let SsaOp::Call { args, .. } = &inst.op else {
|
|
return false;
|
|
};
|
|
let mut values: Vec<SsaValue> = Vec::new();
|
|
for group in args {
|
|
for v in group.iter() {
|
|
values.push(*v);
|
|
}
|
|
}
|
|
crate::ssa::type_facts::is_safe_query_object_arg(&values, sink_caps, type_facts)
|
|
}
|
|
|
|
/// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the call site is
|
|
/// a zero-positional-argument query-builder execute / create verb.
|
|
///
|
|
/// Doctrine DBAL `QueryBuilder` (`$qb->select(...)->from(...)->executeQuery()`),
|
|
/// JPA / Hibernate `CriteriaBuilder` (`cb.createQuery()` returning the
|
|
/// query-object factory), and any chained-builder pattern share the shape:
|
|
/// the SQL string is bound earlier on the receiver chain via parameterized
|
|
/// API calls (`->select`, `->from`, `->where(... param ...)`), and the
|
|
/// terminal verb that fires on the sink list (`executeQuery`,
|
|
/// `executeStatement`, `executeUpdate`, `createQuery`, `createNativeQuery`)
|
|
/// takes zero positional args, no SQL string ever flows through the call
|
|
/// site itself.
|
|
///
|
|
/// vs. the dangerous flat shape:
|
|
/// `$conn->executeQuery($sql, $params)` — arg 0 carries the SQL string,
|
|
/// the structural finding is correctly preserved.
|
|
///
|
|
/// Restricted to verb names where JDBC / Doctrine / JPA expose a
|
|
/// receiver-built (zero-arg) overload. PHP `stmt.execute` is excluded
|
|
/// because PDOStatement::execute() can be reached via a tainted
|
|
/// `prepare($sql)` chain where the SQL was already built unsafely;
|
|
/// the receiver-side taint check is the only thing that fires there.
|
|
fn sink_is_zero_arg_query_builder(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
|
if !sink_caps.intersects(Cap::SQL_QUERY) {
|
|
return false;
|
|
}
|
|
// Only suppress when the sink's caps are SQL_QUERY-only. Multi-cap
|
|
// sinks may carry a non-SQL injection vector through the same call.
|
|
if sink_caps != Cap::SQL_QUERY {
|
|
return false;
|
|
}
|
|
// Restrict to PHP. Java / Kotlin / JVM langs already cover the
|
|
// safe prepared-statement shape via the `prepareStatement` Sanitizer
|
|
// rule that dominates `pstmt.executeUpdate()` / `pstmt.executeQuery()`
|
|
// at the structural finding site. PHP's Doctrine DBAL `QueryBuilder`
|
|
// and Drupal `Connection::prepareStatement` shapes need explicit
|
|
// structural support because the receiver isn't always sanitized in
|
|
// a way the dominator-Sanitizer scan recognises (chain receiver,
|
|
// closure-captured helper, etc.).
|
|
if ctx.lang != Lang::Php {
|
|
return false;
|
|
}
|
|
let info = &ctx.cfg[sink];
|
|
let callee = match info.call.callee.as_deref() {
|
|
Some(c) => c,
|
|
None => return false,
|
|
};
|
|
let suffix = callee.rsplit('.').next().unwrap_or(callee);
|
|
let is_builder_verb = matches!(suffix, "executeQuery" | "executeStatement" | "createQuery");
|
|
if !is_builder_verb {
|
|
return false;
|
|
}
|
|
// Restrict to receivers that name a known query-builder. The
|
|
// root-receiver text is the leftmost segment of the callee chain;
|
|
// for `$qb->...->executeQuery()` the root is `qb`, for
|
|
// `$deleteQuery->executeStatement()` it is `deleteQuery`, etc.
|
|
// Patterns canvassed from Doctrine DBAL / Drupal Database / Nextcloud
|
|
// dav / lib idioms:
|
|
// * canonical names: qb, query, queryBuilder, builder, q
|
|
// * verb-bound builders: deleteQuery, insertQuery, selectTagQuery,
|
|
// calendarObjectIdQuery, deleteQb, qbDeleteCalendarObjectProps
|
|
// * action-named builders: insert, update, delete, select, upsert,
|
|
// forUpdate, restoreUpdate
|
|
// Receivers named after the SQL connection (`conn`, `connection`,
|
|
// `dbc`, `db`) or entity-manager (`em`, `entityManager`) are
|
|
// excluded since their `executeQuery` / `executeStatement` overloads
|
|
// accept a SQL string arg.
|
|
let root_receiver = match callee.split('.').next() {
|
|
Some(r) if !r.is_empty() => r,
|
|
_ => return false,
|
|
};
|
|
let receiver_lower = root_receiver.to_ascii_lowercase();
|
|
let is_builder_receiver_by_name = receiver_lower == "qb"
|
|
|| receiver_lower == "q"
|
|
|| receiver_lower == "query"
|
|
|| receiver_lower == "querybuilder"
|
|
|| receiver_lower == "builder"
|
|
|| receiver_lower == "insert"
|
|
|| receiver_lower == "update"
|
|
|| receiver_lower == "delete"
|
|
|| receiver_lower == "select"
|
|
|| receiver_lower == "upsert"
|
|
|| receiver_lower.starts_with("qb")
|
|
|| receiver_lower.starts_with("querybuilder")
|
|
|| receiver_lower.ends_with("qb")
|
|
|| receiver_lower.ends_with("query")
|
|
|| receiver_lower.ends_with("builder");
|
|
let is_builder_receiver_by_def = receiver_defined_by_builder_factory(ctx, sink, root_receiver);
|
|
if !is_builder_receiver_by_name && !is_builder_receiver_by_def {
|
|
return false;
|
|
}
|
|
// Once the receiver is proven to be a builder via def-call lookup, the
|
|
// call is the builder-variant of `executeQuery` / `executeStatement`
|
|
// regardless of argument count (Doctrine DBAL `QueryBuilder::executeQuery`
|
|
// accepts only an optional `?Connection`, never a SQL string). When the
|
|
// receiver was identified solely by its NAME, fall back to the byte-level
|
|
// zero-arg check that guards the closure-captured shape so an unfamiliar
|
|
// verb-named local (`$insert = "DROP TABLE..."`-bound mistake) doesn't
|
|
// unconditionally suppress.
|
|
if !is_builder_receiver_by_def && !callee_span_has_zero_args(info, ctx.source_bytes) {
|
|
return false;
|
|
}
|
|
true
|
|
}
|
|
|
|
/// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the sink call's first
|
|
/// positional argument is the result of a Doctrine DBAL safe-SQL accessor —
|
|
/// either `<builder>.getSQL()` (parameterised SQL from a QueryBuilder chain)
|
|
/// or a `Platform::get*SQL(...)` factory (`getTruncateTableSQL`,
|
|
/// `getCreateTableSQL`, etc., which return DDL with no user-controlled bytes).
|
|
///
|
|
/// Two paths:
|
|
/// 1. Direct arg: `arg_callees[0]` names a recognised accessor. Catches
|
|
/// `$conn->executeStatement($builder->getSQL(), ...)` and
|
|
/// `$conn->executeStatement($platform->getTruncateTableSQL('t', false))`.
|
|
/// 2. Indirect via local var: the arg is a bare identifier `$sql` whose
|
|
/// most-recent same-function defining Call has a recognised accessor as
|
|
/// its callee. Catches the migration shape
|
|
/// `$sql = $this->dbc->getDatabasePlatform()->getTruncateTableSQL(...);
|
|
/// $this->dbc->executeStatement($sql);`
|
|
///
|
|
/// PHP-only: other languages have their own builder conventions (Java JPA's
|
|
/// `CriteriaQuery` is already covered by `sink_args_jpa_criteria_query_safe`).
|
|
fn sink_first_arg_is_builder_get_sql(
|
|
ctx: &AnalysisContext,
|
|
sink: NodeIndex,
|
|
sink_caps: Cap,
|
|
) -> bool {
|
|
if !sink_caps.intersects(Cap::SQL_QUERY) {
|
|
return false;
|
|
}
|
|
if sink_caps != Cap::SQL_QUERY {
|
|
return false;
|
|
}
|
|
if ctx.lang != Lang::Php {
|
|
return false;
|
|
}
|
|
let info = &ctx.cfg[sink];
|
|
|
|
// Path 1: direct method-call arg.
|
|
if let Some(Some(arg_callee)) = info.arg_callees.first() {
|
|
let suffix = arg_callee.rsplit('.').next().unwrap_or(arg_callee);
|
|
if is_dbal_safe_sql_accessor(suffix) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Path 2: bare-identifier arg defined earlier by a recognised accessor.
|
|
// Use `arg_uses[0]` (the first positional argument's identifier set) to
|
|
// pick the candidate variable name. When `arg_uses` is empty (e.g. the
|
|
// arg is a literal, an arithmetic expression, or a complex chain), no
|
|
// back-walk is performed.
|
|
let first_arg_use = info
|
|
.call
|
|
.arg_uses
|
|
.first()
|
|
.and_then(|grp| grp.first())
|
|
.map(|s| s.as_str());
|
|
let var_name = match first_arg_use {
|
|
Some(n) if !n.is_empty() => n,
|
|
_ => return false,
|
|
};
|
|
let sink_func = info.ast.enclosing_func.as_deref();
|
|
let sink_span_start = info.ast.span.0;
|
|
let mut best: Option<(usize, String)> = None;
|
|
for nidx in ctx.cfg.node_indices() {
|
|
let n = &ctx.cfg[nidx];
|
|
if n.kind != crate::cfg::StmtKind::Call {
|
|
continue;
|
|
}
|
|
if n.taint.defines.as_deref() != Some(var_name) {
|
|
continue;
|
|
}
|
|
if n.ast.enclosing_func.as_deref() != sink_func {
|
|
continue;
|
|
}
|
|
let span_start = n.ast.span.0;
|
|
if span_start >= sink_span_start {
|
|
continue;
|
|
}
|
|
let Some(callee) = n.call.callee.as_deref() else {
|
|
continue;
|
|
};
|
|
match best {
|
|
Some((s, _)) if s >= span_start => {}
|
|
_ => best = Some((span_start, callee.to_string())),
|
|
}
|
|
}
|
|
if let Some((_, callee)) = best {
|
|
let suffix = callee.rsplit('.').next().unwrap_or(&callee);
|
|
if is_dbal_safe_sql_accessor(suffix) {
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
/// Recognise method names that Doctrine DBAL exposes as safe-SQL accessors.
|
|
/// `getSQL` is the QueryBuilder accessor; `get*SQL` (case-sensitive `SQL`
|
|
/// suffix) is the Platform-specific DDL builder convention used across the
|
|
/// `Doctrine\DBAL\Platforms\*` hierarchy (`getTruncateTableSQL`,
|
|
/// `getCreateTableSQL`, `getDropTableSQL`, etc.). All such methods receive
|
|
/// schema identifiers and emit DBMS-specific DDL, never weaving user payload.
|
|
fn is_dbal_safe_sql_accessor(name: &str) -> bool {
|
|
if name == "getSQL" {
|
|
return true;
|
|
}
|
|
name.starts_with("get") && name.len() > 5 && name.ends_with("SQL")
|
|
}
|
|
|
|
/// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the sink's first
|
|
/// positional argument *composes* a Doctrine DBAL safe-SQL accessor with
|
|
/// constant string-shaping ops. Two real-world shapes from nextcloud:
|
|
/// (a) `$conn->executeStatement(preg_replace('/^INSERT/i', 'INSERT IGNORE',
|
|
/// $builder->getSQL()), ...)`
|
|
/// (b) `$conn->executeStatement($builder->getSQL() . ' ON CONFLICT DO
|
|
/// NOTHING', ...)`
|
|
///
|
|
/// Strategy (byte-level, conservative):
|
|
/// 1. Lang-gate to PHP. Cap-gate to SQL_QUERY-only.
|
|
/// 2. Extract the sink's first-positional-arg source bytes by balanced-paren
|
|
/// walk inside the call's `ast.span`, with single/double-quoted-string
|
|
/// awareness.
|
|
/// 3. Scan arg-0 bytes for every PHP variable token `$<name>`. Every var
|
|
/// must be bound by a query-builder factory (`getQueryBuilder` /
|
|
/// `createQueryBuilder` / `*queryBuilder`). Bypasses `arg_uses` because
|
|
/// `collect_idents_with_paths` also surfaces method names (`getSQL`,
|
|
/// `getParameters`) that are not variable references in PHP.
|
|
/// 4. At least one var must appear in arg-0 bytes as the receiver of a DBAL
|
|
/// safe-SQL accessor call (`$<recv>->getSQL(` or `$<recv>->get*SQL(`).
|
|
///
|
|
/// The taint engine has already cleared this flow (gate is `!has_taint`),
|
|
/// so the suppression's job is to silence the structural cfg-unguarded-sink
|
|
/// over-fire on builder-composed SQL. PHP-only.
|
|
fn sink_first_arg_composes_safe_dbal_sql(
|
|
ctx: &AnalysisContext,
|
|
sink: NodeIndex,
|
|
sink_caps: Cap,
|
|
) -> bool {
|
|
if sink_caps != Cap::SQL_QUERY {
|
|
return false;
|
|
}
|
|
if ctx.lang != Lang::Php {
|
|
return false;
|
|
}
|
|
let info = &ctx.cfg[sink];
|
|
let Some(arg0_bytes) = first_positional_arg_bytes(info, ctx.source_bytes) else {
|
|
return false;
|
|
};
|
|
if arg0_bytes.is_empty() {
|
|
return false;
|
|
}
|
|
let vars = extract_php_variables(arg0_bytes);
|
|
if vars.is_empty() {
|
|
return false;
|
|
}
|
|
let mut accessor_seen = false;
|
|
for name in &vars {
|
|
if !receiver_defined_by_builder_factory(ctx, sink, name) {
|
|
return false;
|
|
}
|
|
if arg_bytes_call_dbal_accessor_on(arg0_bytes, name) {
|
|
accessor_seen = true;
|
|
}
|
|
}
|
|
accessor_seen
|
|
}
|
|
|
|
/// Extract the unique PHP variable identifiers appearing as `$<name>` tokens
|
|
/// in `bytes`. Skips the `$` sigil; variables tokens are alphanumeric +
|
|
/// underscore. Order-stable (insertion order, with deduplication), so the
|
|
/// caller's any-failure-bails loop deterministically rejects the first
|
|
/// non-builder-bound var.
|
|
fn extract_php_variables(bytes: &[u8]) -> Vec<String> {
|
|
let mut result: Vec<String> = Vec::new();
|
|
let mut i = 0usize;
|
|
while i < bytes.len() {
|
|
if bytes[i] != b'$' {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
let mut e = i + 1;
|
|
while e < bytes.len() && (bytes[e].is_ascii_alphanumeric() || bytes[e] == b'_') {
|
|
e += 1;
|
|
}
|
|
if e > i + 1 {
|
|
if let Ok(name) = std::str::from_utf8(&bytes[i + 1..e]) {
|
|
if !result.iter().any(|n| n == name) {
|
|
result.push(name.to_string());
|
|
}
|
|
}
|
|
}
|
|
i = e.max(i + 1);
|
|
}
|
|
result
|
|
}
|
|
|
|
/// Extract the source bytes of the sink call's first positional argument.
|
|
///
|
|
/// Scans `info.ast.span` for the first `(` (outer args opener), then
|
|
/// balance-walks parens with single/double-quoted-string awareness, returning
|
|
/// the slice up to the first depth-1 `,` or the matching closing `)`.
|
|
/// PHP-shaped: handles `'...'` and `"..."` with backslash escapes; ignores
|
|
/// heredoc/nowdoc, which don't appear inside DBAL call-site argument lists
|
|
/// in practice. `callee_span` is intentionally ignored because the upstream
|
|
/// CFG narrowing path may set it to the *whole* call span (e.g. when a
|
|
/// `return $this->conn->executeStatement(...)` is lowered: `inner_text_span`
|
|
/// records the call's span via `first_call_ident_with_span`). Searching
|
|
/// from `ast.span.0` and matching the first `(` is robust across both
|
|
/// direct-call and statement-wrapped shapes.
|
|
///
|
|
/// Returns `None` if no `(` is found or the walk runs off the end of
|
|
/// `ast.span` without closing.
|
|
fn first_positional_arg_bytes<'a>(
|
|
info: &crate::cfg::NodeInfo,
|
|
bytes: &'a [u8],
|
|
) -> Option<&'a [u8]> {
|
|
let span = info.ast.span;
|
|
if span.1 > bytes.len() || span.0 >= span.1 {
|
|
return None;
|
|
}
|
|
let mut i = span.0;
|
|
while i < span.1 && bytes[i] != b'(' {
|
|
i += 1;
|
|
}
|
|
if i >= span.1 {
|
|
return None;
|
|
}
|
|
let arg_start = i + 1;
|
|
let mut j = arg_start;
|
|
let mut depth: i32 = 1;
|
|
let mut quote: Option<u8> = None;
|
|
while j < span.1 {
|
|
let b = bytes[j];
|
|
if let Some(q) = quote {
|
|
if b == b'\\' && j + 1 < span.1 {
|
|
j += 2;
|
|
continue;
|
|
}
|
|
if b == q {
|
|
quote = None;
|
|
}
|
|
j += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => {
|
|
quote = Some(b);
|
|
j += 1;
|
|
}
|
|
b'(' => {
|
|
depth += 1;
|
|
j += 1;
|
|
}
|
|
b')' => {
|
|
depth -= 1;
|
|
if depth == 0 {
|
|
return Some(&bytes[arg_start..j]);
|
|
}
|
|
j += 1;
|
|
}
|
|
b',' if depth == 1 => {
|
|
return Some(&bytes[arg_start..j]);
|
|
}
|
|
_ => j += 1,
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Return true if `arg0` contains a method-call against `recv_name` whose
|
|
/// method matches [`is_dbal_safe_sql_accessor`]. Recognises the PHP
|
|
/// member-access shape `$<recv>-><method>(`. The backward walk stops at
|
|
/// the first non-identifier byte; the immediately preceding byte must be
|
|
/// the `$` sigil so `mybuilder->getSQL` does not match `recv = "builder"`.
|
|
fn arg_bytes_call_dbal_accessor_on(arg0: &[u8], recv_name: &str) -> bool {
|
|
if recv_name.is_empty() {
|
|
return false;
|
|
}
|
|
let recv_bytes = recv_name.as_bytes();
|
|
let mut i = 0usize;
|
|
while i + 1 < arg0.len() {
|
|
if arg0[i] != b'-' || arg0[i + 1] != b'>' {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
// Walk backward to capture the receiver identifier ending at i.
|
|
let mut s = i;
|
|
while s > 0 {
|
|
let c = arg0[s - 1];
|
|
if c.is_ascii_alphanumeric() || c == b'_' {
|
|
s -= 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if s == i || s == 0 || arg0[s - 1] != b'$' || &arg0[s..i] != recv_bytes {
|
|
i += 2;
|
|
continue;
|
|
}
|
|
// Walk forward to capture the method identifier following `->`.
|
|
let mut e = i + 2;
|
|
while e < arg0.len() {
|
|
let c = arg0[e];
|
|
if c.is_ascii_alphanumeric() || c == b'_' {
|
|
e += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
// Must be followed by `(`.
|
|
if e < arg0.len() && arg0[e] == b'(' {
|
|
if let Ok(method) = std::str::from_utf8(&arg0[i + 2..e]) {
|
|
if is_dbal_safe_sql_accessor(method) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
i += 2;
|
|
}
|
|
false
|
|
}
|
|
|
|
/// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the sink's first
|
|
/// positional argument interpolates only PHP variables that are bound by a
|
|
/// `foreach` over a literal-keyed array within the same function body.
|
|
/// Real-world shape from nextcloud `lib/private/DB/MySqlTools.php:27`:
|
|
/// ```php
|
|
/// $variables = ['innodb_file_per_table' => 'ON'];
|
|
/// if (...) { $variables['innodb_file_format'] = 'Barracuda'; }
|
|
/// foreach ($variables as $var => $val) {
|
|
/// $connection->executeQuery("SHOW VARIABLES LIKE '$var'");
|
|
/// }
|
|
/// ```
|
|
/// The foreach-key `$var` ranges over `{innodb_file_per_table,
|
|
/// innodb_file_format, innodb_large_prefix}`, all metachar-free, so the
|
|
/// interpolated SQL is bounded.
|
|
///
|
|
/// Strategy (byte-level, conservative):
|
|
/// 1. Lang-gate to PHP. Cap-gate to SQL_QUERY-only.
|
|
/// 2. Extract the sink's first-positional-arg source bytes; collect every
|
|
/// `$<name>` interpolation token.
|
|
/// 3. For every var, walk the enclosing function bytes. Find the
|
|
/// innermost `foreach ($X as $name => $...)` or `foreach ($X as $name)`
|
|
/// pattern whose body contains the sink span, with `$name` matching
|
|
/// the use site.
|
|
/// 4. Find every assignment of `$X` in the function body. Each must be
|
|
/// either an array literal `['LIT' => 'LIT', ...]` (key-arrow form) or
|
|
/// a subscript-set `$X['LIT'] = 'LIT';`. Every key/value involved
|
|
/// must be metachar-free (alphanumeric + `_`, `-`, `.`).
|
|
/// 5. Whether the use site reads the foreach-key (`$key` slot) or
|
|
/// foreach-value (`$val` slot), the corresponding literal set must be
|
|
/// proven safe.
|
|
///
|
|
/// PHP-only. Limited to the simple foreach + literal-array shape; bare-
|
|
/// reference / by-reference foreach variants and dynamic array sources
|
|
/// fall through to the structural finding.
|
|
fn sink_arg_uses_safe_foreach_key(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
|
if sink_caps != Cap::SQL_QUERY {
|
|
return false;
|
|
}
|
|
if ctx.lang != Lang::Php {
|
|
return false;
|
|
}
|
|
let info = &ctx.cfg[sink];
|
|
let Some(arg0_bytes) = first_positional_arg_bytes(info, ctx.source_bytes) else {
|
|
return false;
|
|
};
|
|
if arg0_bytes.is_empty() {
|
|
return false;
|
|
}
|
|
let vars = extract_php_variables(arg0_bytes);
|
|
if vars.is_empty() {
|
|
return false;
|
|
}
|
|
let Some(func_scope) = enclosing_func_byte_scope(ctx, sink) else {
|
|
return false;
|
|
};
|
|
for name in &vars {
|
|
if !php_var_safe_via_foreach_literal_array(
|
|
ctx.source_bytes,
|
|
func_scope,
|
|
info.ast.span.0,
|
|
name,
|
|
) {
|
|
return false;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
/// Extent of the enclosing function body. Returns `None` when the sink
|
|
/// has no `enclosing_func` (e.g. file-level top-level statement) or no
|
|
/// matching CFG nodes. The byte range is `(min_span.0, max_span.1)` over
|
|
/// the function's CFG nodes, conservative against multi-statement bodies.
|
|
fn enclosing_func_byte_scope(ctx: &AnalysisContext, sink: NodeIndex) -> Option<(usize, usize)> {
|
|
let sink_func = ctx.cfg[sink].ast.enclosing_func.as_deref()?;
|
|
let mut lo = usize::MAX;
|
|
let mut hi = 0usize;
|
|
for n in ctx.cfg.node_indices() {
|
|
let info = &ctx.cfg[n];
|
|
if info.ast.enclosing_func.as_deref() != Some(sink_func) {
|
|
continue;
|
|
}
|
|
if info.ast.span.0 < lo {
|
|
lo = info.ast.span.0;
|
|
}
|
|
if info.ast.span.1 > hi {
|
|
hi = info.ast.span.1;
|
|
}
|
|
}
|
|
if lo == usize::MAX || hi == 0 || lo >= hi {
|
|
return None;
|
|
}
|
|
Some((lo, hi))
|
|
}
|
|
|
|
/// Walk `source[func_scope]` for `foreach (...)` blocks containing
|
|
/// `sink_span_start` in their body. Match the iteration pattern shape and
|
|
/// (when found) verify every assignment of the iterated identifier in the
|
|
/// function body is a literal-keyed array or a subscript-set with literal
|
|
/// key, with all keys/values metachar-free. Returns true only when *every*
|
|
/// candidate foreach proves safe; bails (returns false) on the first
|
|
/// failure to keep the suppression conservative.
|
|
fn php_var_safe_via_foreach_literal_array(
|
|
source: &[u8],
|
|
func_scope: (usize, usize),
|
|
sink_span_start: usize,
|
|
name: &str,
|
|
) -> bool {
|
|
if name.is_empty() {
|
|
return false;
|
|
}
|
|
if func_scope.0 >= func_scope.1 || func_scope.1 > source.len() {
|
|
return false;
|
|
}
|
|
let scope = &source[func_scope.0..func_scope.1];
|
|
let sink_offset = if sink_span_start >= func_scope.0 {
|
|
sink_span_start - func_scope.0
|
|
} else {
|
|
return false;
|
|
};
|
|
let needle = b"foreach";
|
|
let mut cursor = 0usize;
|
|
let mut matched_any = false;
|
|
while cursor + needle.len() <= scope.len() {
|
|
let Some(rel) = find_subslice(&scope[cursor..], needle) else {
|
|
break;
|
|
};
|
|
let pos = cursor + rel;
|
|
cursor = pos + needle.len();
|
|
// Require word boundary: prev byte (if any) must not be alnum/`_`.
|
|
if pos > 0 {
|
|
let prev = scope[pos - 1];
|
|
if prev.is_ascii_alphanumeric() || prev == b'_' {
|
|
continue;
|
|
}
|
|
}
|
|
// Skip whitespace; require `(`.
|
|
let mut p = pos + needle.len();
|
|
while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') {
|
|
p += 1;
|
|
}
|
|
if p >= scope.len() || scope[p] != b'(' {
|
|
continue;
|
|
}
|
|
// Balanced walk to closing `)`.
|
|
let header_open = p;
|
|
let mut depth = 1i32;
|
|
let mut q = p + 1;
|
|
let mut quote: Option<u8> = None;
|
|
while q < scope.len() && depth > 0 {
|
|
let b = scope[q];
|
|
if let Some(c) = quote {
|
|
if b == b'\\' && q + 1 < scope.len() {
|
|
q += 2;
|
|
continue;
|
|
}
|
|
if b == c {
|
|
quote = None;
|
|
}
|
|
q += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => quote = Some(b),
|
|
b'(' => depth += 1,
|
|
b')' => depth -= 1,
|
|
_ => {}
|
|
}
|
|
q += 1;
|
|
}
|
|
if depth != 0 {
|
|
continue;
|
|
}
|
|
let header_close = q - 1;
|
|
// Skip whitespace; require `{`.
|
|
let mut bp = header_close + 1;
|
|
while bp < scope.len() && matches!(scope[bp], b' ' | b'\t' | b'\n' | b'\r') {
|
|
bp += 1;
|
|
}
|
|
if bp >= scope.len() || scope[bp] != b'{' {
|
|
continue;
|
|
}
|
|
// Balanced walk to closing `}`.
|
|
let body_open = bp;
|
|
let mut bdepth = 1i32;
|
|
let mut bq = bp + 1;
|
|
let mut bquote: Option<u8> = None;
|
|
while bq < scope.len() && bdepth > 0 {
|
|
let b = scope[bq];
|
|
if let Some(c) = bquote {
|
|
if b == b'\\' && bq + 1 < scope.len() {
|
|
bq += 2;
|
|
continue;
|
|
}
|
|
if b == c {
|
|
bquote = None;
|
|
}
|
|
bq += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => bquote = Some(b),
|
|
b'{' => bdepth += 1,
|
|
b'}' => bdepth -= 1,
|
|
_ => {}
|
|
}
|
|
bq += 1;
|
|
}
|
|
if bdepth != 0 {
|
|
continue;
|
|
}
|
|
let body_end = bq - 1;
|
|
// Sink position must lie inside the body.
|
|
if sink_offset < body_open || sink_offset > body_end {
|
|
continue;
|
|
}
|
|
let header = &scope[header_open + 1..header_close];
|
|
let Some((iter_var, key_var, val_var)) = parse_foreach_header(header) else {
|
|
return false;
|
|
};
|
|
let used_as_key = key_var.as_deref() == Some(name);
|
|
let used_as_val = val_var.as_str() == name;
|
|
if !used_as_key && !used_as_val {
|
|
// The use site references some other variable; not bound by
|
|
// this foreach. Continue scanning (might be a nested foreach).
|
|
continue;
|
|
}
|
|
if !php_iter_var_assigns_safe_literals(scope, &iter_var, used_as_key, used_as_val) {
|
|
return false;
|
|
}
|
|
matched_any = true;
|
|
}
|
|
matched_any
|
|
}
|
|
|
|
/// Parse a foreach header text (the bytes between `(` and `)`). Returns
|
|
/// `(iter_var, key_var, value_var)`. Recognises `$X as $V` and
|
|
/// `$X as $K => $V` shapes; bails (returns `None`) on by-reference
|
|
/// (`& $V`), expressions (`call() as $V`), or any unexpected token.
|
|
fn parse_foreach_header(header: &[u8]) -> Option<(String, Option<String>, String)> {
|
|
let text = std::str::from_utf8(header).ok()?.trim();
|
|
let lower = text;
|
|
let as_pos = find_word(lower.as_bytes(), b"as")?;
|
|
let iter_part = lower[..as_pos].trim();
|
|
let body_part = lower[as_pos + 2..].trim();
|
|
let iter_var = parse_simple_var(iter_part)?;
|
|
if body_part.contains("=>") {
|
|
let mut split = body_part.splitn(2, "=>");
|
|
let k = split.next()?.trim();
|
|
let v = split.next()?.trim();
|
|
let key_var = parse_simple_var(k)?;
|
|
let val_var = parse_simple_var(v)?;
|
|
Some((iter_var, Some(key_var), val_var))
|
|
} else {
|
|
let val_var = parse_simple_var(body_part)?;
|
|
Some((iter_var, None, val_var))
|
|
}
|
|
}
|
|
|
|
/// Parse a `$<name>` token, rejecting any extra tokens (whitespace OK).
|
|
/// By-reference (`&$x`), splat (`...$x`), or list-destructuring shapes
|
|
/// produce `None` so the suppression bails conservatively.
|
|
fn parse_simple_var(text: &str) -> Option<String> {
|
|
let trimmed = text.trim();
|
|
let bytes = trimmed.as_bytes();
|
|
if bytes.first() != Some(&b'$') {
|
|
return None;
|
|
}
|
|
let rest = &trimmed[1..];
|
|
if rest.is_empty() {
|
|
return None;
|
|
}
|
|
if !rest.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') {
|
|
return None;
|
|
}
|
|
Some(rest.to_string())
|
|
}
|
|
|
|
/// Find a whole-word match of `word` inside `text`. Word boundaries are
|
|
/// non-alnum/non-`_` bytes (or the buffer edges). Returns the byte offset
|
|
/// of the first match.
|
|
fn find_word(text: &[u8], word: &[u8]) -> Option<usize> {
|
|
let mut cursor = 0usize;
|
|
while cursor + word.len() <= text.len() {
|
|
let rel = find_subslice(&text[cursor..], word)?;
|
|
let pos = cursor + rel;
|
|
let prev_ok = pos == 0 || {
|
|
let p = text[pos - 1];
|
|
!(p.is_ascii_alphanumeric() || p == b'_')
|
|
};
|
|
let next = pos + word.len();
|
|
let next_ok = next == text.len() || {
|
|
let p = text[next];
|
|
!(p.is_ascii_alphanumeric() || p == b'_')
|
|
};
|
|
if prev_ok && next_ok {
|
|
return Some(pos);
|
|
}
|
|
cursor = pos + 1;
|
|
}
|
|
None
|
|
}
|
|
|
|
/// For every assignment of `$<iter_var>` inside `scope` (the enclosing
|
|
/// function bytes), require every key/value referenced is a metachar-free
|
|
/// string literal (alphanumeric, `_`, `-`, `.`, space). Recognises:
|
|
/// * `$<iter_var> = ['LIT' => 'LIT', ...];` (key-arrow array literal)
|
|
/// * `$<iter_var>['LIT'] = 'LIT';` (subscript-set with literal key)
|
|
///
|
|
/// Conservative: any other assignment shape, missing literals, or empty
|
|
/// array set returns false. When `used_as_key` is true, the literal keys
|
|
/// must be safe; when `used_as_val` is true, the literal values must be
|
|
/// safe; both flags can be true at once.
|
|
fn php_iter_var_assigns_safe_literals(
|
|
scope: &[u8],
|
|
iter_var: &str,
|
|
used_as_key: bool,
|
|
used_as_val: bool,
|
|
) -> bool {
|
|
if iter_var.is_empty() {
|
|
return false;
|
|
}
|
|
let needle: Vec<u8> = std::iter::once(b'$').chain(iter_var.bytes()).collect();
|
|
let mut cursor = 0usize;
|
|
let mut saw_init = false;
|
|
while cursor + needle.len() <= scope.len() {
|
|
let Some(rel) = find_subslice(&scope[cursor..], &needle) else {
|
|
break;
|
|
};
|
|
let pos = cursor + rel;
|
|
cursor = pos + 1;
|
|
// Word-boundary on the trailing side: the next byte must not be
|
|
// alnum/`_` (no `$variables_extra`).
|
|
let after = pos + needle.len();
|
|
if after < scope.len() {
|
|
let b = scope[after];
|
|
if b.is_ascii_alphanumeric() || b == b'_' {
|
|
continue;
|
|
}
|
|
}
|
|
// Skip trailing whitespace.
|
|
let mut p = after;
|
|
while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') {
|
|
p += 1;
|
|
}
|
|
if p >= scope.len() {
|
|
continue;
|
|
}
|
|
match scope[p] {
|
|
b'=' => {
|
|
// Direct assignment: `$X = ['k' => 'v', ...];`
|
|
if p + 1 < scope.len() && scope[p + 1] == b'=' {
|
|
continue; // comparison
|
|
}
|
|
if !php_check_array_literal_assignment(scope, p + 1, used_as_key, used_as_val) {
|
|
return false;
|
|
}
|
|
saw_init = true;
|
|
}
|
|
b'['
|
|
// Subscript-set: `$X['LIT'] = 'LIT';`
|
|
if !php_check_subscript_set(scope, p, used_as_key, used_as_val) =>
|
|
{
|
|
return false;
|
|
}
|
|
_ => {
|
|
// Other usage (foreach iter, function arg, member access).
|
|
// Doesn't add to the literal set; allowed as long as no
|
|
// unrecognised assignment shape appears.
|
|
}
|
|
}
|
|
}
|
|
saw_init
|
|
}
|
|
|
|
/// Validate an array-literal assignment after `$X =` (cursor points at
|
|
/// the byte just after `=`). Allowed: optional whitespace, then `[ ... ];`
|
|
/// where every element is `'LIT' => 'LIT'` with metachar-free literals.
|
|
fn php_check_array_literal_assignment(
|
|
scope: &[u8],
|
|
after_eq: usize,
|
|
used_as_key: bool,
|
|
used_as_val: bool,
|
|
) -> bool {
|
|
let mut p = after_eq;
|
|
while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') {
|
|
p += 1;
|
|
}
|
|
if p >= scope.len() || scope[p] != b'[' {
|
|
return false;
|
|
}
|
|
let body_open = p + 1;
|
|
let mut depth = 1i32;
|
|
let mut q = body_open;
|
|
let mut quote: Option<u8> = None;
|
|
while q < scope.len() && depth > 0 {
|
|
let b = scope[q];
|
|
if let Some(c) = quote {
|
|
if b == b'\\' && q + 1 < scope.len() {
|
|
q += 2;
|
|
continue;
|
|
}
|
|
if b == c {
|
|
quote = None;
|
|
}
|
|
q += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => quote = Some(b),
|
|
b'[' => depth += 1,
|
|
b']' => depth -= 1,
|
|
_ => {}
|
|
}
|
|
q += 1;
|
|
}
|
|
if depth != 0 {
|
|
return false;
|
|
}
|
|
let body_close = q - 1;
|
|
let elements = &scope[body_open..body_close];
|
|
php_check_kv_array_literal(elements, used_as_key, used_as_val)
|
|
}
|
|
|
|
/// Walk an array-literal body (between `[` and `]`). Each element must
|
|
/// be `'LIT' => 'LIT'`. All keys/values used by the consumer must be
|
|
/// metachar-free.
|
|
fn php_check_kv_array_literal(elements: &[u8], used_as_key: bool, used_as_val: bool) -> bool {
|
|
if elements.iter().all(|b| b.is_ascii_whitespace()) {
|
|
return false;
|
|
}
|
|
// Split by `,` at depth 0.
|
|
let mut start = 0usize;
|
|
let mut quote: Option<u8> = None;
|
|
let mut depth = 0i32;
|
|
let mut any_pair = false;
|
|
let mut i = 0usize;
|
|
while i < elements.len() {
|
|
let b = elements[i];
|
|
if let Some(c) = quote {
|
|
if b == b'\\' && i + 1 < elements.len() {
|
|
i += 2;
|
|
continue;
|
|
}
|
|
if b == c {
|
|
quote = None;
|
|
}
|
|
i += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => quote = Some(b),
|
|
b'[' | b'(' => depth += 1,
|
|
b']' | b')' => depth -= 1,
|
|
b',' if depth == 0 => {
|
|
if !php_check_arrow_pair(&elements[start..i], used_as_key, used_as_val) {
|
|
return false;
|
|
}
|
|
any_pair = true;
|
|
start = i + 1;
|
|
}
|
|
_ => {}
|
|
}
|
|
i += 1;
|
|
}
|
|
let tail = &elements[start..];
|
|
if tail.iter().any(|b| !b.is_ascii_whitespace()) {
|
|
if !php_check_arrow_pair(tail, used_as_key, used_as_val) {
|
|
return false;
|
|
}
|
|
any_pair = true;
|
|
}
|
|
any_pair
|
|
}
|
|
|
|
/// Validate one `'LIT' => 'LIT'` pair. Both literals must be string
|
|
/// literals (`'...'` or `"..."`) with metachar-free contents per
|
|
/// `is_metachar_free_literal`.
|
|
fn php_check_arrow_pair(pair: &[u8], used_as_key: bool, used_as_val: bool) -> bool {
|
|
let text = std::str::from_utf8(pair).map(str::trim).unwrap_or("");
|
|
let mut split = text.splitn(2, "=>");
|
|
let k = match split.next() {
|
|
Some(s) => s.trim(),
|
|
None => return false,
|
|
};
|
|
let v = match split.next() {
|
|
Some(s) => s.trim(),
|
|
None => return false,
|
|
};
|
|
if used_as_key && !is_metachar_free_string_literal(k.as_bytes()) {
|
|
return false;
|
|
}
|
|
if used_as_val && !is_metachar_free_string_literal(v.as_bytes()) {
|
|
return false;
|
|
}
|
|
true
|
|
}
|
|
|
|
/// Validate a subscript-set assignment `$X[...] = ...;` starting at the
|
|
/// `[` byte. Both the subscript key (when `used_as_key`) and the
|
|
/// assigned value (when `used_as_val`) must be metachar-free string
|
|
/// literals.
|
|
fn php_check_subscript_set(
|
|
scope: &[u8],
|
|
open_bracket: usize,
|
|
used_as_key: bool,
|
|
used_as_val: bool,
|
|
) -> bool {
|
|
let mut depth = 1i32;
|
|
let mut q = open_bracket + 1;
|
|
let mut quote: Option<u8> = None;
|
|
while q < scope.len() && depth > 0 {
|
|
let b = scope[q];
|
|
if let Some(c) = quote {
|
|
if b == b'\\' && q + 1 < scope.len() {
|
|
q += 2;
|
|
continue;
|
|
}
|
|
if b == c {
|
|
quote = None;
|
|
}
|
|
q += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => quote = Some(b),
|
|
b'[' => depth += 1,
|
|
b']' => depth -= 1,
|
|
_ => {}
|
|
}
|
|
q += 1;
|
|
}
|
|
if depth != 0 {
|
|
return false;
|
|
}
|
|
let close_bracket = q - 1;
|
|
let key_bytes = &scope[open_bracket + 1..close_bracket];
|
|
if used_as_key && !is_metachar_free_string_literal(key_bytes.trim_ascii()) {
|
|
return false;
|
|
}
|
|
// Skip whitespace; require `=`, not `==`.
|
|
let mut p = close_bracket + 1;
|
|
while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') {
|
|
p += 1;
|
|
}
|
|
if p >= scope.len() || scope[p] != b'=' {
|
|
return false;
|
|
}
|
|
if p + 1 < scope.len() && scope[p + 1] == b'=' {
|
|
return false;
|
|
}
|
|
// Read the RHS up to the next `;` at depth 0 (no string awareness needed
|
|
// beyond `;` because PHP statement separator).
|
|
let mut q = p + 1;
|
|
let mut quote: Option<u8> = None;
|
|
let mut depth = 0i32;
|
|
while q < scope.len() {
|
|
let b = scope[q];
|
|
if let Some(c) = quote {
|
|
if b == b'\\' && q + 1 < scope.len() {
|
|
q += 2;
|
|
continue;
|
|
}
|
|
if b == c {
|
|
quote = None;
|
|
}
|
|
q += 1;
|
|
continue;
|
|
}
|
|
match b {
|
|
b'\'' | b'"' => quote = Some(b),
|
|
b'(' | b'[' | b'{' => depth += 1,
|
|
b')' | b']' | b'}' => depth -= 1,
|
|
b';' if depth == 0 => break,
|
|
_ => {}
|
|
}
|
|
q += 1;
|
|
}
|
|
let rhs = &scope[p + 1..q];
|
|
if used_as_val && !is_metachar_free_string_literal(rhs.trim_ascii()) {
|
|
return false;
|
|
}
|
|
true
|
|
}
|
|
|
|
/// `true` when `bytes` form a single-quoted or double-quoted string
|
|
/// literal whose contents are alphanumeric, `_`, `-`, `.`, or space —
|
|
/// safe for SQL pattern literal interpolation. Rejects empty string,
|
|
/// any escape sequences, control characters, quotes, semicolons, or
|
|
/// shell/SQL metacharacters.
|
|
fn is_metachar_free_string_literal(bytes: &[u8]) -> bool {
|
|
if bytes.len() < 2 {
|
|
return false;
|
|
}
|
|
let first = bytes[0];
|
|
let last = bytes[bytes.len() - 1];
|
|
if first != last || (first != b'\'' && first != b'"') {
|
|
return false;
|
|
}
|
|
let inner = &bytes[1..bytes.len() - 1];
|
|
if inner.is_empty() {
|
|
return false;
|
|
}
|
|
inner
|
|
.iter()
|
|
.all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b'.' | b' '))
|
|
}
|
|
|
|
/// Check whether the source bytes inside the sink's `callee_span` end with a
|
|
/// zero-argument call form: trailing `)` preceded by `(` with only whitespace
|
|
/// in between. Used to identify `qb.executeQuery()` / `qb.executeStatement()`
|
|
/// where the SQL was bound earlier on the receiver chain.
|
|
fn callee_span_has_zero_args(info: &crate::cfg::NodeInfo, bytes: &[u8]) -> bool {
|
|
let span = info.call.callee_span.unwrap_or(info.ast.span);
|
|
if span.0 >= span.1 || span.1 > bytes.len() {
|
|
return false;
|
|
}
|
|
let slice = &bytes[span.0..span.1];
|
|
let mut end = slice.len();
|
|
while end > 0 && matches!(slice[end - 1], b' ' | b'\t' | b'\n' | b'\r') {
|
|
end -= 1;
|
|
}
|
|
if end == 0 || slice[end - 1] != b')' {
|
|
return false;
|
|
}
|
|
end -= 1;
|
|
while end > 0 && matches!(slice[end - 1], b' ' | b'\t' | b'\n' | b'\r') {
|
|
end -= 1;
|
|
}
|
|
end > 0 && slice[end - 1] == b'('
|
|
}
|
|
|
|
/// Detect that `receiver_name` was bound earlier in the same function by a
|
|
/// query-builder factory call. Two paths:
|
|
/// 1. CFG def-call: a same-function Call node defines `receiver_name` with a
|
|
/// callee ending in `getQueryBuilder` / `createQueryBuilder`.
|
|
/// 2. Source-text scan: between the enclosing function's first byte and the
|
|
/// sink's byte offset, the source contains
|
|
/// `$<receiver_name> = ... ->getQueryBuilder(...)` (or `createQueryBuilder`).
|
|
/// Picks up assignment nodes whose CFG kind/callee text doesn't surface a
|
|
/// leaf factory name (multi-line chains, `for`/`try` block nesting,
|
|
/// unusual lowering paths).
|
|
fn receiver_defined_by_builder_factory(
|
|
ctx: &AnalysisContext,
|
|
sink: NodeIndex,
|
|
receiver_name: &str,
|
|
) -> bool {
|
|
if receiver_name.is_empty() {
|
|
return false;
|
|
}
|
|
let sink_info = &ctx.cfg[sink];
|
|
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
|
let sink_span_start = sink_info.ast.span.0;
|
|
|
|
// Path 1: CFG-level def lookup.
|
|
let mut best: Option<(usize, String)> = None;
|
|
for nidx in ctx.cfg.node_indices() {
|
|
let n = &ctx.cfg[nidx];
|
|
if n.kind != crate::cfg::StmtKind::Call {
|
|
continue;
|
|
}
|
|
if n.taint.defines.as_deref() != Some(receiver_name) {
|
|
continue;
|
|
}
|
|
if n.ast.enclosing_func.as_deref() != sink_func {
|
|
continue;
|
|
}
|
|
let span_start = n.ast.span.0;
|
|
if span_start >= sink_span_start {
|
|
continue;
|
|
}
|
|
let Some(callee) = n.call.callee.as_deref() else {
|
|
continue;
|
|
};
|
|
match best {
|
|
Some((s, _)) if s >= span_start => {}
|
|
_ => best = Some((span_start, callee.to_string())),
|
|
}
|
|
}
|
|
if let Some((_, callee)) = best {
|
|
let suffix = callee.rsplit('.').next().unwrap_or(&callee);
|
|
let suffix_lower = suffix.to_ascii_lowercase();
|
|
if matches!(
|
|
suffix_lower.as_str(),
|
|
"getquerybuilder" | "createquerybuilder" | "getqb" | "createqb"
|
|
) || suffix_lower.ends_with("querybuilder")
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Path 2: source-text scan over the enclosing function's body. Some
|
|
// builder assignments (multi-line chains, deeply nested in `try`/`for`
|
|
// bodies) bind `defines` to a synthesised name that doesn't match
|
|
// `receiver_name` exactly. A direct byte scan for an assignment shape
|
|
// catches these without depending on CFG synthesis details.
|
|
let func_start = ctx
|
|
.cfg
|
|
.node_indices()
|
|
.filter_map(|i| {
|
|
let n = &ctx.cfg[i];
|
|
if n.ast.enclosing_func.as_deref() == sink_func {
|
|
Some(n.ast.span.0)
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.min()
|
|
.unwrap_or(0);
|
|
let bytes = ctx.source_bytes;
|
|
let lo = func_start.min(bytes.len());
|
|
let hi = sink_span_start.min(bytes.len());
|
|
if lo >= hi {
|
|
return false;
|
|
}
|
|
let scope = &bytes[lo..hi];
|
|
text_contains_builder_factory_assignment(scope, receiver_name)
|
|
}
|
|
|
|
/// Search `scope` for `$<name> = ... <factory>(...)` where `<factory>` ends
|
|
/// with `getQueryBuilder` / `createQueryBuilder` (case-insensitive). Used as a
|
|
/// byte-level fallback for CFG def-lookup that misses multi-line chained
|
|
/// assignments inside nested `try` / `for` bodies.
|
|
fn text_contains_builder_factory_assignment(scope: &[u8], name: &str) -> bool {
|
|
if name.is_empty() {
|
|
return false;
|
|
}
|
|
let needle: Vec<u8> = std::iter::once(b'$').chain(name.bytes()).collect();
|
|
let mut start = 0usize;
|
|
while start + needle.len() <= scope.len() {
|
|
let Some(rel) = find_subslice(&scope[start..], &needle) else {
|
|
return false;
|
|
};
|
|
let mut cursor = start + rel + needle.len();
|
|
// Require an immediate `=` (allow whitespace before).
|
|
while cursor < scope.len() && matches!(scope[cursor], b' ' | b'\t' | b'\n' | b'\r') {
|
|
cursor += 1;
|
|
}
|
|
if cursor < scope.len()
|
|
&& scope[cursor] == b'='
|
|
&& (cursor + 1 == scope.len() || scope[cursor + 1] != b'=')
|
|
{
|
|
// Find the next `;` (statement terminator) without crossing a
|
|
// closing brace boundary, the assignment expression spans up to it.
|
|
let mut end = cursor + 1;
|
|
while end < scope.len() {
|
|
let b = scope[end];
|
|
if b == b';' || b == b'\n' && end + 1 < scope.len() && scope[end + 1] == b'\n' {
|
|
break;
|
|
}
|
|
end += 1;
|
|
}
|
|
let rhs_lower: Vec<u8> = scope[cursor + 1..end]
|
|
.iter()
|
|
.map(|b| b.to_ascii_lowercase())
|
|
.collect();
|
|
if find_subslice(&rhs_lower, b"getquerybuilder").is_some()
|
|
|| find_subslice(&rhs_lower, b"createquerybuilder").is_some()
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
start = start + rel + 1;
|
|
}
|
|
false
|
|
}
|
|
|
|
fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
|
|
if needle.is_empty() || needle.len() > haystack.len() {
|
|
return None;
|
|
}
|
|
haystack.windows(needle.len()).position(|w| w == needle)
|
|
}
|
|
|
|
/// Walk the sink's Call SSA arguments and check whether every real argument
|
|
/// resolves through a defining `SsaOp::Call` whose callee carries an SSA
|
|
/// summary with `validated_params_to_return` covering every propagating
|
|
/// parameter slot the caller's argument flows into. When that holds, the
|
|
/// helper validates each argument on every taint-carrying return path, and
|
|
/// the call result is structurally validated even though no syntactic guard
|
|
/// dominates the sink in the caller's body.
|
|
///
|
|
/// Conservative: returns `false` whenever any required fact is missing,
|
|
/// any operand is non-Call-defined and not a constant/parameter, or any
|
|
/// callee summary lacks the validated transform. Real arguments only —
|
|
/// the same `is_real_arg` filter as `sink_args_typed_safe` skips
|
|
/// callee-fragment pseudo-uses and SSA constants.
|
|
fn sink_args_summary_validated_safe(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
// Per-file SSA summary map carries the augment + rerun-pass merges
|
|
// that GlobalSummaries may not yet reflect on single-file scans;
|
|
// fall back to GlobalSummaries when the per-file map isn't threaded
|
|
// through (legacy callers).
|
|
let local_map = ctx.ssa_summaries;
|
|
let global_map = ctx.global_summaries.map(|g| g.snapshot_ssa());
|
|
if local_map.is_none() && global_map.is_none() {
|
|
return false;
|
|
}
|
|
|
|
let sink_info = &ctx.cfg[sink];
|
|
use crate::cfg::StmtKind;
|
|
|
|
// Collect per-arg use names. Prefer `call.arg_uses` (positional, tighter
|
|
// scope), fall back to `taint.uses` minus callee-fragment names when
|
|
// `arg_uses` wasn't extracted (e.g. `await db.execute(sql)` where the
|
|
// CFG saw the await wrapper rather than the underlying call_expression).
|
|
let callee_desc = sink_info.call.callee.as_deref().unwrap_or("");
|
|
let callee_parts: Vec<&str> = callee_desc
|
|
.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect();
|
|
let outer_parts: Vec<&str> = sink_info
|
|
.call
|
|
.outer_callee
|
|
.as_deref()
|
|
.map(|oc| {
|
|
oc.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
let mut arg_use_names: Vec<String> = Vec::new();
|
|
if !sink_info.call.arg_uses.is_empty() {
|
|
for group in &sink_info.call.arg_uses {
|
|
for u in group {
|
|
if !arg_use_names.iter().any(|n| n == u) {
|
|
arg_use_names.push(u.clone());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if arg_use_names.is_empty() {
|
|
for u in &sink_info.taint.uses {
|
|
if is_callee_fragment(u, callee_desc, &callee_parts, &outer_parts) {
|
|
continue;
|
|
}
|
|
if !arg_use_names.iter().any(|n| n == u) {
|
|
arg_use_names.push(u.clone());
|
|
}
|
|
}
|
|
}
|
|
if arg_use_names.is_empty() {
|
|
return false;
|
|
}
|
|
|
|
// Match callee text against any SSA summary key registered in
|
|
// GlobalSummaries by leaf name. Conservative: require an exact
|
|
// single-match so ambiguous overloads fall through to the default
|
|
// structural-finding path.
|
|
let lookup_validated = |callee_text: &str| -> Option<bool> {
|
|
let leaf = callee_leaf_name(callee_text);
|
|
let mut matches: Vec<&crate::summary::ssa_summary::SsaFuncSummary> = Vec::new();
|
|
if let Some(map) = local_map {
|
|
for (key, sum) in map {
|
|
if key.name == leaf || key.name == callee_text {
|
|
matches.push(sum);
|
|
}
|
|
}
|
|
}
|
|
if matches.is_empty() {
|
|
if let Some(map) = global_map {
|
|
for (key, sum) in map {
|
|
if key.name == leaf || key.name == callee_text {
|
|
matches.push(sum);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if matches.len() != 1 {
|
|
return None;
|
|
}
|
|
let sum = matches[0];
|
|
if sum.validated_params_to_return.is_empty() {
|
|
return Some(false);
|
|
}
|
|
// Every propagating parameter must be in validated_params_to_return.
|
|
// When the callee doesn't propagate taint at all, the call result
|
|
// cannot carry caller-side taint, so a non-empty validation set is
|
|
// sufficient.
|
|
let propagates = sum
|
|
.param_to_return
|
|
.iter()
|
|
.map(|(idx, _)| *idx)
|
|
.collect::<Vec<usize>>();
|
|
if propagates.is_empty() {
|
|
return Some(true);
|
|
}
|
|
let all_validated = propagates
|
|
.iter()
|
|
.all(|p| sum.validated_params_to_return.contains(p));
|
|
Some(all_validated)
|
|
};
|
|
|
|
// Walk CFG predecessors of `sink` looking for nodes that define an
|
|
// arg-use name via a Call to an in-file helper. Conservative
|
|
// traversal: stops at the body entry, follows Seq/Branch edges,
|
|
// bails out on join/branch back-edges (loops) to keep the analysis
|
|
// bounded.
|
|
let mut to_validate: Vec<String> = arg_use_names.clone();
|
|
let mut visited: HashSet<NodeIndex> = HashSet::new();
|
|
let mut frontier: Vec<NodeIndex> = ctx
|
|
.cfg
|
|
.neighbors_directed(sink, petgraph::Direction::Incoming)
|
|
.collect();
|
|
let mut iter_budget = 256usize;
|
|
while let Some(n) = frontier.pop() {
|
|
if iter_budget == 0 {
|
|
return false;
|
|
}
|
|
iter_budget -= 1;
|
|
if !visited.insert(n) {
|
|
continue;
|
|
}
|
|
let info = &ctx.cfg[n];
|
|
if info.kind == StmtKind::Call {
|
|
if let Some(def_name) = info.taint.defines.as_deref() {
|
|
if let Some(pos) = to_validate.iter().position(|u| u == def_name) {
|
|
let callee = info.call.callee.as_deref().unwrap_or("");
|
|
if !matches!(lookup_validated(callee), Some(true)) {
|
|
return false;
|
|
}
|
|
to_validate.remove(pos);
|
|
if to_validate.is_empty() {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for pred in ctx.cfg.neighbors_directed(n, petgraph::Direction::Incoming) {
|
|
frontier.push(pred);
|
|
}
|
|
}
|
|
// Some arg-use names didn't map to an in-body Call definition (e.g.
|
|
// they bind to a function parameter, an import, or a literal).
|
|
// Only suppress when EVERY tainted-shaped arg has been validated by
|
|
// an in-file helper summary; otherwise fall through.
|
|
to_validate.is_empty()
|
|
}
|
|
|
|
/// Thin wrapper around [`crate::ssa::type_facts::is_type_safe_for_sink`] kept
|
|
/// local so the unit tests here can exercise the exact predicate used at the
|
|
/// `cfg-unguarded-sink` emission site.
|
|
fn type_facts_suppress(values: &[SsaValue], sink_caps: Cap, type_facts: &TypeFactResult) -> bool {
|
|
crate::ssa::type_facts::is_type_safe_for_sink(values, sink_caps, type_facts)
|
|
}
|
|
|
|
/// Suppress a `cfg-unguarded-sink` finding when every real argument SSA
|
|
/// value resolves to a finite set of metacharacter-free literals, as proved
|
|
/// by the static-map analysis. Runs in lock-step with the SSA taint
|
|
/// suppression so both findings paths agree on when a provably-bounded
|
|
/// lookup idiom (e.g. `map.get(x).unwrap_or("safe")` over literal inserts)
|
|
/// should clear a command-injection sink.
|
|
///
|
|
/// Only fires for `Cap::SHELL_ESCAPE`, SQL / path suppression from this
|
|
/// domain would require stronger reasoning (literal keys can still carry
|
|
/// SQL tokens if the inserts themselves contain them).
|
|
fn sink_args_static_map_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
|
if !sink_caps.intersects(Cap::SHELL_ESCAPE) {
|
|
return false;
|
|
}
|
|
let Some(facts) = ctx.body_const_facts else {
|
|
return false;
|
|
};
|
|
let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else {
|
|
return false;
|
|
};
|
|
let Some(inst) = find_inst(&facts.ssa, sink_val) else {
|
|
return false;
|
|
};
|
|
let SsaOp::Call { args, receiver, .. } = &inst.op else {
|
|
return false;
|
|
};
|
|
|
|
let sm =
|
|
crate::ssa::static_map::analyze(&facts.ssa, ctx.cfg, Some(ctx.lang), &facts.const_values);
|
|
if sm.is_empty() {
|
|
return false;
|
|
}
|
|
|
|
// Skip callee-fragment pseudo-uses the same way `sink_args_typed_safe`
|
|
// does so only real runtime arg values participate in the check.
|
|
let sink_info = &ctx.cfg[sink];
|
|
let callee_desc = sink_info.call.callee.as_deref().unwrap_or("");
|
|
let callee_parts: Vec<&str> = callee_desc
|
|
.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect();
|
|
let outer_parts: Vec<&str> = sink_info
|
|
.call
|
|
.outer_callee
|
|
.as_deref()
|
|
.map(|oc| {
|
|
oc.split(['.', ':'])
|
|
.map(|p| p.split('(').next().unwrap_or(p))
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
let is_real_arg = |v: SsaValue| -> bool {
|
|
let Some(def) = find_inst(&facts.ssa, v) else {
|
|
return true;
|
|
};
|
|
match &def.op {
|
|
SsaOp::Param { .. } => {
|
|
let name = def.var_name.as_deref().unwrap_or("");
|
|
!is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts)
|
|
}
|
|
SsaOp::Const(_) => false,
|
|
_ => true,
|
|
}
|
|
};
|
|
|
|
let mut values: Vec<SsaValue> = Vec::new();
|
|
if let Some(r) = receiver {
|
|
if is_real_arg(*r) {
|
|
values.push(*r);
|
|
}
|
|
}
|
|
for group in args {
|
|
for v in group.iter() {
|
|
if is_real_arg(*v) {
|
|
values.push(*v);
|
|
}
|
|
}
|
|
}
|
|
if values.is_empty() {
|
|
return false;
|
|
}
|
|
values.iter().all(|v| match sm.finite_string_values.get(v) {
|
|
Some(set) if !set.is_empty() => set
|
|
.iter()
|
|
.all(|s| crate::abstract_interp::string_domain::is_shell_safe_literal(s)),
|
|
_ => false,
|
|
})
|
|
}
|
|
|
|
/// Check if a callee matches any of the runtime label rules that are sanitizers.
|
|
fn match_config_sanitizer(callee: &str, extra: &[RuntimeLabelRule]) -> Option<Cap> {
|
|
// Lazily compute lowercased callee only when a case-insensitive rule is hit.
|
|
let mut callee_lower: Option<String> = None;
|
|
|
|
for rule in extra {
|
|
let cap = match rule.label {
|
|
DataLabel::Sanitizer(c) => c,
|
|
_ => continue,
|
|
};
|
|
for m in &rule.matchers {
|
|
if rule.case_sensitive {
|
|
if m.ends_with('_') {
|
|
if callee.starts_with(m.as_str()) {
|
|
return Some(cap);
|
|
}
|
|
} else if callee.ends_with(m.as_str()) {
|
|
return Some(cap);
|
|
}
|
|
} else {
|
|
let cl = callee_lower.get_or_insert_with(|| callee.to_ascii_lowercase());
|
|
let ml = m.to_ascii_lowercase();
|
|
if ml.ends_with('_') {
|
|
if cl.starts_with(&ml) {
|
|
return Some(cap);
|
|
}
|
|
} else if cl.ends_with(&ml) {
|
|
return Some(cap);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Resolve the `if (X)` / `if (!X)` indirect-validator pattern: the
|
|
/// condition has exactly one bare-identifier variable whose defining
|
|
/// CFG node is a [`StmtKind::Call`] whose `defines` is the same name
|
|
/// and whose `callee` is recognised by
|
|
/// [`crate::ssa::type_facts::classify_input_validator_callee`].
|
|
///
|
|
/// Returns the validator callee name when the pattern matches, `None`
|
|
/// otherwise. Conservative: bails when the condition has zero or more
|
|
/// than one variable, when no defining call is found, or when the
|
|
/// callee doesn't match a validator pattern. Mirrors the SSA
|
|
/// branch-narrowing layer
|
|
/// ([`crate::taint::ssa_transfer::apply_input_validator_branch_narrowing`])
|
|
/// so the structural `cfg-unguarded-sink` suppression matches the
|
|
/// taint engine's validator recognition.
|
|
///
|
|
/// Driven off CFG `TaintMeta.defines` rather than the per-body SSA
|
|
/// value-defs because nested arrow-function bodies are sometimes
|
|
/// lowered with empty SSA in the cfg-analysis context, but the CFG
|
|
/// nodes themselves carry `defines` in every body.
|
|
fn cond_indirect_validator_callee(
|
|
info: &crate::cfg::NodeInfo,
|
|
ctx: &AnalysisContext,
|
|
) -> Option<String> {
|
|
if info.condition_vars.len() != 1 {
|
|
return None;
|
|
}
|
|
let var_name = info.condition_vars[0].as_str();
|
|
let cond_func = info.ast.enclosing_func.as_deref();
|
|
let cond_span_start = info.ast.span.0;
|
|
|
|
// Walk the CFG for any node that DEFINES `var_name` via a Call
|
|
// expression. Same-function only, and only consider definitions
|
|
// textually before the condition: a reassignment after the `if`
|
|
// cannot be the def reaching it. Among the eligible defs, take
|
|
// the textually-last one (highest span start), a conservative
|
|
// latest-def proxy without paying for full dominator analysis.
|
|
let mut best: Option<(usize, &str)> = None;
|
|
for nidx in ctx.cfg.node_indices() {
|
|
let n = &ctx.cfg[nidx];
|
|
if n.kind != crate::cfg::StmtKind::Call {
|
|
continue;
|
|
}
|
|
if n.taint.defines.as_deref() != Some(var_name) {
|
|
continue;
|
|
}
|
|
if n.ast.enclosing_func.as_deref() != cond_func {
|
|
continue;
|
|
}
|
|
let span_start = n.ast.span.0;
|
|
if span_start >= cond_span_start {
|
|
continue;
|
|
}
|
|
let Some(callee) = n.call.callee.as_deref() else {
|
|
continue;
|
|
};
|
|
match best {
|
|
Some((s, _)) if s >= span_start => {}
|
|
_ => best = Some((span_start, callee)),
|
|
}
|
|
}
|
|
let (_, callee) = best?;
|
|
|
|
crate::ssa::type_facts::classify_input_validator_callee(callee).map(|_| callee.to_string())
|
|
}
|
|
|
|
/// Find all nodes in the CFG that are calls to guard functions.
|
|
fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
|
|
let guard_rules = rules::guard_rules(ctx.lang);
|
|
let config_rules = ctx
|
|
.analysis_rules
|
|
.map(|r| r.extra_labels.as_slice())
|
|
.unwrap_or(&[]);
|
|
let mut result = Vec::new();
|
|
|
|
for idx in ctx.cfg.node_indices() {
|
|
let info = &ctx.cfg[idx];
|
|
|
|
// If-condition guards: allowlist checks, type checks, validation
|
|
// calls, shell-metachar rejections, and bounded-length checks in
|
|
// branch conditions act as guards for downstream sinks.
|
|
if info.kind == StmtKind::If {
|
|
if let Some(cond_text) = &info.condition_text {
|
|
let kind = classify_condition(cond_text);
|
|
// For `AllowlistCheck`, also confirm a target identifier was
|
|
// extractable. When the receiver-method form carries a
|
|
// string-literal arg (`filePath.includes("/")`,
|
|
// `path.contains("..")`), `extract_allowlist_target` returns
|
|
// `None` because the argument isn't an identifier. Those
|
|
// shapes are presence-checks, not real allowlist tests against
|
|
// a collection variable, and shouldn't dominate every
|
|
// downstream sink as a structural guard with `Cap::all()`.
|
|
// `classify_condition` itself stays unchanged (an existing
|
|
// test locks in its broad return for the receiver-method form,
|
|
// and the SSA branch-narrowing layer reads the kind for its
|
|
// own purposes).
|
|
let allowlist_has_target = if kind == PredicateKind::AllowlistCheck {
|
|
crate::taint::path_state::classify_condition_with_target(cond_text)
|
|
.1
|
|
.is_some()
|
|
} else {
|
|
true
|
|
};
|
|
if matches!(
|
|
kind,
|
|
PredicateKind::TypeCheck | PredicateKind::ValidationCall,
|
|
) || (kind == PredicateKind::AllowlistCheck && allowlist_has_target)
|
|
{
|
|
result.push((idx, Cap::all()));
|
|
} else if cond_indirect_validator_callee(info, ctx).is_some() {
|
|
// Indirect-validator pattern:
|
|
// const err = validate(x); if (err) throw …;
|
|
// const ok = isValid(x); if (!ok) throw …;
|
|
// The classifier returns Unknown / NullCheck / ErrorCheck
|
|
// because the if-condition is a bare result variable, not
|
|
// a direct call expression. `cond_indirect_validator_callee`
|
|
// handles that by scanning the CFG for nodes whose
|
|
// `TaintMeta.defines` matches the condition variable and
|
|
// checking whether any defining Call has an
|
|
// `is_input_validator_callee`-recognised callee. This keeps
|
|
// cfg-unguarded-sink suppression aligned with the same
|
|
// structural validator recognition the SSA branch-narrowing
|
|
// layer uses, without requiring the condition itself to be
|
|
// a direct call expression.
|
|
//
|
|
// Motivated by Novu CVE GHSA-4x48-cgf9-q33f.
|
|
result.push((idx, Cap::all()));
|
|
} else if matches!(
|
|
kind,
|
|
PredicateKind::ShellMetaValidated | PredicateKind::BoundedLength
|
|
) {
|
|
// Shell-metachar rejection and bounded-length checks only
|
|
// guard shell-family sinks. Keep scope tight so unrelated
|
|
// sinks (SQL, XSS) aren't silenced when a shell gate
|
|
// happens to sit upstream.
|
|
result.push((idx, Cap::SHELL_ESCAPE | Cap::CODE_EXEC));
|
|
} else {
|
|
// Path-traversal rejection guard. When the condition
|
|
// matches a path-rejection idiom recognised by
|
|
// `classify_path_rejection_axes` (`strstr(p, "..")`
|
|
// / `.contains("..")` / `strings.Contains(p, "..")`
|
|
// / `p[0] == '/'` / `path.is_absolute()` / etc.),
|
|
// it acts as a guard for FILE_IO sinks. Catches
|
|
// the C/C++ `if (strstr(p, "..") != NULL)` shape
|
|
// whose `!= NULL` wrapper otherwise falls through
|
|
// to NullCheck classification and never registers
|
|
// as a guard. Scope kept to FILE_IO so unrelated
|
|
// sinks aren't silenced.
|
|
let axes = crate::abstract_interp::path_domain::classify_path_rejection_axes(
|
|
cond_text,
|
|
);
|
|
if !axes.is_empty() {
|
|
result.push((idx, Cap::FILE_IO));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if info.kind != StmtKind::Call {
|
|
continue;
|
|
}
|
|
if let Some(callee) = &info.call.callee {
|
|
// Check config sanitizer rules first
|
|
if let Some(cap) = match_config_sanitizer(callee, config_rules) {
|
|
result.push((idx, cap));
|
|
continue;
|
|
}
|
|
|
|
// Then check built-in guard rules
|
|
let callee_lower = callee.to_ascii_lowercase();
|
|
for rule in guard_rules {
|
|
let matched = rule.matchers.iter().any(|m| {
|
|
let ml = m.to_ascii_lowercase();
|
|
if ml.ends_with('_') {
|
|
callee_lower.starts_with(&ml)
|
|
} else {
|
|
callee_lower.ends_with(&ml)
|
|
}
|
|
});
|
|
if matched {
|
|
result.push((idx, rule.applies_to_sink_caps));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Check whether taint analysis confirmed unsanitized flow to this sink node.
|
|
fn taint_confirms_sink(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
ctx.taint_findings.iter().any(|f| f.sink == sink)
|
|
}
|
|
|
|
/// Check whether any variable used by the sink is directly derived from a
|
|
/// Source node in the same function (via simple def-use chain).
|
|
fn sink_arg_is_source_derived(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
let sink_info = &ctx.cfg[sink];
|
|
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
|
|
|
// Collect all variables the sink reads
|
|
let sink_uses = &sink_info.taint.uses;
|
|
if sink_uses.is_empty() {
|
|
return false;
|
|
}
|
|
|
|
// Walk all nodes in the same function looking for Source nodes that define
|
|
// one of the variables the sink uses.
|
|
for idx in ctx.cfg.node_indices() {
|
|
let info = &ctx.cfg[idx];
|
|
if info.ast.enclosing_func.as_deref() != sink_func {
|
|
continue;
|
|
}
|
|
if !info
|
|
.taint
|
|
.labels
|
|
.iter()
|
|
.any(|l| matches!(l, DataLabel::Source(_)))
|
|
{
|
|
continue;
|
|
}
|
|
// Source node defines a variable that the sink reads → source-derived
|
|
if let Some(def) = &info.taint.defines
|
|
&& sink_uses.iter().any(|u| u == def)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
/// Check whether the sink's arguments are *only* function parameters
|
|
/// (i.e. this function is a thin wrapper around the sink).
|
|
fn sink_arg_is_parameter_only(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
let sink_info = &ctx.cfg[sink];
|
|
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
|
|
|
let sink_uses = &sink_info.taint.uses;
|
|
if sink_uses.is_empty() {
|
|
// No identifiable arguments, could be a constant call like Command::new("ls")
|
|
return true; // treat as non-dangerous (constant arg)
|
|
}
|
|
|
|
// Collect parameter names for the enclosing function from FuncSummaries
|
|
let param_names: Vec<&str> = ctx
|
|
.func_summaries
|
|
.values()
|
|
.filter(|s| {
|
|
// Match by function entry being in the same function
|
|
ctx.cfg[s.entry].ast.enclosing_func.as_deref() == sink_func
|
|
})
|
|
.flat_map(|s| s.param_names.iter().map(|p| p.as_str()))
|
|
.collect();
|
|
|
|
if param_names.is_empty() {
|
|
return false; // can't determine params
|
|
}
|
|
|
|
// The sink's `taint.uses` includes pseudo-uses for callee-chain segments
|
|
// when the chain is rooted at a self-pseudo-receiver (`this`, `self`,
|
|
// `static`, `parent`). In that case every segment of the chain is part
|
|
// of the dotted callee path that tree-sitter records as identifier
|
|
// children of the call expression, not a real argument. This shape
|
|
// covers thin method wrappers like
|
|
// `function wrap($sql) { return $this->inner->execute($sql); }` so the
|
|
// sink is recognised as parameter-only despite `this` / `inner` /
|
|
// `execute` showing up in `taint.uses`.
|
|
//
|
|
// For other callee chains (e.g. Python `cursor.execute(name)` where
|
|
// `cursor` is a local variable from `connection.cursor()`), only the
|
|
// method name itself (`execute`) is filtered. `cursor` is a real
|
|
// identifier value — a non-param local — and must not be filtered,
|
|
// otherwise wrappers around external receivers get suppressed
|
|
// incorrectly.
|
|
//
|
|
// PHP variable receivers carry a leading `$` (`$this->inner->execute`)
|
|
// and use `->` between the receiver and member, so split on the full
|
|
// set of separators and strip a leading `$` so identifier-shaped
|
|
// fragments line up with bare identifier names in `taint.uses`.
|
|
//
|
|
// Each segment carries an `is_call` flag so chain pieces that are
|
|
// themselves method invocations (`getSession()` in
|
|
// `getSession().createQuery(qs)`) can be recognised as pseudo-uses
|
|
// alongside the terminal method name. Variable-receiver chains like
|
|
// `cursor.execute(name)` keep `cursor` as a real identifier and stay
|
|
// out of the param-only filter.
|
|
let callee_desc = sink_info.call.callee.as_deref().unwrap_or("");
|
|
let outer_callee = sink_info.call.outer_callee.as_deref().unwrap_or("");
|
|
fn split_chain_with_flags(s: &str) -> SmallVec<[(&str, bool); 8]> {
|
|
let mut out: SmallVec<[(&str, bool); 8]> = SmallVec::new();
|
|
for piece in s.split(['.', ':', '>', '-']) {
|
|
let stripped = piece.trim_start_matches('$').trim();
|
|
if stripped.is_empty() {
|
|
continue;
|
|
}
|
|
let (name, is_call) = match stripped.find('(') {
|
|
Some(idx) => (stripped[..idx].trim(), true),
|
|
None => (stripped, false),
|
|
};
|
|
if !name.is_empty() {
|
|
out.push((name, is_call));
|
|
}
|
|
}
|
|
out
|
|
}
|
|
fn is_self_root(seg: &str) -> bool {
|
|
matches!(seg, "this" | "self" | "static" | "parent" | "cls")
|
|
}
|
|
let mut callee_fragments: SmallVec<[&str; 8]> = SmallVec::new();
|
|
for src in [callee_desc, outer_callee] {
|
|
let segs = split_chain_with_flags(src);
|
|
let Some(&(first_name, _)) = segs.first() else {
|
|
continue;
|
|
};
|
|
let last_idx = segs.len() - 1;
|
|
if is_self_root(first_name) {
|
|
// Whole chain is callee path: `$this->inner->execute` →
|
|
// every segment is a pseudo-use.
|
|
for &(name, _) in &segs {
|
|
if !callee_fragments.contains(&name) {
|
|
callee_fragments.push(name);
|
|
}
|
|
}
|
|
} else {
|
|
// The terminal method name is a pseudo-use. Any non-last
|
|
// segment that is itself a method call (`getSession()` in
|
|
// `getSession().createQuery(qs)`) is also a pseudo-use, since
|
|
// the segment text in the chain refers to a method name, not
|
|
// a local variable. Bare-identifier receivers like `cursor`
|
|
// in `cursor.execute(name)` carry no `(` and stay as real
|
|
// local-variable values.
|
|
for (i, &(name, is_call)) in segs.iter().enumerate() {
|
|
if (is_call || i == last_idx) && !callee_fragments.contains(&name) {
|
|
callee_fragments.push(name);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Source-text scan: `callee_desc` collapses chains via `root_receiver_text`,
|
|
// so `getSession().getCriteriaBuilder().createQuery(qs)` reduces to
|
|
// `"getSession().createQuery"` and the intermediate `getCriteriaBuilder`
|
|
// is missing. Walk the sink's source bytes up to the outermost args
|
|
// opener and lift every `IDENT(` pattern as a method-call pseudo-use.
|
|
// Identifiers nested inside earlier `()` groups (which open at depth 0
|
|
// for sibling method calls in a chain) are picked up too, so every
|
|
// chain hop contributes its method name.
|
|
let span = sink_info.classification_span();
|
|
let (start, end) = span;
|
|
if start < ctx.source_bytes.len() && end <= ctx.source_bytes.len() && start < end {
|
|
let span_bytes = &ctx.source_bytes[start..end];
|
|
if let Ok(span_text) = std::str::from_utf8(span_bytes) {
|
|
let bytes = span_text.as_bytes();
|
|
// Find the outermost args-opener: the last `(` at depth 0.
|
|
let mut depth: i32 = 0;
|
|
let mut last_open_at_zero: Option<usize> = None;
|
|
for (i, &b) in bytes.iter().enumerate() {
|
|
match b {
|
|
b'(' => {
|
|
if depth == 0 {
|
|
last_open_at_zero = Some(i);
|
|
}
|
|
depth += 1;
|
|
}
|
|
b')' => {
|
|
depth = depth.saturating_sub(1);
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
let chain_end = last_open_at_zero.unwrap_or(bytes.len());
|
|
// Walk the chain prefix and lift every identifier directly followed
|
|
// by `(` as a method-call pseudo-use.
|
|
let mut i = 0;
|
|
while i < chain_end {
|
|
let b = bytes[i];
|
|
let is_ident_start = b.is_ascii_alphabetic() || b == b'_';
|
|
if !is_ident_start {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
let id_start = i;
|
|
while i < chain_end {
|
|
let c = bytes[i];
|
|
if c.is_ascii_alphanumeric() || c == b'_' {
|
|
i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if i < chain_end && bytes[i] == b'(' {
|
|
let name = &span_text[id_start..i];
|
|
if !callee_fragments.contains(&name) {
|
|
callee_fragments.push(name);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Strict parameter set scoped to the sink's enclosing function only.
|
|
// Used for the local-trace fallback below to prevent over-suppression
|
|
// when sibling functions in the same file happen to share param names
|
|
// with the current scope (e.g. a constructor's `dbConn` param leaking
|
|
// into the `param_names` view of an unrelated `logAuditEvent` body).
|
|
// The existing broad `param_names` view is preserved for the direct
|
|
// in-list check above so legacy suppression behaviour is unchanged.
|
|
let strict_param_names: SmallVec<[&str; 8]> = ctx
|
|
.func_summaries
|
|
.iter()
|
|
.filter(|(key, _)| sink_func.is_some_and(|name| key.name.as_str() == name))
|
|
.flat_map(|(_, s)| s.param_names.iter().map(|p| p.as_str()))
|
|
.collect();
|
|
sink_uses.iter().all(|u| {
|
|
if callee_fragments.contains(&u.as_str()) || u == callee_desc {
|
|
return true;
|
|
}
|
|
if param_names.contains(&u.as_str()) {
|
|
return true;
|
|
}
|
|
// One-hop transitive local trace: when a sink use names a body
|
|
// local whose every definition resolves to parameter-derived
|
|
// data (e.g. `Statement stmt = connection.createStatement();
|
|
// stmt.executeQuery(sql);` where `connection` is a param), the
|
|
// local is wrapper plumbing. Receiver-variable shapes whose
|
|
// definitions reach a free (non-param, non-local) identifier or
|
|
// a Source label fail the trace and keep the structural finding.
|
|
if strict_param_names.is_empty() {
|
|
return false;
|
|
}
|
|
let mut seen: SmallVec<[&str; 4]> = SmallVec::new();
|
|
local_is_param_derived(
|
|
ctx,
|
|
sink_func,
|
|
&strict_param_names,
|
|
&callee_fragments,
|
|
u.as_str(),
|
|
3,
|
|
&mut seen,
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Recursive trace, return true iff every definition of `name` inside
|
|
/// `sink_func` has its right-hand-side fully resolvable to parameter
|
|
/// names, callee fragments, or other already-cleared body locals. Bounded
|
|
/// by `depth` to prevent runaway on pathological CFGs and uses `seen` to
|
|
/// short-circuit cycles (a local whose definition mentions itself does
|
|
/// not clear). Called from `sink_arg_is_parameter_only` once the simple
|
|
/// param / callee-fragment / source-text check has failed.
|
|
fn local_is_param_derived<'a>(
|
|
ctx: &'a AnalysisContext,
|
|
sink_func: Option<&str>,
|
|
param_names: &[&'a str],
|
|
callee_fragments: &[&'a str],
|
|
name: &'a str,
|
|
depth: u8,
|
|
seen: &mut SmallVec<[&'a str; 4]>,
|
|
) -> bool {
|
|
if depth == 0 || seen.contains(&name) {
|
|
return false;
|
|
}
|
|
seen.push(name);
|
|
let mut found_def = false;
|
|
let mut all_def_clear = true;
|
|
for idx in ctx.cfg.node_indices() {
|
|
let info = &ctx.cfg[idx];
|
|
if info.ast.enclosing_func.as_deref() != sink_func {
|
|
continue;
|
|
}
|
|
if info.taint.defines.as_deref() != Some(name) {
|
|
continue;
|
|
}
|
|
found_def = true;
|
|
if info
|
|
.taint
|
|
.labels
|
|
.iter()
|
|
.any(|l| matches!(l, DataLabel::Source(_)))
|
|
{
|
|
all_def_clear = false;
|
|
break;
|
|
}
|
|
// Compute the defining node's own callee fragments so method-name
|
|
// segments (e.g. `createStatement` in `statement =
|
|
// connection.createStatement();`) are recognised as pseudo-uses
|
|
// alongside the receiver variable. Without this, the trace
|
|
// wrongly rejects every chained method initialisation. The
|
|
// source-text scan below also lifts intermediate method calls
|
|
// (`unwrap` in `connection.unwrap().createStatement`) that the
|
|
// collapsed `info.call.callee` drops.
|
|
let def_fragments = chain_callee_fragments_with_text(
|
|
info.call.callee.as_deref().unwrap_or(""),
|
|
info.call.outer_callee.as_deref().unwrap_or(""),
|
|
ctx.source_bytes,
|
|
info.classification_span(),
|
|
);
|
|
let clear = info.taint.uses.iter().all(|u| {
|
|
param_names.contains(&u.as_str())
|
|
|| callee_fragments.contains(&u.as_str())
|
|
|| def_fragments.contains(&u.as_str())
|
|
|| local_is_param_derived(
|
|
ctx,
|
|
sink_func,
|
|
param_names,
|
|
callee_fragments,
|
|
u.as_str(),
|
|
depth - 1,
|
|
seen,
|
|
)
|
|
});
|
|
if !clear {
|
|
all_def_clear = false;
|
|
break;
|
|
}
|
|
}
|
|
seen.pop();
|
|
found_def && all_def_clear
|
|
}
|
|
|
|
/// Split a callee chain like `getSession().createQuery` or
|
|
/// `connection.createStatement` into method-name segments treated as
|
|
/// pseudo-uses. Also walks `source_bytes[span]` up to the outermost
|
|
/// args-opener and lifts every `IDENT(` pattern, recovering intermediate
|
|
/// method-call segments that the collapsed `info.call.callee` text drops
|
|
/// (e.g. `unwrap` in `connection.unwrap().createStatement()`). Mirrors
|
|
/// the in-place chain split inside `sink_arg_is_parameter_only` so trace
|
|
/// nodes get the same recognition as the sink itself. Self-rooted
|
|
/// chains (`this->...`, `self.foo`) surface every segment; other chains
|
|
/// surface only the terminal method name plus any inner method-call
|
|
/// segments.
|
|
fn chain_callee_fragments_with_text<'a>(
|
|
callee: &'a str,
|
|
outer: &'a str,
|
|
source_bytes: &'a [u8],
|
|
span: (usize, usize),
|
|
) -> SmallVec<[&'a str; 8]> {
|
|
fn split_chain<'b>(s: &'b str) -> SmallVec<[(&'b str, bool); 8]> {
|
|
let mut out: SmallVec<[(&'b str, bool); 8]> = SmallVec::new();
|
|
for piece in s.split(['.', ':', '>', '-']) {
|
|
let stripped = piece.trim_start_matches('$').trim();
|
|
if stripped.is_empty() {
|
|
continue;
|
|
}
|
|
let (name, is_call) = match stripped.find('(') {
|
|
Some(idx) => (stripped[..idx].trim(), true),
|
|
None => (stripped, false),
|
|
};
|
|
if !name.is_empty() {
|
|
out.push((name, is_call));
|
|
}
|
|
}
|
|
out
|
|
}
|
|
fn is_self_root(seg: &str) -> bool {
|
|
matches!(seg, "this" | "self" | "static" | "parent" | "cls")
|
|
}
|
|
let mut frags: SmallVec<[&str; 8]> = SmallVec::new();
|
|
for src in [callee, outer] {
|
|
let segs = split_chain(src);
|
|
let Some(&(first_name, _)) = segs.first() else {
|
|
continue;
|
|
};
|
|
let last_idx = segs.len() - 1;
|
|
if is_self_root(first_name) {
|
|
for &(name, _) in &segs {
|
|
if !frags.contains(&name) {
|
|
frags.push(name);
|
|
}
|
|
}
|
|
} else {
|
|
for (i, &(name, is_call)) in segs.iter().enumerate() {
|
|
if (is_call || i == last_idx) && !frags.contains(&name) {
|
|
frags.push(name);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
let (start, end) = span;
|
|
if start < source_bytes.len() && end <= source_bytes.len() && start < end {
|
|
let span_bytes = &source_bytes[start..end];
|
|
if let Ok(span_text) = std::str::from_utf8(span_bytes) {
|
|
let bytes = span_text.as_bytes();
|
|
let mut depth: i32 = 0;
|
|
let mut last_open_at_zero: Option<usize> = None;
|
|
for (i, &b) in bytes.iter().enumerate() {
|
|
match b {
|
|
b'(' => {
|
|
if depth == 0 {
|
|
last_open_at_zero = Some(i);
|
|
}
|
|
depth += 1;
|
|
}
|
|
b')' => {
|
|
depth = depth.saturating_sub(1);
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
let chain_end = last_open_at_zero.unwrap_or(bytes.len());
|
|
let mut i = 0;
|
|
while i < chain_end {
|
|
let b = bytes[i];
|
|
let is_ident_start = b.is_ascii_alphabetic() || b == b'_';
|
|
if !is_ident_start {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
let id_start = i;
|
|
while i < chain_end {
|
|
let c = bytes[i];
|
|
if c.is_ascii_alphanumeric() || c == b'_' {
|
|
i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if i < chain_end && bytes[i] == b'(' {
|
|
let name = &span_text[id_start..i];
|
|
let abs_start = start + id_start;
|
|
let abs_end = start + i;
|
|
if abs_start < source_bytes.len() && abs_end <= source_bytes.len() {
|
|
let name_slice =
|
|
std::str::from_utf8(&source_bytes[abs_start..abs_end]).unwrap_or(name);
|
|
if !frags.contains(&name_slice) {
|
|
frags.push(name_slice);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
frags
|
|
}
|
|
|
|
/// Check if the source bytes at a given span contain a redirect call whose
|
|
/// argument starts with a path prefix (`/...`), indicating a server-relative
|
|
/// path rather than an attacker-controlled URL.
|
|
///
|
|
/// Reused by both `cfg-unguarded-sink` suppression and taint finding filtering.
|
|
pub(crate) fn has_redirect_path_prefix(source_bytes: &[u8], span: (usize, usize)) -> bool {
|
|
let (start, end) = span;
|
|
if start >= source_bytes.len() || end > source_bytes.len() {
|
|
return false;
|
|
}
|
|
let text = &source_bytes[start..end];
|
|
// Search for the argument portion after the first '('
|
|
if let Some(paren_pos) = text.iter().position(|&b| b == b'(') {
|
|
let after_paren = &text[paren_pos + 1..];
|
|
let trimmed = after_paren
|
|
.iter()
|
|
.skip_while(|&&b| b == b' ' || b == b'\n' || b == b'\t')
|
|
.copied()
|
|
.collect::<Vec<_>>();
|
|
// Template literal: `/ ...
|
|
if trimmed.starts_with(b"`/") {
|
|
return true;
|
|
}
|
|
// String literal: "/ ... or '/ ...
|
|
if trimmed.starts_with(b"\"/") || trimmed.starts_with(b"'/") {
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
/// Check if this sink is an internal redirect, a `res.redirect` (SSRF sink)
|
|
/// whose argument is a template literal or string starting with `/`, indicating
|
|
/// a server-relative path rather than an attacker-controlled URL.
|
|
fn is_internal_redirect(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
|
if !sink_caps.contains(Cap::SSRF) {
|
|
return false;
|
|
}
|
|
let sink_info = &ctx.cfg[sink];
|
|
let callee = match &sink_info.call.callee {
|
|
Some(c) => c.as_str(),
|
|
None => return false,
|
|
};
|
|
// Only applies to redirect calls
|
|
if !callee.ends_with("redirect") && !callee.ends_with("Redirect") {
|
|
return false;
|
|
}
|
|
has_redirect_path_prefix(ctx.source_bytes, sink_info.ast.span)
|
|
}
|
|
|
|
/// Check if the enclosing function qualifies as an entrypoint.
|
|
fn sink_in_entrypoint(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|
let sink_info = &ctx.cfg[sink];
|
|
if let Some(func_name) = &sink_info.ast.enclosing_func {
|
|
is_entry_point_func(func_name, ctx.lang)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
impl CfgAnalysis for UnguardedSink {
|
|
fn run(&self, ctx: &AnalysisContext) -> Vec<CfgFinding> {
|
|
let doms = dominators::compute_dominators(ctx.cfg, ctx.entry);
|
|
let sink_nodes = dominators::find_sink_nodes(ctx.cfg);
|
|
let guard_nodes = find_guard_nodes(ctx);
|
|
|
|
let mut findings = Vec::new();
|
|
|
|
for sink in &sink_nodes {
|
|
let sink_info = &ctx.cfg[*sink];
|
|
let sink_caps = sink_info.taint.labels.iter().fold(Cap::empty(), |acc, l| {
|
|
if let DataLabel::Sink(caps) = l {
|
|
acc | *caps
|
|
} else {
|
|
acc
|
|
}
|
|
});
|
|
if sink_caps.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
|
|
|
// Check: does any applicable guard dominate this sink?
|
|
// Guards must be in the same function to be relevant.
|
|
let is_guarded = guard_nodes.iter().any(|(guard_idx, guard_caps)| {
|
|
let guard_func = ctx.cfg[*guard_idx].ast.enclosing_func.as_deref();
|
|
(*guard_caps & sink_caps) != Cap::empty()
|
|
&& guard_func == sink_func
|
|
&& dominates(&doms, *guard_idx, *sink)
|
|
});
|
|
|
|
// Also check if an inline sanitizer dominates this sink (same function).
|
|
let has_sanitizer = ctx.cfg.node_indices().any(|idx| {
|
|
let node_func = ctx.cfg[idx].ast.enclosing_func.as_deref();
|
|
ctx.cfg[idx].taint.labels.iter().any(|l| {
|
|
if let DataLabel::Sanitizer(san_caps) = l {
|
|
(*san_caps & sink_caps) != Cap::empty()
|
|
&& node_func == sink_func
|
|
&& dominates(&doms, idx, *sink)
|
|
} else {
|
|
false
|
|
}
|
|
})
|
|
});
|
|
|
|
// Interprocedural sanitizer: check if any arg_callee resolves to a
|
|
// function with sanitizer caps that cover this sink's caps.
|
|
let has_interprocedural_sanitizer = sink_info.arg_callees.iter().any(|mc| {
|
|
if let Some(callee) = mc {
|
|
let leaf = callee_leaf_name(callee);
|
|
// Check local function summaries
|
|
ctx.func_summaries.iter().any(|(k, s)| {
|
|
k.name == leaf && (s.sanitizer_caps & sink_caps) != Cap::empty()
|
|
})
|
|
} else {
|
|
false
|
|
}
|
|
});
|
|
|
|
if is_guarded || has_sanitizer || has_interprocedural_sanitizer {
|
|
continue;
|
|
}
|
|
|
|
let callee_desc = sink_info.call.callee.as_deref().unwrap_or("(unknown sink)");
|
|
|
|
// ── Severity classification ───────────────────────────────
|
|
//
|
|
// HIGH: taint confirms flow OR source directly feeds sink
|
|
// MEDIUM: structural finding without taint confirmation
|
|
// LOW: wrapper function (param-only, non-entrypoint)
|
|
|
|
let has_taint = taint_confirms_sink(ctx, *sink);
|
|
let source_derived = sink_arg_is_source_derived(ctx, *sink);
|
|
|
|
// If sink args are all constants (including one-hop constant bindings)
|
|
// and taint didn't confirm, this is a false positive, skip it.
|
|
if is_all_args_constant(ctx, *sink) && !has_taint {
|
|
continue;
|
|
}
|
|
|
|
// SSA latest-def suppression: when the taint engine has already
|
|
// proved no source-tainted data reaches this sink (`!has_taint`)
|
|
// and every SSA operand resolves to a constant, callee-fragment
|
|
// pseudo-name, OR a function parameter that is not a Source ,
|
|
// the sink's actual arguments cannot carry an injection payload.
|
|
// Catches the reassign-to-constant idiom (`name := req.x; name =
|
|
// "Guest"; sink(name)`) where the latest SSA def is a literal
|
|
// and a non-payload parameter (e.g. an HTTP writer / receiver)
|
|
// is the only other operand. The simpler `is_all_args_constant`
|
|
// check above rejects that mixed shape because it forbids real
|
|
// parameters in operand position.
|
|
//
|
|
// Exemption: shell-array gate filters. The
|
|
// `extract_shell_array_payload_idents` detector recognises
|
|
// `[<shell>, "-c", <payload>]` arrays at any call site and emits a
|
|
// `Sink(SHELL_ESCAPE)` label with `destination_uses` narrowed to
|
|
// the payload-element idents. When the array shape itself is the
|
|
// gate, an unrelated reassign-to-const elsewhere in the body
|
|
// (`const flag = true; if (flag) {}`) does not erase the
|
|
// shell-exec intent — the construction of `[bash, -c, x]` is by
|
|
// itself the dangerous operation. Skip this suppression so the
|
|
// structural finding survives in closed-world contexts where no
|
|
// taint source has been resolved yet.
|
|
let has_shell_array_gate = sink_info.call.gate_filters.iter().any(|gf| {
|
|
gf.label_caps.contains(Cap::SHELL_ESCAPE) && gf.destination_uses.is_some()
|
|
});
|
|
if !has_taint
|
|
&& !has_shell_array_gate
|
|
&& ssa_all_sink_operands_const_or_param(ctx, *sink)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Type-aware suppression: when all SSA operand values of the sink
|
|
// are proven to carry non-injectable types (e.g. integers parsed
|
|
// from a raw source), the arguments cannot form a payload for
|
|
// SHELL/SQL/FILE sinks. Skip the structural finding, the taint
|
|
// engine already covers the source→sink flow via type-aware
|
|
// suppression. Unknown-typed or mixed operands fall through.
|
|
if !has_taint && sink_args_typed_safe(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// JPA / Hibernate Criteria-query suppression: receiver-call SQL
|
|
// sinks like `session.createQuery(cq)` / `em.executeUpdate(cq)`
|
|
// are safe by construction when arg 0 is a structural Criteria
|
|
// object built via `CriteriaBuilder` (returns parameterized
|
|
// SQL). Receiver excluded from the check, the receiver is
|
|
// never the payload. Closes openmrs / xwiki / keycloak
|
|
// Hibernate-DAO FP cluster.
|
|
if !has_taint && sink_args_jpa_criteria_query_safe(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// Zero-arg query-builder verbs: Doctrine DBAL `QueryBuilder`,
|
|
// JPA `CriteriaBuilder`, and similar chain-builder shapes
|
|
// execute a query that was bound earlier on the receiver via
|
|
// parameterised API calls. No SQL string is concatenated at
|
|
// the terminal call site. Closes the nextcloud apps/dav and
|
|
// lib/private/DB cluster (`$qb->executeQuery()` /
|
|
// `$qb->executeStatement()` after `select`/`from`/`where`/
|
|
// `setParameter` chains).
|
|
if !has_taint && sink_is_zero_arg_query_builder(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// Builder.getSQL() arg suppression: the dangerous flat shape is
|
|
// `$conn->executeStatement($sql)` where `$sql` is user-controlled
|
|
// SQL. When `$sql` is itself the return of `<builder>.getSQL()`,
|
|
// the SQL is parameterised by construction (Doctrine DBAL),
|
|
// independent of which receiver fires the terminal verb.
|
|
if !has_taint && sink_first_arg_is_builder_get_sql(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// Composition: `<builder>.getSQL()` wrapped by string-shaping ops
|
|
// (`preg_replace('/^INSERT/i', 'INSERT IGNORE', $b->getSQL())`,
|
|
// `$b->getSQL() . ' ON CONFLICT DO NOTHING'`). Closes the
|
|
// remaining nextcloud `AdapterMySQL.php` / `AdapterSqlite.php`
|
|
// FPs after the direct accessor recognition above.
|
|
if !has_taint && sink_first_arg_composes_safe_dbal_sql(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// PHP foreach-key string interpolation: arg-0 is a SQL string
|
|
// whose interpolated `$<var>` is bound by a `foreach ($X as $var)`
|
|
// (or `as $key => $var`) over a literal-keyed array assigned
|
|
// earlier in the same function. The literal set is finite and
|
|
// metachar-free, so the interpolated SQL is bounded. Closes the
|
|
// nextcloud `lib/private/DB/MySqlTools.php:27` FP.
|
|
if !has_taint && sink_arg_uses_safe_foreach_key(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// Static-map suppression: the SSA value flowing into the sink is
|
|
// proved by the static-HashMap-lookup idiom detector to be a
|
|
// finite set of literals free of shell metacharacters. Mirrors
|
|
// the SSA-taint finite-domain suppression so both paths agree.
|
|
if !has_taint && sink_args_static_map_safe(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
// Summary-validated suppression: when the SSA value flowing into
|
|
// the sink is the return of a callee whose summary records a
|
|
// `validated_params_to_return` covering every propagating
|
|
// parameter, the helper validates its inputs on every taint-
|
|
// carrying return path (regex allowlist, type check, validation
|
|
// call, …). The SSA taint engine already cleared this flow via
|
|
// `propagate_validated_params_to_return`, so the structural
|
|
// finding is noise. Closes the patched-counterpart noise for
|
|
// CVE-2026-25544 (Payload `sanitizeValue` → `createJSONQuery`
|
|
// → `db.execute`).
|
|
if !has_taint && sink_args_summary_validated_safe(ctx, *sink) {
|
|
continue;
|
|
}
|
|
|
|
// Parameterized SQL queries: arg 0 is a string literal with
|
|
// placeholders ($1, ?, %s, :name) and a params argument exists.
|
|
// These are safe by construction, the driver handles escaping.
|
|
if sink_info.parameterized_query {
|
|
continue;
|
|
}
|
|
|
|
// Internal redirects: res.redirect(`/path/...`) with a path-prefix
|
|
// argument are server-relative, not attacker-controlled URLs.
|
|
if is_internal_redirect(ctx, *sink, sink_caps) {
|
|
continue;
|
|
}
|
|
|
|
let param_only = sink_arg_is_parameter_only(ctx, *sink);
|
|
let in_entrypoint = sink_in_entrypoint(ctx, *sink);
|
|
|
|
let (severity, confidence) = if has_taint || source_derived {
|
|
(Severity::High, Confidence::High)
|
|
} else if param_only && !in_entrypoint {
|
|
// Wrapper function with param-only args, zero signal. Suppress.
|
|
continue;
|
|
} else if !ctx.taint_active {
|
|
// AST-only / cfg-only mode, preserve as LOW (unchanged)
|
|
(Severity::Low, Confidence::Low)
|
|
} else {
|
|
// taint_active=true but found nothing.
|
|
// Keep high-risk sinks (SHELL_ESCAPE, CODE_EXEC, SQL_QUERY, DESERIALIZE)
|
|
// as structural backup. Suppress low-risk sinks (FILE_IO, SSRF, etc.).
|
|
let high_risk =
|
|
Cap::SHELL_ESCAPE | Cap::CODE_EXEC | Cap::SQL_QUERY | Cap::DESERIALIZE;
|
|
if (sink_caps & high_risk).is_empty() {
|
|
continue; // FILE_IO, SSRF, FMT_STRING etc. without taint → noise
|
|
}
|
|
// If the function containing the sink has no Source-labeled
|
|
// nodes AND no parameters (through which taint could flow
|
|
// from callers), taint ran and found nothing because there
|
|
// is nothing to find. Suppress, the structural finding
|
|
// is noise.
|
|
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
|
let has_sources = ctx.cfg.node_indices().any(|n| {
|
|
let info = &ctx.cfg[n];
|
|
info.ast.enclosing_func.as_deref() == sink_func
|
|
&& info
|
|
.taint
|
|
.labels
|
|
.iter()
|
|
.any(|l| matches!(l, DataLabel::Source(_)))
|
|
});
|
|
let has_params = ctx.func_summaries.values().any(|s| {
|
|
s.entry.index() < ctx.cfg.node_count()
|
|
&& ctx.cfg[s.entry].ast.enclosing_func.as_deref() == sink_func
|
|
&& !s.param_names.is_empty()
|
|
});
|
|
if !has_sources && !has_params {
|
|
continue; // No sources or params in scope → noise
|
|
}
|
|
(Severity::Medium, Confidence::Medium)
|
|
};
|
|
|
|
findings.push(CfgFinding {
|
|
rule_id: "cfg-unguarded-sink".to_string(),
|
|
severity,
|
|
confidence,
|
|
span: sink_info.ast.span,
|
|
message: format!("Sink `{callee_desc}` has no dominating guard or sanitizer"),
|
|
evidence: vec![*sink],
|
|
score: None,
|
|
});
|
|
}
|
|
|
|
findings
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod chain_fragments_tests {
|
|
use super::chain_callee_fragments_with_text;
|
|
|
|
fn frags(callee: &str, outer: &str, source: &str) -> Vec<String> {
|
|
chain_callee_fragments_with_text(callee, outer, source.as_bytes(), (0, source.len()))
|
|
.iter()
|
|
.map(|s| (*s).to_string())
|
|
.collect()
|
|
}
|
|
|
|
#[test]
|
|
fn java_chained_init_lifts_inner_call() {
|
|
// `Statement stmt = connection.unwrap().createStatement();`
|
|
// The collapsed `info.call.callee` drops the inner method call,
|
|
// so the source-text scan has to recover `unwrap` on top of the
|
|
// structural split's `createStatement`.
|
|
let src = "Statement stmt = connection.unwrap().createStatement()";
|
|
let got = frags("connection.createStatement", "", src);
|
|
assert!(got.contains(&"createStatement".to_string()));
|
|
assert!(got.contains(&"unwrap".to_string()));
|
|
assert!(!got.contains(&"connection".to_string()));
|
|
assert!(!got.contains(&"stmt".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn flat_method_invocation_terminal_only() {
|
|
// `connection.createStatement()` — receiver `connection` stays a
|
|
// real local-variable use, only the terminal method counts as a
|
|
// pseudo-use.
|
|
let src = "connection.createStatement()";
|
|
let got = frags("connection.createStatement", "", src);
|
|
assert!(got.contains(&"createStatement".to_string()));
|
|
assert!(!got.contains(&"connection".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn self_rooted_chain_lifts_every_segment() {
|
|
// `$this->inner->execute($sql)` — every chain segment belongs to
|
|
// the callee path because the chain is rooted at a self
|
|
// pseudo-receiver.
|
|
let src = "$this->inner->execute($sql)";
|
|
let got = frags("this->inner->execute", "", src);
|
|
assert!(got.contains(&"this".to_string()));
|
|
assert!(got.contains(&"inner".to_string()));
|
|
assert!(got.contains(&"execute".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn source_scan_skips_inside_args() {
|
|
// The scan stops at the outermost args opener, so identifiers
|
|
// nested inside the arguments are NOT lifted as pseudo-uses.
|
|
// `db.exec(transform(raw))` still treats `transform` as a real
|
|
// local reference, not a chain segment.
|
|
let src = "db.exec(transform(raw))";
|
|
let got = frags("db.exec", "", src);
|
|
assert!(got.contains(&"exec".to_string()));
|
|
assert!(!got.contains(&"transform".to_string()));
|
|
assert!(!got.contains(&"raw".to_string()));
|
|
}
|
|
}
|