#![allow(clippy::collapsible_if)] use super::dominators::{self, dominates}; use super::rules; use super::{ AnalysisContext, BodyConstFacts, CfgAnalysis, CfgFinding, Confidence, is_entry_point_func, }; use crate::callgraph::callee_leaf_name; use crate::cfg::StmtKind; use crate::labels::{Cap, DataLabel, RuntimeLabelRule}; use crate::patterns::Severity; use crate::ssa::const_prop::ConstLattice; use crate::ssa::type_facts::TypeFactResult; use crate::ssa::{SsaOp, SsaValue}; use crate::symbol::Lang; use crate::taint::path_state::{PredicateKind, classify_condition}; use petgraph::graph::NodeIndex; use smallvec::SmallVec; use std::collections::HashSet; pub struct UnguardedSink; /// Check whether **all** arguments to the sink are constants (no taint-capable /// variable flows). Extends the inline callee-part check by tracing one hop /// through the CFG: if a used variable is defined by a node that itself has /// empty `uses` and no Source label, the definition is treated as a constant /// binding (e.g. `let cmd = "git"; Command::new(cmd)`). When SSA /// [`BodyConstFacts`] are available, falls back to walking the sink's /// `SsaOp::Call` operands and consulting `OptimizeResult.const_values` for /// any operand the syntactic trace can't classify (e.g. a chained method-call /// receiver recorded as a compound identifier rather than a named binding). fn is_all_args_constant(ctx: &AnalysisContext, sink: NodeIndex) -> bool { // Fast path: syntactic literal detection from CFG construction. // Strictly weaker than the one-hop trace below, serves as an // optimization for the common case of inline literal arguments. if ctx.cfg[sink].all_args_literal { return true; } let sink_info = &ctx.cfg[sink]; let callee_desc = sink_info.call.callee.as_deref().unwrap_or(""); // Split callee description into parts and strip parenthesized arg portions. // e.g. `exec.Command("echo", "health-ok").Run` → ["exec", "Command", "Run"] let callee_parts: Vec<&str> = callee_desc .split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect(); // When the callee was overridden by an inner call (e.g. `db.query` inside // `Promise.all([db.query(...)])`), the outer callee's parts (e.g. "Promise", // "all") also belong to the callee machinery, not to arguments. let outer_parts: Vec<&str> = sink_info .call .outer_callee .as_deref() .map(|oc| { oc.split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect() }) .unwrap_or_default(); let sink_func = sink_info.ast.enclosing_func.as_deref(); sink_info.taint.uses.iter().all(|u| { // Part of the callee name itself → not an argument, skip // Check both individual parts and the full dotted callee path if callee_parts.contains(&u.as_str()) || u == callee_desc || outer_parts.contains(&u.as_str()) { return true; } // One-hop trace: find the defining node in the same function for idx in ctx.cfg.node_indices() { let info = &ctx.cfg[idx]; if info.ast.enclosing_func.as_deref() != sink_func { continue; } if info.taint.defines.as_deref() == Some(u.as_str()) { // If the defining node has no uses (pure constant) and is not // a Source, the variable is constant. if info.taint.uses.is_empty() && !info .taint .labels .iter() .any(|l| matches!(l, DataLabel::Source(_))) { return true; } } } // Class-level constant scalar: Java `static final TYPE NAME = LIT;` // field references are compile-time constants that the per-function // CFG one-hop trace can't see (fields live outside any function // body) and that SSA const-prop doesn't surface either (the per- // function lowering treats the cross-scope reference as a free // identifier). if let Some(map) = ctx.class_constant_scalars && map.contains_key(u.as_str()) { return true; } false }) || ssa_all_sink_operands_constant(ctx, sink, callee_desc, &callee_parts, &outer_parts) } /// SSA-backed fallback for `is_all_args_constant`. Looks up the sink CFG /// node in `cfg_node_map`, expects an `SsaOp::Call`, and checks that every /// operand (positional args and receiver) either names a callee fragment or /// resolves to a concrete `ConstLattice` literal. fn ssa_all_sink_operands_constant( ctx: &AnalysisContext, sink: NodeIndex, callee_desc: &str, callee_parts: &[&str], outer_parts: &[&str], ) -> bool { let Some(facts) = ctx.body_const_facts else { return false; }; let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else { return false; }; let Some(inst) = find_inst(&facts.ssa, sink_val) else { return false; }; let SsaOp::Call { args, receiver, .. } = &inst.op else { return false; }; let operand_const = |v: SsaValue| -> bool { ssa_operand_constant(v, facts, callee_desc, callee_parts, outer_parts) }; let args_ok = args .iter() .all(|group| group.iter().all(|v| operand_const(*v))); let receiver_ok = receiver.is_none_or(operand_const); args_ok && receiver_ok } /// SSA-backed reassign-aware safety probe: every operand of the sink /// resolves to a constant, callee fragment, OR a function parameter that /// is not itself a Source. Used at the cfg-unguarded-sink site under /// `!has_taint`, the taint engine has already proved no source-tainted /// data reaches the sink, so a non-source Param at operand position is /// inert payload-wise (e.g. HTTP writer in `Fprintf(w, "

", "Guest")`). /// /// Gated on the function body actually exhibiting the reassign-to-constant /// signature, at least one named SSA def whose RHS is a literal Const /// (`name = "Guest"`). In a thin wrapper without a same-block named /// const assignment (`fn wrap(p) { sink(p) }`, or C `popen(buf, "r")` where /// `buf` is filled in-place by `sprintf` with no Const Assign on `buf`), /// the bare Param at operand position IS the payload and the suppression's /// rationale does not apply, `cfg-unguarded-sink` must still fire. fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex) -> bool { let Some(facts) = ctx.body_const_facts else { return false; }; let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else { return false; }; let Some(inst) = find_inst(&facts.ssa, sink_val) else { return false; }; let SsaOp::Call { args, receiver, .. } = &inst.op else { return false; }; if !func_body_has_named_const_assign(facts) { return false; } let operand_safe = |v: SsaValue| -> bool { ssa_operand_const_or_param(v, facts, ctx.cfg) }; let args_ok = args .iter() .all(|group| group.iter().all(|v| operand_safe(*v))); let receiver_ok = receiver.is_none_or(operand_safe); args_ok && receiver_ok } /// Return true if the SSA body contains a *named* variable whose definition /// is a constant, the SSA signature of an explicit `name = "literal"` /// reassignment. Used as the gate for the broader operand-Param suppression: /// the suppression's purpose is the reassign-to-constant idiom, which by /// definition has at least one named const assignment. In a thin wrapper /// (`fn wrap(p) { sink(p) }` or `popen(buf, "r")` where `buf` is filled by /// `sprintf`), no such named const assignment exists and the suppression's /// rationale doesn't apply, so the bare-Param structural finding fires. fn func_body_has_named_const_assign(facts: &BodyConstFacts) -> bool { for block in &facts.ssa.blocks { for inst in &block.body { if inst.var_name.is_none() { continue; } let rhs_const = match &inst.op { SsaOp::Const(_) => true, SsaOp::Assign(vals) => vals.iter().all(|v| { matches!( facts.const_values.get(v), Some( ConstLattice::Str(_) | ConstLattice::Int(_) | ConstLattice::Bool(_) | ConstLattice::Null ) ) }), _ => false, }; if rhs_const { return true; } } } false } /// Variant of [`ssa_operand_constant`] that also accepts non-Source Params. /// Stricter than `ssa_operand_constant` on Source (always false) but /// looser on bare Params (always true unless they are Source-labeled). fn ssa_operand_const_or_param( root: SsaValue, facts: &BodyConstFacts, cfg: &crate::cfg::Cfg, ) -> bool { let mut visited: HashSet = HashSet::new(); let mut stack = vec![root]; while let Some(v) = stack.pop() { if !visited.insert(v) { continue; } match facts.const_values.get(&v) { Some(ConstLattice::Str(_)) | Some(ConstLattice::Int(_)) | Some(ConstLattice::Bool(_)) | Some(ConstLattice::Null) => continue, _ => {} } let Some(inst) = find_inst(&facts.ssa, v) else { return false; }; // CFG-node-level Source label: when an SSA `Call` corresponds to a // Source-labeled CFG node (e.g. `env::var(...)` whose callee // matches a `LabelRule` Source matcher), the call's result is // tainted user input, refuse, regardless of how the SSA // happened to lower. Catches the `SsaOp::Call` lowering of // labeled Source functions, which the `SsaOp::Source` arm only // sees for callee-less pure sources like PHP `$_GET`. let cfg_node = inst.cfg_node; if cfg .node_weight(cfg_node) .map(|info| { info.taint .labels .iter() .any(|l| matches!(l, DataLabel::Source(_))) }) .unwrap_or(false) { return false; } match &inst.op { SsaOp::Const(_) => {} SsaOp::Assign(vals) => stack.extend(vals.iter().copied()), SsaOp::Phi(ops) => stack.extend(ops.iter().map(|(_, v)| *v)), SsaOp::Call { args, receiver, .. } => { for group in args { stack.extend(group.iter().copied()); } if let Some(r) = receiver { stack.push(*r); } } SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam => { // Bare parameters are accepted: at the call site the // taint engine has already concluded no source data // reaches this sink (`!has_taint` gate). A Param that // is not source-tainted contributes only its caller- // bound value, which the gate above already filtered. } SsaOp::Source => return false, SsaOp::Nop | SsaOp::Undef => {} // FieldProj: walk the receiver, `obj.f` is constant iff `obj` // is constant under the same definition. The field name itself // is structural and adds no runtime value. SsaOp::FieldProj { receiver, .. } => stack.push(*receiver), } } true } /// Return true if this SSA operand is a compile-time-known literal, a callee /// fragment pseudo-use (not a real runtime value), or transitively composed /// of such operands. Returns false for sources, parameters with non-callee /// names, `Varying` const-prop facts, and any unresolved definition. fn ssa_operand_constant( root: SsaValue, facts: &BodyConstFacts, callee_desc: &str, callee_parts: &[&str], outer_parts: &[&str], ) -> bool { let mut visited: HashSet = HashSet::new(); let mut stack = vec![root]; while let Some(v) = stack.pop() { if !visited.insert(v) { continue; } match facts.const_values.get(&v) { Some(ConstLattice::Str(_)) | Some(ConstLattice::Int(_)) | Some(ConstLattice::Bool(_)) | Some(ConstLattice::Null) => continue, Some(ConstLattice::Varying) => { // Fall through: a Varying lattice entry may still correspond // to a callee-fragment pseudo-name that the SSA models as a // Param. The per-op check below filters those out. } _ => {} } let Some(inst) = find_inst(&facts.ssa, v) else { return false; }; match &inst.op { SsaOp::Const(_) => {} SsaOp::Assign(vals) => stack.extend(vals.iter().copied()), SsaOp::Phi(ops) => stack.extend(ops.iter().map(|(_, v)| *v)), SsaOp::Call { args, receiver, .. } => { for group in args { stack.extend(group.iter().copied()); } if let Some(r) = receiver { stack.push(*r); } } SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam | SsaOp::Source => { // Only acceptable when the param's `var_name` is a callee // fragment, i.e. an identifier that only appears because // the CFG recorded name components of the dotted/chained // callee as uses. Real parameters and sources are dynamic. let name = inst.var_name.as_deref().unwrap_or(""); if matches!(inst.op, SsaOp::Source) { return false; } if !is_callee_fragment(name, callee_desc, callee_parts, outer_parts) { return false; } } SsaOp::Nop => {} // Undef is a non-user, non-dynamic sentinel, treat like Const // (no additional operands to trace). SsaOp::Undef => {} // FieldProj: structural field read; constness reduces to the // receiver's constness. SsaOp::FieldProj { receiver, .. } => stack.push(*receiver), } } true } fn is_callee_fragment( name: &str, callee_desc: &str, callee_parts: &[&str], outer_parts: &[&str], ) -> bool { if name.is_empty() { return true; } if callee_parts.contains(&name) || outer_parts.contains(&name) || name == callee_desc { return true; } // Chained-receiver prefix: the name is a strict prefix of `callee_desc` // terminating at a `.` or `::` boundary (e.g. name = // `Command::new("sh").arg("-c").arg(cmd)` for callee_desc ending in // `.status().unwrap`). These are the outer callee's receiver chain, // not user-supplied arguments. if callee_desc.len() > name.len() && callee_desc.starts_with(name) { let rest = &callee_desc[name.len()..]; if rest.starts_with('.') || rest.starts_with("::") { return true; } } false } fn find_inst(ssa: &crate::ssa::SsaBody, v: SsaValue) -> Option<&crate::ssa::SsaInst> { let def = ssa.value_defs.get(v.0 as usize)?; let block = ssa.blocks.get(def.block.0 as usize)?; block .phis .iter() .chain(block.body.iter()) .find(|inst| inst.value == v) } /// Check whether every operand SSA value of the sink's Call instruction is /// proven by type-fact analysis to be non-injectable for `sink_caps`. /// /// Used to suppress `cfg-unguarded-sink` when all arguments are typed safe /// (e.g. Rust `port: u16` flowing into `Command::new(…).arg(port.to_string())`). /// Returns `false` when any required fact is missing so the structural finding /// is preserved whenever typing is ambiguous. fn sink_args_typed_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { let Some(facts) = ctx.body_const_facts else { return false; }; let Some(type_facts) = ctx.type_facts else { return false; }; let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else { return false; }; let Some(inst) = find_inst(&facts.ssa, sink_val) else { return false; }; let SsaOp::Call { args, receiver, .. } = &inst.op else { return false; }; // Chained Rust/JS calls record the whole dotted path as a single Call node. // Its SSA operands include pseudo-uses for every identifier segment of the // callee (e.g. `Command`, `new`, `arg`, `status`, `unwrap`) plus string // literal arguments to intermediate calls. Filter those out so the // is-Int check runs only against real argument values. let sink_info = &ctx.cfg[sink]; let callee_desc = sink_info.call.callee.as_deref().unwrap_or(""); let callee_parts: Vec<&str> = callee_desc .split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect(); let outer_parts: Vec<&str> = sink_info .call .outer_callee .as_deref() .map(|oc| { oc.split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect() }) .unwrap_or_default(); let is_real_arg = |v: SsaValue| -> bool { let Some(def) = find_inst(&facts.ssa, v) else { return true; }; // Callee-fragment pseudo-uses appear as `Param { .. }` with a // var_name that is a segment of the callee text. SelfParam and // CatchParam cover `self`/exception bindings that cannot be the // implicit callee chain. match &def.op { SsaOp::Param { .. } => { let name = def.var_name.as_deref().unwrap_or(""); !is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts) } // Constant string literals used as inline args (e.g. `"listener"`, // `"-c"`) are not user-controlled, treat as non-real for the // "all int-typed" test so they don't block suppression. SsaOp::Const(_) => false, _ => true, } }; let mut values: Vec = Vec::new(); if let Some(r) = receiver { if is_real_arg(*r) { values.push(*r); } } for group in args { for v in group.iter() { if is_real_arg(*v) { values.push(*v); } } } type_facts_suppress(&values, sink_caps, type_facts) } /// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when any positional /// argument to the sink Call is provably a JPA / Hibernate Criteria query /// object ([`crate::ssa::type_facts::TypeKind::JpaCriteriaQuery`]). /// /// Receiver values are deliberately excluded, the receiver of a JPA /// query method (`session.createQuery(cq)`, `em.createQuery(cq)`, /// `session.executeUpdate(cq)`) is the connection / EntityManager /// channel, never the SQL payload. Including the receiver in the type /// check would make this suppression unreachable since `Session` / /// `EntityManager` values are typed `Object` / `Unknown` and never /// `JpaCriteriaQuery` themselves. /// /// Closes the dominant FP cluster across openmrs (169 of 216 /// cfg-unguarded-sink), xwiki, and keycloak: Hibernate DAO methods /// build a `CriteriaQuery` via `cb.createQuery(Foo.class)` + /// `Root` / `Predicate` API, then hand the query object to /// `session.createQuery(cq)` for execution. No string concatenation /// happens, JPA emits parameterized SQL by construction. fn sink_args_jpa_criteria_query_safe( ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap, ) -> bool { if !sink_caps.intersects(Cap::SQL_QUERY) { return false; } let Some(facts) = ctx.body_const_facts else { return false; }; let Some(type_facts) = ctx.type_facts else { return false; }; let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else { return false; }; let Some(inst) = find_inst(&facts.ssa, sink_val) else { return false; }; let SsaOp::Call { args, .. } = &inst.op else { return false; }; let mut values: Vec = Vec::new(); for group in args { for v in group.iter() { values.push(*v); } } crate::ssa::type_facts::is_safe_query_object_arg(&values, sink_caps, type_facts) } /// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the call site is /// a zero-positional-argument query-builder execute / create verb. /// /// Doctrine DBAL `QueryBuilder` (`$qb->select(...)->from(...)->executeQuery()`), /// JPA / Hibernate `CriteriaBuilder` (`cb.createQuery()` returning the /// query-object factory), and any chained-builder pattern share the shape: /// the SQL string is bound earlier on the receiver chain via parameterized /// API calls (`->select`, `->from`, `->where(... param ...)`), and the /// terminal verb that fires on the sink list (`executeQuery`, /// `executeStatement`, `executeUpdate`, `createQuery`, `createNativeQuery`) /// takes zero positional args, no SQL string ever flows through the call /// site itself. /// /// vs. the dangerous flat shape: /// `$conn->executeQuery($sql, $params)` — arg 0 carries the SQL string, /// the structural finding is correctly preserved. /// /// Restricted to verb names where JDBC / Doctrine / JPA expose a /// receiver-built (zero-arg) overload. PHP `stmt.execute` is excluded /// because PDOStatement::execute() can be reached via a tainted /// `prepare($sql)` chain where the SQL was already built unsafely; /// the receiver-side taint check is the only thing that fires there. fn sink_is_zero_arg_query_builder(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { if !sink_caps.intersects(Cap::SQL_QUERY) { return false; } // Only suppress when the sink's caps are SQL_QUERY-only. Multi-cap // sinks may carry a non-SQL injection vector through the same call. if sink_caps != Cap::SQL_QUERY { return false; } // Restrict to PHP. Java / Kotlin / JVM langs already cover the // safe prepared-statement shape via the `prepareStatement` Sanitizer // rule that dominates `pstmt.executeUpdate()` / `pstmt.executeQuery()` // at the structural finding site. PHP's Doctrine DBAL `QueryBuilder` // and Drupal `Connection::prepareStatement` shapes need explicit // structural support because the receiver isn't always sanitized in // a way the dominator-Sanitizer scan recognises (chain receiver, // closure-captured helper, etc.). if ctx.lang != Lang::Php { return false; } let info = &ctx.cfg[sink]; let callee = match info.call.callee.as_deref() { Some(c) => c, None => return false, }; let suffix = callee.rsplit('.').next().unwrap_or(callee); let is_builder_verb = matches!(suffix, "executeQuery" | "executeStatement" | "createQuery"); if !is_builder_verb { return false; } // Restrict to receivers that name a known query-builder. The // root-receiver text is the leftmost segment of the callee chain; // for `$qb->...->executeQuery()` the root is `qb`, for // `$deleteQuery->executeStatement()` it is `deleteQuery`, etc. // Patterns canvassed from Doctrine DBAL / Drupal Database / Nextcloud // dav / lib idioms: // * canonical names: qb, query, queryBuilder, builder, q // * verb-bound builders: deleteQuery, insertQuery, selectTagQuery, // calendarObjectIdQuery, deleteQb, qbDeleteCalendarObjectProps // * action-named builders: insert, update, delete, select, upsert, // forUpdate, restoreUpdate // Receivers named after the SQL connection (`conn`, `connection`, // `dbc`, `db`) or entity-manager (`em`, `entityManager`) are // excluded since their `executeQuery` / `executeStatement` overloads // accept a SQL string arg. let root_receiver = match callee.split('.').next() { Some(r) if !r.is_empty() => r, _ => return false, }; let receiver_lower = root_receiver.to_ascii_lowercase(); let is_builder_receiver_by_name = receiver_lower == "qb" || receiver_lower == "q" || receiver_lower == "query" || receiver_lower == "querybuilder" || receiver_lower == "builder" || receiver_lower == "insert" || receiver_lower == "update" || receiver_lower == "delete" || receiver_lower == "select" || receiver_lower == "upsert" || receiver_lower.starts_with("qb") || receiver_lower.starts_with("querybuilder") || receiver_lower.ends_with("qb") || receiver_lower.ends_with("query") || receiver_lower.ends_with("builder"); let is_builder_receiver_by_def = receiver_defined_by_builder_factory(ctx, sink, root_receiver); if !is_builder_receiver_by_name && !is_builder_receiver_by_def { return false; } // Once the receiver is proven to be a builder via def-call lookup, the // call is the builder-variant of `executeQuery` / `executeStatement` // regardless of argument count (Doctrine DBAL `QueryBuilder::executeQuery` // accepts only an optional `?Connection`, never a SQL string). When the // receiver was identified solely by its NAME, fall back to the byte-level // zero-arg check that guards the closure-captured shape so an unfamiliar // verb-named local (`$insert = "DROP TABLE..."`-bound mistake) doesn't // unconditionally suppress. if !is_builder_receiver_by_def && !callee_span_has_zero_args(info, ctx.source_bytes) { return false; } true } /// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the sink call's first /// positional argument is the result of a Doctrine DBAL safe-SQL accessor — /// either `.getSQL()` (parameterised SQL from a QueryBuilder chain) /// or a `Platform::get*SQL(...)` factory (`getTruncateTableSQL`, /// `getCreateTableSQL`, etc., which return DDL with no user-controlled bytes). /// /// Two paths: /// 1. Direct arg: `arg_callees[0]` names a recognised accessor. Catches /// `$conn->executeStatement($builder->getSQL(), ...)` and /// `$conn->executeStatement($platform->getTruncateTableSQL('t', false))`. /// 2. Indirect via local var: the arg is a bare identifier `$sql` whose /// most-recent same-function defining Call has a recognised accessor as /// its callee. Catches the migration shape /// `$sql = $this->dbc->getDatabasePlatform()->getTruncateTableSQL(...); /// $this->dbc->executeStatement($sql);` /// /// PHP-only: other languages have their own builder conventions (Java JPA's /// `CriteriaQuery` is already covered by `sink_args_jpa_criteria_query_safe`). fn sink_first_arg_is_builder_get_sql( ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap, ) -> bool { if !sink_caps.intersects(Cap::SQL_QUERY) { return false; } if sink_caps != Cap::SQL_QUERY { return false; } if ctx.lang != Lang::Php { return false; } let info = &ctx.cfg[sink]; // Path 1: direct method-call arg. if let Some(Some(arg_callee)) = info.arg_callees.first() { let suffix = arg_callee.rsplit('.').next().unwrap_or(arg_callee); if is_dbal_safe_sql_accessor(suffix) { return true; } } // Path 2: bare-identifier arg defined earlier by a recognised accessor. // Use `arg_uses[0]` (the first positional argument's identifier set) to // pick the candidate variable name. When `arg_uses` is empty (e.g. the // arg is a literal, an arithmetic expression, or a complex chain), no // back-walk is performed. let first_arg_use = info .call .arg_uses .first() .and_then(|grp| grp.first()) .map(|s| s.as_str()); let var_name = match first_arg_use { Some(n) if !n.is_empty() => n, _ => return false, }; let sink_func = info.ast.enclosing_func.as_deref(); let sink_span_start = info.ast.span.0; let mut best: Option<(usize, String)> = None; for nidx in ctx.cfg.node_indices() { let n = &ctx.cfg[nidx]; if n.kind != crate::cfg::StmtKind::Call { continue; } if n.taint.defines.as_deref() != Some(var_name) { continue; } if n.ast.enclosing_func.as_deref() != sink_func { continue; } let span_start = n.ast.span.0; if span_start >= sink_span_start { continue; } let Some(callee) = n.call.callee.as_deref() else { continue; }; match best { Some((s, _)) if s >= span_start => {} _ => best = Some((span_start, callee.to_string())), } } if let Some((_, callee)) = best { let suffix = callee.rsplit('.').next().unwrap_or(&callee); if is_dbal_safe_sql_accessor(suffix) { return true; } } false } /// Recognise method names that Doctrine DBAL exposes as safe-SQL accessors. /// `getSQL` is the QueryBuilder accessor; `get*SQL` (case-sensitive `SQL` /// suffix) is the Platform-specific DDL builder convention used across the /// `Doctrine\DBAL\Platforms\*` hierarchy (`getTruncateTableSQL`, /// `getCreateTableSQL`, `getDropTableSQL`, etc.). All such methods receive /// schema identifiers and emit DBMS-specific DDL, never weaving user payload. fn is_dbal_safe_sql_accessor(name: &str) -> bool { if name == "getSQL" { return true; } name.starts_with("get") && name.len() > 5 && name.ends_with("SQL") } /// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the sink's first /// positional argument *composes* a Doctrine DBAL safe-SQL accessor with /// constant string-shaping ops. Two real-world shapes from nextcloud: /// (a) `$conn->executeStatement(preg_replace('/^INSERT/i', 'INSERT IGNORE', /// $builder->getSQL()), ...)` /// (b) `$conn->executeStatement($builder->getSQL() . ' ON CONFLICT DO /// NOTHING', ...)` /// /// Strategy (byte-level, conservative): /// 1. Lang-gate to PHP. Cap-gate to SQL_QUERY-only. /// 2. Extract the sink's first-positional-arg source bytes by balanced-paren /// walk inside the call's `ast.span`, with single/double-quoted-string /// awareness. /// 3. Scan arg-0 bytes for every PHP variable token `$`. Every var /// must be bound by a query-builder factory (`getQueryBuilder` / /// `createQueryBuilder` / `*queryBuilder`). Bypasses `arg_uses` because /// `collect_idents_with_paths` also surfaces method names (`getSQL`, /// `getParameters`) that are not variable references in PHP. /// 4. At least one var must appear in arg-0 bytes as the receiver of a DBAL /// safe-SQL accessor call (`$->getSQL(` or `$->get*SQL(`). /// /// The taint engine has already cleared this flow (gate is `!has_taint`), /// so the suppression's job is to silence the structural cfg-unguarded-sink /// over-fire on builder-composed SQL. PHP-only. fn sink_first_arg_composes_safe_dbal_sql( ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap, ) -> bool { if sink_caps != Cap::SQL_QUERY { return false; } if ctx.lang != Lang::Php { return false; } let info = &ctx.cfg[sink]; let Some(arg0_bytes) = first_positional_arg_bytes(info, ctx.source_bytes) else { return false; }; if arg0_bytes.is_empty() { return false; } let vars = extract_php_variables(arg0_bytes); if vars.is_empty() { return false; } let mut accessor_seen = false; for name in &vars { if !receiver_defined_by_builder_factory(ctx, sink, name) { return false; } if arg_bytes_call_dbal_accessor_on(arg0_bytes, name) { accessor_seen = true; } } accessor_seen } /// Extract the unique PHP variable identifiers appearing as `$` tokens /// in `bytes`. Skips the `$` sigil; variables tokens are alphanumeric + /// underscore. Order-stable (insertion order, with deduplication), so the /// caller's any-failure-bails loop deterministically rejects the first /// non-builder-bound var. fn extract_php_variables(bytes: &[u8]) -> Vec { let mut result: Vec = Vec::new(); let mut i = 0usize; while i < bytes.len() { if bytes[i] != b'$' { i += 1; continue; } let mut e = i + 1; while e < bytes.len() && (bytes[e].is_ascii_alphanumeric() || bytes[e] == b'_') { e += 1; } if e > i + 1 { if let Ok(name) = std::str::from_utf8(&bytes[i + 1..e]) { if !result.iter().any(|n| n == name) { result.push(name.to_string()); } } } i = e.max(i + 1); } result } /// Extract the source bytes of the sink call's first positional argument. /// /// Scans `info.ast.span` for the first `(` (outer args opener), then /// balance-walks parens with single/double-quoted-string awareness, returning /// the slice up to the first depth-1 `,` or the matching closing `)`. /// PHP-shaped: handles `'...'` and `"..."` with backslash escapes; ignores /// heredoc/nowdoc, which don't appear inside DBAL call-site argument lists /// in practice. `callee_span` is intentionally ignored because the upstream /// CFG narrowing path may set it to the *whole* call span (e.g. when a /// `return $this->conn->executeStatement(...)` is lowered: `inner_text_span` /// records the call's span via `first_call_ident_with_span`). Searching /// from `ast.span.0` and matching the first `(` is robust across both /// direct-call and statement-wrapped shapes. /// /// Returns `None` if no `(` is found or the walk runs off the end of /// `ast.span` without closing. fn first_positional_arg_bytes<'a>( info: &crate::cfg::NodeInfo, bytes: &'a [u8], ) -> Option<&'a [u8]> { let span = info.ast.span; if span.1 > bytes.len() || span.0 >= span.1 { return None; } let mut i = span.0; while i < span.1 && bytes[i] != b'(' { i += 1; } if i >= span.1 { return None; } let arg_start = i + 1; let mut j = arg_start; let mut depth: i32 = 1; let mut quote: Option = None; while j < span.1 { let b = bytes[j]; if let Some(q) = quote { if b == b'\\' && j + 1 < span.1 { j += 2; continue; } if b == q { quote = None; } j += 1; continue; } match b { b'\'' | b'"' => { quote = Some(b); j += 1; } b'(' => { depth += 1; j += 1; } b')' => { depth -= 1; if depth == 0 { return Some(&bytes[arg_start..j]); } j += 1; } b',' if depth == 1 => { return Some(&bytes[arg_start..j]); } _ => j += 1, } } None } /// Return true if `arg0` contains a method-call against `recv_name` whose /// method matches [`is_dbal_safe_sql_accessor`]. Recognises the PHP /// member-access shape `$->(`. The backward walk stops at /// the first non-identifier byte; the immediately preceding byte must be /// the `$` sigil so `mybuilder->getSQL` does not match `recv = "builder"`. fn arg_bytes_call_dbal_accessor_on(arg0: &[u8], recv_name: &str) -> bool { if recv_name.is_empty() { return false; } let recv_bytes = recv_name.as_bytes(); let mut i = 0usize; while i + 1 < arg0.len() { if arg0[i] != b'-' || arg0[i + 1] != b'>' { i += 1; continue; } // Walk backward to capture the receiver identifier ending at i. let mut s = i; while s > 0 { let c = arg0[s - 1]; if c.is_ascii_alphanumeric() || c == b'_' { s -= 1; } else { break; } } if s == i || s == 0 || arg0[s - 1] != b'$' || &arg0[s..i] != recv_bytes { i += 2; continue; } // Walk forward to capture the method identifier following `->`. let mut e = i + 2; while e < arg0.len() { let c = arg0[e]; if c.is_ascii_alphanumeric() || c == b'_' { e += 1; } else { break; } } // Must be followed by `(`. if e < arg0.len() && arg0[e] == b'(' { if let Ok(method) = std::str::from_utf8(&arg0[i + 2..e]) { if is_dbal_safe_sql_accessor(method) { return true; } } } i += 2; } false } /// Suppress a `cfg-unguarded-sink` SQL_QUERY finding when the sink's first /// positional argument interpolates only PHP variables that are bound by a /// `foreach` over a literal-keyed array within the same function body. /// Real-world shape from nextcloud `lib/private/DB/MySqlTools.php:27`: /// ```php /// $variables = ['innodb_file_per_table' => 'ON']; /// if (...) { $variables['innodb_file_format'] = 'Barracuda'; } /// foreach ($variables as $var => $val) { /// $connection->executeQuery("SHOW VARIABLES LIKE '$var'"); /// } /// ``` /// The foreach-key `$var` ranges over `{innodb_file_per_table, /// innodb_file_format, innodb_large_prefix}`, all metachar-free, so the /// interpolated SQL is bounded. /// /// Strategy (byte-level, conservative): /// 1. Lang-gate to PHP. Cap-gate to SQL_QUERY-only. /// 2. Extract the sink's first-positional-arg source bytes; collect every /// `$` interpolation token. /// 3. For every var, walk the enclosing function bytes. Find the /// innermost `foreach ($X as $name => $...)` or `foreach ($X as $name)` /// pattern whose body contains the sink span, with `$name` matching /// the use site. /// 4. Find every assignment of `$X` in the function body. Each must be /// either an array literal `['LIT' => 'LIT', ...]` (key-arrow form) or /// a subscript-set `$X['LIT'] = 'LIT';`. Every key/value involved /// must be metachar-free (alphanumeric + `_`, `-`, `.`). /// 5. Whether the use site reads the foreach-key (`$key` slot) or /// foreach-value (`$val` slot), the corresponding literal set must be /// proven safe. /// /// PHP-only. Limited to the simple foreach + literal-array shape; bare- /// reference / by-reference foreach variants and dynamic array sources /// fall through to the structural finding. fn sink_arg_uses_safe_foreach_key(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { if sink_caps != Cap::SQL_QUERY { return false; } if ctx.lang != Lang::Php { return false; } let info = &ctx.cfg[sink]; let Some(arg0_bytes) = first_positional_arg_bytes(info, ctx.source_bytes) else { return false; }; if arg0_bytes.is_empty() { return false; } let vars = extract_php_variables(arg0_bytes); if vars.is_empty() { return false; } let Some(func_scope) = enclosing_func_byte_scope(ctx, sink) else { return false; }; for name in &vars { if !php_var_safe_via_foreach_literal_array( ctx.source_bytes, func_scope, info.ast.span.0, name, ) { return false; } } true } /// Extent of the enclosing function body. Returns `None` when the sink /// has no `enclosing_func` (e.g. file-level top-level statement) or no /// matching CFG nodes. The byte range is `(min_span.0, max_span.1)` over /// the function's CFG nodes, conservative against multi-statement bodies. fn enclosing_func_byte_scope(ctx: &AnalysisContext, sink: NodeIndex) -> Option<(usize, usize)> { let sink_func = ctx.cfg[sink].ast.enclosing_func.as_deref()?; let mut lo = usize::MAX; let mut hi = 0usize; for n in ctx.cfg.node_indices() { let info = &ctx.cfg[n]; if info.ast.enclosing_func.as_deref() != Some(sink_func) { continue; } if info.ast.span.0 < lo { lo = info.ast.span.0; } if info.ast.span.1 > hi { hi = info.ast.span.1; } } if lo == usize::MAX || hi == 0 || lo >= hi { return None; } Some((lo, hi)) } /// Walk `source[func_scope]` for `foreach (...)` blocks containing /// `sink_span_start` in their body. Match the iteration pattern shape and /// (when found) verify every assignment of the iterated identifier in the /// function body is a literal-keyed array or a subscript-set with literal /// key, with all keys/values metachar-free. Returns true only when *every* /// candidate foreach proves safe; bails (returns false) on the first /// failure to keep the suppression conservative. fn php_var_safe_via_foreach_literal_array( source: &[u8], func_scope: (usize, usize), sink_span_start: usize, name: &str, ) -> bool { if name.is_empty() { return false; } if func_scope.0 >= func_scope.1 || func_scope.1 > source.len() { return false; } let scope = &source[func_scope.0..func_scope.1]; let sink_offset = if sink_span_start >= func_scope.0 { sink_span_start - func_scope.0 } else { return false; }; let needle = b"foreach"; let mut cursor = 0usize; let mut matched_any = false; while cursor + needle.len() <= scope.len() { let Some(rel) = find_subslice(&scope[cursor..], needle) else { break; }; let pos = cursor + rel; cursor = pos + needle.len(); // Require word boundary: prev byte (if any) must not be alnum/`_`. if pos > 0 { let prev = scope[pos - 1]; if prev.is_ascii_alphanumeric() || prev == b'_' { continue; } } // Skip whitespace; require `(`. let mut p = pos + needle.len(); while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') { p += 1; } if p >= scope.len() || scope[p] != b'(' { continue; } // Balanced walk to closing `)`. let header_open = p; let mut depth = 1i32; let mut q = p + 1; let mut quote: Option = None; while q < scope.len() && depth > 0 { let b = scope[q]; if let Some(c) = quote { if b == b'\\' && q + 1 < scope.len() { q += 2; continue; } if b == c { quote = None; } q += 1; continue; } match b { b'\'' | b'"' => quote = Some(b), b'(' => depth += 1, b')' => depth -= 1, _ => {} } q += 1; } if depth != 0 { continue; } let header_close = q - 1; // Skip whitespace; require `{`. let mut bp = header_close + 1; while bp < scope.len() && matches!(scope[bp], b' ' | b'\t' | b'\n' | b'\r') { bp += 1; } if bp >= scope.len() || scope[bp] != b'{' { continue; } // Balanced walk to closing `}`. let body_open = bp; let mut bdepth = 1i32; let mut bq = bp + 1; let mut bquote: Option = None; while bq < scope.len() && bdepth > 0 { let b = scope[bq]; if let Some(c) = bquote { if b == b'\\' && bq + 1 < scope.len() { bq += 2; continue; } if b == c { bquote = None; } bq += 1; continue; } match b { b'\'' | b'"' => bquote = Some(b), b'{' => bdepth += 1, b'}' => bdepth -= 1, _ => {} } bq += 1; } if bdepth != 0 { continue; } let body_end = bq - 1; // Sink position must lie inside the body. if sink_offset < body_open || sink_offset > body_end { continue; } let header = &scope[header_open + 1..header_close]; let Some((iter_var, key_var, val_var)) = parse_foreach_header(header) else { return false; }; let used_as_key = key_var.as_deref() == Some(name); let used_as_val = val_var.as_str() == name; if !used_as_key && !used_as_val { // The use site references some other variable; not bound by // this foreach. Continue scanning (might be a nested foreach). continue; } if !php_iter_var_assigns_safe_literals(scope, &iter_var, used_as_key, used_as_val) { return false; } matched_any = true; } matched_any } /// Parse a foreach header text (the bytes between `(` and `)`). Returns /// `(iter_var, key_var, value_var)`. Recognises `$X as $V` and /// `$X as $K => $V` shapes; bails (returns `None`) on by-reference /// (`& $V`), expressions (`call() as $V`), or any unexpected token. fn parse_foreach_header(header: &[u8]) -> Option<(String, Option, String)> { let text = std::str::from_utf8(header).ok()?.trim(); let lower = text; let as_pos = find_word(lower.as_bytes(), b"as")?; let iter_part = lower[..as_pos].trim(); let body_part = lower[as_pos + 2..].trim(); let iter_var = parse_simple_var(iter_part)?; if body_part.contains("=>") { let mut split = body_part.splitn(2, "=>"); let k = split.next()?.trim(); let v = split.next()?.trim(); let key_var = parse_simple_var(k)?; let val_var = parse_simple_var(v)?; Some((iter_var, Some(key_var), val_var)) } else { let val_var = parse_simple_var(body_part)?; Some((iter_var, None, val_var)) } } /// Parse a `$` token, rejecting any extra tokens (whitespace OK). /// By-reference (`&$x`), splat (`...$x`), or list-destructuring shapes /// produce `None` so the suppression bails conservatively. fn parse_simple_var(text: &str) -> Option { let trimmed = text.trim(); let bytes = trimmed.as_bytes(); if bytes.first() != Some(&b'$') { return None; } let rest = &trimmed[1..]; if rest.is_empty() { return None; } if !rest.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') { return None; } Some(rest.to_string()) } /// Find a whole-word match of `word` inside `text`. Word boundaries are /// non-alnum/non-`_` bytes (or the buffer edges). Returns the byte offset /// of the first match. fn find_word(text: &[u8], word: &[u8]) -> Option { let mut cursor = 0usize; while cursor + word.len() <= text.len() { let rel = find_subslice(&text[cursor..], word)?; let pos = cursor + rel; let prev_ok = pos == 0 || { let p = text[pos - 1]; !(p.is_ascii_alphanumeric() || p == b'_') }; let next = pos + word.len(); let next_ok = next == text.len() || { let p = text[next]; !(p.is_ascii_alphanumeric() || p == b'_') }; if prev_ok && next_ok { return Some(pos); } cursor = pos + 1; } None } /// For every assignment of `$` inside `scope` (the enclosing /// function bytes), require every key/value referenced is a metachar-free /// string literal (alphanumeric, `_`, `-`, `.`, space). Recognises: /// * `$ = ['LIT' => 'LIT', ...];` (key-arrow array literal) /// * `$['LIT'] = 'LIT';` (subscript-set with literal key) /// /// Conservative: any other assignment shape, missing literals, or empty /// array set returns false. When `used_as_key` is true, the literal keys /// must be safe; when `used_as_val` is true, the literal values must be /// safe; both flags can be true at once. fn php_iter_var_assigns_safe_literals( scope: &[u8], iter_var: &str, used_as_key: bool, used_as_val: bool, ) -> bool { if iter_var.is_empty() { return false; } let needle: Vec = std::iter::once(b'$').chain(iter_var.bytes()).collect(); let mut cursor = 0usize; let mut saw_init = false; while cursor + needle.len() <= scope.len() { let Some(rel) = find_subslice(&scope[cursor..], &needle) else { break; }; let pos = cursor + rel; cursor = pos + 1; // Word-boundary on the trailing side: the next byte must not be // alnum/`_` (no `$variables_extra`). let after = pos + needle.len(); if after < scope.len() { let b = scope[after]; if b.is_ascii_alphanumeric() || b == b'_' { continue; } } // Skip trailing whitespace. let mut p = after; while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') { p += 1; } if p >= scope.len() { continue; } match scope[p] { b'=' => { // Direct assignment: `$X = ['k' => 'v', ...];` if p + 1 < scope.len() && scope[p + 1] == b'=' { continue; // comparison } if !php_check_array_literal_assignment(scope, p + 1, used_as_key, used_as_val) { return false; } saw_init = true; } b'[' // Subscript-set: `$X['LIT'] = 'LIT';` if !php_check_subscript_set(scope, p, used_as_key, used_as_val) => { return false; } _ => { // Other usage (foreach iter, function arg, member access). // Doesn't add to the literal set; allowed as long as no // unrecognised assignment shape appears. } } } saw_init } /// Validate an array-literal assignment after `$X =` (cursor points at /// the byte just after `=`). Allowed: optional whitespace, then `[ ... ];` /// where every element is `'LIT' => 'LIT'` with metachar-free literals. fn php_check_array_literal_assignment( scope: &[u8], after_eq: usize, used_as_key: bool, used_as_val: bool, ) -> bool { let mut p = after_eq; while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') { p += 1; } if p >= scope.len() || scope[p] != b'[' { return false; } let body_open = p + 1; let mut depth = 1i32; let mut q = body_open; let mut quote: Option = None; while q < scope.len() && depth > 0 { let b = scope[q]; if let Some(c) = quote { if b == b'\\' && q + 1 < scope.len() { q += 2; continue; } if b == c { quote = None; } q += 1; continue; } match b { b'\'' | b'"' => quote = Some(b), b'[' => depth += 1, b']' => depth -= 1, _ => {} } q += 1; } if depth != 0 { return false; } let body_close = q - 1; let elements = &scope[body_open..body_close]; php_check_kv_array_literal(elements, used_as_key, used_as_val) } /// Walk an array-literal body (between `[` and `]`). Each element must /// be `'LIT' => 'LIT'`. All keys/values used by the consumer must be /// metachar-free. fn php_check_kv_array_literal(elements: &[u8], used_as_key: bool, used_as_val: bool) -> bool { if elements.iter().all(|b| b.is_ascii_whitespace()) { return false; } // Split by `,` at depth 0. let mut start = 0usize; let mut quote: Option = None; let mut depth = 0i32; let mut any_pair = false; let mut i = 0usize; while i < elements.len() { let b = elements[i]; if let Some(c) = quote { if b == b'\\' && i + 1 < elements.len() { i += 2; continue; } if b == c { quote = None; } i += 1; continue; } match b { b'\'' | b'"' => quote = Some(b), b'[' | b'(' => depth += 1, b']' | b')' => depth -= 1, b',' if depth == 0 => { if !php_check_arrow_pair(&elements[start..i], used_as_key, used_as_val) { return false; } any_pair = true; start = i + 1; } _ => {} } i += 1; } let tail = &elements[start..]; if tail.iter().any(|b| !b.is_ascii_whitespace()) { if !php_check_arrow_pair(tail, used_as_key, used_as_val) { return false; } any_pair = true; } any_pair } /// Validate one `'LIT' => 'LIT'` pair. Both literals must be string /// literals (`'...'` or `"..."`) with metachar-free contents per /// `is_metachar_free_literal`. fn php_check_arrow_pair(pair: &[u8], used_as_key: bool, used_as_val: bool) -> bool { let text = std::str::from_utf8(pair).map(str::trim).unwrap_or(""); let mut split = text.splitn(2, "=>"); let k = match split.next() { Some(s) => s.trim(), None => return false, }; let v = match split.next() { Some(s) => s.trim(), None => return false, }; if used_as_key && !is_metachar_free_string_literal(k.as_bytes()) { return false; } if used_as_val && !is_metachar_free_string_literal(v.as_bytes()) { return false; } true } /// Validate a subscript-set assignment `$X[...] = ...;` starting at the /// `[` byte. Both the subscript key (when `used_as_key`) and the /// assigned value (when `used_as_val`) must be metachar-free string /// literals. fn php_check_subscript_set( scope: &[u8], open_bracket: usize, used_as_key: bool, used_as_val: bool, ) -> bool { let mut depth = 1i32; let mut q = open_bracket + 1; let mut quote: Option = None; while q < scope.len() && depth > 0 { let b = scope[q]; if let Some(c) = quote { if b == b'\\' && q + 1 < scope.len() { q += 2; continue; } if b == c { quote = None; } q += 1; continue; } match b { b'\'' | b'"' => quote = Some(b), b'[' => depth += 1, b']' => depth -= 1, _ => {} } q += 1; } if depth != 0 { return false; } let close_bracket = q - 1; let key_bytes = &scope[open_bracket + 1..close_bracket]; if used_as_key && !is_metachar_free_string_literal(key_bytes.trim_ascii()) { return false; } // Skip whitespace; require `=`, not `==`. let mut p = close_bracket + 1; while p < scope.len() && matches!(scope[p], b' ' | b'\t' | b'\n' | b'\r') { p += 1; } if p >= scope.len() || scope[p] != b'=' { return false; } if p + 1 < scope.len() && scope[p + 1] == b'=' { return false; } // Read the RHS up to the next `;` at depth 0 (no string awareness needed // beyond `;` because PHP statement separator). let mut q = p + 1; let mut quote: Option = None; let mut depth = 0i32; while q < scope.len() { let b = scope[q]; if let Some(c) = quote { if b == b'\\' && q + 1 < scope.len() { q += 2; continue; } if b == c { quote = None; } q += 1; continue; } match b { b'\'' | b'"' => quote = Some(b), b'(' | b'[' | b'{' => depth += 1, b')' | b']' | b'}' => depth -= 1, b';' if depth == 0 => break, _ => {} } q += 1; } let rhs = &scope[p + 1..q]; if used_as_val && !is_metachar_free_string_literal(rhs.trim_ascii()) { return false; } true } /// `true` when `bytes` form a single-quoted or double-quoted string /// literal whose contents are alphanumeric, `_`, `-`, `.`, or space — /// safe for SQL pattern literal interpolation. Rejects empty string, /// any escape sequences, control characters, quotes, semicolons, or /// shell/SQL metacharacters. fn is_metachar_free_string_literal(bytes: &[u8]) -> bool { if bytes.len() < 2 { return false; } let first = bytes[0]; let last = bytes[bytes.len() - 1]; if first != last || (first != b'\'' && first != b'"') { return false; } let inner = &bytes[1..bytes.len() - 1]; if inner.is_empty() { return false; } inner .iter() .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b'.' | b' ')) } /// Check whether the source bytes inside the sink's `callee_span` end with a /// zero-argument call form: trailing `)` preceded by `(` with only whitespace /// in between. Used to identify `qb.executeQuery()` / `qb.executeStatement()` /// where the SQL was bound earlier on the receiver chain. fn callee_span_has_zero_args(info: &crate::cfg::NodeInfo, bytes: &[u8]) -> bool { let span = info.call.callee_span.unwrap_or(info.ast.span); if span.0 >= span.1 || span.1 > bytes.len() { return false; } let slice = &bytes[span.0..span.1]; let mut end = slice.len(); while end > 0 && matches!(slice[end - 1], b' ' | b'\t' | b'\n' | b'\r') { end -= 1; } if end == 0 || slice[end - 1] != b')' { return false; } end -= 1; while end > 0 && matches!(slice[end - 1], b' ' | b'\t' | b'\n' | b'\r') { end -= 1; } end > 0 && slice[end - 1] == b'(' } /// Detect that `receiver_name` was bound earlier in the same function by a /// query-builder factory call. Two paths: /// 1. CFG def-call: a same-function Call node defines `receiver_name` with a /// callee ending in `getQueryBuilder` / `createQueryBuilder`. /// 2. Source-text scan: between the enclosing function's first byte and the /// sink's byte offset, the source contains /// `$ = ... ->getQueryBuilder(...)` (or `createQueryBuilder`). /// Picks up assignment nodes whose CFG kind/callee text doesn't surface a /// leaf factory name (multi-line chains, `for`/`try` block nesting, /// unusual lowering paths). fn receiver_defined_by_builder_factory( ctx: &AnalysisContext, sink: NodeIndex, receiver_name: &str, ) -> bool { if receiver_name.is_empty() { return false; } let sink_info = &ctx.cfg[sink]; let sink_func = sink_info.ast.enclosing_func.as_deref(); let sink_span_start = sink_info.ast.span.0; // Path 1: CFG-level def lookup. let mut best: Option<(usize, String)> = None; for nidx in ctx.cfg.node_indices() { let n = &ctx.cfg[nidx]; if n.kind != crate::cfg::StmtKind::Call { continue; } if n.taint.defines.as_deref() != Some(receiver_name) { continue; } if n.ast.enclosing_func.as_deref() != sink_func { continue; } let span_start = n.ast.span.0; if span_start >= sink_span_start { continue; } let Some(callee) = n.call.callee.as_deref() else { continue; }; match best { Some((s, _)) if s >= span_start => {} _ => best = Some((span_start, callee.to_string())), } } if let Some((_, callee)) = best { let suffix = callee.rsplit('.').next().unwrap_or(&callee); let suffix_lower = suffix.to_ascii_lowercase(); if matches!( suffix_lower.as_str(), "getquerybuilder" | "createquerybuilder" | "getqb" | "createqb" ) || suffix_lower.ends_with("querybuilder") { return true; } } // Path 2: source-text scan over the enclosing function's body. Some // builder assignments (multi-line chains, deeply nested in `try`/`for` // bodies) bind `defines` to a synthesised name that doesn't match // `receiver_name` exactly. A direct byte scan for an assignment shape // catches these without depending on CFG synthesis details. let func_start = ctx .cfg .node_indices() .filter_map(|i| { let n = &ctx.cfg[i]; if n.ast.enclosing_func.as_deref() == sink_func { Some(n.ast.span.0) } else { None } }) .min() .unwrap_or(0); let bytes = ctx.source_bytes; let lo = func_start.min(bytes.len()); let hi = sink_span_start.min(bytes.len()); if lo >= hi { return false; } let scope = &bytes[lo..hi]; text_contains_builder_factory_assignment(scope, receiver_name) } /// Search `scope` for `$ = ... (...)` where `` ends /// with `getQueryBuilder` / `createQueryBuilder` (case-insensitive). Used as a /// byte-level fallback for CFG def-lookup that misses multi-line chained /// assignments inside nested `try` / `for` bodies. fn text_contains_builder_factory_assignment(scope: &[u8], name: &str) -> bool { if name.is_empty() { return false; } let needle: Vec = std::iter::once(b'$').chain(name.bytes()).collect(); let mut start = 0usize; while start + needle.len() <= scope.len() { let Some(rel) = find_subslice(&scope[start..], &needle) else { return false; }; let mut cursor = start + rel + needle.len(); // Require an immediate `=` (allow whitespace before). while cursor < scope.len() && matches!(scope[cursor], b' ' | b'\t' | b'\n' | b'\r') { cursor += 1; } if cursor < scope.len() && scope[cursor] == b'=' && (cursor + 1 == scope.len() || scope[cursor + 1] != b'=') { // Find the next `;` (statement terminator) without crossing a // closing brace boundary, the assignment expression spans up to it. let mut end = cursor + 1; while end < scope.len() { let b = scope[end]; if b == b';' || b == b'\n' && end + 1 < scope.len() && scope[end + 1] == b'\n' { break; } end += 1; } let rhs_lower: Vec = scope[cursor + 1..end] .iter() .map(|b| b.to_ascii_lowercase()) .collect(); if find_subslice(&rhs_lower, b"getquerybuilder").is_some() || find_subslice(&rhs_lower, b"createquerybuilder").is_some() { return true; } } start = start + rel + 1; } false } fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option { if needle.is_empty() || needle.len() > haystack.len() { return None; } haystack.windows(needle.len()).position(|w| w == needle) } /// Walk the sink's Call SSA arguments and check whether every real argument /// resolves through a defining `SsaOp::Call` whose callee carries an SSA /// summary with `validated_params_to_return` covering every propagating /// parameter slot the caller's argument flows into. When that holds, the /// helper validates each argument on every taint-carrying return path, and /// the call result is structurally validated even though no syntactic guard /// dominates the sink in the caller's body. /// /// Conservative: returns `false` whenever any required fact is missing, /// any operand is non-Call-defined and not a constant/parameter, or any /// callee summary lacks the validated transform. Real arguments only — /// the same `is_real_arg` filter as `sink_args_typed_safe` skips /// callee-fragment pseudo-uses and SSA constants. fn sink_args_summary_validated_safe(ctx: &AnalysisContext, sink: NodeIndex) -> bool { // Per-file SSA summary map carries the augment + rerun-pass merges // that GlobalSummaries may not yet reflect on single-file scans; // fall back to GlobalSummaries when the per-file map isn't threaded // through (legacy callers). let local_map = ctx.ssa_summaries; let global_map = ctx.global_summaries.map(|g| g.snapshot_ssa()); if local_map.is_none() && global_map.is_none() { return false; } let sink_info = &ctx.cfg[sink]; use crate::cfg::StmtKind; // Collect per-arg use names. Prefer `call.arg_uses` (positional, tighter // scope), fall back to `taint.uses` minus callee-fragment names when // `arg_uses` wasn't extracted (e.g. `await db.execute(sql)` where the // CFG saw the await wrapper rather than the underlying call_expression). let callee_desc = sink_info.call.callee.as_deref().unwrap_or(""); let callee_parts: Vec<&str> = callee_desc .split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect(); let outer_parts: Vec<&str> = sink_info .call .outer_callee .as_deref() .map(|oc| { oc.split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect() }) .unwrap_or_default(); let mut arg_use_names: Vec = Vec::new(); if !sink_info.call.arg_uses.is_empty() { for group in &sink_info.call.arg_uses { for u in group { if !arg_use_names.iter().any(|n| n == u) { arg_use_names.push(u.clone()); } } } } if arg_use_names.is_empty() { for u in &sink_info.taint.uses { if is_callee_fragment(u, callee_desc, &callee_parts, &outer_parts) { continue; } if !arg_use_names.iter().any(|n| n == u) { arg_use_names.push(u.clone()); } } } if arg_use_names.is_empty() { return false; } // Match callee text against any SSA summary key registered in // GlobalSummaries by leaf name. Conservative: require an exact // single-match so ambiguous overloads fall through to the default // structural-finding path. let lookup_validated = |callee_text: &str| -> Option { let leaf = callee_leaf_name(callee_text); let mut matches: Vec<&crate::summary::ssa_summary::SsaFuncSummary> = Vec::new(); if let Some(map) = local_map { for (key, sum) in map { if key.name == leaf || key.name == callee_text { matches.push(sum); } } } if matches.is_empty() { if let Some(map) = global_map { for (key, sum) in map { if key.name == leaf || key.name == callee_text { matches.push(sum); } } } } if matches.len() != 1 { return None; } let sum = matches[0]; if sum.validated_params_to_return.is_empty() { return Some(false); } // Every propagating parameter must be in validated_params_to_return. // When the callee doesn't propagate taint at all, the call result // cannot carry caller-side taint, so a non-empty validation set is // sufficient. let propagates = sum .param_to_return .iter() .map(|(idx, _)| *idx) .collect::>(); if propagates.is_empty() { return Some(true); } let all_validated = propagates .iter() .all(|p| sum.validated_params_to_return.contains(p)); Some(all_validated) }; // Walk CFG predecessors of `sink` looking for nodes that define an // arg-use name via a Call to an in-file helper. Conservative // traversal: stops at the body entry, follows Seq/Branch edges, // bails out on join/branch back-edges (loops) to keep the analysis // bounded. let mut to_validate: Vec = arg_use_names.clone(); let mut visited: HashSet = HashSet::new(); let mut frontier: Vec = ctx .cfg .neighbors_directed(sink, petgraph::Direction::Incoming) .collect(); let mut iter_budget = 256usize; while let Some(n) = frontier.pop() { if iter_budget == 0 { return false; } iter_budget -= 1; if !visited.insert(n) { continue; } let info = &ctx.cfg[n]; if info.kind == StmtKind::Call { if let Some(def_name) = info.taint.defines.as_deref() { if let Some(pos) = to_validate.iter().position(|u| u == def_name) { let callee = info.call.callee.as_deref().unwrap_or(""); if !matches!(lookup_validated(callee), Some(true)) { return false; } to_validate.remove(pos); if to_validate.is_empty() { return true; } } } } for pred in ctx.cfg.neighbors_directed(n, petgraph::Direction::Incoming) { frontier.push(pred); } } // Some arg-use names didn't map to an in-body Call definition (e.g. // they bind to a function parameter, an import, or a literal). // Only suppress when EVERY tainted-shaped arg has been validated by // an in-file helper summary; otherwise fall through. to_validate.is_empty() } /// Thin wrapper around [`crate::ssa::type_facts::is_type_safe_for_sink`] kept /// local so the unit tests here can exercise the exact predicate used at the /// `cfg-unguarded-sink` emission site. fn type_facts_suppress(values: &[SsaValue], sink_caps: Cap, type_facts: &TypeFactResult) -> bool { crate::ssa::type_facts::is_type_safe_for_sink(values, sink_caps, type_facts) } /// Suppress a `cfg-unguarded-sink` finding when every real argument SSA /// value resolves to a finite set of metacharacter-free literals, as proved /// by the static-map analysis. Runs in lock-step with the SSA taint /// suppression so both findings paths agree on when a provably-bounded /// lookup idiom (e.g. `map.get(x).unwrap_or("safe")` over literal inserts) /// should clear a command-injection sink. /// /// Only fires for `Cap::SHELL_ESCAPE`, SQL / path suppression from this /// domain would require stronger reasoning (literal keys can still carry /// SQL tokens if the inserts themselves contain them). fn sink_args_static_map_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { if !sink_caps.intersects(Cap::SHELL_ESCAPE) { return false; } let Some(facts) = ctx.body_const_facts else { return false; }; let Some(&sink_val) = facts.ssa.cfg_node_map.get(&sink) else { return false; }; let Some(inst) = find_inst(&facts.ssa, sink_val) else { return false; }; let SsaOp::Call { args, receiver, .. } = &inst.op else { return false; }; let sm = crate::ssa::static_map::analyze(&facts.ssa, ctx.cfg, Some(ctx.lang), &facts.const_values); if sm.is_empty() { return false; } // Skip callee-fragment pseudo-uses the same way `sink_args_typed_safe` // does so only real runtime arg values participate in the check. let sink_info = &ctx.cfg[sink]; let callee_desc = sink_info.call.callee.as_deref().unwrap_or(""); let callee_parts: Vec<&str> = callee_desc .split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect(); let outer_parts: Vec<&str> = sink_info .call .outer_callee .as_deref() .map(|oc| { oc.split(['.', ':']) .map(|p| p.split('(').next().unwrap_or(p)) .collect() }) .unwrap_or_default(); let is_real_arg = |v: SsaValue| -> bool { let Some(def) = find_inst(&facts.ssa, v) else { return true; }; match &def.op { SsaOp::Param { .. } => { let name = def.var_name.as_deref().unwrap_or(""); !is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts) } SsaOp::Const(_) => false, _ => true, } }; let mut values: Vec = Vec::new(); if let Some(r) = receiver { if is_real_arg(*r) { values.push(*r); } } for group in args { for v in group.iter() { if is_real_arg(*v) { values.push(*v); } } } if values.is_empty() { return false; } values.iter().all(|v| match sm.finite_string_values.get(v) { Some(set) if !set.is_empty() => set .iter() .all(|s| crate::abstract_interp::string_domain::is_shell_safe_literal(s)), _ => false, }) } /// Check if a callee matches any of the runtime label rules that are sanitizers. fn match_config_sanitizer(callee: &str, extra: &[RuntimeLabelRule]) -> Option { // Lazily compute lowercased callee only when a case-insensitive rule is hit. let mut callee_lower: Option = None; for rule in extra { let cap = match rule.label { DataLabel::Sanitizer(c) => c, _ => continue, }; for m in &rule.matchers { if rule.case_sensitive { if m.ends_with('_') { if callee.starts_with(m.as_str()) { return Some(cap); } } else if callee.ends_with(m.as_str()) { return Some(cap); } } else { let cl = callee_lower.get_or_insert_with(|| callee.to_ascii_lowercase()); let ml = m.to_ascii_lowercase(); if ml.ends_with('_') { if cl.starts_with(&ml) { return Some(cap); } } else if cl.ends_with(&ml) { return Some(cap); } } } } None } /// Resolve the `if (X)` / `if (!X)` indirect-validator pattern: the /// condition has exactly one bare-identifier variable whose defining /// CFG node is a [`StmtKind::Call`] whose `defines` is the same name /// and whose `callee` is recognised by /// [`crate::ssa::type_facts::classify_input_validator_callee`]. /// /// Returns the validator callee name when the pattern matches, `None` /// otherwise. Conservative: bails when the condition has zero or more /// than one variable, when no defining call is found, or when the /// callee doesn't match a validator pattern. Mirrors the SSA /// branch-narrowing layer /// ([`crate::taint::ssa_transfer::apply_input_validator_branch_narrowing`]) /// so the structural `cfg-unguarded-sink` suppression matches the /// taint engine's validator recognition. /// /// Driven off CFG `TaintMeta.defines` rather than the per-body SSA /// value-defs because nested arrow-function bodies are sometimes /// lowered with empty SSA in the cfg-analysis context, but the CFG /// nodes themselves carry `defines` in every body. fn cond_indirect_validator_callee( info: &crate::cfg::NodeInfo, ctx: &AnalysisContext, ) -> Option { if info.condition_vars.len() != 1 { return None; } let var_name = info.condition_vars[0].as_str(); let cond_func = info.ast.enclosing_func.as_deref(); let cond_span_start = info.ast.span.0; // Walk the CFG for any node that DEFINES `var_name` via a Call // expression. Same-function only, and only consider definitions // textually before the condition: a reassignment after the `if` // cannot be the def reaching it. Among the eligible defs, take // the textually-last one (highest span start), a conservative // latest-def proxy without paying for full dominator analysis. let mut best: Option<(usize, &str)> = None; for nidx in ctx.cfg.node_indices() { let n = &ctx.cfg[nidx]; if n.kind != crate::cfg::StmtKind::Call { continue; } if n.taint.defines.as_deref() != Some(var_name) { continue; } if n.ast.enclosing_func.as_deref() != cond_func { continue; } let span_start = n.ast.span.0; if span_start >= cond_span_start { continue; } let Some(callee) = n.call.callee.as_deref() else { continue; }; match best { Some((s, _)) if s >= span_start => {} _ => best = Some((span_start, callee)), } } let (_, callee) = best?; crate::ssa::type_facts::classify_input_validator_callee(callee).map(|_| callee.to_string()) } /// Find all nodes in the CFG that are calls to guard functions. fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> { let guard_rules = rules::guard_rules(ctx.lang); let config_rules = ctx .analysis_rules .map(|r| r.extra_labels.as_slice()) .unwrap_or(&[]); let mut result = Vec::new(); for idx in ctx.cfg.node_indices() { let info = &ctx.cfg[idx]; // If-condition guards: allowlist checks, type checks, validation // calls, shell-metachar rejections, and bounded-length checks in // branch conditions act as guards for downstream sinks. if info.kind == StmtKind::If { if let Some(cond_text) = &info.condition_text { let kind = classify_condition(cond_text); // For `AllowlistCheck`, also confirm a target identifier was // extractable. When the receiver-method form carries a // string-literal arg (`filePath.includes("/")`, // `path.contains("..")`), `extract_allowlist_target` returns // `None` because the argument isn't an identifier. Those // shapes are presence-checks, not real allowlist tests against // a collection variable, and shouldn't dominate every // downstream sink as a structural guard with `Cap::all()`. // `classify_condition` itself stays unchanged (an existing // test locks in its broad return for the receiver-method form, // and the SSA branch-narrowing layer reads the kind for its // own purposes). let allowlist_has_target = if kind == PredicateKind::AllowlistCheck { crate::taint::path_state::classify_condition_with_target(cond_text) .1 .is_some() } else { true }; if matches!( kind, PredicateKind::TypeCheck | PredicateKind::ValidationCall, ) || (kind == PredicateKind::AllowlistCheck && allowlist_has_target) { result.push((idx, Cap::all())); } else if cond_indirect_validator_callee(info, ctx).is_some() { // Indirect-validator pattern: // const err = validate(x); if (err) throw …; // const ok = isValid(x); if (!ok) throw …; // The classifier returns Unknown / NullCheck / ErrorCheck // because the if-condition is a bare result variable, not // a direct call expression. `cond_indirect_validator_callee` // handles that by scanning the CFG for nodes whose // `TaintMeta.defines` matches the condition variable and // checking whether any defining Call has an // `is_input_validator_callee`-recognised callee. This keeps // cfg-unguarded-sink suppression aligned with the same // structural validator recognition the SSA branch-narrowing // layer uses, without requiring the condition itself to be // a direct call expression. // // Motivated by Novu CVE GHSA-4x48-cgf9-q33f. result.push((idx, Cap::all())); } else if matches!( kind, PredicateKind::ShellMetaValidated | PredicateKind::BoundedLength ) { // Shell-metachar rejection and bounded-length checks only // guard shell-family sinks. Keep scope tight so unrelated // sinks (SQL, XSS) aren't silenced when a shell gate // happens to sit upstream. result.push((idx, Cap::SHELL_ESCAPE | Cap::CODE_EXEC)); } else { // Path-traversal rejection guard. When the condition // matches a path-rejection idiom recognised by // `classify_path_rejection_axes` (`strstr(p, "..")` // / `.contains("..")` / `strings.Contains(p, "..")` // / `p[0] == '/'` / `path.is_absolute()` / etc.), // it acts as a guard for FILE_IO sinks. Catches // the C/C++ `if (strstr(p, "..") != NULL)` shape // whose `!= NULL` wrapper otherwise falls through // to NullCheck classification and never registers // as a guard. Scope kept to FILE_IO so unrelated // sinks aren't silenced. let axes = crate::abstract_interp::path_domain::classify_path_rejection_axes( cond_text, ); if !axes.is_empty() { result.push((idx, Cap::FILE_IO)); } } } } if info.kind != StmtKind::Call { continue; } if let Some(callee) = &info.call.callee { // Check config sanitizer rules first if let Some(cap) = match_config_sanitizer(callee, config_rules) { result.push((idx, cap)); continue; } // Then check built-in guard rules let callee_lower = callee.to_ascii_lowercase(); for rule in guard_rules { let matched = rule.matchers.iter().any(|m| { let ml = m.to_ascii_lowercase(); if ml.ends_with('_') { callee_lower.starts_with(&ml) } else { callee_lower.ends_with(&ml) } }); if matched { result.push((idx, rule.applies_to_sink_caps)); break; } } } } result } /// Check whether taint analysis confirmed unsanitized flow to this sink node. fn taint_confirms_sink(ctx: &AnalysisContext, sink: NodeIndex) -> bool { ctx.taint_findings.iter().any(|f| f.sink == sink) } /// Check whether any variable used by the sink is directly derived from a /// Source node in the same function (via simple def-use chain). fn sink_arg_is_source_derived(ctx: &AnalysisContext, sink: NodeIndex) -> bool { let sink_info = &ctx.cfg[sink]; let sink_func = sink_info.ast.enclosing_func.as_deref(); // Collect all variables the sink reads let sink_uses = &sink_info.taint.uses; if sink_uses.is_empty() { return false; } // Walk all nodes in the same function looking for Source nodes that define // one of the variables the sink uses. for idx in ctx.cfg.node_indices() { let info = &ctx.cfg[idx]; if info.ast.enclosing_func.as_deref() != sink_func { continue; } if !info .taint .labels .iter() .any(|l| matches!(l, DataLabel::Source(_))) { continue; } // Source node defines a variable that the sink reads → source-derived if let Some(def) = &info.taint.defines && sink_uses.iter().any(|u| u == def) { return true; } } false } /// Check whether the sink's arguments are *only* function parameters /// (i.e. this function is a thin wrapper around the sink). fn sink_arg_is_parameter_only(ctx: &AnalysisContext, sink: NodeIndex) -> bool { let sink_info = &ctx.cfg[sink]; let sink_func = sink_info.ast.enclosing_func.as_deref(); let sink_uses = &sink_info.taint.uses; if sink_uses.is_empty() { // No identifiable arguments, could be a constant call like Command::new("ls") return true; // treat as non-dangerous (constant arg) } // Collect parameter names for the enclosing function from FuncSummaries let param_names: Vec<&str> = ctx .func_summaries .values() .filter(|s| { // Match by function entry being in the same function ctx.cfg[s.entry].ast.enclosing_func.as_deref() == sink_func }) .flat_map(|s| s.param_names.iter().map(|p| p.as_str())) .collect(); if param_names.is_empty() { return false; // can't determine params } // The sink's `taint.uses` includes pseudo-uses for callee-chain segments // when the chain is rooted at a self-pseudo-receiver (`this`, `self`, // `static`, `parent`). In that case every segment of the chain is part // of the dotted callee path that tree-sitter records as identifier // children of the call expression, not a real argument. This shape // covers thin method wrappers like // `function wrap($sql) { return $this->inner->execute($sql); }` so the // sink is recognised as parameter-only despite `this` / `inner` / // `execute` showing up in `taint.uses`. // // For other callee chains (e.g. Python `cursor.execute(name)` where // `cursor` is a local variable from `connection.cursor()`), only the // method name itself (`execute`) is filtered. `cursor` is a real // identifier value — a non-param local — and must not be filtered, // otherwise wrappers around external receivers get suppressed // incorrectly. // // PHP variable receivers carry a leading `$` (`$this->inner->execute`) // and use `->` between the receiver and member, so split on the full // set of separators and strip a leading `$` so identifier-shaped // fragments line up with bare identifier names in `taint.uses`. // // Each segment carries an `is_call` flag so chain pieces that are // themselves method invocations (`getSession()` in // `getSession().createQuery(qs)`) can be recognised as pseudo-uses // alongside the terminal method name. Variable-receiver chains like // `cursor.execute(name)` keep `cursor` as a real identifier and stay // out of the param-only filter. let callee_desc = sink_info.call.callee.as_deref().unwrap_or(""); let outer_callee = sink_info.call.outer_callee.as_deref().unwrap_or(""); fn split_chain_with_flags(s: &str) -> SmallVec<[(&str, bool); 8]> { let mut out: SmallVec<[(&str, bool); 8]> = SmallVec::new(); for piece in s.split(['.', ':', '>', '-']) { let stripped = piece.trim_start_matches('$').trim(); if stripped.is_empty() { continue; } let (name, is_call) = match stripped.find('(') { Some(idx) => (stripped[..idx].trim(), true), None => (stripped, false), }; if !name.is_empty() { out.push((name, is_call)); } } out } fn is_self_root(seg: &str) -> bool { matches!(seg, "this" | "self" | "static" | "parent" | "cls") } let mut callee_fragments: SmallVec<[&str; 8]> = SmallVec::new(); for src in [callee_desc, outer_callee] { let segs = split_chain_with_flags(src); let Some(&(first_name, _)) = segs.first() else { continue; }; let last_idx = segs.len() - 1; if is_self_root(first_name) { // Whole chain is callee path: `$this->inner->execute` → // every segment is a pseudo-use. for &(name, _) in &segs { if !callee_fragments.contains(&name) { callee_fragments.push(name); } } } else { // The terminal method name is a pseudo-use. Any non-last // segment that is itself a method call (`getSession()` in // `getSession().createQuery(qs)`) is also a pseudo-use, since // the segment text in the chain refers to a method name, not // a local variable. Bare-identifier receivers like `cursor` // in `cursor.execute(name)` carry no `(` and stay as real // local-variable values. for (i, &(name, is_call)) in segs.iter().enumerate() { if (is_call || i == last_idx) && !callee_fragments.contains(&name) { callee_fragments.push(name); } } } } // Source-text scan: `callee_desc` collapses chains via `root_receiver_text`, // so `getSession().getCriteriaBuilder().createQuery(qs)` reduces to // `"getSession().createQuery"` and the intermediate `getCriteriaBuilder` // is missing. Walk the sink's source bytes up to the outermost args // opener and lift every `IDENT(` pattern as a method-call pseudo-use. // Identifiers nested inside earlier `()` groups (which open at depth 0 // for sibling method calls in a chain) are picked up too, so every // chain hop contributes its method name. let span = sink_info.classification_span(); let (start, end) = span; if start < ctx.source_bytes.len() && end <= ctx.source_bytes.len() && start < end { let span_bytes = &ctx.source_bytes[start..end]; if let Ok(span_text) = std::str::from_utf8(span_bytes) { let bytes = span_text.as_bytes(); // Find the outermost args-opener: the last `(` at depth 0. let mut depth: i32 = 0; let mut last_open_at_zero: Option = None; for (i, &b) in bytes.iter().enumerate() { match b { b'(' => { if depth == 0 { last_open_at_zero = Some(i); } depth += 1; } b')' => { depth = depth.saturating_sub(1); } _ => {} } } let chain_end = last_open_at_zero.unwrap_or(bytes.len()); // Walk the chain prefix and lift every identifier directly followed // by `(` as a method-call pseudo-use. let mut i = 0; while i < chain_end { let b = bytes[i]; let is_ident_start = b.is_ascii_alphabetic() || b == b'_'; if !is_ident_start { i += 1; continue; } let id_start = i; while i < chain_end { let c = bytes[i]; if c.is_ascii_alphanumeric() || c == b'_' { i += 1; } else { break; } } if i < chain_end && bytes[i] == b'(' { let name = &span_text[id_start..i]; if !callee_fragments.contains(&name) { callee_fragments.push(name); } } } } } // Strict parameter set scoped to the sink's enclosing function only. // Used for the local-trace fallback below to prevent over-suppression // when sibling functions in the same file happen to share param names // with the current scope (e.g. a constructor's `dbConn` param leaking // into the `param_names` view of an unrelated `logAuditEvent` body). // The existing broad `param_names` view is preserved for the direct // in-list check above so legacy suppression behaviour is unchanged. let strict_param_names: SmallVec<[&str; 8]> = ctx .func_summaries .iter() .filter(|(key, _)| sink_func.is_some_and(|name| key.name.as_str() == name)) .flat_map(|(_, s)| s.param_names.iter().map(|p| p.as_str())) .collect(); sink_uses.iter().all(|u| { if callee_fragments.contains(&u.as_str()) || u == callee_desc { return true; } if param_names.contains(&u.as_str()) { return true; } // One-hop transitive local trace: when a sink use names a body // local whose every definition resolves to parameter-derived // data (e.g. `Statement stmt = connection.createStatement(); // stmt.executeQuery(sql);` where `connection` is a param), the // local is wrapper plumbing. Receiver-variable shapes whose // definitions reach a free (non-param, non-local) identifier or // a Source label fail the trace and keep the structural finding. if strict_param_names.is_empty() { return false; } let mut seen: SmallVec<[&str; 4]> = SmallVec::new(); local_is_param_derived( ctx, sink_func, &strict_param_names, &callee_fragments, u.as_str(), 3, &mut seen, ) }) } /// Recursive trace, return true iff every definition of `name` inside /// `sink_func` has its right-hand-side fully resolvable to parameter /// names, callee fragments, or other already-cleared body locals. Bounded /// by `depth` to prevent runaway on pathological CFGs and uses `seen` to /// short-circuit cycles (a local whose definition mentions itself does /// not clear). Called from `sink_arg_is_parameter_only` once the simple /// param / callee-fragment / source-text check has failed. fn local_is_param_derived<'a>( ctx: &'a AnalysisContext, sink_func: Option<&str>, param_names: &[&'a str], callee_fragments: &[&'a str], name: &'a str, depth: u8, seen: &mut SmallVec<[&'a str; 4]>, ) -> bool { if depth == 0 || seen.contains(&name) { return false; } seen.push(name); let mut found_def = false; let mut all_def_clear = true; for idx in ctx.cfg.node_indices() { let info = &ctx.cfg[idx]; if info.ast.enclosing_func.as_deref() != sink_func { continue; } if info.taint.defines.as_deref() != Some(name) { continue; } found_def = true; if info .taint .labels .iter() .any(|l| matches!(l, DataLabel::Source(_))) { all_def_clear = false; break; } // Compute the defining node's own callee fragments so method-name // segments (e.g. `createStatement` in `statement = // connection.createStatement();`) are recognised as pseudo-uses // alongside the receiver variable. Without this, the trace // wrongly rejects every chained method initialisation. The // source-text scan below also lifts intermediate method calls // (`unwrap` in `connection.unwrap().createStatement`) that the // collapsed `info.call.callee` drops. let def_fragments = chain_callee_fragments_with_text( info.call.callee.as_deref().unwrap_or(""), info.call.outer_callee.as_deref().unwrap_or(""), ctx.source_bytes, info.classification_span(), ); let clear = info.taint.uses.iter().all(|u| { param_names.contains(&u.as_str()) || callee_fragments.contains(&u.as_str()) || def_fragments.contains(&u.as_str()) || local_is_param_derived( ctx, sink_func, param_names, callee_fragments, u.as_str(), depth - 1, seen, ) }); if !clear { all_def_clear = false; break; } } seen.pop(); found_def && all_def_clear } /// Split a callee chain like `getSession().createQuery` or /// `connection.createStatement` into method-name segments treated as /// pseudo-uses. Also walks `source_bytes[span]` up to the outermost /// args-opener and lifts every `IDENT(` pattern, recovering intermediate /// method-call segments that the collapsed `info.call.callee` text drops /// (e.g. `unwrap` in `connection.unwrap().createStatement()`). Mirrors /// the in-place chain split inside `sink_arg_is_parameter_only` so trace /// nodes get the same recognition as the sink itself. Self-rooted /// chains (`this->...`, `self.foo`) surface every segment; other chains /// surface only the terminal method name plus any inner method-call /// segments. fn chain_callee_fragments_with_text<'a>( callee: &'a str, outer: &'a str, source_bytes: &'a [u8], span: (usize, usize), ) -> SmallVec<[&'a str; 8]> { fn split_chain<'b>(s: &'b str) -> SmallVec<[(&'b str, bool); 8]> { let mut out: SmallVec<[(&'b str, bool); 8]> = SmallVec::new(); for piece in s.split(['.', ':', '>', '-']) { let stripped = piece.trim_start_matches('$').trim(); if stripped.is_empty() { continue; } let (name, is_call) = match stripped.find('(') { Some(idx) => (stripped[..idx].trim(), true), None => (stripped, false), }; if !name.is_empty() { out.push((name, is_call)); } } out } fn is_self_root(seg: &str) -> bool { matches!(seg, "this" | "self" | "static" | "parent" | "cls") } let mut frags: SmallVec<[&str; 8]> = SmallVec::new(); for src in [callee, outer] { let segs = split_chain(src); let Some(&(first_name, _)) = segs.first() else { continue; }; let last_idx = segs.len() - 1; if is_self_root(first_name) { for &(name, _) in &segs { if !frags.contains(&name) { frags.push(name); } } } else { for (i, &(name, is_call)) in segs.iter().enumerate() { if (is_call || i == last_idx) && !frags.contains(&name) { frags.push(name); } } } } let (start, end) = span; if start < source_bytes.len() && end <= source_bytes.len() && start < end { let span_bytes = &source_bytes[start..end]; if let Ok(span_text) = std::str::from_utf8(span_bytes) { let bytes = span_text.as_bytes(); let mut depth: i32 = 0; let mut last_open_at_zero: Option = None; for (i, &b) in bytes.iter().enumerate() { match b { b'(' => { if depth == 0 { last_open_at_zero = Some(i); } depth += 1; } b')' => { depth = depth.saturating_sub(1); } _ => {} } } let chain_end = last_open_at_zero.unwrap_or(bytes.len()); let mut i = 0; while i < chain_end { let b = bytes[i]; let is_ident_start = b.is_ascii_alphabetic() || b == b'_'; if !is_ident_start { i += 1; continue; } let id_start = i; while i < chain_end { let c = bytes[i]; if c.is_ascii_alphanumeric() || c == b'_' { i += 1; } else { break; } } if i < chain_end && bytes[i] == b'(' { let name = &span_text[id_start..i]; let abs_start = start + id_start; let abs_end = start + i; if abs_start < source_bytes.len() && abs_end <= source_bytes.len() { let name_slice = std::str::from_utf8(&source_bytes[abs_start..abs_end]).unwrap_or(name); if !frags.contains(&name_slice) { frags.push(name_slice); } } } } } } frags } /// Check if the source bytes at a given span contain a redirect call whose /// argument starts with a path prefix (`/...`), indicating a server-relative /// path rather than an attacker-controlled URL. /// /// Reused by both `cfg-unguarded-sink` suppression and taint finding filtering. pub(crate) fn has_redirect_path_prefix(source_bytes: &[u8], span: (usize, usize)) -> bool { let (start, end) = span; if start >= source_bytes.len() || end > source_bytes.len() { return false; } let text = &source_bytes[start..end]; // Search for the argument portion after the first '(' if let Some(paren_pos) = text.iter().position(|&b| b == b'(') { let after_paren = &text[paren_pos + 1..]; let trimmed = after_paren .iter() .skip_while(|&&b| b == b' ' || b == b'\n' || b == b'\t') .copied() .collect::>(); // Template literal: `/ ... if trimmed.starts_with(b"`/") { return true; } // String literal: "/ ... or '/ ... if trimmed.starts_with(b"\"/") || trimmed.starts_with(b"'/") { return true; } } false } /// Check if this sink is an internal redirect, a `res.redirect` (SSRF sink) /// whose argument is a template literal or string starting with `/`, indicating /// a server-relative path rather than an attacker-controlled URL. fn is_internal_redirect(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { if !sink_caps.contains(Cap::SSRF) { return false; } let sink_info = &ctx.cfg[sink]; let callee = match &sink_info.call.callee { Some(c) => c.as_str(), None => return false, }; // Only applies to redirect calls if !callee.ends_with("redirect") && !callee.ends_with("Redirect") { return false; } has_redirect_path_prefix(ctx.source_bytes, sink_info.ast.span) } /// Check if the enclosing function qualifies as an entrypoint. fn sink_in_entrypoint(ctx: &AnalysisContext, sink: NodeIndex) -> bool { let sink_info = &ctx.cfg[sink]; if let Some(func_name) = &sink_info.ast.enclosing_func { is_entry_point_func(func_name, ctx.lang) } else { false } } impl CfgAnalysis for UnguardedSink { fn run(&self, ctx: &AnalysisContext) -> Vec { let doms = dominators::compute_dominators(ctx.cfg, ctx.entry); let sink_nodes = dominators::find_sink_nodes(ctx.cfg); let guard_nodes = find_guard_nodes(ctx); let mut findings = Vec::new(); for sink in &sink_nodes { let sink_info = &ctx.cfg[*sink]; let sink_caps = sink_info.taint.labels.iter().fold(Cap::empty(), |acc, l| { if let DataLabel::Sink(caps) = l { acc | *caps } else { acc } }); if sink_caps.is_empty() { continue; } let sink_func = sink_info.ast.enclosing_func.as_deref(); // Check: does any applicable guard dominate this sink? // Guards must be in the same function to be relevant. let is_guarded = guard_nodes.iter().any(|(guard_idx, guard_caps)| { let guard_func = ctx.cfg[*guard_idx].ast.enclosing_func.as_deref(); (*guard_caps & sink_caps) != Cap::empty() && guard_func == sink_func && dominates(&doms, *guard_idx, *sink) }); // Also check if an inline sanitizer dominates this sink (same function). let has_sanitizer = ctx.cfg.node_indices().any(|idx| { let node_func = ctx.cfg[idx].ast.enclosing_func.as_deref(); ctx.cfg[idx].taint.labels.iter().any(|l| { if let DataLabel::Sanitizer(san_caps) = l { (*san_caps & sink_caps) != Cap::empty() && node_func == sink_func && dominates(&doms, idx, *sink) } else { false } }) }); // Interprocedural sanitizer: check if any arg_callee resolves to a // function with sanitizer caps that cover this sink's caps. let has_interprocedural_sanitizer = sink_info.arg_callees.iter().any(|mc| { if let Some(callee) = mc { let leaf = callee_leaf_name(callee); // Check local function summaries ctx.func_summaries.iter().any(|(k, s)| { k.name == leaf && (s.sanitizer_caps & sink_caps) != Cap::empty() }) } else { false } }); if is_guarded || has_sanitizer || has_interprocedural_sanitizer { continue; } let callee_desc = sink_info.call.callee.as_deref().unwrap_or("(unknown sink)"); // ── Severity classification ─────────────────────────────── // // HIGH: taint confirms flow OR source directly feeds sink // MEDIUM: structural finding without taint confirmation // LOW: wrapper function (param-only, non-entrypoint) let has_taint = taint_confirms_sink(ctx, *sink); let source_derived = sink_arg_is_source_derived(ctx, *sink); // If sink args are all constants (including one-hop constant bindings) // and taint didn't confirm, this is a false positive, skip it. if is_all_args_constant(ctx, *sink) && !has_taint { continue; } // SSA latest-def suppression: when the taint engine has already // proved no source-tainted data reaches this sink (`!has_taint`) // and every SSA operand resolves to a constant, callee-fragment // pseudo-name, OR a function parameter that is not a Source , // the sink's actual arguments cannot carry an injection payload. // Catches the reassign-to-constant idiom (`name := req.x; name = // "Guest"; sink(name)`) where the latest SSA def is a literal // and a non-payload parameter (e.g. an HTTP writer / receiver) // is the only other operand. The simpler `is_all_args_constant` // check above rejects that mixed shape because it forbids real // parameters in operand position. // // Exemption: shell-array gate filters. The // `extract_shell_array_payload_idents` detector recognises // `[, "-c", ]` arrays at any call site and emits a // `Sink(SHELL_ESCAPE)` label with `destination_uses` narrowed to // the payload-element idents. When the array shape itself is the // gate, an unrelated reassign-to-const elsewhere in the body // (`const flag = true; if (flag) {}`) does not erase the // shell-exec intent — the construction of `[bash, -c, x]` is by // itself the dangerous operation. Skip this suppression so the // structural finding survives in closed-world contexts where no // taint source has been resolved yet. let has_shell_array_gate = sink_info.call.gate_filters.iter().any(|gf| { gf.label_caps.contains(Cap::SHELL_ESCAPE) && gf.destination_uses.is_some() }); if !has_taint && !has_shell_array_gate && ssa_all_sink_operands_const_or_param(ctx, *sink) { continue; } // Type-aware suppression: when all SSA operand values of the sink // are proven to carry non-injectable types (e.g. integers parsed // from a raw source), the arguments cannot form a payload for // SHELL/SQL/FILE sinks. Skip the structural finding, the taint // engine already covers the source→sink flow via type-aware // suppression. Unknown-typed or mixed operands fall through. if !has_taint && sink_args_typed_safe(ctx, *sink, sink_caps) { continue; } // JPA / Hibernate Criteria-query suppression: receiver-call SQL // sinks like `session.createQuery(cq)` / `em.executeUpdate(cq)` // are safe by construction when arg 0 is a structural Criteria // object built via `CriteriaBuilder` (returns parameterized // SQL). Receiver excluded from the check, the receiver is // never the payload. Closes openmrs / xwiki / keycloak // Hibernate-DAO FP cluster. if !has_taint && sink_args_jpa_criteria_query_safe(ctx, *sink, sink_caps) { continue; } // Zero-arg query-builder verbs: Doctrine DBAL `QueryBuilder`, // JPA `CriteriaBuilder`, and similar chain-builder shapes // execute a query that was bound earlier on the receiver via // parameterised API calls. No SQL string is concatenated at // the terminal call site. Closes the nextcloud apps/dav and // lib/private/DB cluster (`$qb->executeQuery()` / // `$qb->executeStatement()` after `select`/`from`/`where`/ // `setParameter` chains). if !has_taint && sink_is_zero_arg_query_builder(ctx, *sink, sink_caps) { continue; } // Builder.getSQL() arg suppression: the dangerous flat shape is // `$conn->executeStatement($sql)` where `$sql` is user-controlled // SQL. When `$sql` is itself the return of `.getSQL()`, // the SQL is parameterised by construction (Doctrine DBAL), // independent of which receiver fires the terminal verb. if !has_taint && sink_first_arg_is_builder_get_sql(ctx, *sink, sink_caps) { continue; } // Composition: `.getSQL()` wrapped by string-shaping ops // (`preg_replace('/^INSERT/i', 'INSERT IGNORE', $b->getSQL())`, // `$b->getSQL() . ' ON CONFLICT DO NOTHING'`). Closes the // remaining nextcloud `AdapterMySQL.php` / `AdapterSqlite.php` // FPs after the direct accessor recognition above. if !has_taint && sink_first_arg_composes_safe_dbal_sql(ctx, *sink, sink_caps) { continue; } // PHP foreach-key string interpolation: arg-0 is a SQL string // whose interpolated `$` is bound by a `foreach ($X as $var)` // (or `as $key => $var`) over a literal-keyed array assigned // earlier in the same function. The literal set is finite and // metachar-free, so the interpolated SQL is bounded. Closes the // nextcloud `lib/private/DB/MySqlTools.php:27` FP. if !has_taint && sink_arg_uses_safe_foreach_key(ctx, *sink, sink_caps) { continue; } // Static-map suppression: the SSA value flowing into the sink is // proved by the static-HashMap-lookup idiom detector to be a // finite set of literals free of shell metacharacters. Mirrors // the SSA-taint finite-domain suppression so both paths agree. if !has_taint && sink_args_static_map_safe(ctx, *sink, sink_caps) { continue; } // Summary-validated suppression: when the SSA value flowing into // the sink is the return of a callee whose summary records a // `validated_params_to_return` covering every propagating // parameter, the helper validates its inputs on every taint- // carrying return path (regex allowlist, type check, validation // call, …). The SSA taint engine already cleared this flow via // `propagate_validated_params_to_return`, so the structural // finding is noise. Closes the patched-counterpart noise for // CVE-2026-25544 (Payload `sanitizeValue` → `createJSONQuery` // → `db.execute`). if !has_taint && sink_args_summary_validated_safe(ctx, *sink) { continue; } // Parameterized SQL queries: arg 0 is a string literal with // placeholders ($1, ?, %s, :name) and a params argument exists. // These are safe by construction, the driver handles escaping. if sink_info.parameterized_query { continue; } // Internal redirects: res.redirect(`/path/...`) with a path-prefix // argument are server-relative, not attacker-controlled URLs. if is_internal_redirect(ctx, *sink, sink_caps) { continue; } let param_only = sink_arg_is_parameter_only(ctx, *sink); let in_entrypoint = sink_in_entrypoint(ctx, *sink); let (severity, confidence) = if has_taint || source_derived { (Severity::High, Confidence::High) } else if param_only && !in_entrypoint { // Wrapper function with param-only args, zero signal. Suppress. continue; } else if !ctx.taint_active { // AST-only / cfg-only mode, preserve as LOW (unchanged) (Severity::Low, Confidence::Low) } else { // taint_active=true but found nothing. // Keep high-risk sinks (SHELL_ESCAPE, CODE_EXEC, SQL_QUERY, DESERIALIZE) // as structural backup. Suppress low-risk sinks (FILE_IO, SSRF, etc.). let high_risk = Cap::SHELL_ESCAPE | Cap::CODE_EXEC | Cap::SQL_QUERY | Cap::DESERIALIZE; if (sink_caps & high_risk).is_empty() { continue; // FILE_IO, SSRF, FMT_STRING etc. without taint → noise } // If the function containing the sink has no Source-labeled // nodes AND no parameters (through which taint could flow // from callers), taint ran and found nothing because there // is nothing to find. Suppress, the structural finding // is noise. let sink_func = sink_info.ast.enclosing_func.as_deref(); let has_sources = ctx.cfg.node_indices().any(|n| { let info = &ctx.cfg[n]; info.ast.enclosing_func.as_deref() == sink_func && info .taint .labels .iter() .any(|l| matches!(l, DataLabel::Source(_))) }); let has_params = ctx.func_summaries.values().any(|s| { s.entry.index() < ctx.cfg.node_count() && ctx.cfg[s.entry].ast.enclosing_func.as_deref() == sink_func && !s.param_names.is_empty() }); if !has_sources && !has_params { continue; // No sources or params in scope → noise } (Severity::Medium, Confidence::Medium) }; findings.push(CfgFinding { rule_id: "cfg-unguarded-sink".to_string(), severity, confidence, span: sink_info.ast.span, message: format!("Sink `{callee_desc}` has no dominating guard or sanitizer"), evidence: vec![*sink], score: None, }); } findings } } #[cfg(test)] mod chain_fragments_tests { use super::chain_callee_fragments_with_text; fn frags(callee: &str, outer: &str, source: &str) -> Vec { chain_callee_fragments_with_text(callee, outer, source.as_bytes(), (0, source.len())) .iter() .map(|s| (*s).to_string()) .collect() } #[test] fn java_chained_init_lifts_inner_call() { // `Statement stmt = connection.unwrap().createStatement();` // The collapsed `info.call.callee` drops the inner method call, // so the source-text scan has to recover `unwrap` on top of the // structural split's `createStatement`. let src = "Statement stmt = connection.unwrap().createStatement()"; let got = frags("connection.createStatement", "", src); assert!(got.contains(&"createStatement".to_string())); assert!(got.contains(&"unwrap".to_string())); assert!(!got.contains(&"connection".to_string())); assert!(!got.contains(&"stmt".to_string())); } #[test] fn flat_method_invocation_terminal_only() { // `connection.createStatement()` — receiver `connection` stays a // real local-variable use, only the terminal method counts as a // pseudo-use. let src = "connection.createStatement()"; let got = frags("connection.createStatement", "", src); assert!(got.contains(&"createStatement".to_string())); assert!(!got.contains(&"connection".to_string())); } #[test] fn self_rooted_chain_lifts_every_segment() { // `$this->inner->execute($sql)` — every chain segment belongs to // the callee path because the chain is rooted at a self // pseudo-receiver. let src = "$this->inner->execute($sql)"; let got = frags("this->inner->execute", "", src); assert!(got.contains(&"this".to_string())); assert!(got.contains(&"inner".to_string())); assert!(got.contains(&"execute".to_string())); } #[test] fn source_scan_skips_inside_args() { // The scan stops at the outermost args opener, so identifiers // nested inside the arguments are NOT lifted as pseudo-uses. // `db.exec(transform(raw))` still treats `transform` as a real // local reference, not a chain segment. let src = "db.exec(transform(raw))"; let got = frags("db.exec", "", src); assert!(got.contains(&"exec".to_string())); assert!(!got.contains(&"transform".to_string())); assert!(!got.contains(&"raw".to_string())); } }