use super::helpers::first_member_label; use super::{ AstMeta, Cfg, EdgeKind, MAX_COND_VARS, MAX_CONDITION_TEXT_LEN, NodeInfo, StmtKind, collect_idents, connect_all, detect_eq_with_const, detect_negation, has_call_descendant, member_expr_text, push_node, text_of, try_lower_jsx_dangerous_html, }; use crate::labels::{DataLabel, LangAnalysisRules, classify}; use crate::utils::snippet::truncate_at_char_boundary; use petgraph::graph::NodeIndex; use smallvec::SmallVec; use tree_sitter::Node; // ------------------------------------------------------------------------- // Short-circuit boolean operator helpers // ------------------------------------------------------------------------- #[derive(Debug, Clone, Copy, PartialEq)] pub(super) enum BoolOp { And, Or, } /// Check if an AST node is a boolean operator (`&&`/`||`/`and`/`or`). pub(super) fn is_boolean_operator(node: Node) -> Option { match node.kind() { "binary_expression" | "boolean_operator" | "binary" => { let mut cursor = node.walk(); for child in node.children(&mut cursor) { match child.kind() { "&&" | "and" => return Some(BoolOp::And), "||" | "or" => return Some(BoolOp::Or), _ => {} } } None } _ => None, } } /// Strip parenthesized_expression wrappers. pub(super) fn unwrap_parens(node: Node) -> Node { if node.kind() == "parenthesized_expression" { if let Some(inner) = node.named_child(0) { return unwrap_parens(inner); } } node } /// Extract `left` and `right` operands from a binary boolean node. pub(super) fn get_boolean_operands<'a>(node: Node<'a>) -> Option<(Node<'a>, Node<'a>)> { // Field-based (all supported grammars) if let (Some(left), Some(right)) = ( node.child_by_field_name("left"), node.child_by_field_name("right"), ) { return Some((left, right)); } // Positional fallback (safety net) let mut cursor = node.walk(); let named: Vec<_> = node.named_children(&mut cursor).collect(); if named.len() >= 2 { return Some((named[0], named[named.len() - 1])); } None } /// Create a lightweight `StmtKind::If` node for a sub-condition in a boolean chain. pub(super) fn push_condition_node<'a>( g: &mut Cfg, cond_ast: Node<'a>, lang: &str, code: &'a [u8], enclosing_func: Option<&str>, ) -> NodeIndex { // Pass cond_ast as both args, sub-conditions are never `unless` nodes let (inner, negated) = detect_negation(cond_ast, cond_ast, lang); let mut vars = Vec::new(); collect_idents(inner, code, &mut vars); vars.sort(); vars.dedup(); vars.truncate(MAX_COND_VARS); let text = text_of(cond_ast, code) .map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string()); let span = (cond_ast.start_byte(), cond_ast.end_byte()); // Mirror condition variables into `taint.uses` so the per-body // `SymbolInterner::from_cfg` pass interns them. Without this, // `apply_branch_predicates` (which calls `interner.get(var)` to // look up a Symbol id) silently no-ops on short-circuit branch // condition nodes — they have no `taint.uses` even though // `condition_vars` carries the variable names. Surfaced by // GHSA-h8cj-hpmg-636v: a `||`-decomposed validator like // `if (x == null || !regex.matcher(x).matches()) throw;` failed // to mark `x` as `validated_must` on the surviving branch // because the per-disjunct cond nodes (built via // `build_condition_chain`) didn't populate `taint.uses`. let uses_for_taint: Vec = vars.clone(); g.add_node(NodeInfo { kind: StmtKind::If, ast: AstMeta { span, enclosing_func: enclosing_func.map(|s| s.to_string()), }, condition_text: text, condition_vars: vars, condition_negated: negated, taint: crate::cfg::TaintMeta { uses: uses_for_taint, ..Default::default() }, ..Default::default() }) } /// For a Rust `let = match { if => .., ... }`, /// find the first guarded `match_arm` and return the guard expression node plus /// the primary let-binding name. Returns `None` when the let-value is not a /// `match_expression` or no arm has a guard. /// /// The guard lives on the tree-sitter `match_pattern` node as the field /// `condition` (present whenever the pattern is followed by `if `). pub(super) fn detect_rust_let_match_guard<'a>( ast: Node<'a>, code: &[u8], ) -> Option<(Node<'a>, String)> { if ast.kind() != "let_declaration" { return None; } let value = ast.child_by_field_name("value")?; if value.kind() != "match_expression" { return None; } let body = value.child_by_field_name("body")?; let mut cursor = body.walk(); let guard = body.children(&mut cursor).find_map(|arm| { if !matches!(arm.kind(), "match_arm" | "last_match_arm") { return None; } let pattern = arm.child_by_field_name("pattern")?; pattern.child_by_field_name("condition") })?; let pat = ast.child_by_field_name("pattern")?; let mut idents = Vec::new(); collect_idents(pat, code, &mut idents); let name = idents.into_iter().next()?; Some((guard, name)) } /// Synthesize a `StmtKind::If` CFG node carrying a Rust match-arm guard's /// condition text and vars. The let-binding name is added to `condition_vars` /// so `apply_branch_predicates` narrows validation to that specific variable ///, the variable that receives the arm's value and flows to downstream sinks. pub(super) fn emit_rust_match_guard_if<'a>( g: &mut Cfg, guard: Node<'a>, let_name: &str, code: &'a [u8], enclosing_func: Option<&str>, ) -> NodeIndex { let mut vars = Vec::new(); collect_idents(guard, code, &mut vars); vars.push(let_name.to_string()); vars.sort(); vars.dedup(); vars.truncate(MAX_COND_VARS); let text = text_of(guard, code) .map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string()); let span = (guard.start_byte(), guard.end_byte()); g.add_node(NodeInfo { kind: StmtKind::If, ast: AstMeta { span, enclosing_func: enclosing_func.map(|s| s.to_string()), }, condition_text: text, condition_vars: vars, condition_negated: false, ..Default::default() }) } /// Decompose an assignment whose RHS is a ternary (`lhs = cond ? a : b`) into /// a proper diamond CFG: cond → {true_branch | false_branch} → join. Each /// branch defines `lhs_text` from its own operand's identifiers; a phi for /// `lhs_text` is then synthesised by SSA lowering at the join. /// /// The condition's identifiers live on the If node's `condition_vars`, **not** /// on the branch `uses`. This is the whole point of the split, cond is control /// flow, branches are data flow. /// /// Returns the exit frontier for downstream statement chaining (a single-element /// vec containing the join node). #[allow(clippy::too_many_arguments)] pub(super) fn build_ternary_diamond<'a>( lhs_text: String, lhs_labels: SmallVec<[DataLabel; 2]>, ternary_ast: Node<'a>, preds: &[NodeIndex], pred_edge: EdgeKind, g: &mut Cfg, lang: &str, code: &'a [u8], enclosing_func: Option<&str>, call_ordinal: &mut u32, analysis_rules: Option<&LangAnalysisRules>, ) -> Vec { let (Some(cond_field), Some(cons_field), Some(alt_field)) = ( ternary_ast.child_by_field_name("condition"), ternary_ast.child_by_field_name("consequence"), ternary_ast.child_by_field_name("alternative"), ) else { // Grammar mismatch: caller will fall through to the non-split path. return preds.to_vec(); }; let cond_ast = unwrap_parens(cond_field); let cons_ast = unwrap_parens(cons_field); let alt_ast = unwrap_parens(alt_field); // 1. Condition header. `push_condition_node` sets span/text/vars/negated // but leaves `is_eq_with_const` default; stamp it explicitly so the // taint engine's equality-narrowing fires for `x === 'literal' ? …`. let cond_if = push_condition_node(g, cond_ast, lang, code, enclosing_func); g[cond_if].is_eq_with_const = detect_eq_with_const(cond_ast, lang); connect_all(g, preds, cond_if, pred_edge); // 2. Branches. Each branch produces its own exit frontier (≥ 1 node) , // a nested ternary recurses and returns its own join node. let true_exits = lower_ternary_branch( cons_ast, &[cond_if], EdgeKind::True, &lhs_text, &lhs_labels, g, lang, code, enclosing_func, call_ordinal, analysis_rules, ); let false_exits = lower_ternary_branch( alt_ast, &[cond_if], EdgeKind::False, &lhs_text, &lhs_labels, g, lang, code, enclosing_func, call_ordinal, analysis_rules, ); // 3. Join: a zero-width Seq node placed at the ternary's end. Phi insertion // via Cytron will synthesise `lhs_text = phi(true_def, false_def)` here // because both branches define `lhs_text` and this is their dominance // frontier. let join_pos = ternary_ast.end_byte(); let join = g.add_node(NodeInfo { kind: StmtKind::Seq, ast: AstMeta { span: (join_pos, join_pos), enclosing_func: enclosing_func.map(|s| s.to_string()), }, ..Default::default() }); connect_all(g, &true_exits, join, EdgeKind::Seq); connect_all(g, &false_exits, join, EdgeKind::Seq); vec![join] } /// Emit the CFG shape for a single ternary branch. Three cases: /// /// 1. Branch is itself a ternary → recurse via `build_ternary_diamond` so nested /// conditions also split cleanly (no `cond2` leakage into uses). /// 2. Branch contains a call → emit as `StmtKind::Call` via `push_node` so inner /// source/sanitizer/sink classification is preserved, then rewrite `defines` /// to the outer LHS and union in the LHS's sink labels. /// 3. Otherwise → emit as `StmtKind::Seq`, same override. #[allow(clippy::too_many_arguments)] pub(super) fn lower_ternary_branch<'a>( branch_ast: Node<'a>, preds: &[NodeIndex], pred_edge: EdgeKind, lhs_text: &str, lhs_labels: &SmallVec<[DataLabel; 2]>, g: &mut Cfg, lang: &str, code: &'a [u8], enclosing_func: Option<&str>, call_ordinal: &mut u32, analysis_rules: Option<&LangAnalysisRules>, ) -> Vec { // Case 1: nested ternary. if branch_ast.kind() == "ternary_expression" { return build_ternary_diamond( lhs_text.to_string(), lhs_labels.clone(), branch_ast, preds, pred_edge, g, lang, code, enclosing_func, call_ordinal, analysis_rules, ); } // Cases 2 and 3: leaf branch expression. let has_call = has_call_descendant(branch_ast, lang); let kind = if has_call { StmtKind::Call } else { StmtKind::Seq }; let ord = if kind == StmtKind::Call { let o = *call_ordinal; *call_ordinal += 1; o } else { 0 }; let node = push_node( g, kind, branch_ast, lang, code, enclosing_func, ord, analysis_rules, ); // The branch expression's own `defines` (if any, typically None for a // pure value expression) is replaced with the outer LHS so that both // branches agree on the target, driving phi insertion at the join. g[node].taint.defines = Some(lhs_text.to_string()); for label in lhs_labels { if !g[node].taint.labels.contains(label) { g[node].taint.labels.push(*label); } } // Bridge source recognition to ternary branches. push_node only does // suffix/prefix matching on the branch text, so a source-shaped member // expression like `req.query.lng` doesn't classify (the rule matcher // is `req.query`, which neither suffix-matches nor prefix-matches // `req.query.lng`). Run the segment-strip-and-retry classifier on // the branch AST to recover the source label, mirroring what // `pre_emit_arg_source_nodes` does for call arguments and what the // `Kind::CallWrapper | Kind::Assignment` gate at push_node:1827 does // for whole declarations. Without this, `let arr = cond ? req.query.lng // : "";` lowers each branch to a labelless Assign-with-empty-uses, the // join phi sees no taint, and downstream sinks miss the flow. if !g[node] .taint .labels .iter() .any(|l| matches!(l, DataLabel::Source(_))) { let extra = analysis_rules .map(|r| r.extra_labels.as_slice()) .filter(|s| !s.is_empty()); if let Some(found @ DataLabel::Source(_)) = first_member_label(branch_ast, lang, code, extra) { g[node].taint.labels.push(found); } } connect_all(g, preds, node, pred_edge); // React JSX `dangerouslySetInnerHTML={{__html: x}}` synthesis when the // branch expression is itself a JSX element (or contains one as a // descendant). Without this, `cond ?
: null` and similar ternary-RHS shapes never reach the // `Kind::Return` / `Kind::Assignment` arms that own the synthesis hook, // because `build_ternary_diamond` lowers each branch directly. let post_jsx = try_lower_jsx_dangerous_html( branch_ast, &[node], g, lang, code, enclosing_func, call_ordinal, analysis_rules, ); post_jsx } /// Extract `(lhs_ast, ternary_ast)` when `outer_ast` is an expression-statement /// or declaration whose single assignment/declarator's RHS is a ternary. /// Returns `None` for multi-declarator forms, for missing fields, and for /// any RHS that isn't a `ternary_expression` after `unwrap_parens`. pub(super) fn find_ternary_rhs_wrapper<'a>(outer_ast: Node<'a>) -> Option<(Node<'a>, Node<'a>)> { let mut cursor = outer_ast.walk(); let mut declarator_count = 0usize; let mut found: Option<(Node<'a>, Node<'a>)> = None; for child in outer_ast.children(&mut cursor) { match child.kind() { "variable_declarator" => { declarator_count += 1; if declarator_count > 1 { return None; } let (Some(name), Some(value)) = ( child.child_by_field_name("name"), child.child_by_field_name("value"), ) else { continue; }; let rhs = unwrap_parens(value); if rhs.kind() == "ternary_expression" { found = Some((name, rhs)); } } "assignment_expression" => { let (Some(left), Some(right)) = ( child.child_by_field_name("left"), child.child_by_field_name("right"), ) else { continue; }; let rhs = unwrap_parens(right); if rhs.kind() == "ternary_expression" { return Some((left, rhs)); } } _ => {} } } found } /// Classify the LHS of a ternary-split assignment. Returns `(lhs_text, labels)` /// where `labels` are any sink labels that belong to the LHS itself (e.g. /// `innerHTML`, `document.cookie`). These are applied to **each branch** so /// the sink fires on whichever branch carries tainted data. pub(super) fn classify_ternary_lhs( lhs_ast: Node, lang: &str, code: &[u8], analysis_rules: Option<&LangAnalysisRules>, ) -> (String, SmallVec<[DataLabel; 2]>) { let extra = analysis_rules.map(|r| r.extra_labels.as_slice()); let mut labels: SmallVec<[DataLabel; 2]> = SmallVec::new(); // Prefer full member-expression path; fall back to raw text. let lhs_text = member_expr_text(lhs_ast, code) .or_else(|| text_of(lhs_ast, code)) .unwrap_or_default(); // Try the full dotted path first (e.g. "document.cookie"), then fall back // to the property alone (e.g. "innerHTML"), mirrors the LHS classification // already performed in `push_node` for non-split assignments. if let Some(l) = classify(lang, &lhs_text, extra) { labels.push(l); } if labels.is_empty() && let Some(prop) = lhs_ast.child_by_field_name("property") && let Some(prop_text) = text_of(prop, code) && let Some(l) = classify(lang, &prop_text, extra) { labels.push(l); } (lhs_text, labels) } /// Recursively decompose a boolean condition into a chain of `StmtKind::If` nodes /// with short-circuit edges. /// /// Returns `(true_exits, false_exits)`, the sets of nodes from which True/False /// edges should connect to the then/else branches. pub(super) fn build_condition_chain<'a>( cond_ast: Node<'a>, preds: &[NodeIndex], pred_edge: EdgeKind, g: &mut Cfg, lang: &str, code: &'a [u8], enclosing_func: Option<&str>, ) -> (Vec, Vec) { let inner = unwrap_parens(cond_ast); match is_boolean_operator(inner) { Some(BoolOp::And) => { if let Some((left, right)) = get_boolean_operands(inner) { // Left operand with current preds let (left_true, left_false) = build_condition_chain(left, preds, pred_edge, g, lang, code, enclosing_func); // Right operand only evaluated when left is true let (right_true, right_false) = build_condition_chain( right, &left_true, EdgeKind::True, g, lang, code, enclosing_func, ); // AND: true only when both true; false when either false let mut false_exits = left_false; false_exits.extend(right_false); (right_true, false_exits) } else { // Safety fallback: treat as leaf let node = push_condition_node(g, inner, lang, code, enclosing_func); connect_all(g, preds, node, pred_edge); (vec![node], vec![node]) } } Some(BoolOp::Or) => { if let Some((left, right)) = get_boolean_operands(inner) { // Left operand with current preds let (left_true, left_false) = build_condition_chain(left, preds, pred_edge, g, lang, code, enclosing_func); // Right operand only evaluated when left is false let (right_true, right_false) = build_condition_chain( right, &left_false, EdgeKind::False, g, lang, code, enclosing_func, ); // OR: true when either true; false only when both false let mut true_exits = left_true; true_exits.extend(right_true); (true_exits, right_false) } else { // Safety fallback: treat as leaf let node = push_condition_node(g, inner, lang, code, enclosing_func); connect_all(g, preds, node, pred_edge); (vec![node], vec![node]) } } None => { // Leaf: single condition node let node = push_condition_node(g, inner, lang, code, enclosing_func); connect_all(g, preds, node, pred_edge); (vec![node], vec![node]) } } }