Precision pass on auth and resource analysis (#63)

This commit is contained in:
Eli Peter 2026-05-03 13:51:46 -04:00 committed by GitHub
parent 064801a3a4
commit c7c5e0f3a1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
62 changed files with 4248 additions and 138 deletions

View file

@ -1390,6 +1390,116 @@ fn rust_nested_use_as_alias() {
assert_eq!(b.original, "Read");
}
/// `format!("{x}")` uses x even though x is captured via the format
/// string's named-argument syntax rather than as a separate AST
/// argument. Without this lift, taint stops at the macro boundary
/// for any caller whose format string reads a tainted variable by
/// name (matrix-rust-sdk CVE-2025-53549, log!() / println!() across
/// most Rust 1.58+ codebases).
#[test]
fn rust_format_macro_named_arg_lifted_into_uses() {
let src = b"fn f() { let x = 1; let y = format!(\"v={x}\"); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
let mut found = false;
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("y") {
assert!(
info.taint.uses.iter().any(|u| u == "x"),
"expected `x` in uses for `let y = format!(\"v={{x}}\")`; got {:?}",
info.taint.uses
);
found = true;
}
}
assert!(found, "no node found defining `y`");
}
#[test]
fn rust_format_macro_named_arg_with_format_spec() {
let src = b"fn f() { let x = 1; let y = format!(\"{x:?}\"); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
let mut found = false;
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("y") {
assert!(
info.taint.uses.iter().any(|u| u == "x"),
"expected `x` lifted past `{{x:?}}` format spec; got {:?}",
info.taint.uses
);
found = true;
}
}
assert!(found, "no node found defining `y`");
}
#[test]
fn rust_format_macro_escaped_braces_not_lifted() {
// `{{` and `}}` are escapes for literal `{` / `}`, NOT named
// argument captures. No identifier should be lifted from the
// sequence between them.
let src = b"fn f() { let q = format!(\"{{x}}\"); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("q") {
assert!(
!info.taint.uses.iter().any(|u| u == "x"),
"must not lift `x` from escaped `{{{{x}}}}`; got {:?}",
info.taint.uses
);
}
}
}
#[test]
fn rust_format_macro_positional_index_not_lifted() {
// Positional placeholders like `{0}` reference args by position,
// not by name. Don't accidentally treat a digit as an identifier.
let src = b"fn f() { let a = 1; let q = format!(\"{0}\", a); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("q") {
assert!(
!info.taint.uses.iter().any(|u| u == "0"),
"must not lift digit-only positional placeholder; got {:?}",
info.taint.uses
);
assert!(
info.taint.uses.iter().any(|u| u == "a"),
"expected `a` in uses (positional arg) for `format!(\"{{0}}\", a)`; got {:?}",
info.taint.uses
);
}
}
}
#[test]
fn rust_println_macro_named_arg_lifted() {
let src = b"fn f() { let user = String::from(\"x\"); println!(\"hi {user}\"); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
let mut found = false;
for n in cfg.node_indices() {
let info = &cfg[n];
if info.call.callee.as_deref() == Some("println") {
assert!(
info.taint.uses.iter().any(|u| u == "user"),
"expected `user` lifted into println! uses; got {:?}",
info.taint.uses
);
found = true;
}
}
assert!(found, "no println! macro_invocation node found");
}
#[test]
fn go_no_import_bindings() {
let src = b"package main\nimport alias \"fmt\"\n";
@ -2798,6 +2908,43 @@ fn go_for_loop_back_edge() {
assert_loop_with_back_edge(&cfg, "go for");
}
/// Pins the structural fix in `def_use` Kind::For arm for Go's
/// `for ident, ident := range iter` shape. Tree-sitter wraps the binding
/// pattern + iterable in a `range_clause` child of the `for_statement`
/// (rather than direct `left`/`right` fields like Python / JS). Without
/// this, the loop binding never becomes a CFG def and taint from the
/// iterable cannot reach uses of the binding inside the loop body.
/// Original gap: CVE-2026-41422 (daptin) goqu.L SQL injection.
#[test]
fn go_for_range_loop_binding_is_defined() {
let src = b"package p\nfunc f(xs []string) { for _, p := range xs { use(p) } }";
let ts_lang = Language::from(tree_sitter_go::LANGUAGE);
let (cfg, _) = parse_and_build(src, "go", ts_lang);
let loop_node = cfg
.node_indices()
.find(|&n| matches!(cfg[n].kind, StmtKind::Loop))
.expect("for-range loop should produce a Loop header");
let info = &cfg[loop_node];
let all_defs: Vec<&str> = info
.taint
.defines
.iter()
.map(String::as_str)
.chain(info.taint.extra_defines.iter().map(String::as_str))
.collect();
assert!(
all_defs.contains(&"p"),
"loop binding `p` should appear in defines/extra_defines, got {:?}",
all_defs
);
assert!(
info.taint.uses.iter().any(|u| u == "xs"),
"iterable `xs` should appear in uses, got {:?}",
info.taint.uses
);
}
#[test]
fn ruby_while_back_edge() {
let src = b"def f\n while cond\n body\n end\nend\n";

View file

@ -83,6 +83,18 @@ pub(super) fn push_condition_node<'a>(
let text = text_of(cond_ast, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
let span = (cond_ast.start_byte(), cond_ast.end_byte());
// Mirror condition variables into `taint.uses` so the per-body
// `SymbolInterner::from_cfg` pass interns them. Without this,
// `apply_branch_predicates` (which calls `interner.get(var)` to
// look up a Symbol id) silently no-ops on short-circuit branch
// condition nodes — they have no `taint.uses` even though
// `condition_vars` carries the variable names. Surfaced by
// GHSA-h8cj-hpmg-636v: a `||`-decomposed validator like
// `if (x == null || !regex.matcher(x).matches()) throw;` failed
// to mark `x` as `validated_must` on the surviving branch
// because the per-disjunct cond nodes (built via
// `build_condition_chain`) didn't populate `taint.uses`.
let uses_for_taint: Vec<String> = vars.clone();
g.add_node(NodeInfo {
kind: StmtKind::If,
ast: AstMeta {
@ -92,6 +104,10 @@ pub(super) fn push_condition_node<'a>(
condition_text: text,
condition_vars: vars,
condition_negated: negated,
taint: crate::cfg::TaintMeta {
uses: uses_for_taint,
..Default::default()
},
..Default::default()
})
}

View file

@ -1151,6 +1151,170 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
true
}
/// Extract identifiers captured by Rust format-string named-argument syntax
/// (`format!("…{name}…")`, stable since 1.58) from a `macro_invocation`
/// node. Returns the identifier names referenced by `{name}` /
/// `{name:fmt-spec}` patterns inside the first `string_literal` child of
/// the macro's `token_tree`.
///
/// Without this lifting, `let q = format!("...{x}...")` carries no `x` in
/// its `uses` because `x` lives in the format string's bytes rather than
/// as a separate AST argument node, so taint stops at the macro
/// boundary. Mirrors the Python f-string interpolation lifting in
/// `patterns/python.rs`.
///
/// Conservative recognition: only fires for known format-style macros
/// (`format`, `print`/`println`, `eprint`/`eprintln`, `write`/`writeln`,
/// `panic`, `format_args`, `assert`/`debug_assert`, the common `log`
/// crate severity macros). Empty for any non-Rust call node, any other
/// macro, or a token_tree whose first string is not present.
pub(super) fn extract_rust_format_macro_named_idents(call_node: Node, code: &[u8]) -> Vec<String> {
if call_node.kind() != "macro_invocation" {
return Vec::new();
}
let Some(macro_node) = call_node.child_by_field_name("macro") else {
return Vec::new();
};
let Some(macro_text) = text_of(macro_node, code) else {
return Vec::new();
};
let leaf = macro_text
.rsplit("::")
.next()
.unwrap_or(macro_text.as_str());
if !is_rust_format_style_macro(leaf) {
return Vec::new();
}
let tt = match call_node.child_by_field_name("token_tree") {
Some(t) => t,
None => {
let mut cursor = call_node.walk();
match call_node
.children(&mut cursor)
.find(|c| c.kind() == "token_tree")
{
Some(t) => t,
None => return Vec::new(),
}
}
};
let mut cursor = tt.walk();
let fmt_lit = match tt
.children(&mut cursor)
.find(|c| matches!(c.kind(), "string_literal" | "raw_string_literal"))
{
Some(n) => n,
None => return Vec::new(),
};
let raw = match text_of(fmt_lit, code) {
Some(s) => s,
None => return Vec::new(),
};
let content = strip_literal_quotes(&raw, fmt_lit, code).unwrap_or_else(|| raw.clone());
parse_rust_format_named_idents(&content)
}
/// Walk `n` and any descendants, accumulating named-format-arg idents from
/// every Rust `macro_invocation` reachable through structural expression
/// children (calls, fields, await, references, blocks, ...). Lets the
/// def-use collectors lift `format!("...{x}...")` named args through one
/// or two levels of expression wrapping (e.g.
/// `let q = format!("{x}").to_owned();` or RHS chained method calls).
pub(super) fn extract_rust_format_macro_named_idents_in(n: Node, code: &[u8]) -> Vec<String> {
let mut out = Vec::new();
collect_format_macro_idents_recursive(n, code, &mut out, 0);
out
}
fn collect_format_macro_idents_recursive(n: Node, code: &[u8], out: &mut Vec<String>, depth: u32) {
if depth > 6 {
return;
}
if n.kind() == "macro_invocation" {
for ident in extract_rust_format_macro_named_idents(n, code) {
out.push(ident);
}
}
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
collect_format_macro_idents_recursive(child, code, out, depth + 1);
}
}
fn is_rust_format_style_macro(name: &str) -> bool {
matches!(
name,
"format"
| "print"
| "println"
| "eprint"
| "eprintln"
| "write"
| "writeln"
| "panic"
| "format_args"
| "assert"
| "debug_assert"
| "todo"
| "unimplemented"
| "unreachable"
| "info"
| "warn"
| "error"
| "debug"
| "trace"
)
}
fn parse_rust_format_named_idents(s: &str) -> Vec<String> {
let bytes = s.as_bytes();
let mut out: Vec<String> = Vec::new();
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b'{' {
if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
i += 2;
continue;
}
let start = i + 1;
let mut j = start;
while j < bytes.len() && bytes[j] != b'}' && bytes[j] != b':' {
j += 1;
}
let ident_bytes = &bytes[start..j];
if is_valid_rust_format_ident(ident_bytes) {
if let Ok(name) = std::str::from_utf8(ident_bytes) {
out.push(name.to_string());
}
}
while j < bytes.len() && bytes[j] != b'}' {
j += 1;
}
i = j + 1;
} else if b == b'}' && i + 1 < bytes.len() && bytes[i + 1] == b'}' {
i += 2;
} else {
i += 1;
}
}
out
}
fn is_valid_rust_format_ident(b: &[u8]) -> bool {
if b.is_empty() {
return false;
}
let first = b[0];
if !(first.is_ascii_alphabetic() || first == b'_') {
return false;
}
if b.iter().all(|c| c.is_ascii_digit()) {
return false;
}
b.iter().all(|c| c.is_ascii_alphanumeric() || *c == b'_')
}
/// Extract per-argument identifiers from a call node's argument list.
/// Returns one `Vec<String>` per argument (in parameter-position order).
/// Returns empty if argument list can't be found or contains spread/keyword args.
@ -1663,6 +1827,11 @@ pub(super) fn def_use(
collect_idents_with_paths(val, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
// Rust format-string named-arg capture: `let q =
// format!("...{x}...")` reads `x`, but `x` lives in
// the format-string bytes, not as a separate AST
// argument node, so collect_idents misses it.
uses.extend(extract_rust_format_macro_named_idents_in(val, code));
}
} else {
// Try nested declarator pattern (JS/TS `lexical_declaration` → `variable_declarator`,
@ -1716,6 +1885,7 @@ pub(super) fn def_use(
collect_idents_with_paths(val_node, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(val_node, code));
}
}
}
@ -1728,6 +1898,7 @@ pub(super) fn def_use(
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(ast, code));
}
}
(defs, uses, extra_defs)
@ -1750,6 +1921,7 @@ pub(super) fn def_use(
collect_idents_with_paths(rhs, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(rhs, code));
}
(defs, uses, vec![])
}
@ -1801,9 +1973,26 @@ pub(super) fn def_use(
// `initializer`/`condition`/`increment`), so this path falls through
// to the default-collecting behaviour for those, preserving today's
// semantics.
//
// Go's `for ident := range iter` shape places the binding pattern
// and iterable on a `range_clause` child of the `for_statement`
// rather than as direct fields. Without the range_clause lookup
// below, taint from the iterable never reaches the loop binding
// (CVE-2026-41422 daptin: `c.QueryArray("col")` loop var `project`
// flows into `goqu.L(project)` SQL_QUERY sink).
Kind::For => {
let left = ast.child_by_field_name("left");
let right = ast.child_by_field_name("right");
let mut left = ast.child_by_field_name("left");
let mut right = ast.child_by_field_name("right");
if left.is_none() && right.is_none() {
let mut cursor = ast.walk();
for child in ast.children(&mut cursor) {
if child.kind() == "range_clause" {
left = child.child_by_field_name("left");
right = child.child_by_field_name("right");
break;
}
}
}
if left.is_none() && right.is_none() {
// C-style for, defer to default ident collection.
let mut idents = Vec::new();

View file

@ -69,6 +69,36 @@ pub(super) fn extract_param_meta<'a>(
}
return out;
};
// Java lambda shorthand: tree-sitter-java exposes the `parameters` field
// on `lambda_expression` as either a single bare identifier (`cmd -> …`)
// or an `inferred_parameters` wrapper around identifiers (`(a, b) -> …`).
// Neither shape matches the formal_parameter / spread_parameter kinds in
// PARAM_CONFIG, so the per-child loop below would otherwise see no
// params and the lambda would appear parameterless. Without this, the
// SSA pipeline treats the lambda binding as a free / closure-captured
// variable, defeating the JS/TS / Java auto-seed distinction between
// real handler-param formals and bubbled-up captures. Mirrors the JS/TS
// arrow shorthand handled above.
if func_node.kind() == "lambda_expression" {
if params.kind() == "identifier" {
if let Some(name) = text_of(params, code) {
out.push((name, None, Vec::new()));
return out;
}
} else if params.kind() == "inferred_parameters" {
let mut cursor = params.walk();
for child in params.named_children(&mut cursor) {
if child.kind() == "identifier" {
if let Some(name) = text_of(child, code) {
out.push((name, None, Vec::new()));
}
}
}
if !out.is_empty() {
return out;
}
}
}
let mut cursor = params.walk();
for child in params.children(&mut cursor) {
// Self/this parameter (e.g. Rust's `self_parameter`)