mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-30 20:39:39 +02:00
Precision pass on auth and resource analysis (#63)
This commit is contained in:
parent
064801a3a4
commit
c7c5e0f3a1
62 changed files with 4248 additions and 138 deletions
|
|
@ -1390,6 +1390,116 @@ fn rust_nested_use_as_alias() {
|
|||
assert_eq!(b.original, "Read");
|
||||
}
|
||||
|
||||
/// `format!("{x}")` uses x even though x is captured via the format
|
||||
/// string's named-argument syntax rather than as a separate AST
|
||||
/// argument. Without this lift, taint stops at the macro boundary
|
||||
/// for any caller whose format string reads a tainted variable by
|
||||
/// name (matrix-rust-sdk CVE-2025-53549, log!() / println!() across
|
||||
/// most Rust 1.58+ codebases).
|
||||
#[test]
|
||||
fn rust_format_macro_named_arg_lifted_into_uses() {
|
||||
let src = b"fn f() { let x = 1; let y = format!(\"v={x}\"); }";
|
||||
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
|
||||
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
|
||||
let mut found = false;
|
||||
for n in cfg.node_indices() {
|
||||
let info = &cfg[n];
|
||||
if info.taint.defines.as_deref() == Some("y") {
|
||||
assert!(
|
||||
info.taint.uses.iter().any(|u| u == "x"),
|
||||
"expected `x` in uses for `let y = format!(\"v={{x}}\")`; got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assert!(found, "no node found defining `y`");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_format_macro_named_arg_with_format_spec() {
|
||||
let src = b"fn f() { let x = 1; let y = format!(\"{x:?}\"); }";
|
||||
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
|
||||
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
|
||||
let mut found = false;
|
||||
for n in cfg.node_indices() {
|
||||
let info = &cfg[n];
|
||||
if info.taint.defines.as_deref() == Some("y") {
|
||||
assert!(
|
||||
info.taint.uses.iter().any(|u| u == "x"),
|
||||
"expected `x` lifted past `{{x:?}}` format spec; got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assert!(found, "no node found defining `y`");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_format_macro_escaped_braces_not_lifted() {
|
||||
// `{{` and `}}` are escapes for literal `{` / `}`, NOT named
|
||||
// argument captures. No identifier should be lifted from the
|
||||
// sequence between them.
|
||||
let src = b"fn f() { let q = format!(\"{{x}}\"); }";
|
||||
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
|
||||
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
|
||||
for n in cfg.node_indices() {
|
||||
let info = &cfg[n];
|
||||
if info.taint.defines.as_deref() == Some("q") {
|
||||
assert!(
|
||||
!info.taint.uses.iter().any(|u| u == "x"),
|
||||
"must not lift `x` from escaped `{{{{x}}}}`; got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_format_macro_positional_index_not_lifted() {
|
||||
// Positional placeholders like `{0}` reference args by position,
|
||||
// not by name. Don't accidentally treat a digit as an identifier.
|
||||
let src = b"fn f() { let a = 1; let q = format!(\"{0}\", a); }";
|
||||
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
|
||||
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
|
||||
for n in cfg.node_indices() {
|
||||
let info = &cfg[n];
|
||||
if info.taint.defines.as_deref() == Some("q") {
|
||||
assert!(
|
||||
!info.taint.uses.iter().any(|u| u == "0"),
|
||||
"must not lift digit-only positional placeholder; got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
assert!(
|
||||
info.taint.uses.iter().any(|u| u == "a"),
|
||||
"expected `a` in uses (positional arg) for `format!(\"{{0}}\", a)`; got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_println_macro_named_arg_lifted() {
|
||||
let src = b"fn f() { let user = String::from(\"x\"); println!(\"hi {user}\"); }";
|
||||
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
|
||||
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
|
||||
let mut found = false;
|
||||
for n in cfg.node_indices() {
|
||||
let info = &cfg[n];
|
||||
if info.call.callee.as_deref() == Some("println") {
|
||||
assert!(
|
||||
info.taint.uses.iter().any(|u| u == "user"),
|
||||
"expected `user` lifted into println! uses; got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assert!(found, "no println! macro_invocation node found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn go_no_import_bindings() {
|
||||
let src = b"package main\nimport alias \"fmt\"\n";
|
||||
|
|
@ -2798,6 +2908,43 @@ fn go_for_loop_back_edge() {
|
|||
assert_loop_with_back_edge(&cfg, "go for");
|
||||
}
|
||||
|
||||
/// Pins the structural fix in `def_use` Kind::For arm for Go's
|
||||
/// `for ident, ident := range iter` shape. Tree-sitter wraps the binding
|
||||
/// pattern + iterable in a `range_clause` child of the `for_statement`
|
||||
/// (rather than direct `left`/`right` fields like Python / JS). Without
|
||||
/// this, the loop binding never becomes a CFG def and taint from the
|
||||
/// iterable cannot reach uses of the binding inside the loop body.
|
||||
/// Original gap: CVE-2026-41422 (daptin) goqu.L SQL injection.
|
||||
#[test]
|
||||
fn go_for_range_loop_binding_is_defined() {
|
||||
let src = b"package p\nfunc f(xs []string) { for _, p := range xs { use(p) } }";
|
||||
let ts_lang = Language::from(tree_sitter_go::LANGUAGE);
|
||||
let (cfg, _) = parse_and_build(src, "go", ts_lang);
|
||||
|
||||
let loop_node = cfg
|
||||
.node_indices()
|
||||
.find(|&n| matches!(cfg[n].kind, StmtKind::Loop))
|
||||
.expect("for-range loop should produce a Loop header");
|
||||
let info = &cfg[loop_node];
|
||||
let all_defs: Vec<&str> = info
|
||||
.taint
|
||||
.defines
|
||||
.iter()
|
||||
.map(String::as_str)
|
||||
.chain(info.taint.extra_defines.iter().map(String::as_str))
|
||||
.collect();
|
||||
assert!(
|
||||
all_defs.contains(&"p"),
|
||||
"loop binding `p` should appear in defines/extra_defines, got {:?}",
|
||||
all_defs
|
||||
);
|
||||
assert!(
|
||||
info.taint.uses.iter().any(|u| u == "xs"),
|
||||
"iterable `xs` should appear in uses, got {:?}",
|
||||
info.taint.uses
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ruby_while_back_edge() {
|
||||
let src = b"def f\n while cond\n body\n end\nend\n";
|
||||
|
|
|
|||
|
|
@ -83,6 +83,18 @@ pub(super) fn push_condition_node<'a>(
|
|||
let text = text_of(cond_ast, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
let span = (cond_ast.start_byte(), cond_ast.end_byte());
|
||||
// Mirror condition variables into `taint.uses` so the per-body
|
||||
// `SymbolInterner::from_cfg` pass interns them. Without this,
|
||||
// `apply_branch_predicates` (which calls `interner.get(var)` to
|
||||
// look up a Symbol id) silently no-ops on short-circuit branch
|
||||
// condition nodes — they have no `taint.uses` even though
|
||||
// `condition_vars` carries the variable names. Surfaced by
|
||||
// GHSA-h8cj-hpmg-636v: a `||`-decomposed validator like
|
||||
// `if (x == null || !regex.matcher(x).matches()) throw;` failed
|
||||
// to mark `x` as `validated_must` on the surviving branch
|
||||
// because the per-disjunct cond nodes (built via
|
||||
// `build_condition_chain`) didn't populate `taint.uses`.
|
||||
let uses_for_taint: Vec<String> = vars.clone();
|
||||
g.add_node(NodeInfo {
|
||||
kind: StmtKind::If,
|
||||
ast: AstMeta {
|
||||
|
|
@ -92,6 +104,10 @@ pub(super) fn push_condition_node<'a>(
|
|||
condition_text: text,
|
||||
condition_vars: vars,
|
||||
condition_negated: negated,
|
||||
taint: crate::cfg::TaintMeta {
|
||||
uses: uses_for_taint,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1151,6 +1151,170 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
|
|||
true
|
||||
}
|
||||
|
||||
/// Extract identifiers captured by Rust format-string named-argument syntax
|
||||
/// (`format!("…{name}…")`, stable since 1.58) from a `macro_invocation`
|
||||
/// node. Returns the identifier names referenced by `{name}` /
|
||||
/// `{name:fmt-spec}` patterns inside the first `string_literal` child of
|
||||
/// the macro's `token_tree`.
|
||||
///
|
||||
/// Without this lifting, `let q = format!("...{x}...")` carries no `x` in
|
||||
/// its `uses` because `x` lives in the format string's bytes rather than
|
||||
/// as a separate AST argument node, so taint stops at the macro
|
||||
/// boundary. Mirrors the Python f-string interpolation lifting in
|
||||
/// `patterns/python.rs`.
|
||||
///
|
||||
/// Conservative recognition: only fires for known format-style macros
|
||||
/// (`format`, `print`/`println`, `eprint`/`eprintln`, `write`/`writeln`,
|
||||
/// `panic`, `format_args`, `assert`/`debug_assert`, the common `log`
|
||||
/// crate severity macros). Empty for any non-Rust call node, any other
|
||||
/// macro, or a token_tree whose first string is not present.
|
||||
pub(super) fn extract_rust_format_macro_named_idents(call_node: Node, code: &[u8]) -> Vec<String> {
|
||||
if call_node.kind() != "macro_invocation" {
|
||||
return Vec::new();
|
||||
}
|
||||
let Some(macro_node) = call_node.child_by_field_name("macro") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let Some(macro_text) = text_of(macro_node, code) else {
|
||||
return Vec::new();
|
||||
};
|
||||
let leaf = macro_text
|
||||
.rsplit("::")
|
||||
.next()
|
||||
.unwrap_or(macro_text.as_str());
|
||||
if !is_rust_format_style_macro(leaf) {
|
||||
return Vec::new();
|
||||
}
|
||||
let tt = match call_node.child_by_field_name("token_tree") {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
let mut cursor = call_node.walk();
|
||||
match call_node
|
||||
.children(&mut cursor)
|
||||
.find(|c| c.kind() == "token_tree")
|
||||
{
|
||||
Some(t) => t,
|
||||
None => return Vec::new(),
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut cursor = tt.walk();
|
||||
let fmt_lit = match tt
|
||||
.children(&mut cursor)
|
||||
.find(|c| matches!(c.kind(), "string_literal" | "raw_string_literal"))
|
||||
{
|
||||
Some(n) => n,
|
||||
None => return Vec::new(),
|
||||
};
|
||||
let raw = match text_of(fmt_lit, code) {
|
||||
Some(s) => s,
|
||||
None => return Vec::new(),
|
||||
};
|
||||
let content = strip_literal_quotes(&raw, fmt_lit, code).unwrap_or_else(|| raw.clone());
|
||||
parse_rust_format_named_idents(&content)
|
||||
}
|
||||
|
||||
/// Walk `n` and any descendants, accumulating named-format-arg idents from
|
||||
/// every Rust `macro_invocation` reachable through structural expression
|
||||
/// children (calls, fields, await, references, blocks, ...). Lets the
|
||||
/// def-use collectors lift `format!("...{x}...")` named args through one
|
||||
/// or two levels of expression wrapping (e.g.
|
||||
/// `let q = format!("{x}").to_owned();` or RHS chained method calls).
|
||||
pub(super) fn extract_rust_format_macro_named_idents_in(n: Node, code: &[u8]) -> Vec<String> {
|
||||
let mut out = Vec::new();
|
||||
collect_format_macro_idents_recursive(n, code, &mut out, 0);
|
||||
out
|
||||
}
|
||||
|
||||
fn collect_format_macro_idents_recursive(n: Node, code: &[u8], out: &mut Vec<String>, depth: u32) {
|
||||
if depth > 6 {
|
||||
return;
|
||||
}
|
||||
if n.kind() == "macro_invocation" {
|
||||
for ident in extract_rust_format_macro_named_idents(n, code) {
|
||||
out.push(ident);
|
||||
}
|
||||
}
|
||||
let mut cursor = n.walk();
|
||||
for child in n.children(&mut cursor) {
|
||||
collect_format_macro_idents_recursive(child, code, out, depth + 1);
|
||||
}
|
||||
}
|
||||
|
||||
fn is_rust_format_style_macro(name: &str) -> bool {
|
||||
matches!(
|
||||
name,
|
||||
"format"
|
||||
| "print"
|
||||
| "println"
|
||||
| "eprint"
|
||||
| "eprintln"
|
||||
| "write"
|
||||
| "writeln"
|
||||
| "panic"
|
||||
| "format_args"
|
||||
| "assert"
|
||||
| "debug_assert"
|
||||
| "todo"
|
||||
| "unimplemented"
|
||||
| "unreachable"
|
||||
| "info"
|
||||
| "warn"
|
||||
| "error"
|
||||
| "debug"
|
||||
| "trace"
|
||||
)
|
||||
}
|
||||
|
||||
fn parse_rust_format_named_idents(s: &str) -> Vec<String> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
let b = bytes[i];
|
||||
if b == b'{' {
|
||||
if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
let start = i + 1;
|
||||
let mut j = start;
|
||||
while j < bytes.len() && bytes[j] != b'}' && bytes[j] != b':' {
|
||||
j += 1;
|
||||
}
|
||||
let ident_bytes = &bytes[start..j];
|
||||
if is_valid_rust_format_ident(ident_bytes) {
|
||||
if let Ok(name) = std::str::from_utf8(ident_bytes) {
|
||||
out.push(name.to_string());
|
||||
}
|
||||
}
|
||||
while j < bytes.len() && bytes[j] != b'}' {
|
||||
j += 1;
|
||||
}
|
||||
i = j + 1;
|
||||
} else if b == b'}' && i + 1 < bytes.len() && bytes[i + 1] == b'}' {
|
||||
i += 2;
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn is_valid_rust_format_ident(b: &[u8]) -> bool {
|
||||
if b.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let first = b[0];
|
||||
if !(first.is_ascii_alphabetic() || first == b'_') {
|
||||
return false;
|
||||
}
|
||||
if b.iter().all(|c| c.is_ascii_digit()) {
|
||||
return false;
|
||||
}
|
||||
b.iter().all(|c| c.is_ascii_alphanumeric() || *c == b'_')
|
||||
}
|
||||
|
||||
/// Extract per-argument identifiers from a call node's argument list.
|
||||
/// Returns one `Vec<String>` per argument (in parameter-position order).
|
||||
/// Returns empty if argument list can't be found or contains spread/keyword args.
|
||||
|
|
@ -1663,6 +1827,11 @@ pub(super) fn def_use(
|
|||
collect_idents_with_paths(val, code, &mut idents, &mut paths);
|
||||
uses.extend(paths);
|
||||
uses.extend(idents);
|
||||
// Rust format-string named-arg capture: `let q =
|
||||
// format!("...{x}...")` reads `x`, but `x` lives in
|
||||
// the format-string bytes, not as a separate AST
|
||||
// argument node, so collect_idents misses it.
|
||||
uses.extend(extract_rust_format_macro_named_idents_in(val, code));
|
||||
}
|
||||
} else {
|
||||
// Try nested declarator pattern (JS/TS `lexical_declaration` → `variable_declarator`,
|
||||
|
|
@ -1716,6 +1885,7 @@ pub(super) fn def_use(
|
|||
collect_idents_with_paths(val_node, code, &mut idents, &mut paths);
|
||||
uses.extend(paths);
|
||||
uses.extend(idents);
|
||||
uses.extend(extract_rust_format_macro_named_idents_in(val_node, code));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1728,6 +1898,7 @@ pub(super) fn def_use(
|
|||
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
|
||||
uses.extend(paths);
|
||||
uses.extend(idents);
|
||||
uses.extend(extract_rust_format_macro_named_idents_in(ast, code));
|
||||
}
|
||||
}
|
||||
(defs, uses, extra_defs)
|
||||
|
|
@ -1750,6 +1921,7 @@ pub(super) fn def_use(
|
|||
collect_idents_with_paths(rhs, code, &mut idents, &mut paths);
|
||||
uses.extend(paths);
|
||||
uses.extend(idents);
|
||||
uses.extend(extract_rust_format_macro_named_idents_in(rhs, code));
|
||||
}
|
||||
(defs, uses, vec![])
|
||||
}
|
||||
|
|
@ -1801,9 +1973,26 @@ pub(super) fn def_use(
|
|||
// `initializer`/`condition`/`increment`), so this path falls through
|
||||
// to the default-collecting behaviour for those, preserving today's
|
||||
// semantics.
|
||||
//
|
||||
// Go's `for ident := range iter` shape places the binding pattern
|
||||
// and iterable on a `range_clause` child of the `for_statement`
|
||||
// rather than as direct fields. Without the range_clause lookup
|
||||
// below, taint from the iterable never reaches the loop binding
|
||||
// (CVE-2026-41422 daptin: `c.QueryArray("col")` loop var `project`
|
||||
// flows into `goqu.L(project)` SQL_QUERY sink).
|
||||
Kind::For => {
|
||||
let left = ast.child_by_field_name("left");
|
||||
let right = ast.child_by_field_name("right");
|
||||
let mut left = ast.child_by_field_name("left");
|
||||
let mut right = ast.child_by_field_name("right");
|
||||
if left.is_none() && right.is_none() {
|
||||
let mut cursor = ast.walk();
|
||||
for child in ast.children(&mut cursor) {
|
||||
if child.kind() == "range_clause" {
|
||||
left = child.child_by_field_name("left");
|
||||
right = child.child_by_field_name("right");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if left.is_none() && right.is_none() {
|
||||
// C-style for, defer to default ident collection.
|
||||
let mut idents = Vec::new();
|
||||
|
|
|
|||
|
|
@ -69,6 +69,36 @@ pub(super) fn extract_param_meta<'a>(
|
|||
}
|
||||
return out;
|
||||
};
|
||||
// Java lambda shorthand: tree-sitter-java exposes the `parameters` field
|
||||
// on `lambda_expression` as either a single bare identifier (`cmd -> …`)
|
||||
// or an `inferred_parameters` wrapper around identifiers (`(a, b) -> …`).
|
||||
// Neither shape matches the formal_parameter / spread_parameter kinds in
|
||||
// PARAM_CONFIG, so the per-child loop below would otherwise see no
|
||||
// params and the lambda would appear parameterless. Without this, the
|
||||
// SSA pipeline treats the lambda binding as a free / closure-captured
|
||||
// variable, defeating the JS/TS / Java auto-seed distinction between
|
||||
// real handler-param formals and bubbled-up captures. Mirrors the JS/TS
|
||||
// arrow shorthand handled above.
|
||||
if func_node.kind() == "lambda_expression" {
|
||||
if params.kind() == "identifier" {
|
||||
if let Some(name) = text_of(params, code) {
|
||||
out.push((name, None, Vec::new()));
|
||||
return out;
|
||||
}
|
||||
} else if params.kind() == "inferred_parameters" {
|
||||
let mut cursor = params.walk();
|
||||
for child in params.named_children(&mut cursor) {
|
||||
if child.kind() == "identifier" {
|
||||
if let Some(name) = text_of(child, code) {
|
||||
out.push((name, None, Vec::new()));
|
||||
}
|
||||
}
|
||||
}
|
||||
if !out.is_empty() {
|
||||
return out;
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut cursor = params.walk();
|
||||
for child in params.children(&mut cursor) {
|
||||
// Self/this parameter (e.g. Rust's `self_parameter`)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue