Authorization analysis logic improvements (#61)

This commit is contained in:
Eli Peter 2026-05-02 16:44:49 -04:00 committed by GitHub
parent 3c89bddbf2
commit 40995e45e7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
55 changed files with 4193 additions and 134 deletions

View file

@ -345,6 +345,126 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8])
false
}
/// Extract the literal value of a property `prop_name` from the object
/// literal at positional argument `arg_index`. Returns `None` if the
/// arg is absent, is not an object literal, the prop key isn't found,
/// or the prop value isn't a literal (so callers can distinguish
/// "present but dynamic" from "absent" only via [`has_object_arg_property`]).
///
/// Used by JS/TS-style "options object as kwargs" gates — e.g.
/// `_.template(tpl, { evaluate: false })` — where the safe-flag lives
/// in an inline object literal rather than as a dedicated kwarg node
/// (which JS does not have). Strict-additive: returns `None` for any
/// non-JS-object shape, including bare identifiers passed as the
/// options arg, so the gate falls back to the conservative dynamic
/// branch.
pub(super) fn extract_object_arg_property(
call_node: Node,
arg_index: usize,
prop_name: &str,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg = args.named_children(&mut cursor).nth(arg_index)?;
let arg = unwrap_parens(arg);
if !matches!(arg.kind(), "object" | "dictionary") {
return None;
}
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
if child.kind() != "pair" {
continue;
}
let Some(key_node) = child.child_by_field_name("key") else {
continue;
};
let key_text = match key_node.kind() {
"string" | "string_literal" => text_of(key_node, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
"computed_property_name" => continue,
_ => text_of(key_node, code),
};
if key_text.as_deref() != Some(prop_name) {
continue;
}
let val_node = child.child_by_field_name("value")?;
let val_node = unwrap_parens(val_node);
return match val_node.kind() {
"true" | "false" | "null" | "undefined" | "number" | "string" | "string_literal" => {
text_of(val_node, code).map(|s| s.to_string())
}
// JS booleans true/false are their own node kinds (above), but
// some grammar versions wrap them as identifier literals; surface
// `undefined` similarly.
"identifier" => text_of(val_node, code)
.filter(|s| matches!(s.as_str(), "true" | "false" | "null" | "undefined")),
_ => None,
};
}
None
}
/// Return `true` if the call node's positional arg at `arg_index` is an
/// object literal containing a property named `prop_name` (whether the
/// value is a literal or a dynamic expression). Used alongside
/// [`extract_object_arg_property`] so gated-sink classification can
/// distinguish "options key absent" (language default) from "options
/// key present with dynamic value" (conservative dangerous).
pub(super) fn has_object_arg_property(
call_node: Node,
arg_index: usize,
prop_name: &str,
code: &[u8],
) -> bool {
let Some(args) = call_node.child_by_field_name("arguments") else {
return false;
};
let mut cursor = args.walk();
let Some(arg) = args.named_children(&mut cursor).nth(arg_index) else {
return false;
};
let arg = unwrap_parens(arg);
if !matches!(arg.kind(), "object" | "dictionary") {
return false;
}
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
match child.kind() {
"shorthand_property_identifier" | "shorthand_property_identifier_pattern"
if text_of(child, code).as_deref() == Some(prop_name) =>
{
return true;
}
"pair" => {
if let Some(key_node) = child.child_by_field_name("key") {
let key_text = match key_node.kind() {
"string" | "string_literal" => text_of(key_node, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
"computed_property_name" => continue,
_ => text_of(key_node, code),
};
if key_text.as_deref() == Some(prop_name) {
return true;
}
}
}
_ => {}
}
}
false
}
/// Inspect the first positional argument of a call node and return its
/// tree-sitter `kind()` plus a flag indicating whether any descendant is an
/// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as
@ -584,6 +704,29 @@ pub(super) fn find_chained_inner_call<'a>(
let function = outer
.child_by_field_name("function")
.or_else(|| outer.child_by_field_name("method"))?;
// Direct double-call form (`f()(x)`): the outer call's `function`
// field IS itself a call_expression, with no intermediate
// member-chain. Treat the inner call as the chain's innermost.
// Without this, lodash-style template-render chains like
// `_.template(t)(data)` evade the chained-inner rebinding because
// the outer's function field is a `call_expression`, not the
// `member_expression` shape the original branch below expects.
if matches!(
lookup(lang, function.kind()),
Kind::CallFn | Kind::CallMethod
) {
// Recurse: the inner call may itself be chained.
if let Some(inner) = find_chained_inner_call(function, lang, code) {
return Some(inner);
}
let inner_func = function
.child_by_field_name("function")
.or_else(|| function.child_by_field_name("method"))
.or_else(|| function.child_by_field_name("name"))?;
let raw = text_of(inner_func, code)?;
let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect();
return Some((function, inner_text));
}
// The function/method field for a chained call is a member_expression
// (JS/TS) or attribute (Python) etc.; its `object` field is the
// receiver expression. Only proceed when that receiver is itself a

View file

@ -54,8 +54,9 @@ use literals::{
detect_rust_replace_chain_sanitizer, extract_arg_callees, extract_arg_string_literals,
extract_arg_uses, extract_const_keyword_arg, extract_const_macro_arg, extract_const_string_arg,
extract_destination_field_pairs, extract_destination_kwarg_pairs, extract_kwargs,
extract_literal_rhs, extract_shell_array_payload_idents, find_call_node, find_call_node_deep,
find_chained_inner_call, has_keyword_arg, has_only_literal_args, is_parameterized_query_call,
extract_literal_rhs, extract_object_arg_property, extract_shell_array_payload_idents,
find_call_node, find_call_node_deep, find_chained_inner_call, has_keyword_arg,
has_object_arg_property, has_only_literal_args, is_parameterized_query_call,
java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
};
@ -67,11 +68,33 @@ use params::{
/// Test-only re-export of [`extract_param_meta`] so the external
/// `tests/typed_extractors_audit.rs` harness can drive the per-param
/// classifier directly without spinning up the full scan pipeline.
/// Projects away the destructured-siblings third tuple slot so the
/// existing tuple-shape assertions in the audit harness keep working;
/// the sibling info is plumbed separately through `BodyMeta`.
pub fn extract_param_meta_for_test<'a>(
func_node: tree_sitter::Node<'a>,
lang: &str,
code: &'a [u8],
) -> Vec<(String, Option<crate::ssa::type_facts::TypeKind>)> {
extract_param_meta(func_node, lang, code)
.into_iter()
.map(|(name, ty, _siblings)| (name, ty))
.collect()
}
/// Test-only re-export that returns the full per-slot tuple including
/// destructured sibling names. Used by the destructured-arg-probe
/// regression tests in `src/taint/tests.rs` and the params unit tests
/// in `src/cfg/cfg_tests.rs`.
pub fn extract_param_meta_with_destructured_for_test<'a>(
func_node: tree_sitter::Node<'a>,
lang: &str,
code: &'a [u8],
) -> Vec<(
String,
Option<crate::ssa::type_facts::TypeKind>,
Vec<String>,
)> {
extract_param_meta(func_node, lang, code)
}
@ -567,6 +590,17 @@ pub struct BodyMeta {
/// `None`, downstream behaviour is identical to the pre-Phase-1
/// engine.
pub param_types: Vec<Option<crate::ssa::type_facts::TypeKind>>,
/// Per-parameter destructured-binding sibling names. Same length
/// as `params`; entry `i` lists field names bound by the same
/// argument slot as `params[i]`, excluding the primary name itself.
/// Empty for non-destructured params. Today populated only for
/// JS/TS object-pattern formals (`({ a, b, c })` → params=["a"],
/// destructured=[["b","c"]]). Used by per-parameter taint-summary
/// probing in `extract_ssa_func_summary` so destructured bindings
/// inside the body share the slot's seeded caps and any of them
/// being in `validated_must` at a return path counts as the slot
/// being validated. Closes the residual gap behind CVE-2026-25544.
pub param_destructured_fields: Vec<Vec<String>>,
pub param_count: usize,
pub span: (usize, usize),
pub parent_body_id: Option<BodyId>,
@ -1909,8 +1943,27 @@ pub(super) fn push_node<'a>(
}
})
},
|kw| extract_const_keyword_arg(cn, kw, code),
|kw| has_keyword_arg(cn, kw, code),
|kw| {
// For JS/TS, options-bearing args are passed as inline
// object literals (`fn(x, { evaluate: false })`) rather
// than language-level keyword arguments. When the
// standard `keyword_argument`-walking extractor returns
// None, fall back to inspecting arg 1's object literal
// for a property named `kw`. This lets gates like
// `_.template` consult `{ evaluate: false }` literally.
extract_const_keyword_arg(cn, kw, code).or_else(|| {
if matches!(lang, "javascript" | "typescript") {
extract_object_arg_property(cn, 1, kw, code)
} else {
None
}
})
},
|kw| {
has_keyword_arg(cn, kw, code)
|| (matches!(lang, "javascript" | "typescript")
&& has_object_arg_property(cn, 1, kw, code))
},
);
if !matches.is_empty() {
@ -3871,9 +3924,13 @@ pub(super) fn build_sub<'a>(
let is_anon = is_anon_fn_name(&fn_name);
let param_meta = extract_param_meta(ast, lang, code);
let param_count = param_meta.len();
let param_names: Vec<String> = param_meta.iter().map(|(n, _)| n.clone()).collect();
let param_names: Vec<String> = param_meta.iter().map(|(n, _, _)| n.clone()).collect();
let param_types: Vec<Option<crate::ssa::type_facts::TypeKind>> =
param_meta.iter().map(|(_, t)| t.clone()).collect();
param_meta.iter().map(|(_, t, _)| t.clone()).collect();
let param_destructured_fields: Vec<Vec<String>> = param_meta
.iter()
.map(|(_, _, siblings)| siblings.clone())
.collect();
// ── 1b) Compute identity discriminators ───────────────────────────
let (fn_container, fn_kind) =
@ -4130,6 +4187,7 @@ pub(super) fn build_sub<'a>(
name: if is_anon { None } else { Some(fn_name.clone()) },
params: param_names,
param_types,
param_destructured_fields,
param_count,
span: (ast.start_byte(), ast.end_byte()),
parent_body_id: Some(current_body_id),
@ -4628,6 +4686,7 @@ pub(crate) fn build_cfg<'a>(
name: None,
params: Vec::new(),
param_types: Vec::new(),
param_destructured_fields: Vec::new(),
param_count: 0,
span: (0, code.len()),
parent_body_id: None,

View file

@ -21,16 +21,27 @@ fn lookup_dto_class(class_name: &str) -> Option<TypeKind> {
/// Extract parameter names + per-position [`TypeKind`] from a function
/// AST node. Each entry's second slot is `Some(TypeKind)` when the
/// parameter's decorator, attribute, or static type annotation maps to
/// a known kind, and `None` otherwise. Strictly additive, when no
/// type info is recoverable, behaviour is identical to the names-only
/// path.
/// a known kind, and `None` otherwise. The third slot lists
/// destructured field names bound by the same parameter slot — empty
/// for non-destructured params and for the primary name itself. E.g.
/// for the JS/TS object-pattern formal `({ a, b, c })`, the entry is
/// `("a", None, ["b", "c"])`. Strictly additive: when the param is
/// not a destructured pattern (or the language has no destructure
/// concept), behaviour is identical to the pre-Phase-5 names-only path.
///
/// Closes the residual gap behind CVE-2026-25544 (PayloadCMS Drizzle
/// SQL injection): a per-parameter taint probe that seeds only the
/// primary name `column` cannot see flow through sibling destructured
/// bindings (`value` etc.) inside the body, so summary extraction
/// misses `validated_params_to_return` when a validator helper is
/// applied to one of the siblings.
pub(super) fn extract_param_meta<'a>(
func_node: Node<'a>,
lang: &str,
code: &'a [u8],
) -> Vec<(String, Option<TypeKind>)> {
) -> Vec<(String, Option<TypeKind>, Vec<String>)> {
let cfg = param_config(lang);
let mut out: Vec<(String, Option<TypeKind>)> = Vec::new();
let mut out: Vec<(String, Option<TypeKind>, Vec<String>)> = Vec::new();
// Try the params_field directly on the function node first.
// For C/C++, the parameter list is nested inside the declarator
// (function_definition > declarator:function_declarator > parameters:parameter_list),
@ -51,7 +62,7 @@ pub(super) fn extract_param_meta<'a>(
if let Some(p) = func_node.child_by_field_name("parameter") {
if p.kind() == "identifier" {
if let Some(name) = text_of(p, code) {
out.push((name, None));
out.push((name, None, Vec::new()));
}
}
}
@ -62,7 +73,7 @@ pub(super) fn extract_param_meta<'a>(
for child in params.children(&mut cursor) {
// Self/this parameter (e.g. Rust's `self_parameter`)
if cfg.self_param_kinds.contains(&child.kind()) {
out.push(("self".into(), None));
out.push(("self".into(), None, Vec::new()));
continue;
}
@ -74,14 +85,26 @@ pub(super) fn extract_param_meta<'a>(
if let Some(node) = child.child_by_field_name(field) {
let mut tmp = Vec::new();
collect_idents(node, code, &mut tmp);
let candidate = if lang == "rust" {
tmp.into_iter().last()
let primary = if lang == "rust" {
// Rust: last ident is the binding name (e.g.
// `Path(project_id): Path<i64>` → `project_id`).
tmp.pop()
} else if tmp.is_empty() {
None
} else {
tmp.into_iter().next()
Some(tmp.remove(0))
};
if let Some(name) = candidate {
if let Some(name) = primary {
let ty = classify_param_type(child, lang, code);
out.push((name, ty));
// Surface destructured siblings only when the
// pattern node is a destructure container. For
// ordinary (non-destructured) params, `tmp` is
// already empty after `pop()` / `remove(0)`.
// Object-pattern children of the same slot
// (`{ a, b, c }`) leave the remaining names in
// `tmp`, which become the slot's siblings.
let siblings = sibling_names_for_destructure(node, &tmp, lang);
out.push((name, ty, siblings));
found = true;
break;
}
@ -92,7 +115,7 @@ pub(super) fn extract_param_meta<'a>(
&& child.kind() == "identifier"
&& let Some(txt) = text_of(child, code)
{
out.push((txt, None));
out.push((txt, None, Vec::new()));
found = true;
}
// Fallback for C/C++: look for nested declarator → identifier
@ -101,7 +124,7 @@ pub(super) fn extract_param_meta<'a>(
collect_idents(child, code, &mut tmp);
if let Some(last) = tmp.pop() {
let ty = classify_param_type(child, lang, code);
out.push((last, ty));
out.push((last, ty, Vec::new()));
found = true;
}
}
@ -112,12 +135,22 @@ pub(super) fn extract_param_meta<'a>(
// *first* identifier, that is the parameter name; subsequent
// identifiers are part of the type annotation or default
// expression.
//
// Destructure-container case (JS arrow `({ a, b }) => …`):
// when the child node IS a destructure pattern itself (no
// `required_parameter` / `assignment_pattern` wrapper), the
// remaining idents after the primary are destructured
// bindings sharing this slot — surface them as siblings so
// per-parameter summary probing seeds every binding the
// slot produces.
if !found {
let mut tmp = Vec::new();
collect_idents(child, code, &mut tmp);
if let Some(first) = tmp.into_iter().next() {
if !tmp.is_empty() {
let first = tmp.remove(0);
let ty = classify_param_type(child, lang, code);
out.push((first, ty));
let siblings = sibling_names_for_destructure(child, &tmp, lang);
out.push((first, ty, siblings));
}
}
continue;
@ -127,13 +160,52 @@ pub(super) fn extract_param_meta<'a>(
// where the child is an `identifier` node, not a `parameter` wrapper.
if child.kind() == "identifier" {
if let Some(txt) = text_of(child, code) {
out.push((txt, None));
out.push((txt, None, Vec::new()));
}
}
}
out
}
/// Return destructured field-name siblings for a parameter's pattern
/// node, but only when the pattern is a recognised destructure
/// container (object / record pattern). For ordinary patterns the
/// `remaining` slice is already empty so this is a noop. Restricting
/// the return to destructure containers prevents typed-parameter
/// idioms (`Path<i64>`, `@PathVariable Long userId`, Rust extractor
/// wrappers) from accidentally surfacing the type identifier as a
/// destructured sibling.
fn sibling_names_for_destructure(
pattern: Node<'_>,
remaining: &[String],
lang: &str,
) -> Vec<String> {
if remaining.is_empty() {
return Vec::new();
}
if !is_destructure_container_kind(pattern.kind(), lang) {
return Vec::new();
}
remaining.to_vec()
}
/// Recognise tree-sitter pattern node kinds that destructure a
/// single argument into multiple bindings — JS/TS object patterns
/// today, plus Python's `pattern_list` / `tuple_pattern` for kwargs
/// destructure if those ever come through this path. Conservative:
/// only kinds we have explicit per-language reasoning for return
/// `true`; everything else returns `false` so the existing single-
/// name fallback path is preserved untouched.
fn is_destructure_container_kind(kind: &str, lang: &str) -> bool {
match (lang, kind) {
("javascript" | "typescript", "object_pattern") => true,
// Future languages: array pattern (`[a, b]`) is intentionally
// omitted — the index-based unpacking is positional, and the
// names don't map cleanly to "all share slot 0".
_ => false,
}
}
/// Walk up from a function definition node and build a container path.
///
/// Records the names of enclosing classes / impls / modules / namespaces /