Critical bug fixes and recall improvements (#68)

This commit is contained in:
Eli Peter 2026-05-11 12:42:39 -04:00 committed by GitHub
parent 7d0e7320e2
commit 55247b7fcd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
352 changed files with 60069 additions and 900 deletions

View file

@ -521,10 +521,21 @@ pub(super) fn build_switch<'a>(
) -> Vec<NodeIndex> {
// Locate the case container. Most grammars expose it as field "body"
// (JS/TS, Java, C, C++); Go puts cases as direct children of the switch.
//
// Per-language gotcha: Go's `expression_case` / `default_case` /
// `type_case` / `communication_case` map to `Kind::Block` (so the case
// body is iterated by the Block handler), so a naive "first Block
// child" fallback latches onto the FIRST case as the container, then
// walks the case's interior looking for case-like children, finds none,
// and falls through to the empty-cases early return (CFG dead-end:
// dispatch If has no False edge, every post-switch statement becomes
// unreachable). Skip case-kind nodes when picking the container so
// Go's flat "cases-as-direct-children" shape uses `ast` itself.
let body = ast.child_by_field_name("body").or_else(|| {
let mut c = ast.walk();
ast.children(&mut c)
.find(|n| matches!(lookup(lang, n.kind()), Kind::Block))
ast.children(&mut c).find(|n| {
matches!(lookup(lang, n.kind()), Kind::Block) && !is_switch_case_kind(n.kind())
})
});
let container = body.unwrap_or(ast);

View file

@ -1202,6 +1202,8 @@ fn clone_preserves_all_sub_structs() {
defines: Some("r".into()),
uses: vec!["a".into(), "b".into()],
extra_defines: vec!["c".into()],
array_pattern_indices: smallvec::SmallVec::new(),
rhs_array_elements: smallvec::SmallVec::new(),
},
ast: AstMeta {
span: (10, 100),
@ -1501,6 +1503,105 @@ fn rust_println_macro_named_arg_lifted() {
assert!(found, "no println! macro_invocation node found");
}
/// `format!(URL_FMT, path)` where `URL_FMT` resolves to a top-level
/// `const &str` literal must seed a `string_prefix` on the let-binding
/// node so `is_string_safe_for_ssrf` can lock the host the same way
/// `format!("https://api/{}", path)` does. The bridge fires only when
/// the first non-string token in the macro is an identifier whose
/// matching `const_item` has a string-literal value.
#[test]
fn rust_format_macro_const_first_arg_seeds_string_prefix() {
let src = b"const URL_FMT: &str = \"https://api.example.com/users/{}\";\n\
fn f(path: String) { let u = format!(URL_FMT, path); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
let mut prefix: Option<String> = None;
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("u")
&& let Some(p) = info.string_prefix.as_deref()
{
prefix = Some(p.to_string());
}
}
assert_eq!(
prefix.as_deref(),
Some("https://api.example.com/users/"),
"expected URL_FMT const to bridge into the format!() string_prefix",
);
}
/// Counter-test: when the named const has no string-literal initializer
/// (e.g. `const X: usize = 4;`), the bridge must not fabricate a
/// prefix from a non-string value.
#[test]
fn rust_format_macro_const_first_arg_non_string_skipped() {
let src = b"const N: usize = 4;\n\
fn f(path: String) { let u = format!(N, path); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("u") {
assert!(
info.string_prefix.is_none(),
"non-string const must not seed a prefix; got {:?}",
info.string_prefix
);
}
}
}
/// `static NAME: &str = "...";` declarations participate alongside
/// `const_item`: both shapes carry a `name` field and a string-literal
/// `value` so the bridge resolves either form identically.
#[test]
fn rust_format_macro_static_first_arg_seeds_string_prefix() {
let src = b"static API_BASE: &str = \"https://api.example.com/users/{}\";\n\
fn f(path: String) { let u = format!(API_BASE, path); }";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
let mut prefix: Option<String> = None;
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("u")
&& let Some(p) = info.string_prefix.as_deref()
{
prefix = Some(p.to_string());
}
}
assert_eq!(
prefix.as_deref(),
Some("https://api.example.com/users/"),
"expected static API_BASE to bridge into the format!() string_prefix",
);
}
/// A const declared inside a function body must not bridge: only
/// file-level `const_item` declarations participate to keep the
/// lookup deterministic. (The macro's first arg can shadow a
/// file-level const with an inner-fn const, but inner consts are
/// off-scope for the AST-time prefix bridge.)
#[test]
fn rust_format_macro_inner_const_not_bridged() {
let src = b"fn f(path: String) {\n\
const URL_FMT: &str = \"https://api/{}\";\n\
let u = format!(URL_FMT, path);\n\
}";
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "rust", ts_lang);
for n in cfg.node_indices() {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("u") {
assert!(
info.string_prefix.is_none(),
"inner-fn const must not bridge; got {:?}",
info.string_prefix
);
}
}
}
#[test]
fn go_no_import_bindings() {
let src = b"package main\nimport alias \"fmt\"\n";
@ -2354,6 +2455,29 @@ fn py_subscript_write_lowers_to_index_set_call() {
});
}
#[test]
fn go_selector_expression_call_sets_receiver() {
// Regression for Phase 15 deferred GORM tuple-return case.
// Go's `userDb.Raw(sql)` parses as `call_expression` whose `function`
// field is a `selector_expression` (operand=userDb, field=Raw).
// The CFG-side `Kind::CallFn` arm must extract `userDb` as the
// receiver so type-qualified resolution can rewrite `userDb.Raw` →
// `GormDb.Raw` once `userDb`'s SSA value is tagged via
// `constructor_type(Lang::Go, "gorm.Open")`. Pre-fix the arm only
// recognised JS/TS `member_expression`, Python `attribute`, and Rust
// `field_expression`; Go fell through to receiver=None.
let src = br#"package main
func f(userDb int) {
userDb.Raw("SELECT 1")
}
"#;
let ts_lang = Language::from(tree_sitter_go::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "go", ts_lang);
let node =
find_node_with_callee(&cfg, "userDb.Raw").expect("go: userDb.Raw node should be present");
assert_eq!(node.call.receiver.as_deref(), Some("userDb"));
}
#[test]
fn go_index_expr_read_lowers_to_index_get_call() {
with_pointer_on(|| {
@ -3217,3 +3341,620 @@ fn js_ternary_branch_subscript_source_classified() {
"expected ternary subscript branch defining `x` to carry a Source label"
);
}
/// Regression: Go's `switch` with no `default` arm and an only-case body
/// that returns must keep post-switch statements reachable from entry.
///
/// `expression_case` / `default_case` / `type_case` / `communication_case`
/// all map to `Kind::Block` so the case body is iterated by the Block
/// handler, but `build_switch`'s container fallback ("first Block child")
/// would latch onto the FIRST case as the container. Walking the case's
/// interior for case-like children finds nothing, the empty-cases early
/// return fires, and the dispatch If has no False edge: every post-switch
/// statement becomes unreachable, lighting up `cfg-unreachable-sanitizer`
/// on real code (gin's `binding/form_mapping.go::setTimeField`, line 469
/// `if isUTC, _ := strconv.ParseBool(...); isUTC` after a no-default
/// `switch tf := strings.ToLower(timeFormat); tf` on the unix epoch
/// formats).
#[test]
fn go_switch_no_default_keeps_post_switch_reachable() {
use petgraph::visit::Bfs;
use std::collections::HashSet;
let src = br#"package p
func f(x string) bool {
switch tf := x; tf {
case "unix":
return false
}
after()
return true
}
"#;
let ts_lang = Language::from(tree_sitter_go::LANGUAGE);
let (cfg, entry) = parse_and_build(src, "go", ts_lang);
let mut reachable: HashSet<NodeIndex> = HashSet::new();
let mut bfs = Bfs::new(&cfg, entry);
while let Some(n) = bfs.next(&cfg) {
reachable.insert(n);
}
let after = cfg
.node_indices()
.find(|&n| cfg[n].call.callee.as_deref() == Some("after"))
.expect("expected after() Call node");
assert!(
reachable.contains(&after),
"post-switch `after()` must be reachable from entry; got reachable={:?}",
reachable
);
}
/// `qs = User.objects` at module/function level lowers as a Python
/// `expression_statement` wrapping an `assignment`. The CFG-level
/// `member_field` detector must unwrap the wrapper and pick up
/// `Some("objects")` from the inner RHS so the type-fact pass can tag
/// the bound value as `DjangoQuerySet`.
#[test]
fn python_member_field_assignment_detected_for_bare_objects() {
let src = b"def view(req):\n qs = User.objects\n";
let ts_lang = Language::from(tree_sitter_python::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "python", ts_lang);
let detected: Vec<Option<String>> = cfg
.node_indices()
.filter_map(|n| {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("qs") {
Some(info.member_field.clone())
} else {
None
}
})
.collect();
assert!(
detected.iter().any(|m| m.as_deref() == Some("objects")),
"expected at least one `qs = ...` CFG node with member_field=Some(\"objects\"); got {:?}",
detected
);
}
/// Negative shape: `qs = User.something_else` must NOT set
/// `member_field == Some("objects")`. Guards against the unwrap
/// accidentally picking up the wrong field name.
#[test]
fn python_member_field_assignment_non_objects_does_not_match() {
let src = b"def view(req):\n qs = User.profile\n";
let ts_lang = Language::from(tree_sitter_python::LANGUAGE);
let (cfg, _entry) = parse_and_build(src, "python", ts_lang);
let detected: Vec<Option<String>> = cfg
.node_indices()
.filter_map(|n| {
let info = &cfg[n];
if info.taint.defines.as_deref() == Some("qs") {
Some(info.member_field.clone())
} else {
None
}
})
.collect();
assert!(
detected.iter().any(|m| m.as_deref() == Some("profile")),
"expected `qs = User.profile` to detect member_field=Some(\"profile\"); got {:?}",
detected
);
assert!(
detected.iter().all(|m| m.as_deref() != Some("objects")),
"must not falsely tag non-`objects` field; got {:?}",
detected
);
}
/// Phase 15 chained-shape closure: a Java local of the form
/// `Session sess = sf.openSession();` registers `(fn_start, "sess")`
/// → `TypeKind::HibernateSession` in the per-file local-receiver-types
/// map, so `find_classifiable_inner_call` can rewrite the chained
/// inner `sess.createNativeQuery(...)` to
/// `HibernateSession.createNativeQuery` when the legacy literal-
/// receiver classify misses.
#[test]
fn java_hibernate_session_open_registers_local_receiver_type() {
let src = br#"
class Foo {
void bar(SessionFactory sf, String sql) {
Session sess = sf.openSession();
sess.createNativeQuery(sql).getResultList();
}
}
"#;
let ts_lang = Language::from(tree_sitter_java::LANGUAGE);
let _ = parse_to_file_cfg(src, "java", ts_lang);
// The TLS map is cleared at the end of `build_cfg`, but the
// public lookup helper consults it during construction. Re-run
// population manually for the assertion.
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_java::LANGUAGE))
.unwrap();
let tree = parser.parse(src.as_slice(), None).unwrap();
super::populate_local_receiver_types(&tree, "java", src);
// Walk to find the function body's start_byte.
fn find_method_start(node: tree_sitter::Node<'_>) -> Option<usize> {
if node.kind() == "method_declaration" {
return Some(node.start_byte());
}
let mut c = node.walk();
for child in node.children(&mut c) {
if let Some(s) = find_method_start(child) {
return Some(s);
}
}
None
}
let fn_start = find_method_start(tree.root_node()).expect("method_declaration in fixture");
let got = super::lookup_local_receiver_type(fn_start, "sess");
assert_eq!(
got,
Some(crate::ssa::type_facts::TypeKind::HibernateSession),
"local `Session sess = sf.openSession()` should bind to HibernateSession"
);
// Cleanup so the TLS state doesn't leak into other tests.
super::LOCAL_RECEIVER_TYPES.with(|cell| cell.borrow_mut().clear());
}
/// Same Java per-file map: a local whose RHS is unrelated (no
/// `constructor_type` match) must NOT register. Confirms the
/// recogniser is anchored on `constructor_type`'s callee classifier
/// rather than the declared receiver type, so a generic
/// `Session foo = computeFoo()` doesn't bleed an unrelated method
/// into the type-qualified pool.
#[test]
fn java_unrecognised_rhs_does_not_register_local_receiver_type() {
let src = br#"
class Foo {
void bar() {
Session sess = computeSomethingUnrelated();
sess.doSomething();
}
}
"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_java::LANGUAGE))
.unwrap();
let tree = parser.parse(src.as_slice(), None).unwrap();
super::populate_local_receiver_types(&tree, "java", src);
fn find_method_start(node: tree_sitter::Node<'_>) -> Option<usize> {
if node.kind() == "method_declaration" {
return Some(node.start_byte());
}
let mut c = node.walk();
for child in node.children(&mut c) {
if let Some(s) = find_method_start(child) {
return Some(s);
}
}
None
}
let fn_start = find_method_start(tree.root_node()).expect("method_declaration in fixture");
let got = super::lookup_local_receiver_type(fn_start, "sess");
assert_eq!(
got, None,
"unrecognised RHS `computeSomethingUnrelated()` must not register a receiver-type"
);
super::LOCAL_RECEIVER_TYPES.with(|cell| cell.borrow_mut().clear());
}
/// `collect_array_pattern_bindings_indexed` walks JS/TS `array_pattern`
/// children in source order and records `(name, position)` for each
/// simple-identifier binding. Skip slots (commas with no binding
/// between) advance the position counter without emitting a binding,
/// so `const [, b]` produces `[("b", 1)]` and `const [a, ,]` produces
/// `[("a", 0)]`. Complex sub-patterns (`assignment_pattern`,
/// `rest_pattern`, nested `array_pattern`) cause the helper to return
/// an empty vec so the lowering rewrite falls back to scalar union.
#[test]
fn array_pattern_indexed_bindings_recognise_skip_slots() {
use super::helpers::collect_array_pattern_bindings_indexed;
fn first_array_pattern<'t>(n: tree_sitter::Node<'t>) -> Option<tree_sitter::Node<'t>> {
if n.kind() == "array_pattern" {
return Some(n);
}
let mut c = n.walk();
for child in n.children(&mut c) {
if let Some(found) = first_array_pattern(child) {
return Some(found);
}
}
None
}
fn parse_first(src: &[u8]) -> (tree_sitter::Tree, Vec<u8>) {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_javascript::LANGUAGE))
.unwrap();
let tree = parser.parse(src, None).unwrap();
(tree, src.to_vec())
}
fn run_case(src: &[u8]) -> Vec<(String, usize)> {
let (tree, bytes) = parse_first(src);
let pat = first_array_pattern(tree.root_node()).expect("array_pattern in fixture");
collect_array_pattern_bindings_indexed(pat, &bytes)
.into_iter()
.collect()
}
assert_eq!(
run_case(b"const [a, b] = x;"),
vec![("a".into(), 0), ("b".into(), 1)],
);
assert_eq!(run_case(b"const [, b] = x;"), vec![("b".into(), 1)]);
assert_eq!(run_case(b"const [a, ,] = x;"), vec![("a".into(), 0)]);
assert_eq!(
run_case(b"const [a, , c] = x;"),
vec![("a".into(), 0), ("c".into(), 2)],
);
// Rest patterns bail to empty so callers fall back to scalar union.
assert!(run_case(b"const [a, ...rest] = x;").is_empty());
// Default value patterns also bail.
assert!(run_case(b"const [a = 1, b] = x;").is_empty());
// Nested array patterns bail.
assert!(run_case(b"const [[a, b], c] = x;").is_empty());
}
/// Rust `tuple_pattern` shares the helper. The `_` wildcard
/// (`_pattern` node) advances the position counter without binding,
/// mirroring JS skip-slot semantics. Other complex sub-patterns
/// (tuple-struct, parenthesized) bail to empty.
#[test]
fn tuple_pattern_indexed_bindings_recognise_rust_wildcards() {
use super::helpers::collect_array_pattern_bindings_indexed;
fn first_tuple_pattern<'t>(n: tree_sitter::Node<'t>) -> Option<tree_sitter::Node<'t>> {
if n.kind() == "tuple_pattern" {
return Some(n);
}
let mut c = n.walk();
for child in n.children(&mut c) {
if let Some(found) = first_tuple_pattern(child) {
return Some(found);
}
}
None
}
fn parse_first_rust(src: &[u8]) -> (tree_sitter::Tree, Vec<u8>) {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src, None).unwrap();
(tree, src.to_vec())
}
fn run_case(src: &[u8]) -> Vec<(String, usize)> {
let (tree, bytes) = parse_first_rust(src);
let pat = first_tuple_pattern(tree.root_node()).expect("tuple_pattern in fixture");
collect_array_pattern_bindings_indexed(pat, &bytes)
.into_iter()
.collect()
}
assert_eq!(
run_case(b"fn f() { let (a, b) = (1, 2); }"),
vec![("a".into(), 0), ("b".into(), 1)],
);
assert_eq!(
run_case(b"fn f() { let (_, b) = (1, 2); }"),
vec![("b".into(), 1)],
);
assert_eq!(
run_case(b"fn f() { let (a, _) = (1, 2); }"),
vec![("a".into(), 0)],
);
assert_eq!(
run_case(b"fn f() { let (a, _, c) = (1, 2, 3); }"),
vec![("a".into(), 0), ("c".into(), 2)],
);
}
/// Python `pattern_list` (bare `a, b = ...`) and `tuple_pattern`
/// (parenthesised `(a, b) = ...`) share the helper. Python's `_` is
/// a normal identifier binding (not a wildcard), so every identifier
/// child emits a `(name, position)` entry — `_` lands at its source
/// position alongside any other names. `list_splat_pattern`
/// (`a, *rest`) bails to empty so callers fall back to scalar union.
#[test]
fn pattern_list_indexed_bindings_recognise_python_destructure() {
use super::helpers::collect_array_pattern_bindings_indexed;
fn first_pattern<'t>(
n: tree_sitter::Node<'t>,
kinds: &[&str],
) -> Option<tree_sitter::Node<'t>> {
if kinds.contains(&n.kind()) {
return Some(n);
}
let mut c = n.walk();
for child in n.children(&mut c) {
if let Some(found) = first_pattern(child, kinds) {
return Some(found);
}
}
None
}
fn parse_first_python(src: &[u8]) -> (tree_sitter::Tree, Vec<u8>) {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_python::LANGUAGE))
.unwrap();
let tree = parser.parse(src, None).unwrap();
(tree, src.to_vec())
}
fn run_case(src: &[u8], kinds: &[&str]) -> Vec<(String, usize)> {
let (tree, bytes) = parse_first_python(src);
let pat = first_pattern(tree.root_node(), kinds)
.unwrap_or_else(|| panic!("no {kinds:?} in fixture"));
collect_array_pattern_bindings_indexed(pat, &bytes)
.into_iter()
.collect()
}
// Bare comma-list `a, b = ...` is `pattern_list`.
assert_eq!(
run_case(b"a, b = (1, 2)\n", &["pattern_list"]),
vec![("a".into(), 0), ("b".into(), 1)],
);
// Three-binding bare comma list.
assert_eq!(
run_case(b"a, b, c = (1, 2, 3)\n", &["pattern_list"]),
vec![("a".into(), 0), ("b".into(), 1), ("c".into(), 2)],
);
// Underscore is a regular identifier binding in Python.
assert_eq!(
run_case(b"_, b = (1, 2)\n", &["pattern_list"]),
vec![("_".into(), 0), ("b".into(), 1)],
);
assert_eq!(
run_case(b"a, _ = (1, 2)\n", &["pattern_list"]),
vec![("a".into(), 0), ("_".into(), 1)],
);
// Parenthesised destructure surfaces as `tuple_pattern`.
assert_eq!(
run_case(b"(a, b) = (1, 2)\n", &["tuple_pattern"]),
vec![("a".into(), 0), ("b".into(), 1)],
);
// Splat / rest bindings bail because positional mapping breaks.
assert!(run_case(b"a, *rest = (1, 2, 3)\n", &["pattern_list"]).is_empty());
// Nested destructure bails — recogniser doesn't recurse into
// sub-patterns to preserve flat-binding-only semantics.
assert!(run_case(b"(a, b), c = ((1, 2), 3)\n", &["pattern_list"]).is_empty());
}
/// Ruby `left_assignment_list` is the LHS node tree-sitter-ruby produces
/// for `a, b = ...`. The helper walks comma-separated identifier
/// children in source order, emitting `(name, position)` for each.
/// Ruby `_` is a normal identifier (matches Python convention).
/// `rest_assignment` (`*rest`) and `destructured_left_assignment`
/// (parenthesised nested destructure) hit the bail branch so callers
/// fall back to scalar union for those advanced shapes.
#[test]
fn left_assignment_list_indexed_bindings_recognise_ruby_destructure() {
use super::helpers::collect_array_pattern_bindings_indexed;
fn first_left_assignment_list<'t>(n: tree_sitter::Node<'t>) -> Option<tree_sitter::Node<'t>> {
if n.kind() == "left_assignment_list" {
return Some(n);
}
let mut c = n.walk();
for child in n.children(&mut c) {
if let Some(found) = first_left_assignment_list(child) {
return Some(found);
}
}
None
}
fn parse_first_ruby(src: &[u8]) -> (tree_sitter::Tree, Vec<u8>) {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_ruby::LANGUAGE))
.unwrap();
let tree = parser.parse(src, None).unwrap();
(tree, src.to_vec())
}
fn run_case(src: &[u8]) -> Vec<(String, usize)> {
let (tree, bytes) = parse_first_ruby(src);
let pat =
first_left_assignment_list(tree.root_node()).expect("left_assignment_list in fixture");
collect_array_pattern_bindings_indexed(pat, &bytes)
.into_iter()
.collect()
}
assert_eq!(
run_case(b"a, b = [x, y]\n"),
vec![("a".into(), 0), ("b".into(), 1)],
);
assert_eq!(
run_case(b"a, b, c = [x, y, z]\n"),
vec![("a".into(), 0), ("b".into(), 1), ("c".into(), 2)],
);
// Underscore is a regular identifier binding in Ruby (idiomatic
// "unused" marker, but still resolvable in scope).
assert_eq!(
run_case(b"_, b = [x, y]\n"),
vec![("_".into(), 0), ("b".into(), 1)],
);
assert_eq!(
run_case(b"a, _ = [x, y]\n"),
vec![("a".into(), 0), ("_".into(), 1)],
);
// Call return value, helper walks LHS regardless of RHS shape.
assert_eq!(
run_case(b"a, b = func()\n"),
vec![("a".into(), 0), ("b".into(), 1)],
);
// Splat tail bails because rest_assignment is a complex sub-pattern.
assert!(run_case(b"a, *rest = [x, y, z]\n").is_empty());
// Parenthesised nested destructure bails because
// destructured_left_assignment isn't in the simple-identifier
// whitelist.
assert!(run_case(b"(a, b) = [x, y]\n").is_empty());
}
/// Helper for `src/ssa/lower.rs` bare-array destructure rewrite.
/// Walks the RHS of a destructure assignment and emits one slot per
/// source-order element. Each slot is `Ident(name)`, `Literal`, or
/// `Complex(inner_uses)`. Bails (empty) on shapes that shift index
/// alignment (spread / list splat).
#[test]
fn rhs_array_literal_elements_recognise_per_language_shapes() {
use super::RhsArraySlot;
use super::helpers::collect_rhs_array_literal_elements;
fn parse(lang_label: &str, src: &[u8]) -> (tree_sitter::Tree, Vec<u8>) {
let mut parser = tree_sitter::Parser::new();
let lang = match lang_label {
"javascript" => Language::from(tree_sitter_javascript::LANGUAGE),
"typescript" => Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT),
"python" => Language::from(tree_sitter_python::LANGUAGE),
"ruby" => Language::from(tree_sitter_ruby::LANGUAGE),
"rust" => Language::from(tree_sitter_rust::LANGUAGE),
other => panic!("unsupported lang: {}", other),
};
parser.set_language(&lang).unwrap();
let tree = parser.parse(src, None).unwrap();
(tree, src.to_vec())
}
fn find_first<'t>(n: tree_sitter::Node<'t>, kinds: &[&str]) -> Option<tree_sitter::Node<'t>> {
if kinds.iter().any(|k| *k == n.kind()) {
return Some(n);
}
let mut c = n.walk();
for child in n.children(&mut c) {
if let Some(found) = find_first(child, kinds) {
return Some(found);
}
}
None
}
fn run(lang: &str, src: &[u8], rhs_kinds: &[&str]) -> Vec<RhsArraySlot> {
let (tree, bytes) = parse(lang, src);
let rhs = find_first(tree.root_node(), rhs_kinds).expect("rhs in fixture");
collect_rhs_array_literal_elements(rhs, lang, &bytes, None)
.into_iter()
.collect()
}
fn ident(name: &str) -> RhsArraySlot {
RhsArraySlot::Ident(name.to_string())
}
fn complex(uses: &[&str]) -> RhsArraySlot {
RhsArraySlot::Complex {
uses: uses.iter().map(|s| s.to_string()).collect(),
source_cap: crate::labels::Cap::empty(),
}
}
fn complex_source(uses: &[&str]) -> RhsArraySlot {
RhsArraySlot::Complex {
uses: uses.iter().map(|s| s.to_string()).collect(),
source_cap: crate::labels::Cap::all(),
}
}
// JS/TS `array` literal: two bare idents.
assert_eq!(
run("javascript", b"const _ = [safe, tainted];\n", &["array"]),
vec![ident("safe"), ident("tainted")],
);
// JS/TS `array` mixed ident + string literal.
assert_eq!(
run("javascript", b"const _ = [tainted, \"ok\"];\n", &["array"]),
vec![ident("tainted"), RhsArraySlot::Literal],
);
// JS/TS now classifies a call as `Complex` carrying inner idents
// rather than bailing. `collect_idents_with_paths` lifts both paths
// and bare idents, so a member access surfaces as the dotted path
// (e.g. `req.query.x`) followed by its component idents.
assert_eq!(
run("javascript", b"const _ = [fn(x), 'lit'];\n", &["array"]),
vec![complex(&["fn", "x"]), RhsArraySlot::Literal],
);
// JS/TS member access becomes Complex; dotted path + component idents.
// Per-slot Source classification fires when the slot's subtree carries
// a member-expression that strip-and-retry-classifies as Source
// (`req.query.x` → strip `.x` → `req.query` matches the JS Source rule).
assert_eq!(
run(
"javascript",
b"const _ = [req.query.x, 'lit'];\n",
&["array"],
),
vec![
complex_source(&["req.query.x", "req", "query", "x"]),
RhsArraySlot::Literal,
],
);
// Sibling-precision: a Source-classified Complex slot ALONGSIDE a
// Complex slot whose subtree does NOT classify as Source. Pre-session
// 0047 every Complex slot was conservatively re-emitted as Source by
// the outer-node fallback in `src/ssa/lower.rs`; with per-slot
// classification the safe sibling stays empty so the SSA lowering can
// emit `Assign(safe)` instead.
assert_eq!(
run(
"javascript",
b"const _ = [process.env.X, helper(local)];\n",
&["array"],
),
vec![
complex_source(&["process.env.X", "process", "env", "X"]),
complex(&["helper", "local"]),
],
);
// JS/TS spread bails entirely (index alignment shifts).
assert!(run("javascript", b"const _ = [...arr, b];\n", &["array"]).is_empty());
// JS/TS binary expression becomes Complex with the inner ident.
assert_eq!(
run(
"javascript",
b"const _ = ['log-' + x, 'lit'];\n",
&["array"],
),
vec![complex(&["x"]), RhsArraySlot::Literal],
);
// Python `list` shape.
assert_eq!(
run("python", b"a = [safe, tainted]\n", &["list"]),
vec![ident("safe"), ident("tainted")],
);
// Python `expression_list` (bare commas RHS in `a, b = x, y`).
assert_eq!(
run("python", b"a, b = safe, tainted\n", &["expression_list"]),
vec![ident("safe"), ident("tainted")],
);
// Python `tuple` (parenthesised).
assert_eq!(
run("python", b"x = (safe, 42)\n", &["tuple"]),
vec![ident("safe"), RhsArraySlot::Literal],
);
// Python list-splat bails.
assert!(run("python", b"x = [*a, b]\n", &["list"]).is_empty());
// Ruby `array`.
assert_eq!(
run("ruby", b"a, b = [safe, tainted]\n", &["array"]),
vec![ident("safe"), ident("tainted")],
);
// Ruby `array` with literal + ident.
assert_eq!(
run("ruby", b"a, b = [tainted, \"safe\"]\n", &["array"]),
vec![ident("tainted"), RhsArraySlot::Literal],
);
// Rust `tuple_expression`.
assert_eq!(
run(
"rust",
b"fn f(safe: &str, tainted: &str) { let _ = (safe, tainted); }\n",
&["tuple_expression"]
),
vec![ident("safe"), ident("tainted")],
);
// Non-array-shape node returns empty (defensive guard).
assert!(run("javascript", b"const x = tainted;\n", &["identifier"]).is_empty());
}

View file

@ -2,7 +2,7 @@ use super::helpers::first_member_label;
use super::{
AstMeta, Cfg, EdgeKind, MAX_COND_VARS, MAX_CONDITION_TEXT_LEN, NodeInfo, StmtKind,
collect_idents, connect_all, detect_eq_with_const, detect_negation, has_call_descendant,
member_expr_text, push_node, text_of,
member_expr_text, push_node, text_of, try_lower_jsx_dangerous_html,
};
use crate::labels::{DataLabel, LangAnalysisRules, classify};
use crate::utils::snippet::truncate_at_char_boundary;
@ -378,7 +378,24 @@ pub(super) fn lower_ternary_branch<'a>(
}
connect_all(g, preds, node, pred_edge);
vec![node]
// React JSX `dangerouslySetInnerHTML={{__html: x}}` synthesis when the
// branch expression is itself a JSX element (or contains one as a
// descendant). Without this, `cond ? <div dangerouslySetInnerHTML=...
// /> : null` and similar ternary-RHS shapes never reach the
// `Kind::Return` / `Kind::Assignment` arms that own the synthesis hook,
// because `build_ternary_diamond` lowers each branch directly.
let post_jsx = try_lower_jsx_dangerous_html(
branch_ast,
&[node],
g,
lang,
code,
enclosing_func,
call_ordinal,
analysis_rules,
);
post_jsx
}
/// Extract `(lhs_ast, ternary_ast)` when `outer_ast` is an expression-statement

View file

@ -554,3 +554,469 @@ fn collect_ruby_symbol_list(node: Node<'_>, code: &[u8], out: &mut Vec<String>)
_ => {}
}
}
/// Extract route-path capture variable names from framework routing decorators
/// on a function AST node.
///
/// Supported languages:
/// * Python: walks Flask-style `@app.route("/users/<name>")`,
/// blueprint-prefixed `@bp.get("/u/<int:id>")`, and verb-shaped
/// `@router.post("/<path:slug>")` decorators. Returns inner names from
/// `<name>` / `<conv:name>` brace-segments.
/// * Ruby: walks Sinatra `get "/u/:name" do |name| ... end`. The
/// `func_node` is the `do_block`; its parent `call` carries the verb
/// in the `method` field and the path pattern in the first positional
/// string argument. Returns inner names from `:name` colon-segments.
///
/// Functions without a recognised routing pattern return an empty `Vec`.
/// Strict additive: downstream consumers gate the result via
/// `param.contains(name)` so empty captures preserve today's behaviour.
pub(super) fn extract_route_path_captures<'a>(
func_node: Node<'a>,
lang: &str,
code: &'a [u8],
) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
match lang {
"python" => extract_python_route_captures(func_node, code, &mut out),
"ruby" => extract_ruby_route_captures(func_node, code, &mut out),
_ => {}
}
out
}
fn extract_python_route_captures<'a>(func_node: Node<'a>, code: &'a [u8], out: &mut Vec<String>) {
let Some(parent) = func_node.parent() else {
return;
};
if parent.kind() != "decorated_definition" {
return;
}
let mut w = parent.walk();
for ch in parent.children(&mut w) {
if ch.kind() != "decorator" {
continue;
}
let mut dw = ch.walk();
let Some(expr) = ch.children(&mut dw).find(|c| c.kind() != "@") else {
continue;
};
if expr.kind() != "call" {
continue;
}
let Some(target) = expr.child_by_field_name("function") else {
continue;
};
if target.kind() != "attribute" {
continue;
}
let Some(attr) = target.child_by_field_name("attribute") else {
continue;
};
let Some(attr_text) = text_of(attr, code) else {
continue;
};
let attr_lower = attr_text.to_ascii_lowercase();
let is_route_verb = matches!(
attr_lower.as_str(),
"route" | "get" | "post" | "put" | "patch" | "delete" | "head" | "options"
);
if !is_route_verb {
continue;
}
let Some(args) = expr.child_by_field_name("arguments") else {
continue;
};
let Some(pattern) = first_positional_string_arg(args, code) else {
continue;
};
collect_flask_path_captures(&pattern, out);
collect_fastapi_path_captures(&pattern, out);
}
}
/// Walk up from a Ruby `do_block` / `block` to the enclosing `call`.
/// If the call's method is a Sinatra-style HTTP verb and its first
/// positional argument is a static string literal, parse Sinatra
/// `:name` path captures into `out`.
fn extract_ruby_route_captures<'a>(func_node: Node<'a>, code: &'a [u8], out: &mut Vec<String>) {
let Some(parent) = func_node.parent() else {
return;
};
if parent.kind() != "call" {
return;
}
let Some(method_node) = parent.child_by_field_name("method") else {
return;
};
let Some(verb) = text_of(method_node, code) else {
return;
};
let verb_lc = verb.to_ascii_lowercase();
let is_sinatra_verb = matches!(
verb_lc.as_str(),
"get" | "post" | "put" | "patch" | "delete" | "head" | "options" | "link" | "unlink"
);
if !is_sinatra_verb {
return;
}
let Some(args) = parent.child_by_field_name("arguments") else {
return;
};
let Some(pattern) = first_positional_string_arg_ruby(args, code) else {
return;
};
collect_sinatra_path_captures(&pattern, out);
}
/// Return the literal text of the first positional string argument inside a
/// Python `argument_list`. Skips keyword args and non-string positionals.
fn first_positional_string_arg(args: Node<'_>, code: &[u8]) -> Option<String> {
let mut cursor = args.walk();
for arg in args.children(&mut cursor) {
match arg.kind() {
"(" | ")" | "," => continue,
"keyword_argument" => continue,
"string" => {
return python_string_text(arg, code);
}
_ => return None,
}
}
None
}
/// Strip Python string-literal quoting from a `string` AST node. Rejects
/// f-strings (interpolation children present) because the captured pattern
/// is not statically known.
fn python_string_text(node: Node<'_>, code: &[u8]) -> Option<String> {
let mut cursor = node.walk();
for ch in node.children(&mut cursor) {
if ch.kind() == "interpolation" {
return None;
}
}
let raw = text_of(node, code)?;
let trimmed = raw.trim();
let trimmed = trimmed.trim_start_matches(['r', 'R', 'b', 'B', 'u', 'U', 'f', 'F']);
let stripped = trimmed
.strip_prefix("\"\"\"")
.and_then(|s| s.strip_suffix("\"\"\""))
.or_else(|| {
trimmed
.strip_prefix("'''")
.and_then(|s| s.strip_suffix("'''"))
})
.or_else(|| trimmed.strip_prefix('"').and_then(|s| s.strip_suffix('"')))
.or_else(|| {
trimmed
.strip_prefix('\'')
.and_then(|s| s.strip_suffix('\''))
})?;
Some(stripped.to_string())
}
/// Return the literal text of the first positional string argument inside a
/// Ruby `argument_list`. Hash literals (`pair`), block arguments,
/// hash-splat arguments, and non-string positionals all return `None`.
fn first_positional_string_arg_ruby(args: Node<'_>, code: &[u8]) -> Option<String> {
let mut cursor = args.walk();
for arg in args.children(&mut cursor) {
match arg.kind() {
"(" | ")" | "," => continue,
"pair" | "hash" | "block_argument" | "hash_splat_argument" => return None,
"string" => return ruby_string_text(arg, code),
_ => return None,
}
}
None
}
/// Strip Ruby string-literal quoting from a `string` AST node. Rejects
/// strings with `#{...}` interpolation (the captured pattern is not
/// statically known). Returns the concatenation of `string_content`
/// children.
fn ruby_string_text(node: Node<'_>, code: &[u8]) -> Option<String> {
let mut cursor = node.walk();
let mut content = String::new();
let mut had_content = false;
for ch in node.children(&mut cursor) {
match ch.kind() {
"interpolation" => return None,
"string_content" => {
if let Some(t) = text_of(ch, code) {
content.push_str(&t);
had_content = true;
}
}
_ => continue,
}
}
if had_content { Some(content) } else { None }
}
/// Parse Sinatra-style `:name` capture segments out of a route pattern.
/// A capture is a `:` followed by an identifier-ish run of bytes
/// (`[A-Za-z0-9_]+`). Only fires when `:` is at pattern start or
/// immediately follows `/`, so `Foo::Bar` style names embedded in a
/// non-routing string are not mis-parsed as captures.
fn collect_sinatra_path_captures(pattern: &str, out: &mut Vec<String>) {
let bytes = pattern.as_bytes();
let mut i = 0;
while i < bytes.len() {
let at_segment_boundary = i == 0 || bytes[i - 1] == b'/';
if bytes[i] == b':' && at_segment_boundary {
let mut j = i + 1;
while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
j += 1;
}
if j > i + 1 {
let name = &pattern[i + 1..j];
let lower = name.to_ascii_lowercase();
if !out.iter().any(|existing| existing == &lower) {
out.push(lower);
}
}
i = j;
} else {
i += 1;
}
}
}
/// Parse FastAPI / Starlette-style `{name}` / `{name:converter}` capture
/// segments out of a route pattern. Pushes the inner name (lowercased)
/// into `out`. FastAPI puts the name FIRST (`{item_id:int}`), unlike
/// Flask which puts the converter first (`<int:item_id>`). Skips
/// malformed segments (no closing `}`, empty name) and rejects names
/// with non-identifier characters.
fn collect_fastapi_path_captures(pattern: &str, out: &mut Vec<String>) {
let bytes = pattern.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'{' {
let mut j = i + 1;
while j < bytes.len() && bytes[j] != b'}' {
j += 1;
}
if j >= bytes.len() {
break;
}
let inner = &pattern[i + 1..j];
let name = inner.split(':').next().unwrap_or(inner).trim();
if !name.is_empty() && name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
let lower = name.to_ascii_lowercase();
if !out.iter().any(|existing| existing == &lower) {
out.push(lower);
}
}
i = j + 1;
} else {
i += 1;
}
}
}
/// Parse Flask-style `<conv:name>` / `<name>` capture segments out of a
/// route pattern. Pushes the inner name (lowercased) into `out`. Skips
/// malformed segments (no closing `>`, empty name).
fn collect_flask_path_captures(pattern: &str, out: &mut Vec<String>) {
let bytes = pattern.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'<' {
let mut j = i + 1;
while j < bytes.len() && bytes[j] != b'>' {
j += 1;
}
if j >= bytes.len() {
break;
}
let inner = &pattern[i + 1..j];
let name = match inner.rsplit_once(':') {
Some((_, n)) => n,
None => inner,
};
let name = name.trim();
if !name.is_empty() && name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
let lower = name.to_ascii_lowercase();
if !out.iter().any(|existing| existing == &lower) {
out.push(lower);
}
}
i = j + 1;
} else {
i += 1;
}
}
}
#[cfg(test)]
mod path_capture_tests {
use super::*;
fn collect_for(pat: &str) -> Vec<String> {
let mut out = Vec::new();
collect_flask_path_captures(pat, &mut out);
out
}
#[test]
fn extracts_bare_capture() {
assert_eq!(collect_for("/users/<name>"), vec!["name".to_string()]);
}
#[test]
fn extracts_converter_capture() {
assert_eq!(
collect_for("/items/<int:item_id>"),
vec!["item_id".to_string()]
);
}
#[test]
fn extracts_path_converter() {
assert_eq!(collect_for("/x/<path:slug>"), vec!["slug".to_string()]);
}
#[test]
fn extracts_multiple_captures() {
assert_eq!(
collect_for("/u/<uid>/post/<int:pid>"),
vec!["uid".to_string(), "pid".to_string()]
);
}
#[test]
fn dedupes_repeated_names() {
let mut out = Vec::new();
collect_flask_path_captures("/<a>/<a>", &mut out);
assert_eq!(out, vec!["a".to_string()]);
}
#[test]
fn rejects_unclosed_brace() {
assert_eq!(collect_for("/<oops"), Vec::<String>::new());
}
#[test]
fn rejects_non_ident_chars() {
assert_eq!(collect_for("/<bad name>"), Vec::<String>::new());
assert_eq!(collect_for("/<name!>"), Vec::<String>::new());
}
#[test]
fn empty_when_no_captures() {
assert_eq!(collect_for("/static/path"), Vec::<String>::new());
}
fn collect_sinatra_for(pat: &str) -> Vec<String> {
let mut out = Vec::new();
collect_sinatra_path_captures(pat, &mut out);
out
}
#[test]
fn sinatra_extracts_bare_capture() {
assert_eq!(
collect_sinatra_for("/users/:name"),
vec!["name".to_string()]
);
}
#[test]
fn sinatra_extracts_multiple_captures() {
assert_eq!(
collect_sinatra_for("/u/:uid/post/:pid"),
vec!["uid".to_string(), "pid".to_string()]
);
}
#[test]
fn sinatra_extracts_leading_capture() {
assert_eq!(collect_sinatra_for(":root"), vec!["root".to_string()]);
}
#[test]
fn sinatra_dedupes_repeated_names() {
let mut out = Vec::new();
collect_sinatra_path_captures("/:a/:a", &mut out);
assert_eq!(out, vec!["a".to_string()]);
}
#[test]
fn sinatra_ignores_double_colon() {
assert_eq!(collect_sinatra_for("/Foo::Bar"), Vec::<String>::new());
}
#[test]
fn sinatra_ignores_lone_colon() {
assert_eq!(collect_sinatra_for("/users/:"), Vec::<String>::new());
}
#[test]
fn sinatra_empty_when_no_captures() {
assert_eq!(collect_sinatra_for("/static/path"), Vec::<String>::new());
}
fn collect_fastapi_for(pat: &str) -> Vec<String> {
let mut out = Vec::new();
collect_fastapi_path_captures(pat, &mut out);
out
}
#[test]
fn fastapi_extracts_bare_capture() {
assert_eq!(
collect_fastapi_for("/items/{item_id}"),
vec!["item_id".to_string()]
);
}
#[test]
fn fastapi_extracts_converter_capture() {
assert_eq!(
collect_fastapi_for("/items/{item_id:int}"),
vec!["item_id".to_string()]
);
}
#[test]
fn fastapi_extracts_path_converter() {
assert_eq!(
collect_fastapi_for("/files/{file_path:path}"),
vec!["file_path".to_string()]
);
}
#[test]
fn fastapi_extracts_multiple_captures() {
assert_eq!(
collect_fastapi_for("/u/{uid}/post/{pid:int}"),
vec!["uid".to_string(), "pid".to_string()]
);
}
#[test]
fn fastapi_dedupes_repeated_names() {
let mut out = Vec::new();
collect_fastapi_path_captures("/{a}/{a}", &mut out);
assert_eq!(out, vec!["a".to_string()]);
}
#[test]
fn fastapi_rejects_unclosed_brace() {
assert_eq!(collect_fastapi_for("/{oops"), Vec::<String>::new());
}
#[test]
fn fastapi_rejects_non_ident_chars() {
assert_eq!(collect_fastapi_for("/{bad name}"), Vec::<String>::new());
assert_eq!(collect_fastapi_for("/{name!}"), Vec::<String>::new());
}
#[test]
fn fastapi_empty_when_no_captures() {
assert_eq!(collect_fastapi_for("/static/path"), Vec::<String>::new());
}
}

View file

@ -1,6 +1,7 @@
use super::anon_fn_name;
use super::conditions::unwrap_parens;
use crate::labels::{DataLabel, Kind, classify, lookup};
use smallvec::SmallVec;
use tree_sitter::Node;
// -------------------------------------------------------------------------
@ -210,7 +211,7 @@ pub(crate) fn first_call_ident_with_span<'a>(
.and_then(|f| root_receiver_text(f, lang, code));
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f.to_string()),
(_, Some(f)) => Some(f),
_ => None,
}
}
@ -269,6 +270,11 @@ pub(crate) fn find_classifiable_inner_call<'a>(
}
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => {
// For CallMethod we also remember the bare receiver
// identifier so we can try a type-qualified rewrite
// when the literal classify misses.
let mut method_receiver: Option<String> = None;
let mut method_name: Option<String> = None;
let ident = match lookup(lang, c.kind()) {
Kind::CallFn => c
.child_by_field_name("function")
@ -286,6 +292,8 @@ pub(crate) fn find_classifiable_inner_call<'a>(
.or_else(|| c.child_by_field_name("receiver"))
.or_else(|| c.child_by_field_name("scope"))
.and_then(|f| root_receiver_text(f, lang, code));
method_receiver = recv.clone();
method_name = func.clone();
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f),
@ -302,6 +310,36 @@ pub(crate) fn find_classifiable_inner_call<'a>(
{
return Some((id.clone(), lbl, (c.start_byte(), c.end_byte())));
}
// Receiver-type rewrite fallback: when the literal
// `recv.method` text didn't classify, AND we're inside
// a chained call (parent `n` is itself a call), look
// up `recv`'s locally-bound type and retry with the
// type prefix. E.g. for
// `sess.createNativeQuery(sql).getResultList()`, the
// inner `sess.createNativeQuery` rewrites to
// `HibernateSession.createNativeQuery` (rule fires).
//
// Gated on `n` being a Call-kind so the rewrite only
// fires on chain-hop inner calls. When `n` is an
// expression-statement / variable-declarator / etc.
// the candidate `c` IS the outermost call of the
// statement, and the SSA-time
// `resolve_type_qualified_labels` path handles it
// with multi-label semantics that single-label
// `classify` here would erase.
let parent_is_call = matches!(
lookup(lang, n.kind()),
Kind::CallFn | Kind::CallMethod | Kind::CallMacro
);
if parent_is_call
&& let (Some(recv), Some(method)) = (method_receiver, method_name)
&& let Some(prefix) = crate::cfg::local_receiver_type_prefix(c, &recv, lang)
{
let alt = format!("{prefix}.{method}");
if let Some(lbl) = classify(lang, &alt, extra) {
return Some((alt, lbl, (c.start_byte(), c.end_byte())));
}
}
// Recurse into arguments of this call
if let Some(found) = find_classifiable_inner_call(c, lang, code, extra) {
return Some(found);
@ -412,6 +450,16 @@ pub(crate) fn first_member_label(
}
// PHP/Python/Ruby subscript access: `$_GET['cmd']`, `os.environ['KEY']`, `params[:cmd]`
// Try to classify the object (before the `[`) as a source.
//
// Source-only on the receiver: a subscript reads a value from the
// receiver, so a Sink label found on the receiver text (e.g.
// `response.headers['content-type']`, where `response.headers`
// matches the JS HEADER_INJECTION sink rule) describes the
// *target* of a hypothetical write, not this read. Promoting it
// would fire phantom sinks at every `body =
// response.headers["X"]`-shape line. Sinks/Sanitizers reachable
// via callable positions (function-arg, method-receiver) still
// flow through the outer recursive walk below.
"subscript_expression" | "subscript" | "element_reference" => {
if let Some(obj) = n
.child_by_field_name("object")
@ -419,15 +467,23 @@ pub(crate) fn first_member_label(
.or_else(|| n.child(0))
{
if let Some(txt) = text_of(obj, code)
&& let Some(lbl) = classify(lang, &txt, extra_labels)
&& let Some(lbl @ DataLabel::Source(_)) = classify(lang, &txt, extra_labels)
{
return Some(lbl);
}
// Recurse into the object for nested member accesses
if let Some(lbl) = first_member_label(obj, lang, code, extra_labels) {
// Recurse into the object for nested member accesses, but
// keep the same Source-only restriction as above by passing
// through the dedicated source-only walker.
if let Some(lbl @ DataLabel::Source(_)) =
first_member_label(obj, lang, code, extra_labels)
{
return Some(lbl);
}
}
// Suppress further descent into this subscript node, the outer
// child-walk loop would otherwise enter the receiver via the
// member_expression arm and reattach a value-extraction Sink.
return None;
}
_ => {}
}
@ -678,6 +734,7 @@ pub(crate) fn collect_idents_with_paths(
"identifier"
| "field_identifier"
| "property_identifier"
| "shorthand_property_identifier"
| "shorthand_property_identifier_pattern" => {
if let Some(txt) = text_of(n, code) {
idents.push(txt);
@ -697,16 +754,241 @@ pub(crate) fn collect_idents_with_paths(
}
}
/// Walk an array/tuple destructure pattern in source order and return
/// each simple-identifier binding paired with its position index.
///
/// Recognises:
/// * JS/TS `array_pattern` — `const [a, b] = ...`, `const [, b] = ...`,
/// `const [a, ,] = ...`. Skip slots (commas with no binding between)
/// advance the position counter without emitting a binding.
/// * Rust `tuple_pattern` — `let (a, _, b) = ...`. `_pattern` (wildcard)
/// advances the position counter without emitting a binding.
/// * Python `pattern_list` / `tuple_pattern` — `a, b = ...` and
/// `(a, b) = ...`. Python `_` is a normal identifier binding (not a
/// wildcard), so every `identifier` child emits a (name, position)
/// entry.
/// * Ruby `left_assignment_list` — `a, b = ...`. Bare comma-list LHS
/// produced by `assignment` whose RHS is an array literal, a call
/// return, or another tuple-yielding expression. Ruby `_` is a normal
/// identifier (matches Python convention; `_` may still be referenced
/// later in scope). Splat (`*rest` parsed as `rest_assignment`) and
/// parenthesised nested destructure (`destructured_left_assignment`)
/// hit the bail branch and fall back to scalar union.
///
/// Returns an empty `SmallVec` when the pattern is not one of the above
/// kinds OR contains complex sub-patterns (`assignment_pattern` for
/// `[a = 1, b]`, `rest_pattern` for `[a, ...rest]`, Python
/// `list_splat_pattern` for `a, *rest = ...`, Ruby `rest_assignment` for
/// `a, *rest = ...`, nested `array_pattern`, `object_pattern`,
/// `destructured_left_assignment`). Callers treat the empty return as
/// "no position-aware rewrite available; fall back to scalar union".
pub(crate) fn collect_array_pattern_bindings_indexed(
pat: Node,
code: &[u8],
) -> SmallVec<[(String, usize); 4]> {
let mut out: SmallVec<[(String, usize); 4]> = SmallVec::new();
let kind = pat.kind();
if !matches!(
kind,
"array_pattern" | "tuple_pattern" | "pattern_list" | "left_assignment_list"
) {
return out;
}
let mut cursor = pat.walk();
let mut pos: usize = 0;
for child in pat.children(&mut cursor) {
match child.kind() {
"[" | "]" | "(" | ")" => {}
"," => {
pos += 1;
}
"identifier" | "shorthand_property_identifier_pattern" => {
if let Some(txt) = text_of(child, code) {
out.push((txt, pos));
}
}
// Rust wildcard `_` in tuple_pattern. Advances position counter
// without binding; no emit. Tree-sitter-rust models the
// wildcard as a leaf node whose `kind()` is literally "_".
"_" => {}
_ => {
// Complex sub-pattern. Bail by clearing — caller treats
// empty as "no position-aware rewrite", preserving the
// pre-existing scalar-union behavior for these shapes.
out.clear();
return out;
}
}
}
out
}
/// Walk an array-literal-shape RHS node and return one slot per source-order
/// element. Each slot is one of:
/// * `RhsArraySlot::Ident(name)` — bare identifier element.
/// * `RhsArraySlot::Literal` — syntactic literal (string, number, bool,
/// null/nil).
/// * `RhsArraySlot::Complex(uses)` — call / binary / subscript / member
/// access / nested array literal / etc. `uses` carries the inner
/// identifier names (member-access paths first, bare idents second)
/// harvested from the slot's subtree via `collect_idents_with_paths`.
///
/// Recognised RHS kinds:
/// * JS/TS / Ruby `array` — `[a, b]`
/// * Python `list` — `[a, b]`
/// * Python `tuple` — `(a, b)`
/// * Python `expression_list` — bare comma form `a, b`
/// * Rust `tuple_expression` — `(a, b)`
///
/// Bails (returns empty) when the RHS is not one of these kinds OR contains
/// a slot whose shape would shift index alignment (spread, list splat).
/// Callers treat empty as "no per-element rewrite available; fall back to
/// scalar union".
pub(crate) fn collect_rhs_array_literal_elements(
rhs: Node,
lang: &str,
code: &[u8],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> SmallVec<[crate::cfg::RhsArraySlot; 4]> {
use crate::cfg::RhsArraySlot;
use crate::labels::{Cap, DataLabel};
// Per-slot source classification: when a slot's own subtree carries a
// Source-labeled member-expression / subscript, capture the Cap so the
// SSA destructure rewrite emits Source for THIS slot specifically and
// lets sibling Complex slots stay slot-scoped Assign. Falls back to
// Cap::empty() when no per-slot source is recognised; the lowering
// path then consults the outer-node Source flag for conservative
// preservation of legacy behavior on shapes whose source pattern
// doesn't text-classify (e.g. a subscript on a tainted local).
let slot_source_cap = |slot: Node| -> Cap {
match first_member_label(slot, lang, code, extra_labels) {
Some(DataLabel::Source(c)) => c,
_ => Cap::empty(),
}
};
let mut out: SmallVec<[RhsArraySlot; 4]> = SmallVec::new();
let kind = rhs.kind();
if !matches!(
kind,
"array" | "array_literal" | "list" | "tuple" | "tuple_expression" | "expression_list"
) {
return out;
}
let mut cursor = rhs.walk();
for child in rhs.named_children(&mut cursor) {
let ck = child.kind();
match ck {
"identifier"
| "shorthand_property_identifier"
| "shorthand_property_identifier_pattern"
| "field_identifier"
| "property_identifier" => match text_of(child, code) {
Some(txt) => out.push(RhsArraySlot::Ident(txt)),
None => {
out.clear();
return out;
}
},
"variable_name" => match text_of(child, code) {
Some(txt) => out.push(RhsArraySlot::Ident(txt.trim_start_matches('$').to_string())),
None => {
out.clear();
return out;
}
},
// Syntactic literal slots: no ident, no taint contribution.
// Names follow tree-sitter's per-grammar literal kinds across
// the supported languages.
"string"
| "string_literal"
| "raw_string_literal"
| "interpreted_string_literal"
| "concatenated_string"
| "integer"
| "integer_literal"
| "float"
| "float_literal"
| "number"
| "numeric_literal"
| "true"
| "false"
| "boolean_literal"
| "boolean"
| "null"
| "null_literal"
| "nil"
| "none"
| "None"
| "undefined" => {
out.push(RhsArraySlot::Literal);
}
// Spread / list-splat shift index alignment unpredictably
// (`[...arr, b]` may expand to N elements at index 0). Bail
// so callers fall back to scalar union.
"spread_element" | "list_splat" | "list_splat_pattern" | "splat_argument"
| "unary_splat" | "splat_expression" => {
out.clear();
return out;
}
// Interpolated strings carry inner identifier uses. Treat as
// Complex so the slot picks up the contributions from
// `${user.id}` etc.
"template_string" | "string_interpolation" | "interpolation" | "encapsed_string" => {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(child, code, &mut idents, &mut paths);
let mut uses: SmallVec<[String; 4]> = SmallVec::new();
for p in paths {
uses.push(p);
}
for ident in idents {
if !uses.iter().any(|u| u == &ident) {
uses.push(ident);
}
}
let source_cap = slot_source_cap(child);
out.push(RhsArraySlot::Complex { uses, source_cap });
}
// Everything else (call, member access, binary, subscript,
// unary, ternary, nested array literal, etc.) is a "complex"
// slot. Harvest inner ident uses so the SSA lowering can paint
// the binding with this slot's contributions only — not the
// union of every ident on the RHS.
_ => {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(child, code, &mut idents, &mut paths);
let mut uses: SmallVec<[String; 4]> = SmallVec::new();
for p in paths {
uses.push(p);
}
for ident in idents {
if !uses.iter().any(|u| u == &ident) {
uses.push(ident);
}
}
let source_cap = slot_source_cap(child);
out.push(RhsArraySlot::Complex { uses, source_cap });
}
}
}
out
}
/// Recursively collect every identifier that occurs inside `n`.
///
/// Recognises `identifier` (most languages), `variable_name` (PHP),
/// `field_identifier` (Go), `property_identifier` (JS/TS), and
/// `shorthand_property_identifier_pattern` (JS/TS destructuring).
/// `shorthand_property_identifier` / `shorthand_property_identifier_pattern`
/// (JS/TS object-literal shorthand uses and destructuring binding patterns).
pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec<String>) {
match n.kind() {
"identifier"
| "field_identifier"
| "property_identifier"
| "shorthand_property_identifier"
| "shorthand_property_identifier_pattern"
// PHP `name`: leaf node carrying the bare identifier text for
// function/method names and similar grammar slots. Without this

View file

@ -337,7 +337,7 @@ fn collect_ruby<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
&& let Some(t) = text_of(c, code)
{
let leaf = t.rsplit("::").next().unwrap_or(&t).to_string();
push(sub.clone(), leaf);
push(sub, leaf);
break;
}
}

View file

@ -1,8 +1,140 @@
use super::{
ImportBinding, ImportBindings, PromisifyAlias, PromisifyAliases, member_expr_text, text_of,
};
use std::collections::HashMap;
use tree_sitter::{Node, Tree};
/// File-local view of every JS/TS import binding: local-name → source-module
/// specifier (verbatim from the `import` / `require` site, without `node:`
/// stripping). Built once per CFG pass; consumed by the gated-label
/// post-pass via [`crate::labels::ClassificationContext::local_imports`].
///
/// Records every binding regardless of aliasing (the legacy
/// [`extract_import_bindings`] only preserves *renamed* bindings, which is
/// not enough for Phase 05's `import { readFile } from 'fs/promises'`
/// shape where `local_name == imported_name`).
///
/// Shares its top-level walk with [`crate::resolve::walk_js_top_level_imports`]
/// so the import-clause / require-declarator parsing logic only lives in one
/// place; this view simply discards the resolver verdict and side-effect-only
/// markers.
pub(super) fn extract_local_import_view(tree: &Tree, code: &[u8]) -> HashMap<String, String> {
let mut out: HashMap<String, String> = HashMap::new();
for raw in crate::resolve::walk_js_top_level_imports(tree, code) {
if raw.local.is_empty() {
continue;
}
out.insert(raw.local, raw.source_spec);
}
extend_with_promises_alias(tree, code, &mut out);
out
}
/// Recognise top-level `const fsp = fs.promises;` /
/// `const fsp = require('fs').promises;` aliasing and add the new local
/// name to the import view as `fs/promises` (or `node:fs/promises`,
/// whichever the source binding spelt).
///
/// The Phase 05 `LabelGate::ImportedFromModule(&["fs/promises", ...])`
/// only consults `local_imports[leading_identifier(callee)]`. Without
/// this extension, `fsp.readFile(x)` evades the gate because `fsp`
/// itself is not an import binding — only the underlying `fs`
/// namespace is.
fn extend_with_promises_alias(tree: &Tree, code: &[u8], out: &mut HashMap<String, String>) {
let root = tree.root_node();
let mut top_cursor = root.walk();
for child in root.children(&mut top_cursor) {
if !matches!(child.kind(), "lexical_declaration" | "variable_declaration") {
continue;
}
let mut decl_cursor = child.walk();
for decl in child.children(&mut decl_cursor) {
if decl.kind() != "variable_declarator" {
continue;
}
let (Some(name_node), Some(value_node)) = (
decl.child_by_field_name("name"),
decl.child_by_field_name("value"),
) else {
continue;
};
if name_node.kind() != "identifier" {
continue;
}
let Some(local_name) = text_of(name_node, code) else {
continue;
};
if value_node.kind() != "member_expression" {
continue;
}
let property = value_node
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
if property.as_deref() != Some("promises") {
continue;
}
let Some(obj) = value_node.child_by_field_name("object") else {
continue;
};
let Some(source) = promises_alias_source(obj, code, out) else {
continue;
};
// Don't override an existing import entry for the same name —
// an explicit import of `fsp` from `fs/promises` already says
// what we'd be inferring here.
out.entry(local_name).or_insert(source);
}
}
}
/// Resolve the object side of a `<lhs> = <obj>.promises` member-expression
/// to a source-module string when `<obj>` is a known `fs` binding.
///
/// Recognised shapes:
/// - identifier `X` where `local_imports[X]` is `fs` or `node:fs`
/// - `require('fs')` / `require("node:fs")` call expression
fn promises_alias_source(
obj: Node,
code: &[u8],
imports_so_far: &HashMap<String, String>,
) -> Option<String> {
match obj.kind() {
"identifier" => {
let id = text_of(obj, code)?;
let module = imports_so_far.get(&id)?;
map_fs_module_to_promises(module)
}
"call_expression" => {
let func = obj.child_by_field_name("function")?;
if text_of(func, code).as_deref() != Some("require") {
return None;
}
let args = obj.child_by_field_name("arguments")?;
let mut cursor = args.walk();
for arg in args.children(&mut cursor) {
if !matches!(arg.kind(), "string" | "template_string") {
continue;
}
let raw = text_of(arg, code)?;
let spec = raw.trim_matches(|c: char| c == '\'' || c == '"' || c == '`');
return map_fs_module_to_promises(spec);
}
None
}
_ => None,
}
}
fn map_fs_module_to_promises(module: &str) -> Option<String> {
if module.eq_ignore_ascii_case("fs") {
Some("fs/promises".to_string())
} else if module.eq_ignore_ascii_case("node:fs") {
Some("node:fs/promises".to_string())
} else {
None
}
}
// -------------------------------------------------------------------------
// Import binding extraction
// -------------------------------------------------------------------------
@ -360,6 +492,129 @@ fn extract_require_module(node: Node, code: &[u8]) -> Option<String> {
None
}
/// Per-file Rust scan: did the file `use` a join-style macro from `tokio` or
/// `futures`? Returns the crate prefix to use when the file calls a bare
/// `join!` / `try_join!` macro.
///
/// Rationale: tree-sitter records `tokio::join!(...)` with a fully qualified
/// `macro` field text, but `use tokio::join; ... join!(a, b)` records the
/// bare leaf. Without this lookup, the SSA-level promise-combinator
/// recogniser (`crate::labels::is_promise_combinator`) misses the bare form
/// and the macro's argument taint is dropped. Conservative: returns `None`
/// when both `tokio::<name>` and `futures::<name>` are imported (ambiguous)
/// or when neither is, leaving the bare `join` callee alone.
pub(super) fn rust_bare_join_crate_prefix(
root: Node,
code: &[u8],
leaf: &str,
) -> Option<&'static str> {
if !matches!(leaf, "join" | "try_join") {
return None;
}
let mut cursor = root.walk();
let mut tokio_seen = false;
let mut futures_seen = false;
for child in root.children(&mut cursor) {
if child.kind() != "use_declaration" {
continue;
}
if rust_use_decl_imports_leaf(child, code, "tokio", leaf) {
tokio_seen = true;
}
if rust_use_decl_imports_leaf(child, code, "futures", leaf) {
futures_seen = true;
}
}
match (tokio_seen, futures_seen) {
(true, false) => Some("tokio"),
(false, true) => Some("futures"),
_ => None,
}
}
/// True when `use_decl` brings `<crate_prefix>::<leaf>` into scope.
///
/// Recognises the common shapes:
/// * `use tokio::join;` → leaf at the path tail
/// * `use tokio::{join, select};` → leaf inside a use_list
/// * `use tokio::join as my_join;` → aliased; we detect the
/// original path even though the aliased name is unused (the macro is
/// typically invoked under its alias, but if the alias and the bare form
/// collide the rewrite is still safe).
/// * `use tokio::*;` is NOT recognised — wildcard imports are too permissive
/// for the bare-leaf rewrite to stay precise.
fn rust_use_decl_imports_leaf(use_decl: Node, code: &[u8], crate_prefix: &str, leaf: &str) -> bool {
let mut stack = vec![use_decl];
while let Some(node) = stack.pop() {
match node.kind() {
// `use tokio::join;` — argument is a `scoped_identifier`.
"scoped_identifier" => {
if scoped_identifier_matches(node, code, crate_prefix, leaf) {
return true;
}
}
// `use tokio::{join, select};` — the `path` field is `tokio`,
// and a `use_list` enumerates leaves.
"scoped_use_list" => {
let path_ok = node
.child_by_field_name("path")
.and_then(|p| text_of(p, code))
.as_deref()
== Some(crate_prefix);
if path_ok && let Some(list) = node.child_by_field_name("list") {
let mut lc = list.walk();
for entry in list.named_children(&mut lc) {
match entry.kind() {
"identifier" if text_of(entry, code).as_deref() == Some(leaf) => {
return true;
}
"use_as_clause"
if entry
.child_by_field_name("path")
.and_then(|p| text_of(p, code))
.as_deref()
== Some(leaf) =>
{
return true;
}
_ => {}
}
}
}
}
// `use tokio::join as my_join;` — aliased clause sits directly
// under the use_declaration; check the path side.
"use_as_clause" => {
if let Some(p) = node.child_by_field_name("path")
&& p.kind() == "scoped_identifier"
&& scoped_identifier_matches(p, code, crate_prefix, leaf)
{
return true;
}
}
_ => {
// Walk children for nested groups (`use a::{b::{c, d}}`).
let mut c = node.walk();
for ch in node.children(&mut c) {
stack.push(ch);
}
}
}
}
false
}
fn scoped_identifier_matches(node: Node, code: &[u8], crate_prefix: &str, leaf: &str) -> bool {
let path_text = node
.child_by_field_name("path")
.and_then(|p| text_of(p, code));
let leaf_text = node
.child_by_field_name("name")
.and_then(|n| text_of(n, code));
matches!((path_text.as_deref(), leaf_text.as_deref()),
(Some(p), Some(l)) if p == crate_prefix && l == leaf)
}
// -------------------------------------------------------------------------
// === PUBLIC ENTRY POINT =================================================
// -------------------------------------------------------------------------

View file

@ -1,22 +1,45 @@
use super::conditions::unwrap_parens;
use super::helpers::{collect_array_pattern_bindings_indexed, collect_rhs_array_literal_elements};
use super::{
anon_fn_name, collect_idents, collect_idents_with_paths, find_constructor_type_child,
first_call_ident, root_receiver_text, text_of,
};
use crate::labels::{Cap, Kind, lookup};
use smallvec::SmallVec;
use tree_sitter::Node;
/// Find the inner CallFn/CallMethod/CallMacro node within an AST node.
/// For direct call nodes, returns the node itself. For wrappers, searches
/// up to two levels of children.
/// up to two levels of children, transparently descending through
/// `await_expression` / `yield_expression` (`Kind::AwaitForward`) wrappers
/// so `const x = await foo(y)` reaches the inner `call_expression` at
/// effective depth 3 (`lexical_declaration > variable_declarator >
/// await_expression > call_expression`).
pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
match lookup(lang, n.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => Some(n),
Kind::AwaitForward => {
// Transparent wrapper: descend into the awaited expression.
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
if let Some(found) = find_call_node(c, lang) {
return Some(found);
}
}
None
}
_ => {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return Some(c),
// Skip past await/yield wrappers without consuming a
// recursion level — the wrapper itself is transparent.
Kind::AwaitForward => {
if let Some(found) = find_call_node(c, lang) {
return Some(found);
}
}
_ => {}
}
}
@ -25,11 +48,14 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
for c in n.children(&mut cursor2) {
let mut cursor3 = c.walk();
for gc in c.children(&mut cursor3) {
if matches!(
lookup(lang, gc.kind()),
Kind::CallFn | Kind::CallMethod | Kind::CallMacro
) {
return Some(gc);
match lookup(lang, gc.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => return Some(gc),
Kind::AwaitForward => {
if let Some(found) = find_call_node(gc, lang) {
return Some(found);
}
}
_ => {}
}
}
}
@ -108,9 +134,43 @@ pub(super) fn extract_destination_field_pairs(
raw
}
}),
// Computed keys like `[someVar]` can't be statically
// resolved, skip (conservative: not a destination field).
"computed_property_name" => continue,
// Computed keys: resolve only when the inner expression
// is a pure string literal (`['url']`). Dynamic forms
// (`[someVar]`, `[`url-${i}`]`, ``[`url`]`` with
// interpolation) stay conservative-skip.
"computed_property_name" => {
let mut inner_cursor = key_node.walk();
let inner = key_node.named_children(&mut inner_cursor).find(|c| {
!matches!(c.kind(), "comment" | "block_comment" | "line_comment")
});
match inner.map(|n| (n.kind(), n)) {
Some(("string" | "string_literal", n)) => text_of(n, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
}),
// Template strings only when no interpolation
// (no `template_substitution` children).
Some(("template_string", n))
if {
let mut tc = n.walk();
!n.named_children(&mut tc)
.any(|c| c.kind() == "template_substitution")
} =>
{
text_of(n, code).map(|raw| {
if raw.len() >= 2 {
raw[1..raw.len() - 1].to_string()
} else {
raw
}
})
}
_ => continue,
}
}
_ => text_of(key_node, code),
};
let Some(key) = key_text else {
@ -144,6 +204,13 @@ pub(super) fn extract_destination_field_pairs(
/// `requests.post(url, data=tainted, json=safe)` where `data` and `json` are
/// `keyword_argument` siblings of the positional URL.
///
/// Also covers Ruby, where tree-sitter-ruby emits `pair` nodes (with
/// `key`/`value` fields) directly under `argument_list` for the
/// `Faraday.new(url: x)` / `Net::HTTP.start(host, port, proxy_addr: prx)`
/// kwarg shape. The `key` is typically a `hash_key_symbol` whose text is the
/// bare identifier (`url`); `simple_symbol` (`:url`) and string keys are
/// normalised by stripping a leading `:` or wrapping quotes.
///
/// Returns the union of matching kwargs, preserving the kwarg name in the
/// `field` slot so callers can still attribute findings per-field. Empty
/// when no matching kwargs exist or the call has no `arguments` field.
@ -162,22 +229,38 @@ pub(super) fn extract_destination_kwarg_pairs(
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
if kind != "keyword_argument" && kind != "named_argument" {
let (name_node, value_node) = if kind == "keyword_argument" || kind == "named_argument" {
let named_count = child.named_child_count();
(
child
.child_by_field_name("name")
.or_else(|| child.named_child(0)),
child
.child_by_field_name("value")
.or_else(|| child.named_child(named_count.saturating_sub(1) as u32)),
)
} else if kind == "pair" {
// Ruby `pair` node sits directly under `argument_list` for
// kwarg-style call args (`f(url: x)`). `key`/`value` fields
// are populated; key text is `hash_key_symbol` ("url"),
// `simple_symbol` (":url"), or a string literal.
(
child.child_by_field_name("key"),
child.child_by_field_name("value"),
)
} else {
continue;
}
let named_count = child.named_child_count();
let name_node = child
.child_by_field_name("name")
.or_else(|| child.named_child(0));
let value_node = child
.child_by_field_name("value")
.or_else(|| child.named_child(named_count.saturating_sub(1) as u32));
};
let (Some(nn), Some(vn)) = (name_node, value_node) else {
continue;
};
let Some(name) = text_of(nn, code) else {
let Some(name_raw) = text_of(nn, code) else {
continue;
};
let name = name_raw
.trim_start_matches(':')
.trim_matches(['"', '\''])
.to_string();
if !fields.iter().any(|&f| f == name) {
continue;
}
@ -387,11 +470,9 @@ pub(super) fn extract_const_macro_arg(
// C/C++ identifier / PHP `name` node for define-style constants.
// Scoped C++ identifiers (`Curl::OPT_POSTFIELDS`) and PHP namespaced
// names also surface here so the dangerous_values match catches them.
"identifier" | "name" | "qualified_name" | "scoped_identifier" => {
text_of(arg, code).map(|s| s.to_string())
}
"identifier" | "name" | "qualified_name" | "scoped_identifier" => text_of(arg, code),
// Ruby bare constant (`NOENT`) — leaf form.
"constant" => text_of(arg, code).map(|s| s.to_string()),
"constant" => text_of(arg, code),
// Ruby scope-qualified constant (`Nokogiri::XML::ParseOptions::NOENT`).
// Return only the rightmost `name` segment so the gate's
// `dangerous_values` list can stay identifier-bare instead of
@ -400,8 +481,7 @@ pub(super) fn extract_const_macro_arg(
"scope_resolution" => arg
.child_by_field_name("name")
.and_then(|n| text_of(n, code))
.map(|s| s.to_string())
.or_else(|| text_of(arg, code).map(|s| s.to_string())),
.or_else(|| text_of(arg, code)),
// Integer literals at the activation arg position. PHP / C / C++
// commonly use plain `0` to opt into the safe-default option set
// (e.g. `simplexml_load_string($xml, "SimpleXMLElement", 0)`). The
@ -409,7 +489,7 @@ pub(super) fn extract_const_macro_arg(
// the literal text lets the comparison fail against `LIBXML_NOENT`
// and suppresses the conservative-fire branch.
"integer" | "integer_literal" | "number_literal" | "decimal_integer_literal" => {
text_of(arg, code).map(|s| s.to_string())
text_of(arg, code)
}
_ => None,
}
@ -443,7 +523,7 @@ pub(super) fn extract_const_keyword_arg(
// distinguish literal-safe from dynamic.
return match value_node.kind() {
"true" | "false" | "none" | "integer" | "float" | "string" | "string_literal"
| "identifier" => text_of(value_node, code).map(|s| s.to_string()),
| "identifier" => text_of(value_node, code),
_ => None,
}
.filter(|_| {
@ -537,7 +617,7 @@ pub(super) fn extract_object_arg_property(
let val_node = unwrap_parens(val_node);
return match val_node.kind() {
"true" | "false" | "null" | "undefined" | "number" | "string" | "string_literal" => {
text_of(val_node, code).map(|s| s.to_string())
text_of(val_node, code)
}
// JS booleans true/false are their own node kinds (above), but
// some grammar versions wrap them as identifier literals; surface
@ -811,7 +891,7 @@ pub(super) fn js_chain_outer_method_for_inner<'a>(
if inner_matched {
return function
.child_by_field_name("property")
.and_then(|p| text_of(p, code).map(|s| s.to_string()));
.and_then(|p| text_of(p, code));
}
}
// Recurse: outer chain may have more depth (`a.b().c().d()` ,
@ -1518,6 +1598,18 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>>
return result;
}
// Rust `tokio::join!` / `futures::join!` (and their `try_*` variants).
// tree-sitter-rust models macro args as a `token_tree` rather than an
// `arguments` field, so a vanilla extraction returns nothing. Walk the
// top-level token_tree splitting on `,` separators, lifting identifiers
// out of each chunk so the existing PromiseCombinator transfer can union
// arg-side taint into the resulting tuple value.
if call_node.kind() == "macro_invocation"
&& let Some(arg_uses) = extract_rust_macro_join_arg_uses(call_node, code)
{
return arg_uses;
}
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
@ -1551,6 +1643,82 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>>
result
}
/// `tokio::join!` / `futures::join!` (and their `try_*` variants) bundle
/// concurrently-awaited futures into a tuple result. tree-sitter-rust
/// represents the args as a `token_tree` whose children alternate between
/// expressions and `,` separators (`token_tree` itself nests on every
/// parenthesised group, e.g. the `(x)` inside `fetch(x)`). Walk the
/// top-level token_tree, segment by `,` leaves, and lift identifiers out
/// of each chunk so the SSA Call op carries one positional arg per future.
///
/// Returns `Some(arg_uses)` only when the macro is one of the recognised
/// join macros, so `extract_arg_uses` can fall through to its normal
/// `arguments`-field path for every other macro shape (`format!`,
/// `println!`, custom DSL macros) where arg lifting could disturb existing
/// label / SSA flow.
pub(super) fn extract_rust_macro_join_arg_uses(
call_node: Node,
code: &[u8],
) -> Option<Vec<Vec<String>>> {
let macro_node = call_node.child_by_field_name("macro")?;
let macro_text = text_of(macro_node, code)?;
if !is_rust_join_macro(&macro_text) {
return None;
}
let tt = match call_node.child_by_field_name("token_tree") {
Some(t) => t,
None => {
let mut cursor = call_node.walk();
call_node
.children(&mut cursor)
.find(|c| c.kind() == "token_tree")?
}
};
let mut chunks: Vec<Vec<Node>> = vec![Vec::new()];
let mut cursor = tt.walk();
for child in tt.children(&mut cursor) {
// Skip the surrounding `(`/`)` punctuation.
if !child.is_named() {
let kind = child.kind();
if kind == "," {
chunks.push(Vec::new());
continue;
}
if kind == "(" || kind == ")" {
continue;
}
}
chunks.last_mut().unwrap().push(child);
}
let mut result = Vec::new();
for chunk in chunks {
if chunk.is_empty() {
continue;
}
let mut idents = Vec::new();
let mut paths = Vec::new();
for n in chunk {
collect_idents_with_paths(n, code, &mut idents, &mut paths);
}
let mut combined = paths;
combined.extend(idents);
result.push(combined);
}
Some(result)
}
fn is_rust_join_macro(macro_text: &str) -> bool {
matches!(
macro_text,
"tokio::join"
| "tokio::try_join"
| "futures::join"
| "futures::try_join"
| "join"
| "try_join"
)
}
/// Extract keyword / named argument bindings for a call node.
///
/// Returns `Vec<(name, uses)>` where `uses` are the identifier references
@ -1891,11 +2059,31 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
.child_by_field_name("method")
.or_else(|| n.child_by_field_name("name"))
.and_then(|f| text_of(f, code));
let recv = n
let recv_node = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("receiver"))
.or_else(|| n.child_by_field_name("scope"))
.and_then(|f| root_receiver_text(f, lang, code));
.or_else(|| n.child_by_field_name("scope"));
let recv = recv_node.and_then(|f| root_receiver_text(f, lang, code));
// Preserve Java `.getClass()` segment in the chained callee text
// so downstream predicates (e.g.
// [`crate::ssa::type_facts::is_safe_string_producing_callee`])
// can recognise idiomatic `obj.getClass().<accessor>()` chains.
// Without this, `root_receiver_text` collapses the chain to
// `obj.<accessor>`, indistinguishable from a user-defined method.
let recv = if lang == "java"
&& let Some(rn) = recv_node
&& lookup(lang, rn.kind()) == Kind::CallMethod
&& let Some(inner_method) = rn
.child_by_field_name("method")
.or_else(|| rn.child_by_field_name("name"))
.and_then(|f| text_of(f, code))
&& inner_method == "getClass"
&& let Some(r) = recv
{
Some(format!("{r}.getClass"))
} else {
recv
};
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f),
@ -1984,7 +2172,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
| "integer"
| "number"
| "number_literal"
| "decimal_literal" => text_of(target, code).map(|s| s.to_string()),
| "decimal_literal" => text_of(target, code),
_ => None,
};
result.push(literal);
@ -2003,7 +2191,7 @@ pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option
let mut cursor = node.walk();
for child in node.named_children(&mut cursor) {
if child.kind() == "string_content" {
return text_of(child, code).map(|s| s.to_string());
return text_of(child, code);
}
}
if raw.len() >= 2 {
@ -2044,20 +2232,43 @@ pub(super) fn extract_arg_callees(call_node: Node, lang: &str, code: &[u8]) -> V
result
}
/// Return `(defines, uses)` for the AST fragment `ast`.
/// Returns (defines, uses, extra_defines) where extra_defines captures additional
/// bindings from destructuring patterns beyond the primary define.
/// Return `(defines, uses, extra_defines, array_pattern_indices,
/// rhs_array_elements)` for the AST fragment `ast`.
///
/// `extra_defines` captures additional bindings from destructuring patterns
/// beyond the primary define. `array_pattern_indices`, when non-empty, gives
/// the source-order position of each binding in `iter::once(defines).chain(
/// extra_defines)` for `array_pattern` / `tuple_pattern` LHS shapes. Empty
/// for non-array destructures and for non-skip array patterns where callers
/// can derive sequential 0..N indices implicitly.
///
/// `rhs_array_elements`, when non-empty, gives source-order RHS slots for
/// destructure-from-array-literal shapes (`const [a, b] = [safe, tainted]`,
/// `let (a, b) = (safe, tainted)`, Python `a, b = safe, tainted`). Each slot
/// is `Some(ident)` for a bare-ident element or `None` for a syntactic
/// literal. Empty when RHS isn't an array-literal shape or any element is
/// too complex; callers fall back to scalar union in that case.
#[allow(clippy::type_complexity)]
pub(super) fn def_use(
ast: Node,
lang: &str,
code: &[u8],
) -> (Option<String>, Vec<String>, Vec<String>) {
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> (
Option<String>,
Vec<String>,
Vec<String>,
SmallVec<[usize; 4]>,
SmallVec<[crate::cfg::RhsArraySlot; 4]>,
) {
match lookup(lang, ast.kind()) {
// Declaration wrappers (let, var, short_var_declaration, etc.)
Kind::CallWrapper => {
let mut defs = None;
let mut extra_defs = Vec::new();
let mut uses = Vec::new();
let mut pattern_indices: SmallVec<[usize; 4]> = SmallVec::new();
let mut rhs_array_elements: SmallVec<[crate::cfg::RhsArraySlot; 4]> = SmallVec::new();
// Try direct field names first (Rust `let_declaration`, Go `short_var_declaration`)
let def_node = ast
@ -2076,17 +2287,30 @@ pub(super) fn def_use(
if def_node.is_some() || val_node.is_some() {
if let Some(pat) = def_node {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(pat, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
// Remaining idents are extra defines (for destructuring)
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
let bindings = collect_array_pattern_bindings_indexed(pat, code);
if !bindings.is_empty() {
let mut iter = bindings.into_iter();
if let Some((first_name, first_idx)) = iter.next() {
defs = Some(first_name);
pattern_indices.push(first_idx);
}
for (name, idx) in iter {
extra_defs.push(name);
pattern_indices.push(idx);
}
} else {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(pat, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
// Remaining idents are extra defines (for destructuring)
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
defs = first;
}
if let Some(val) = val_node {
let mut idents = Vec::new();
@ -2099,6 +2323,14 @@ pub(super) fn def_use(
// the format-string bytes, not as a separate AST
// argument node, so collect_idents misses it.
uses.extend(extract_rust_format_macro_named_idents_in(val, code));
// When the LHS is a recognised destructure pattern AND
// the RHS is a bare array-literal shape (no call), record
// per-element idents so the SSA destructure rewrite can
// map each binding to its specific RHS slot.
if !pattern_indices.is_empty() {
rhs_array_elements =
collect_rhs_array_literal_elements(val, lang, code, extra_labels);
}
}
} else {
// Try nested declarator pattern (JS/TS `lexical_declaration` → `variable_declarator`,
@ -2135,16 +2367,29 @@ pub(super) fn def_use(
if let Some(name_node) = child_name
&& defs.is_none()
{
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(name_node, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
let bindings = collect_array_pattern_bindings_indexed(name_node, code);
if !bindings.is_empty() {
let mut iter = bindings.into_iter();
if let Some((first_name, first_idx)) = iter.next() {
defs = Some(first_name);
pattern_indices.push(first_idx);
}
for (name, idx) in iter {
extra_defs.push(name);
pattern_indices.push(idx);
}
} else {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(name_node, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
defs = first;
}
if let Some(val_node) = child_value {
let mut idents = Vec::new();
@ -2153,6 +2398,14 @@ pub(super) fn def_use(
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(val_node, code));
if !pattern_indices.is_empty() && rhs_array_elements.is_empty() {
rhs_array_elements = collect_rhs_array_literal_elements(
val_node,
lang,
code,
extra_labels,
);
}
}
}
}
@ -2168,19 +2421,42 @@ pub(super) fn def_use(
uses.extend(extract_rust_format_macro_named_idents_in(ast, code));
}
}
(defs, uses, extra_defs)
(defs, uses, extra_defs, pattern_indices, rhs_array_elements)
}
// Plain assignment `x = y`
// Plain assignment `x = y` or destructuring assignment such as
// Python `a, b = await asyncio.gather(...)` whose LHS surfaces as
// a `pattern_list` / `tuple_pattern`. When the LHS is a
// destructure pattern that the indexed helper recognises, the
// primary binding lands in `defs`, the rest land in `extra_defs`,
// and `pattern_indices` carries source-order positions so the
// SSA lowering's destructure-promise rewrite can paint each
// binding from the matching combinator argument.
Kind::Assignment => {
let mut defs = None;
let mut extra_defs = Vec::new();
let mut pattern_indices: SmallVec<[usize; 4]> = SmallVec::new();
let mut rhs_array_elements: SmallVec<[crate::cfg::RhsArraySlot; 4]> = SmallVec::new();
let mut uses = Vec::new();
if let Some(lhs) = ast.child_by_field_name("left") {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(lhs, code, &mut idents, &mut paths);
// Prefer dotted path (member expression) over last ident
defs = paths.pop().or_else(|| idents.pop());
let bindings = collect_array_pattern_bindings_indexed(lhs, code);
if !bindings.is_empty() {
let mut iter = bindings.into_iter();
if let Some((first_name, first_idx)) = iter.next() {
defs = Some(first_name);
pattern_indices.push(first_idx);
}
for (name, idx) in iter {
extra_defs.push(name);
pattern_indices.push(idx);
}
} else {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(lhs, code, &mut idents, &mut paths);
// Prefer dotted path (member expression) over last ident
defs = paths.pop().or_else(|| idents.pop());
}
}
if let Some(rhs) = ast.child_by_field_name("right") {
let mut idents = Vec::new();
@ -2189,8 +2465,16 @@ pub(super) fn def_use(
uses.extend(paths);
uses.extend(idents);
uses.extend(extract_rust_format_macro_named_idents_in(rhs, code));
// When the LHS is a recognised destructure pattern AND the
// RHS is a bare array-literal shape, record per-element
// idents so the SSA destructure rewrite can map each
// binding to its specific RHS slot.
if !pattern_indices.is_empty() {
rhs_array_elements =
collect_rhs_array_literal_elements(rhs, lang, code, extra_labels);
}
}
(defs, uses, vec![])
(defs, uses, extra_defs, pattern_indices, rhs_array_elements)
}
// iflet / whilelet, the `let_condition` binds a variable from
@ -2215,7 +2499,7 @@ pub(super) fn def_use(
if let Some(val) = c.child_by_field_name("value") {
collect_idents(val, code, &mut uses);
}
return (defs, uses, vec![]);
return (defs, uses, vec![], SmallVec::new(), SmallVec::new());
}
let mut idents = Vec::new();
@ -2223,7 +2507,7 @@ pub(super) fn def_use(
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
(None, uses, vec![])
(None, uses, vec![], SmallVec::new(), SmallVec::new())
}
// for-in / for-of / Python `for x in iter:` ─────────────────────────
@ -2267,7 +2551,7 @@ pub(super) fn def_use(
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
return (None, uses, vec![]);
return (None, uses, vec![], SmallVec::new(), SmallVec::new());
}
let mut defs: Option<String> = None;
@ -2293,7 +2577,7 @@ pub(super) fn def_use(
uses.extend(paths);
uses.extend(idents);
}
(defs, uses, extra_defs)
(defs, uses, extra_defs, SmallVec::new(), SmallVec::new())
}
// everything else no definition, but may read vars
@ -2303,7 +2587,7 @@ pub(super) fn def_use(
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
(None, uses, vec![])
(None, uses, vec![], SmallVec::new(), SmallVec::new())
}
}
}

File diff suppressed because it is too large Load diff

882
src/cfg/safe_fields.rs Normal file
View file

@ -0,0 +1,882 @@
//! Per-file extraction of class fields whose `.get(...)` lookups are
//! provably safe.
//!
//! Recognises Java `final` fields whose initializer is `Map.of(K1, V1,
//! K2, V2, ...)` with all string-literal arguments. At a downstream
//! `<FIELD>.get(taintedKey)` call the result is bounded to the literal
//! value set, so the SSA taint engine can suppress propagation from the
//! key to the result. Without this pre-pass the engine sees `<FIELD>`
//! as a free identifier with no SSA value, fails to resolve the
//! container, and falls back to default arg-to-result propagation.
//!
//! Strictly additive: unrecognised initializer shapes (factory chains,
//! `Map.ofEntries`, builders) produce no entry and the engine keeps
//! its prior behaviour.
use std::cell::RefCell;
use std::collections::HashMap;
use tree_sitter::Node;
use super::helpers::text_of;
thread_local! {
/// Per-file safe-lookup field map published by [`with_safe_lookup_fields`]
/// around taint passes that need it. The SSA taint engine's container
/// Load fallback consults this view via [`safe_lookup_field_values`] when
/// the receiver is a free identifier (no SSA value to resolve against).
static SAFE_LOOKUP_FIELDS_TLS: RefCell<Option<HashMap<String, Vec<String>>>> =
const { RefCell::new(None) };
}
/// Run `f` with `fields` published as the per-thread safe-lookup view.
/// Restores the prior value on drop so nested calls compose; pass `None`
/// to suppress the gate for callers that lack a file context.
pub fn with_safe_lookup_fields<R>(
fields: Option<&HashMap<String, Vec<String>>>,
f: impl FnOnce() -> R,
) -> R {
let prev = SAFE_LOOKUP_FIELDS_TLS.with(|cell| {
cell.borrow_mut()
.replace(fields.cloned().unwrap_or_default())
});
let restore_to = if fields.is_some() { prev } else { None };
struct Guard(Option<HashMap<String, Vec<String>>>);
impl Drop for Guard {
fn drop(&mut self) {
SAFE_LOOKUP_FIELDS_TLS.with(|cell| *cell.borrow_mut() = self.0.take());
}
}
let _guard = Guard(restore_to);
f()
}
/// Look up the literal value set for a safe field. Returns `None` when
/// no view is published, the field is not a known safe lookup, or the
/// value list is empty.
pub fn safe_lookup_field_values(name: &str) -> Option<Vec<String>> {
SAFE_LOOKUP_FIELDS_TLS.with(|cell| {
let borrowed = cell.borrow();
let map = borrowed.as_ref()?;
let values = map.get(name)?;
if values.is_empty() {
None
} else {
Some(values.clone())
}
})
}
/// Per-file safe-lookup field map: field name → finite set of literal
/// values that `<field>.get(...)` may return. Empty for non-Java files.
pub fn collect_safe_lookup_fields(
root: Node<'_>,
lang: &str,
code: &[u8],
) -> HashMap<String, Vec<String>> {
let mut out: HashMap<String, Vec<String>> = HashMap::new();
if lang == "java" {
collect_java(root, code, &mut out);
}
out
}
/// Per-file file-level constant scalar map: name → literal value text.
///
/// Recognises declarations that bind a name to a primitive scalar literal at
/// file or class scope, where the per-function SSA const-prop has no view of
/// the binding (the name is a free identifier from inside any function body):
///
/// - Java: `static final TYPE NAME = LITERAL;` fields (any class depth).
/// - Python: `NAME = LITERAL` at module scope.
/// - Go: `const NAME = LITERAL` and `const NAME TYPE = LITERAL` at package scope.
/// - Rust: `const NAME: TYPE = LITERAL;` and `static NAME: TYPE = LITERAL;` at
/// crate or module scope.
///
/// Used by `cfg_analysis::guards` to suppress `cfg-unguarded-sink` when a
/// sink's argument is one of these bindings. `LITERAL` covers strings (no
/// interpolation), integers in any supported base, floats, booleans, null /
/// nil / None, and unary negation / not over those.
///
/// Empty for unsupported languages. Scalar means single-value, not
/// container; the `Map.of(...)` form is captured by
/// [`collect_safe_lookup_fields`].
pub fn collect_class_constant_scalars(
root: Node<'_>,
lang: &str,
code: &[u8],
) -> HashMap<String, String> {
let mut out: HashMap<String, String> = HashMap::new();
match lang {
"java" => collect_java_constant_scalars(root, code, &mut out),
"python" => collect_python_constant_scalars(root, code, &mut out),
"go" => collect_go_constant_scalars(root, code, &mut out),
"rust" => collect_rust_constant_scalars(root, code, &mut out),
_ => {}
}
out
}
fn collect_java_constant_scalars(root: Node<'_>, code: &[u8], out: &mut HashMap<String, String>) {
walk(root, &mut |node| {
if node.kind() != "field_declaration" {
return;
}
if !has_static_modifier(node) || !has_final_modifier(node) {
return;
}
// A single `field_declaration` may carry multiple
// `variable_declarator` children (`static final int A = 1, B = 2;`).
// Iterate every declarator field; tree-sitter exposes them under
// the `declarator` field name as repeated entries.
let mut cursor = node.walk();
for child in node.children_by_field_name("declarator", &mut cursor) {
let Some(name_node) = child.child_by_field_name("name") else {
continue;
};
let Some(field_name) = text_of(name_node, code) else {
continue;
};
let Some(value_node) = child.child_by_field_name("value") else {
continue;
};
let Some(literal) = scalar_literal_text(value_node, code) else {
continue;
};
out.insert(field_name, literal);
}
});
}
/// Python: module-level `NAME = LITERAL` assignments. Only top-level
/// expression statements are considered; assignments inside function bodies,
/// class bodies, or other blocks are out of scope (a per-function SSA pass
/// already sees those).
fn collect_python_constant_scalars(root: Node<'_>, code: &[u8], out: &mut HashMap<String, String>) {
if root.kind() != "module" {
return;
}
let mut cursor = root.walk();
for child in root.named_children(&mut cursor) {
if child.kind() != "expression_statement" {
continue;
}
let Some(assign) = child.named_child(0) else {
continue;
};
if assign.kind() != "assignment" {
continue;
}
let Some(target) = assign.child_by_field_name("left") else {
continue;
};
if target.kind() != "identifier" {
continue;
}
let Some(name) = text_of(target, code) else {
continue;
};
let Some(value) = assign.child_by_field_name("right") else {
continue;
};
let Some(literal) = python_scalar_literal_text(value, code) else {
continue;
};
out.insert(name, literal);
}
}
/// Go: package-level `const NAME = LITERAL` and `const NAME TYPE = LITERAL`,
/// including the grouped `const (...)` form. Iterates direct
/// `const_declaration` children of the source file, then per-`const_spec`
/// reads the `name` list and `value` expression list, binding by position.
fn collect_go_constant_scalars(root: Node<'_>, code: &[u8], out: &mut HashMap<String, String>) {
if root.kind() != "source_file" {
return;
}
let mut cursor = root.walk();
for child in root.named_children(&mut cursor) {
if child.kind() != "const_declaration" {
continue;
}
let mut spec_cursor = child.walk();
for spec in child.named_children(&mut spec_cursor) {
if spec.kind() != "const_spec" {
continue;
}
collect_go_const_spec(spec, code, out);
}
}
}
fn collect_go_const_spec(spec: Node<'_>, code: &[u8], out: &mut HashMap<String, String>) {
// tree-sitter-go `const_spec`:
// name: <identifier> (repeated) — one or more identifiers
// value: <expression_list> — list of value expressions
// For a multi-target spec `const A, B = 1, 2`, identifiers and values pair
// up positionally. The simpler single-target form parses the same way
// with one entry per side.
let mut name_cursor = spec.walk();
let names: Vec<Node<'_>> = spec
.children_by_field_name("name", &mut name_cursor)
.collect();
if names.is_empty() {
return;
}
let Some(value_list) = spec.child_by_field_name("value") else {
return;
};
let mut value_cursor = value_list.walk();
let values: Vec<Node<'_>> = value_list.named_children(&mut value_cursor).collect();
if values.len() != names.len() {
return;
}
for (name_node, value_node) in names.iter().zip(values.iter()) {
if name_node.kind() != "identifier" {
continue;
}
let Some(name) = text_of(*name_node, code) else {
continue;
};
let Some(literal) = go_scalar_literal_text(*value_node, code) else {
continue;
};
out.insert(name, literal);
}
}
/// Rust: module-level `const NAME: TYPE = LITERAL;` and `static NAME: TYPE =
/// LITERAL;`. Only direct children of `source_file` participate so a `const`
/// defined inside a function body does not bleed across scopes.
fn collect_rust_constant_scalars(root: Node<'_>, code: &[u8], out: &mut HashMap<String, String>) {
if root.kind() != "source_file" {
return;
}
let mut cursor = root.walk();
for child in root.named_children(&mut cursor) {
if !matches!(child.kind(), "const_item" | "static_item") {
continue;
}
let Some(name_node) = child.child_by_field_name("name") else {
continue;
};
let Some(name) = text_of(name_node, code) else {
continue;
};
let Some(value_node) = child.child_by_field_name("value") else {
continue;
};
let Some(literal) = rust_scalar_literal_text(value_node, code) else {
continue;
};
out.insert(name, literal);
}
}
/// `true` when `field_declaration` carries a `static` modifier.
fn has_static_modifier(field_decl: Node<'_>) -> bool {
let mut cursor = field_decl.walk();
for child in field_decl.children(&mut cursor) {
if child.kind() != "modifiers" {
continue;
}
let mut sub = child.walk();
for mod_child in child.children(&mut sub) {
if mod_child.kind() == "static" {
return true;
}
}
}
false
}
/// Return the source text when `value` is a primitive scalar literal node.
/// Covers the Java grammar's literal kinds. Returns `None` for compound
/// expressions, identifier references, method invocations, and other
/// non-literal initializers.
fn scalar_literal_text(value: Node<'_>, code: &[u8]) -> Option<String> {
match value.kind() {
"string_literal"
| "decimal_integer_literal"
| "hex_integer_literal"
| "octal_integer_literal"
| "binary_integer_literal"
| "decimal_floating_point_literal"
| "hex_floating_point_literal"
| "character_literal"
| "true"
| "false"
| "null_literal" => text_of(value, code),
// Unary `-1`, `+0`, `!true` over a literal child still resolve to a
// compile-time constant; recurse into the operand.
"unary_expression" => {
let operand = value.child_by_field_name("operand")?;
scalar_literal_text(operand, code)
}
_ => None,
}
}
/// Python scalar literal classifier. Rejects f-strings with interpolation
/// (`f"x{var}"` parses as `string` with an `interpolation` child); returns
/// the source text otherwise.
fn python_scalar_literal_text(value: Node<'_>, code: &[u8]) -> Option<String> {
match value.kind() {
"string" => {
if python_string_has_interpolation(value) {
None
} else {
text_of(value, code)
}
}
"integer" | "float" | "true" | "false" | "none" => text_of(value, code),
"unary_operator" => {
let operand = value.child_by_field_name("argument")?;
python_scalar_literal_text(operand, code)
}
_ => None,
}
}
fn python_string_has_interpolation(node: Node<'_>) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "interpolation" {
return true;
}
}
false
}
/// Go scalar literal classifier. `interpreted_string_literal` and
/// `raw_string_literal` cover both `"x"` and `` `x` `` forms.
fn go_scalar_literal_text(value: Node<'_>, code: &[u8]) -> Option<String> {
match value.kind() {
"interpreted_string_literal"
| "raw_string_literal"
| "int_literal"
| "float_literal"
| "imaginary_literal"
| "rune_literal"
| "true"
| "false"
| "nil" => text_of(value, code),
"unary_expression" => {
let operand = value.child_by_field_name("operand")?;
go_scalar_literal_text(operand, code)
}
_ => None,
}
}
/// Rust scalar literal classifier. Accepts `string_literal`, `raw_string_literal`
/// (both unwrappable to a single text run), integer / float / boolean / char.
fn rust_scalar_literal_text(value: Node<'_>, code: &[u8]) -> Option<String> {
match value.kind() {
"string_literal" | "raw_string_literal" | "integer_literal" | "float_literal"
| "char_literal" | "boolean_literal" => text_of(value, code),
// `true` / `false` are leaf identifier-ish nodes in some grammars but
// tree-sitter-rust gives them the `boolean_literal` kind; defensively
// accept the leaf form too in case the grammar is upgraded.
"true" | "false" => text_of(value, code),
"unary_expression" => {
let mut cursor = value.walk();
value
.named_children(&mut cursor)
.find_map(|c| rust_scalar_literal_text(c, code))
}
_ => None,
}
}
fn collect_java(root: Node<'_>, code: &[u8], out: &mut HashMap<String, Vec<String>>) {
walk(root, &mut |node| {
if node.kind() != "field_declaration" {
return;
}
if !has_final_modifier(node) {
return;
}
let Some(decl) = node.child_by_field_name("declarator") else {
return;
};
let Some(name_node) = decl.child_by_field_name("name") else {
return;
};
let Some(field_name) = text_of(name_node, code) else {
return;
};
let Some(value_node) = decl.child_by_field_name("value") else {
return;
};
let Some(values) = extract_map_of_literal_values(value_node, code) else {
return;
};
out.insert(field_name, values);
});
}
/// `true` when `field_declaration` carries a `final` modifier (static or
/// instance — both block reassignment after construction).
fn has_final_modifier(field_decl: Node<'_>) -> bool {
let mut cursor = field_decl.walk();
for child in field_decl.children(&mut cursor) {
if child.kind() != "modifiers" {
continue;
}
let mut sub = child.walk();
for mod_child in child.children(&mut sub) {
if mod_child.kind() == "final" {
return true;
}
}
}
false
}
/// If `value_node` is `Map.of(LIT, LIT, LIT, LIT, ...)` with at least one
/// key/value pair and every argument a `string_literal`, return the
/// value-position literals (positions 1, 3, 5, ...).
fn extract_map_of_literal_values(value_node: Node<'_>, code: &[u8]) -> Option<Vec<String>> {
if value_node.kind() != "method_invocation" {
return None;
}
let object_node = value_node.child_by_field_name("object")?;
let method_node = value_node.child_by_field_name("name")?;
let method_text = text_of(method_node, code)?;
if method_text != "of" {
return None;
}
if !receiver_is_map_class(object_node, code) {
return None;
}
let args_node = value_node.child_by_field_name("arguments")?;
let mut cursor = args_node.walk();
let args: Vec<Node<'_>> = args_node.named_children(&mut cursor).collect();
if args.is_empty() || !args.len().is_multiple_of(2) {
return None;
}
let mut values = Vec::with_capacity(args.len() / 2);
for (i, arg) in args.iter().enumerate() {
if arg.kind() != "string_literal" {
return None;
}
if i % 2 == 1 {
let literal = string_literal_value(*arg, code)?;
values.push(literal);
}
}
Some(values)
}
/// `true` when `node` resolves to the `Map` class — either the bare
/// identifier `Map` or a `field_access` whose tail segment is `Map`
/// (covers `java.util.Map.of(...)`).
fn receiver_is_map_class(node: Node<'_>, code: &[u8]) -> bool {
match node.kind() {
"identifier" => text_of(node, code).as_deref() == Some("Map"),
"field_access" => {
// tail segment lives on the `field` field
let Some(field) = node.child_by_field_name("field") else {
return false;
};
text_of(field, code).as_deref() == Some("Map")
}
_ => false,
}
}
/// Extract the inner content of a Java `string_literal` node. The
/// grammar wraps the value in `string_fragment` children between quote
/// tokens; concatenate every `string_fragment` so escaped quotes inside
/// the literal are not lost. Returns `None` for literals containing
/// interpolation / escape-sequence children that do not classify as a
/// pure string fragment.
fn string_literal_value(node: Node<'_>, code: &[u8]) -> Option<String> {
let mut cursor = node.walk();
let mut out = String::new();
let mut saw_fragment = false;
for child in node.named_children(&mut cursor) {
match child.kind() {
"string_fragment" => {
saw_fragment = true;
out.push_str(&text_of(child, code)?);
}
"escape_sequence" => {
// A real escape sequence keeps the literal pure-string but
// we cannot trivially decode it; return None to be
// conservative on header-injection safety.
return None;
}
_ => return None,
}
}
if saw_fragment {
Some(out)
} else {
// Empty literal `""` — has no `string_fragment` children but is
// a valid empty string.
let raw = text_of(node, code)?;
if raw == "\"\"" {
Some(String::new())
} else {
None
}
}
}
fn walk<'a, F: FnMut(Node<'a>)>(node: Node<'a>, f: &mut F) {
f(node);
let mut cursor = node.walk();
for child in node.named_children(&mut cursor) {
walk(child, f);
}
}
#[cfg(test)]
mod tests {
use super::*;
use tree_sitter::Parser;
fn collect(src: &str) -> HashMap<String, Vec<String>> {
let mut p = Parser::new();
p.set_language(&tree_sitter_java::LANGUAGE.into()).unwrap();
let tree = p.parse(src, None).unwrap();
collect_safe_lookup_fields(tree.root_node(), "java", src.as_bytes())
}
#[test]
fn static_final_map_of_two_pairs() {
let src = r#"
class C {
private static final java.util.Map<String, String> T = Map.of(
"a", "x", "b", "y"
);
}
"#;
let out = collect(src);
assert_eq!(out.get("T"), Some(&vec!["x".to_string(), "y".to_string()]));
}
#[test]
fn instance_final_map_of_one_pair() {
let src = r#"
class C {
private final java.util.Map<String, String> T = Map.of("a", "x");
}
"#;
let out = collect(src);
assert_eq!(out.get("T"), Some(&vec!["x".to_string()]));
}
#[test]
fn rejects_non_final_field() {
let src = r#"
class C {
private static java.util.Map<String, String> T = Map.of("a", "x");
}
"#;
let out = collect(src);
assert!(out.is_empty());
}
#[test]
fn rejects_non_literal_value() {
let src = r#"
class C {
private static final String SAFE = "x";
private static final java.util.Map<String, String> T = Map.of("a", SAFE);
}
"#;
let out = collect(src);
// SAFE is an identifier, not a string_literal — even though const-
// foldable, the syntactic check rejects to stay simple.
assert!(!out.contains_key("T"));
}
#[test]
fn rejects_odd_arg_count() {
// Compiler would reject this too, but the extractor must not panic.
let src = r#"
class C {
private static final java.util.Map<String, String> T = Map.of("a", "x", "b");
}
"#;
let out = collect(src);
assert!(out.is_empty());
}
#[test]
fn rejects_empty_map_of() {
let src = r#"
class C {
private static final java.util.Map<String, String> T = Map.of();
}
"#;
let out = collect(src);
assert!(out.is_empty());
}
#[test]
fn fully_qualified_map_of() {
let src = r#"
class C {
private static final java.util.Map<String, String> T = java.util.Map.of(
"a", "x", "b", "y"
);
}
"#;
let out = collect(src);
assert_eq!(out.get("T"), Some(&vec!["x".to_string(), "y".to_string()]));
}
#[test]
fn rejects_escape_sequence_value() {
let src = r#"
class C {
private static final java.util.Map<String, String> T = Map.of(
"a", "with\nnewline"
);
}
"#;
let out = collect(src);
// `\n` would smuggle a CRLF-style metachar through the static
// gate; conservative reject keeps header-injection suppression
// honest.
assert!(!out.contains_key("T"));
}
#[test]
fn ignores_non_java_lang() {
let src = "const x = 1;";
let mut p = Parser::new();
p.set_language(&tree_sitter_javascript::LANGUAGE.into())
.unwrap();
let tree = p.parse(src, None).unwrap();
let out = collect_safe_lookup_fields(tree.root_node(), "javascript", src.as_bytes());
assert!(out.is_empty());
}
fn collect_consts(src: &str) -> HashMap<String, String> {
let mut p = Parser::new();
p.set_language(&tree_sitter_java::LANGUAGE.into()).unwrap();
let tree = p.parse(src, None).unwrap();
collect_class_constant_scalars(tree.root_node(), "java", src.as_bytes())
}
#[test]
fn class_constants_capture_string_int_bool() {
let src = r#"
class C {
private static final String DRIVER = "com.mysql.cj.jdbc.Driver";
public static final int LIMIT = 100;
static final boolean DEBUG = false;
}
"#;
let out = collect_consts(src);
assert_eq!(
out.get("DRIVER"),
Some(&"\"com.mysql.cj.jdbc.Driver\"".to_string())
);
assert_eq!(out.get("LIMIT"), Some(&"100".to_string()));
assert_eq!(out.get("DEBUG"), Some(&"false".to_string()));
}
#[test]
fn class_constants_capture_multi_declarator() {
let src = r#"
class C {
private static final int A = 1, B = 2, C2 = 3;
}
"#;
let out = collect_consts(src);
assert_eq!(out.get("A"), Some(&"1".to_string()));
assert_eq!(out.get("B"), Some(&"2".to_string()));
assert_eq!(out.get("C2"), Some(&"3".to_string()));
}
#[test]
fn class_constants_capture_unary_negation() {
let src = r#"
class C {
private static final int OFFSET = -1;
}
"#;
let out = collect_consts(src);
// text_of returns the operand text, not the wrapper text.
assert_eq!(out.get("OFFSET"), Some(&"1".to_string()));
}
#[test]
fn class_constants_reject_non_static() {
let src = r#"
class C {
private final String NAME = "x";
}
"#;
let out = collect_consts(src);
assert!(!out.contains_key("NAME"));
}
#[test]
fn class_constants_reject_non_final() {
let src = r#"
class C {
private static String NAME = "x";
}
"#;
let out = collect_consts(src);
assert!(!out.contains_key("NAME"));
}
#[test]
fn class_constants_reject_identifier_value() {
let src = r#"
class C {
private static final String OTHER = computed();
private static final String COPY = OTHER;
}
"#;
let out = collect_consts(src);
assert!(!out.contains_key("OTHER"));
assert!(!out.contains_key("COPY"));
}
#[test]
fn class_constants_capture_inside_inner_class() {
let src = r#"
class Outer {
static class Inner {
private static final String DRIVER = "x";
}
}
"#;
let out = collect_consts(src);
assert_eq!(out.get("DRIVER"), Some(&"\"x\"".to_string()));
}
#[test]
fn class_constants_ignore_non_supported_lang() {
let src = "const x = 1;";
let mut p = Parser::new();
p.set_language(&tree_sitter_javascript::LANGUAGE.into())
.unwrap();
let tree = p.parse(src, None).unwrap();
let out = collect_class_constant_scalars(tree.root_node(), "javascript", src.as_bytes());
assert!(out.is_empty());
}
fn collect_consts_lang(src: &str, lang: &str) -> HashMap<String, String> {
let mut p = Parser::new();
match lang {
"python" => p
.set_language(&tree_sitter_python::LANGUAGE.into())
.unwrap(),
"go" => p.set_language(&tree_sitter_go::LANGUAGE.into()).unwrap(),
"rust" => p.set_language(&tree_sitter_rust::LANGUAGE.into()).unwrap(),
_ => unreachable!("unsupported lang in test helper: {lang}"),
};
let tree = p.parse(src, None).unwrap();
collect_class_constant_scalars(tree.root_node(), lang, src.as_bytes())
}
#[test]
fn python_module_constants_capture_scalars() {
let src = "DRIVER = \"sqlite3\"\nLIMIT = 100\nDEBUG = False\nNAME = None\n";
let out = collect_consts_lang(src, "python");
assert_eq!(out.get("DRIVER"), Some(&"\"sqlite3\"".to_string()));
assert_eq!(out.get("LIMIT"), Some(&"100".to_string()));
assert_eq!(out.get("DEBUG"), Some(&"False".to_string()));
assert_eq!(out.get("NAME"), Some(&"None".to_string()));
}
#[test]
fn python_module_constants_capture_unary_negation() {
// The recogniser recurses into the operand and returns its text, so
// `OFFSET = -1` stores `"1"`. The downstream suppression consumer
// only cares about name binding, not the decoded numeric value.
let src = "OFFSET = -1\n";
let out = collect_consts_lang(src, "python");
assert_eq!(out.get("OFFSET"), Some(&"1".to_string()));
}
#[test]
fn python_module_constants_reject_fstring_with_interpolation() {
let src = "import os\nVAR = f\"hi {os.getcwd()}\"\n";
let out = collect_consts_lang(src, "python");
assert!(!out.contains_key("VAR"));
}
#[test]
fn python_module_constants_reject_call_value() {
let src = "from os import getcwd\nPATH = getcwd()\n";
let out = collect_consts_lang(src, "python");
assert!(!out.contains_key("PATH"));
}
#[test]
fn python_module_constants_skip_inside_function_body() {
// An assignment inside a function body is per-function SSA's job.
// Only top-level module assignments should land in the map.
let src = "def f():\n INNER = \"x\"\n return INNER\n";
let out = collect_consts_lang(src, "python");
assert!(!out.contains_key("INNER"));
}
#[test]
fn go_package_constants_capture_scalars() {
let src =
"package main\nconst DRIVER = \"postgres\"\nconst LIMIT = 100\nconst FLAG = true\n";
let out = collect_consts_lang(src, "go");
assert_eq!(out.get("DRIVER"), Some(&"\"postgres\"".to_string()));
assert_eq!(out.get("LIMIT"), Some(&"100".to_string()));
assert_eq!(out.get("FLAG"), Some(&"true".to_string()));
}
#[test]
fn go_package_constants_capture_grouped_const_block() {
let src = "package main\nconst (\n A = \"x\"\n B int = 42\n C = false\n)\n";
let out = collect_consts_lang(src, "go");
assert_eq!(out.get("A"), Some(&"\"x\"".to_string()));
assert_eq!(out.get("B"), Some(&"42".to_string()));
assert_eq!(out.get("C"), Some(&"false".to_string()));
}
#[test]
fn go_package_constants_reject_non_literal() {
let src = "package main\nconst OTHER = foo()\n";
let out = collect_consts_lang(src, "go");
assert!(!out.contains_key("OTHER"));
}
#[test]
fn go_package_constants_skip_inside_function_body() {
// `const` inside a function body is per-function SSA's territory.
let src = "package main\nfunc f() string { const INNER = \"x\"; return INNER }\n";
let out = collect_consts_lang(src, "go");
assert!(!out.contains_key("INNER"));
}
#[test]
fn rust_module_consts_capture_scalars() {
let src = "const DRIVER: &str = \"sqlite\";\nconst LIMIT: i32 = 100;\nstatic FLAG: bool = false;\n";
let out = collect_consts_lang(src, "rust");
assert_eq!(out.get("DRIVER"), Some(&"\"sqlite\"".to_string()));
assert_eq!(out.get("LIMIT"), Some(&"100".to_string()));
assert_eq!(out.get("FLAG"), Some(&"false".to_string()));
}
#[test]
fn rust_module_consts_reject_non_literal() {
let src = "const VAL: i32 = some_func();\n";
let out = collect_consts_lang(src, "rust");
assert!(!out.contains_key("VAL"));
}
#[test]
fn rust_module_consts_skip_inside_function_body() {
let src = "fn f() -> &'static str { const INNER: &str = \"x\"; INNER }\n";
let out = collect_consts_lang(src, "rust");
assert!(!out.contains_key("INNER"));
}
}