Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -10,7 +10,7 @@ use tree_sitter::Node;
/// at the *case-level* shape `build_switch` sees here. Rust `match`, Go
/// `switch`, and Java arrow-switches qualify; classic Java/C/C++/JS switches
/// with fall-through do not. The check is per-language because Java mixes
/// arrow and classic shapes that's handled by inspecting the case kind in
/// arrow and classic shapes, that's handled by inspecting the case kind in
/// [`extract_case_literal_text`].
fn lang_has_exclusive_cases(lang: &str) -> bool {
matches!(lang, "rust" | "go")
@ -19,7 +19,7 @@ fn lang_has_exclusive_cases(lang: &str) -> bool {
/// Extract the scrutinee subtree from a switch-like AST node.
///
/// Returns the AST node referenced by the language's scrutinee field. Only
/// fires for Rust `match`, Go `switch`, and Java `switch` statements other
/// fires for Rust `match`, Go `switch`, and Java `switch` statements, other
/// languages return `None` so [`build_switch`] keeps its legacy behavior.
fn extract_scrutinee_node<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
let field = match lang {
@ -39,7 +39,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
let kind = case.kind();
match (lang, kind) {
("rust", "match_arm") => {
// Reject guarded arms `match x { y if cond => ... }`.
// Reject guarded arms, `match x { y if cond => ... }`.
if case.child_by_field_name("guard").is_some() {
return None;
}
@ -71,7 +71,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
text_of(inner, code)
}
("go", "expression_case") => {
// Go case `case v1, v2: ...` only handle exactly one expression.
// Go case `case v1, v2: ...`, only handle exactly one expression.
let value = case.child_by_field_name("value")?;
let mut named_children: Vec<Node> = Vec::new();
let mut cursor = value.walk();
@ -195,7 +195,7 @@ pub(super) fn extract_catch_param_name<'a>(
// -------------------------------------------------------------------------
/// Builds CFG for Ruby's `begin`/`rescue`/`ensure` blocks (and `body_statement`
/// with inline rescue). Ruby's `begin` has no `body` field the try-body
/// with inline rescue). Ruby's `begin` has no `body` field, the try-body
/// statements are direct children before `rescue`/`else`/`ensure` nodes.
#[allow(clippy::too_many_arguments)]
pub(super) fn build_begin_rescue<'a>(
@ -305,7 +305,7 @@ pub(super) fn build_begin_rescue<'a>(
vec![synth]
} else {
// No param name will wire exception edges to first rescue body node
// No param name, will wire exception edges to first rescue body node
Vec::new()
};
@ -333,7 +333,7 @@ pub(super) fn build_begin_rescue<'a>(
current_body_id,
)
} else {
// No body field build rescue node itself as a block.
// No body field, build rescue node itself as a block.
// Filter out meta-children (exceptions, exception_variable) by
// iterating and building only statement children.
let mut rescue_cursor = rescue_node.walk();
@ -407,7 +407,7 @@ pub(super) fn build_begin_rescue<'a>(
try_exits
};
// 6. Build ensure clause (Ruby's finally always runs)
// 6. Build ensure clause (Ruby's finally, always runs)
if let Some(ensure_node) = ensure_clause {
let mut ensure_preds: Vec<NodeIndex> = Vec::new();
ensure_preds.extend(&normal_exits);
@ -443,7 +443,7 @@ pub(super) fn build_begin_rescue<'a>(
}
// -------------------------------------------------------------------------
// switch handler multi-way dispatch with fallthrough
// switch handler, multi-way dispatch with fallthrough
// -------------------------------------------------------------------------
/// True for AST kinds that wrap a single switch case body.
@ -490,7 +490,7 @@ pub(super) fn case_has_default_label(case: Node<'_>) -> bool {
/// Build CFG for a switch statement.
///
/// The dispatch is decomposed into a chain of binary `StmtKind::If` headers
/// — one per non-default case — because the SSA terminator only models 0/1/2
///, one per non-default case, because the SSA terminator only models 0/1/2
/// successors. A monolithic N-way header would otherwise be collapsed to
/// `Goto(first)` and silently drop every other case. Each header's True edge
/// reaches its case body; the False edge falls through to the next header (or
@ -544,7 +544,7 @@ pub(super) fn build_switch<'a>(
}
}
// Grammar didn't expose recognisable case nodes fall back to a single
// Grammar didn't expose recognisable case nodes, fall back to a single
// header + Block-style walk so nodes still get linked.
if cases.is_empty() {
let header = push_node(
@ -603,7 +603,7 @@ pub(super) fn build_switch<'a>(
// arrow-switch), pre-extract the scrutinee text + idents so the synthetic
// dispatch headers can carry a `<scrutinee> == <case_literal>` condition.
// Falls back to `None` when the scrutinee is structurally complex (calls,
// member chains, parenthesized expressions in Go) the existing first-
// member chains, parenthesized expressions in Go), the existing first-
// reachable behavior remains correct in that case.
let supports_exclusive_cases = lang_has_exclusive_cases(lang) || lang == "java";
let (scrutinee_text, scrutinee_idents) = if supports_exclusive_cases {
@ -647,7 +647,7 @@ pub(super) fn build_switch<'a>(
for (idx, (case, is_default)) in cases.iter().copied().enumerate() {
let is_last = idx + 1 == cases.len();
// Default at the chain tail doesn't get its own dispatch If the
// Default at the chain tail doesn't get its own dispatch If, the
// previous header's False edge already targets it directly.
let case_first_preds: Vec<NodeIndex> = if is_default && is_last {
// First node of the default body becomes the False target of the
@ -675,12 +675,13 @@ pub(super) fn build_switch<'a>(
);
// The dispatch header is purely structural (it stands in for the
// discriminant comparison). It must not inherit Sink/Source labels
// from the case body's text push_node uses `text_of(ast)` for
// from the case body's text, push_node uses `text_of(ast)` for
// non-call kinds, which would let the body text drive classification.
g[header].taint.labels.clear();
g[header].call.callee = None;
g[header].call.sink_payload_args = None;
g[header].call.destination_uses = None;
g[header].call.gate_filters.clear();
// For mutually-exclusive switch shapes with a single-ident
// scrutinee, synthesize a `<scrutinee> == <case_literal>`
// structured condition on the dispatch header so SSA lowering
@ -958,7 +959,7 @@ pub(super) fn build_try<'a>(
vec![synth]
} else {
// No param name wire exception edges directly to first catch body node
// No param name, wire exception edges directly to first catch body node
Vec::new()
};

View file

@ -43,7 +43,7 @@ fn js_try_catch_has_exception_edges() {
/// When a classifiable call (here `eval`, a built-in JS sink) is nested
/// inside a multi-line statement, the CFG node's `classification_span()`
/// should point at the inner call, not at the outer statement's start
/// should point at the inner call, not at the outer statement's start ,
/// so finding display reports the line the dangerous call actually lives
/// on. `ast.span` must still cover the whole outer statement for
/// structural passes that need the statement grain.
@ -86,7 +86,7 @@ fn inner_call_override_narrows_classification_span() {
}
/// `classification_span()` must fall back to `ast.span` when no narrower
/// sub-expression was recorded so existing structural code paths keep
/// sub-expression was recorded, so existing structural code paths keep
/// working unchanged for nodes whose classification applies to the whole
/// outer node.
#[test]
@ -125,7 +125,7 @@ fn callee_span_unset_when_no_narrowing_is_possible() {
// A bare `eval(x);` on one line: `first_call_ident` finds the
// call_expression whose span is nearly the whole expression_statement
// (different by the trailing `;`). `classification_span` still
// returns a sensible line but the exact trimming is an
// returns a sensible line, but the exact trimming is an
// implementation detail. What we assert here is the invariant:
// if callee_span *is* set, it must be contained in ast.span.
let src = b"function f() { eval(x); }";
@ -708,7 +708,7 @@ fn python_if_and() {
#[test]
fn ruby_unless_and() {
// `unless a && b` chain built, branches swapped
// `unless a && b`, chain built, branches swapped
// Body should run when condition is false
let src = b"def f\n unless a && b\n x\n end\nend\n";
let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE);
@ -848,7 +848,7 @@ fn parse_tree(src: &[u8], ts_lang: Language) -> tree_sitter::Tree {
#[test]
fn first_call_ident_skips_lambda_body() {
// `process(lambda: eval(dangerous))` Python-style.
// `process(lambda: eval(dangerous))`, Python-style.
// first_call_ident should return "process", not "eval".
let src = b"process(lambda: eval(dangerous))";
let ts_lang = Language::from(tree_sitter_python::LANGUAGE);
@ -860,7 +860,7 @@ fn first_call_ident_skips_lambda_body() {
#[test]
fn first_call_ident_skips_arrow_function_body() {
// `process(() => eval(dangerous))` JS arrow function in argument.
// `process(() => eval(dangerous))`, JS arrow function in argument.
let src = b"process(() => eval(dangerous))";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -871,7 +871,7 @@ fn first_call_ident_skips_arrow_function_body() {
#[test]
fn first_call_ident_skips_named_function_in_arg() {
// `process(function inner() { eval(dangerous); })` named function expression in arg.
// `process(function inner() { eval(dangerous); })`, named function expression in arg.
let src = b"process(function inner() { eval(dangerous); })";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -882,7 +882,7 @@ fn first_call_ident_skips_named_function_in_arg() {
#[test]
fn first_call_ident_normal_nested_call() {
// `outer(inner(x))` inner is NOT behind a function boundary, should be reachable.
// `outer(inner(x))`, inner is NOT behind a function boundary, should be reachable.
let src = b"outer(inner(x))";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -895,7 +895,7 @@ fn first_call_ident_normal_nested_call() {
#[test]
fn first_call_ident_finds_call_not_blocked_by_function() {
// Ensure a call at the same level as a function literal is still found.
// `[function() {}, actual_call()]` array with function and call.
// `[function() {}, actual_call()]`, array with function and call.
let src = b"[function() {}, actual_call()]";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -908,7 +908,7 @@ fn first_call_ident_finds_call_not_blocked_by_function() {
#[test]
fn callee_not_resolved_from_nested_function_arg() {
// `safe_wrapper(function() { eval(user_input); })` the CFG for the
// `safe_wrapper(function() { eval(user_input); })`, the CFG for the
// outer call should resolve the callee as "safe_wrapper", never "eval".
let src = b"function f() { safe_wrapper(function() { eval(user_input); }); }";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
@ -923,7 +923,7 @@ fn callee_not_resolved_from_nested_function_arg() {
assert!(has_safe, "expected a node with callee 'safe_wrapper'");
// The outer body should NOT have a node with callee "eval" attributed
// to the outer expression eval lives inside the nested function body.
// to the outer expression, eval lives inside the nested function body.
let outer_eval = body.graph.node_weights().any(|info| {
info.call.callee.as_deref() == Some("eval") && info.ast.enclosing_func.is_none()
});
@ -1117,6 +1117,7 @@ fn clone_preserves_all_sub_structs() {
kwargs: vec![("shell".into(), vec!["True".into()])],
arg_string_literals: vec![Some("lit".into())],
destination_uses: None,
gate_filters: Vec::new(),
},
taint: TaintMeta {
labels: {
@ -1399,7 +1400,7 @@ fn js_promisify_ignored_for_non_js_langs() {
#[test]
fn js_promisify_non_call_value_ignored() {
// RHS is not a promisify call no binding should be captured.
// RHS is not a promisify call, no binding should be captured.
let src = b"const execAsync = child_process.exec;";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_to_file_cfg(src, "javascript", ts_lang);
@ -1471,7 +1472,7 @@ fn cpp_function_extracts_param_names() {
// ── callee-site metadata extraction ──────────────────────────────────
/// Callees collected into `LocalFuncSummary` should now carry structured
/// arity, receiver, and qualifier fields not just a bare name.
/// arity, receiver, and qualifier fields, not just a bare name.
#[test]
fn local_summary_callees_carry_arity_and_receiver() {
// Two calls: one is a plain function call with 2 args, the other is
@ -1703,7 +1704,7 @@ fn local_summary_callees_have_distinct_ordinals() {
.find(|(k, _)| k.name == "outer")
.unwrap();
// Dedup key is (name, arity, receiver, qualifier, ordinal) the two
// Dedup key is (name, arity, receiver, qualifier, ordinal), the two
// `a()` sites have different ordinals, so both must appear.
let a_sites: Vec<_> = outer.callees.iter().filter(|c| c.name == "a").collect();
assert_eq!(
@ -1825,7 +1826,7 @@ fn anon_fn_named_from_short_var_decl_go() {
#[test]
fn iife_callee_resolves_to_anon_body_js() {
// `(function(arg){eval(arg);})(q)` the CallFn arm must produce
// `(function(arg){eval(arg);})(q)`, the CallFn arm must produce
// a synthetic anon callee name so that taint can match the
// inline body's FuncKey.
let src = b"(function(arg){ eval(arg); })(q);";
@ -1898,7 +1899,7 @@ fn strip_tags(s: &str) -> String {
#[test]
fn replace_chain_rejects_unrecognised_literals() {
// `.replace("foo", "bar")` contains no dangerous pattern must NOT be
// `.replace("foo", "bar")` contains no dangerous pattern, must NOT be
// credited as a sanitizer. Preserves the FP→TN guard: replace calls
// that don't strip anything dangerous must stay transparent to taint.
let src = br#"
@ -1916,7 +1917,7 @@ fn rewrite(s: &str) -> String {
#[test]
fn replace_chain_rejects_when_replacement_reintroduces_pattern() {
// `.replace("x", "..")` strips `x` but *reintroduces* `..` be
// `.replace("x", "..")` strips `x` but *reintroduces* `..`, be
// maximally conservative and abandon all credit for this chain.
let src = br#"
fn evil(s: &str) -> String {
@ -1933,7 +1934,7 @@ fn evil(s: &str) -> String {
#[test]
fn replace_chain_rejects_dynamic_arg() {
// `.replace(var, "")` search is not a literal; pattern analysis can
// `.replace(var, "")`, search is not a literal; pattern analysis can
// say nothing about what was stripped. Must not earn credit.
let src = br#"
fn dynamic(s: &str, needle: &str) -> String {
@ -1950,7 +1951,7 @@ fn dynamic(s: &str, needle: &str) -> String {
#[test]
fn replace_chain_rejects_non_identifier_base() {
// `get_s().replace("..", "")` innermost receiver is a call, not a
// `get_s().replace("..", "")`, innermost receiver is a call, not a
// parameter. We have no reason to believe `get_s()` returns a value
// that benefits the caller; refuse credit.
let src = br#"
@ -1976,7 +1977,7 @@ fn find_node_defining<'a>(cfg: &'a Cfg, var: &str) -> Option<&'a NodeInfo> {
#[test]
fn numeric_length_access_detected_on_js_property_read() {
// `var count = items.length` property access on a member expression
// `var count = items.length`, property access on a member expression
// should mark the CFG node as a numeric-length access so the
// type-fact analysis infers TypeKind::Int for `count`.
let src = br#"function f(items) {
@ -1994,7 +1995,7 @@ fn numeric_length_access_detected_on_js_property_read() {
#[test]
fn numeric_length_access_detected_on_js_zero_arg_method_call() {
// `var n = str.length()` zero-arg method call form (uncommon in JS
// `var n = str.length()`, zero-arg method call form (uncommon in JS
// but present in other languages). Detector should unwrap a
// zero-arg call around a member expression.
let src = br#"function f(list) {
@ -2012,7 +2013,7 @@ fn numeric_length_access_detected_on_js_zero_arg_method_call() {
#[test]
fn numeric_length_access_ignores_unrelated_properties() {
// `var v = arr.foo` arbitrary property reads must not be flagged.
// `var v = arr.foo`, arbitrary property reads must not be flagged.
let src = br#"function f(arr) {
var v = arr.foo;
return v;
@ -2028,7 +2029,7 @@ fn numeric_length_access_ignores_unrelated_properties() {
#[test]
fn numeric_length_access_ignores_method_calls_with_args() {
// `var r = s.indexOf('x')` the detector must reject any call with
// `var r = s.indexOf('x')`, the detector must reject any call with
// positional arguments because those aren't pure length reads.
let src = br#"function f(s) {
var r = s.indexOf('x');
@ -2043,7 +2044,7 @@ fn numeric_length_access_ignores_method_calls_with_args() {
);
}
// ── Pointer-Phase 6 / W5: subscript lowering tests ────────────────────────
//── subscript lowering tests ────────────────────────
/// Scope for tests that flip `NYX_POINTER_ANALYSIS=1` so the CFG-side
/// subscript synthesis activates. The env-var is restored afterwards
@ -2290,7 +2291,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
);
}
/// JS switch fall-through (`case 1: a(); case 2: b();`) case 1's
/// JS switch fall-through (`case 1: a(); case 2: b();`), case 1's
/// exit should flow into case 2's body so taint from `first()`
/// reaches `second()`'s sinks.
///
@ -2301,7 +2302,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
/// structural shape.
/// (b) `first()` has a non-Back forward out-edge that lands inside
/// the case-2 sub-graph (the actual fall-through wire), so we
/// prove there *is* a fall-through edge not just an
/// prove there *is* a fall-through edge, not just an
/// Entry→…→Exit path that happens to walk through both calls
/// via the dispatch chain.
///
@ -2309,7 +2310,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
/// Seq passthrough nodes (one per surrounding scope), so the
/// fall-through edge from `first()` lands on the *first wrapper
/// Seq node* of case 2, not on `second()` itself. Asserting that
/// `second()` has ≥2 in-edges would therefore be wrong the True
/// `second()` has ≥2 in-edges would therefore be wrong, the True
/// edge from the case-2 dispatch If targets the wrapper node, and
/// only a single Seq chain leads from there to `second()`.
#[test]
@ -2800,7 +2801,7 @@ fn nested_loops_two_headers_two_back_edges() {
#[test]
fn loop_with_break_no_back_edge_from_break() {
// A `break` short-circuits the loop body its edge must NOT be a
// A `break` short-circuits the loop body, its edge must NOT be a
// back edge to the header (it leaves the loop entirely).
let src = b"function f() { while (cond()) { if (done()) break; body(); } }";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
@ -2879,7 +2880,7 @@ fn chained_method_call_rebinds_to_inner_gated_sink() {
// no longer be the recorded callee for this node.
if callee.ends_with("https.get") {
// The inner-gate path must have populated sink_payload_args
// (the gate's payload arg is position 0 the URL string).
// (the gate's payload arg is position 0, the URL string).
assert!(
info.call.sink_payload_args.is_some(),
"expected sink_payload_args to be populated for chained \

View file

@ -4,6 +4,7 @@ use super::{
member_expr_text, push_node, text_of,
};
use crate::labels::{DataLabel, LangAnalysisRules, classify};
use crate::utils::snippet::truncate_at_char_boundary;
use petgraph::graph::NodeIndex;
use smallvec::SmallVec;
use tree_sitter::Node;
@ -72,20 +73,15 @@ pub(super) fn push_condition_node<'a>(
code: &'a [u8],
enclosing_func: Option<&str>,
) -> NodeIndex {
// Pass cond_ast as both args sub-conditions are never `unless` nodes
// Pass cond_ast as both args, sub-conditions are never `unless` nodes
let (inner, negated) = detect_negation(cond_ast, cond_ast, lang);
let mut vars = Vec::new();
collect_idents(inner, code, &mut vars);
vars.sort();
vars.dedup();
vars.truncate(MAX_COND_VARS);
let text = text_of(cond_ast, code).map(|t| {
if t.len() > MAX_CONDITION_TEXT_LEN {
t[..MAX_CONDITION_TEXT_LEN].to_string()
} else {
t
}
});
let text = text_of(cond_ast, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
let span = (cond_ast.start_byte(), cond_ast.end_byte());
g.add_node(NodeInfo {
kind: StmtKind::If,
@ -140,7 +136,7 @@ pub(super) fn detect_rust_let_match_guard<'a>(
/// Synthesize a `StmtKind::If` CFG node carrying a Rust match-arm guard's
/// condition text and vars. The let-binding name is added to `condition_vars`
/// so `apply_branch_predicates` narrows validation to that specific variable
/// the variable that receives the arm's value and flows to downstream sinks.
///, the variable that receives the arm's value and flows to downstream sinks.
pub(super) fn emit_rust_match_guard_if<'a>(
g: &mut Cfg,
guard: Node<'a>,
@ -154,13 +150,8 @@ pub(super) fn emit_rust_match_guard_if<'a>(
vars.sort();
vars.dedup();
vars.truncate(MAX_COND_VARS);
let text = text_of(guard, code).map(|t| {
if t.len() > MAX_CONDITION_TEXT_LEN {
t[..MAX_CONDITION_TEXT_LEN].to_string()
} else {
t
}
});
let text = text_of(guard, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
let span = (guard.start_byte(), guard.end_byte());
g.add_node(NodeInfo {
kind: StmtKind::If,
@ -181,7 +172,7 @@ pub(super) fn emit_rust_match_guard_if<'a>(
/// `lhs_text` is then synthesised by SSA lowering at the join.
///
/// The condition's identifiers live on the If node's `condition_vars`, **not**
/// on the branch `uses`. This is the whole point of the split cond is control
/// on the branch `uses`. This is the whole point of the split, cond is control
/// flow, branches are data flow.
///
/// Returns the exit frontier for downstream statement chaining (a single-element
@ -219,7 +210,7 @@ pub(super) fn build_ternary_diamond<'a>(
g[cond_if].is_eq_with_const = detect_eq_with_const(cond_ast, lang);
connect_all(g, preds, cond_if, pred_edge);
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node)
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node) ,
// a nested ternary recurses and returns its own join node.
let true_exits = lower_ternary_branch(
cons_ast,
@ -332,7 +323,7 @@ pub(super) fn lower_ternary_branch<'a>(
analysis_rules,
);
// The branch expression's own `defines` (if any typically None for a
// The branch expression's own `defines` (if any, typically None for a
// pure value expression) is replaced with the outer LHS so that both
// branches agree on the target, driving phi insertion at the join.
g[node].taint.defines = Some(lhs_text.to_string());
@ -410,7 +401,7 @@ pub(super) fn classify_ternary_lhs(
.unwrap_or_default();
// Try the full dotted path first (e.g. "document.cookie"), then fall back
// to the property alone (e.g. "innerHTML") mirrors the LHS classification
// to the property alone (e.g. "innerHTML"), mirrors the LHS classification
// already performed in `push_node` for non-split assignments.
if let Some(l) = classify(lang, &lhs_text, extra) {
labels.push(l);
@ -429,7 +420,7 @@ pub(super) fn classify_ternary_lhs(
/// Recursively decompose a boolean condition into a chain of `StmtKind::If` nodes
/// with short-circuit edges.
///
/// Returns `(true_exits, false_exits)` the sets of nodes from which True/False
/// Returns `(true_exits, false_exits)`, the sets of nodes from which True/False
/// edges should connect to the then/else branches.
pub(super) fn build_condition_chain<'a>(
cond_ast: Node<'a>,

View file

@ -5,7 +5,7 @@ use tree_sitter::Node;
///
/// Used by decorator extraction to reduce `login_required`, `permission_required(...)`,
/// `flask_login.login_required`, `hasRole('ADMIN')` to their first identifier
/// name the matcher target.
/// name, the matcher target.
fn leading_ident_text(node: Node<'_>, code: &[u8]) -> Option<String> {
let mut cur = node;
loop {
@ -56,7 +56,7 @@ fn normalize_decorator_name(raw: &str) -> String {
let trimmed = raw.trim();
let trimmed = trimmed.trim_start_matches(':').trim_start_matches('@');
// If a call syntax leaked through (e.g. `UseGuards(AuthGuard)`), keep only
// the head callers that want the arg handle it separately.
// the head, callers that want the arg handle it separately.
let head = trimmed
.split(['(', ' ', '\t', '\n'])
.next()
@ -115,7 +115,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
/// are `decorator` nodes containing an `identifier` or `call` expression.
/// - **JS/TS**: decorators attach to `method_definition` children or appear
/// as siblings inside `class_body`; stage-3 decorators use `decorator` nodes.
/// `@UseGuards(AuthGuard)` we include the call args too.
/// `@UseGuards(AuthGuard)`, we include the call args too.
/// - **Java**: annotations live in the `modifiers` child of `method_declaration`;
/// kinds are `marker_annotation` / `annotation`.
/// - **Rust**: `function_item` has `attribute_item` siblings (outer `#[..]`).
@ -127,7 +127,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
/// at class body scope applies to every method in the class. `only:` /
/// `except:` hash args scope the filter to the listed action names; the
/// filter is only recorded for the current method when the scope matches.
/// Conditional filters (`if:` / `unless:`) are not honored those require
/// Conditional filters (`if:` / `unless:`) are not honored, those require
/// predicate evaluation and are deferred.
pub(super) fn extract_auth_decorators<'a>(
func_node: Node<'a>,
@ -379,12 +379,12 @@ pub(super) fn extract_auth_decorators<'a>(
}
/// If a Ruby statement is `before_action :name` (or `before_filter :name`),
/// push the normalized filter name into `out` honoring any `only:` / `except:`
/// push the normalized filter name into `out`, honoring any `only:` / `except:`
/// hash arguments against `method_name`.
///
/// Positional symbol args (`before_action :a, :b, only: [:x]`) all share the
/// single trailing scope. Conditional filters (`if:` / `unless:`) are not
/// honored here those require predicate evaluation and are deferred.
/// honored here, those require predicate evaluation and are deferred.
fn collect_ruby_before_action(
node: Node<'_>,
code: &[u8],
@ -499,7 +499,7 @@ fn collect_ruby_before_action(
/// Parse a single `only:` / `except:` hash pair and append the symbol list into
/// the corresponding out-vec. Sets the `*_present` flag when the key is seen,
/// regardless of whether the value parses into any symbols treating
/// regardless of whether the value parses into any symbols, treating
/// `only: []` as "no actions match" is safer than ignoring the scope.
fn collect_ruby_filter_pair(
pair_node: Node<'_>,

View file

@ -1,26 +1,28 @@
//! Phase 6.1: per-language DTO definition collectors.
//! per-language DTO definition collectors.
//!
//! Walks a parsed file's AST and emits `(class_name, DtoFields)` pairs
//! for class / interface / struct / Pydantic-model declarations whose
//! field types resolve to a recognised [`TypeKind`].
//!
//! Strictly additive: classes whose fields cannot be classified produce
//! a `DtoFields` with an empty `fields` map the caller must decide
//! a `DtoFields` with an empty `fields` map, the caller must decide
//! whether to use that as a "Dto with no inferred fields" or fall back
//! to the pre-Phase-6 Object/Unknown classification.
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use tree_sitter::Node;
use super::helpers::text_of;
use super::params::{java_type_to_kind, python_primitive_to_kind, ts_type_to_kind};
use super::params::{
java_type_to_kind, python_primitive_to_kind, ts_type_to_kind, ts_type_to_local_collection,
};
use crate::ssa::type_facts::{DtoFields, TypeKind};
/// Collect all DTO-shaped class definitions in a parsed file.
///
/// Dispatches per-language; returns an empty map for languages without
/// a Phase 6 collector (Go, Ruby, PHP, C/C++ — DTOs in those ecosystems
/// a collector (Go, Ruby, PHP, C/C++, DTOs in those ecosystems
/// either don't follow framework conventions Nyx tracks today, or are
/// already covered by other type-inference paths).
pub(super) fn collect_dto_classes(
@ -39,6 +41,55 @@ pub(super) fn collect_dto_classes(
out
}
/// Collect same-file `type X = Map<...>` / `Set<...>` / `T[]`
/// aliases for TS / JS so the param classifier can resolve a
/// parameter typed `m: ElementsMap` (where
/// `type ElementsMap = Map<K, V>`) to
/// [`TypeKind::LocalCollection`].
///
/// Empty for non-JS/TS languages. Cross-file aliases are not
/// resolved here, that requires the multi-file type-resolution
/// pipeline that doesn't yet exist for TS. Excalidraw's
/// `type ElementsMap = Map<...>` is in
/// `packages/element/src/types.ts`; users that import the alias
/// without a same-file copy still see the original FP. Most
/// real-repo aliases the FP cluster touched were declared in the
/// same file as their consumers (see fixture).
pub(super) fn collect_type_alias_local_collections(
root: Node<'_>,
lang: &str,
code: &[u8],
) -> HashSet<String> {
let mut out: HashSet<String> = HashSet::new();
if matches!(lang, "typescript" | "ts" | "javascript" | "js") {
collect_ts_type_alias_local_collections(root, code, &mut out);
}
out
}
fn collect_ts_type_alias_local_collections(root: Node<'_>, code: &[u8], out: &mut HashSet<String>) {
walk(root, &mut |node| {
if node.kind() != "type_alias_declaration" {
return;
}
let Some(name_node) = node.child_by_field_name("name") else {
return;
};
let Some(alias_name) = text_of(name_node, code) else {
return;
};
let Some(value_node) = node.child_by_field_name("value") else {
return;
};
let Some(value_text) = text_of(value_node, code) else {
return;
};
if ts_type_to_local_collection(value_text.trim()).is_some() {
out.insert(alias_name);
}
});
}
// ─────────────────────────────────────────────────────────────────────
// Java
// ─────────────────────────────────────────────────────────────────────
@ -163,7 +214,7 @@ fn extract_ts_property<'a>(node: Node<'a>, code: &'a [u8]) -> Option<(String, Ty
let name_node = node.child_by_field_name("name")?;
let field_name = text_of(name_node, code)?;
let type_anno = node.child_by_field_name("type")?;
// type_annotation node text is `: T` walk to the inner type.
// type_annotation node text is `: T`, walk to the inner type.
let type_text = type_anno
.named_child(0)
.and_then(|t| text_of(t, code))
@ -193,7 +244,7 @@ fn collect_rust(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFields
return;
};
if body.kind() != "field_declaration_list" {
// Tuple struct or unit struct no named fields.
// Tuple struct or unit struct, no named fields.
return;
}
let mut fields = DtoFields::new(class_name.clone());
@ -291,7 +342,7 @@ fn collect_python(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFiel
/// Conservative supertype scan: returns true when the class definition
/// has a superclass list whose text mentions `BaseModel` (covers both
/// `BaseModel` and `pydantic.BaseModel`). No false positives on
/// non-Pydantic classes named `BaseModel`-something match is on the
/// non-Pydantic classes named `BaseModel`-something, match is on the
/// full token, not a substring.
fn python_inherits_basemodel<'a>(class_node: Node<'a>, code: &'a [u8]) -> bool {
let Some(supers) = class_node.child_by_field_name("superclasses") else {
@ -418,7 +469,7 @@ mod tests {
"#;
let dtos = collect("rust", src);
// Tuple structs have no named fields and must NOT produce a
// DtoFields entry — Phase 6 only handles named-field DTOs.
// DtoFields entry, This collector only handles named-field DTOs.
assert!(!dtos.contains_key("Wrap"));
}

View file

@ -19,11 +19,11 @@ pub(crate) fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option<String> {
///
/// For `Runtime.getRuntime().exec(cmd)`, the receiver of `exec` is the call
/// `Runtime.getRuntime()`. This function drills through that to return
/// `"Runtime"` the outermost non-call object. This lets labels like
/// `"Runtime"`, the outermost non-call object. This lets labels like
/// `"Runtime.exec"` match correctly.
pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<String> {
match lookup(lang, n.kind()) {
// The receiver is itself a call drill into ITS receiver.
// The receiver is itself a call, drill into ITS receiver.
// e.g. for `Runtime.getRuntime()`, the object is `Runtime`.
Kind::CallFn | Kind::CallMethod => {
let inner = n
@ -53,7 +53,7 @@ pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<Str
/// identifier (e.g. call expressions, subscripts, `this`/`self`, etc.).
pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
let mut cur = n;
// Bounded walk tree-sitter can nest deeply but we only need a handful
// Bounded walk, tree-sitter can nest deeply but we only need a handful
// of hops for real code.
for _ in 0..16 {
match cur.kind() {
@ -68,7 +68,7 @@ pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
cur = cur.child_by_field_name("value")?;
}
// Drill through nested calls / method chains to find the base
// identifier. E.g. `Connection::open(p).unwrap().execute(...)`
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` ,
// the receiver of `.execute` is the `.unwrap()` call whose
// object is `Connection::open(p)`; we want the leftmost plain
// identifier the chain resolves to (for SSA var_stacks lookup).
@ -212,7 +212,7 @@ pub(crate) fn first_call_ident_with_span<'a>(
return ident.map(|s| (s, span));
}
Kind::Function => {
// Do not descend into nested function/lambda bodies
// Do not descend into nested function/lambda bodies ,
// they are separate scopes and should not contribute
// callee identifiers to the parent expression.
continue;
@ -240,7 +240,7 @@ pub(crate) fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> O
/// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does.
///
/// Returns `(callee_text, label, span)` where `span` is the byte range of the
/// inner call node itself used to populate `CallMeta.callee_span` so that
/// inner call node itself, used to populate `CallMeta.callee_span` so that
/// display sites can report the actual call location rather than the enclosing
/// statement's span.
pub(crate) fn find_classifiable_inner_call<'a>(
@ -251,7 +251,7 @@ pub(crate) fn find_classifiable_inner_call<'a>(
) -> Option<(String, DataLabel, (usize, usize))> {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
// Do not descend into Kind::Function nodes they will be extracted
// Do not descend into Kind::Function nodes, they will be extracted
// as separate BodyCfg entries and should not contribute inner callees
// to the parent expression.
if lookup(lang, c.kind()) == Kind::Function {
@ -329,7 +329,7 @@ pub(crate) fn member_expr_text_inner(n: Node, code: &[u8]) -> Option<String> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => {
// Tree-sitter exposes the receiver under `object` (JS/TS, Python),
// `value` (Rust field_expression handled in the matching arm
// `value` (Rust field_expression, handled in the matching arm
// above), or `operand` (Go selector_expression). Without the
// `operand` fallback, Go member access like `r.Body` collapsed to
// just the trailing field (`Body`), so source rules keyed on the
@ -442,7 +442,7 @@ pub(crate) fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
/// This finds anonymous functions / arrow functions / closures that are
/// passed as arguments to a call and should be analysed as separate
/// function scopes. Only direct function-argument children are collected
/// (not functions nested inside other functions those get handled when
/// (not functions nested inside other functions, those get handled when
/// the outer function is recursed into).
pub(crate) fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec<Node<'a>> {
let mut funcs = Vec::new();
@ -558,7 +558,7 @@ pub(crate) fn derive_anon_fn_name_from_context<'a>(
}
// Python: `h = lambda: ...` parents as `assignment`, handled above.
// Python `default_parameter` assigning `def foo(x=lambda: 0)` ambiguous, skip.
// Python `default_parameter` assigning `def foo(x=lambda: 0)`, ambiguous, skip.
_ => {
// Some grammars wrap the RHS in an `expression`, `expression_list`,
// or similar node between the binding site and the function literal.
@ -709,7 +709,7 @@ pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec<String>) {
}
}
/// Pointer-Phase 6 / W5: AST kind names for subscript / index expressions
/// AST kind names for subscript / index expressions
/// across the languages whose container-element flow we model.
///
/// JS/TS use `subscript_expression`; Python uses `subscript`; Go uses
@ -724,7 +724,7 @@ pub(crate) fn is_subscript_kind(kind: &str) -> bool {
)
}
/// Pointer-Phase 6 / W5: when the LHS of an assignment statement is a
/// when the LHS of an assignment statement is a
/// subscript / index expression (or a single-element wrapper around
/// one), return that node. Returns `None` for multi-target Go
/// `expression_list`s, identifier LHSs, member-expression LHSs, etc.
@ -745,10 +745,10 @@ pub(crate) fn subscript_lhs_node<'a>(lhs: Node<'a>, lang: &str) -> Option<Node<'
None
}
/// Pointer-Phase 6 / W5: extract `(array_text, index_text)` from a
/// extract `(array_text, index_text)` from a
/// subscript / index AST node.
///
/// Returns `None` when the array operand is not a plain identifier we
/// Returns `None` when the array operand is not a plain identifier, we
/// only synthesise `__index_get__` / `__index_set__` calls when the
/// receiver resolves cleanly to a SSA-renamed local, since the W2/W4
/// container hooks need a stable receiver var_name to drive
@ -771,7 +771,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
n.named_children(&mut cur).nth(1)
})?;
let arr_kind = arr.kind();
// Only proceed when the array is a plain identifier otherwise
// Only proceed when the array is a plain identifier, otherwise
// we can't bind a stable receiver name for the synth Call.
if !matches!(
arr_kind,
@ -780,7 +780,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
return None;
}
let arr_text = text_of(arr, code)?;
// PHP-style `$x` strip not needed here Go/JS/Python don't use it.
// PHP-style `$x` strip not needed here, Go/JS/Python don't use it.
let idx_text = text_of(idx, code)?;
Some((arr_text, idx_text))
}

View file

@ -1,4 +1,4 @@
//! Phase 6: per-language class / trait / interface hierarchy extraction.
//! per-language class / trait / interface hierarchy extraction.
//!
//! Walks a parsed file's AST and emits `(sub_container, super_container)`
//! pairs for every declared inheritance / impl / implements relationship.
@ -47,7 +47,7 @@ pub(crate) fn collect_hierarchy_edges(
"php" => collect_php(root, code, &mut push),
"cpp" | "c++" => collect_cpp(root, code, &mut push),
// Go: structural / implicit interface satisfaction is intractable
// per-file; Phase 6 deliberately skips it.
// per-file; deliberately skipped it.
// C: no inheritance.
_ => {}
}
@ -70,7 +70,7 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
let Some(sub) = text_of(name_node, code) else {
return;
};
// `superclass` field on class_declaration singular `extends Y`.
// `superclass` field on class_declaration, singular `extends Y`.
if let Some(superclass) = node.child_by_field_name("superclass") {
let mut cursor = superclass.walk();
for c in superclass.named_children(&mut cursor) {
@ -79,13 +79,13 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
}
}
}
// `interfaces` field on class_declaration `implements I, J`
// `interfaces` field on class_declaration, `implements I, J`
// wraps a `super_interfaces` → `type_list`.
if let Some(ifaces) = node.child_by_field_name("interfaces") {
collect_java_type_list(ifaces, code, &sub, push);
}
// `extends_interfaces` is an unnamed child on
// interface_declaration `extends Foo, Bar` for an
// interface_declaration, `extends Foo, Bar` for an
// interface. Walk children directly since it's not a field.
let mut cursor = node.walk();
for c in node.named_children(&mut cursor) {
@ -123,7 +123,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
match n.kind() {
"type_identifier" | "identifier" => text_of(n, code),
"generic_type" => {
// `Foo<T>` the leading child is the bare type identifier.
// `Foo<T>`, the leading child is the bare type identifier.
let mut cursor = n.walk();
for c in n.named_children(&mut cursor) {
if matches!(
@ -136,7 +136,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
None
}
"scoped_type_identifier" => {
// `pkg.Foo` return last segment.
// `pkg.Foo`, return last segment.
text_of(n, code).map(|s| {
let last = s.rsplit('.').next().unwrap_or(&s);
last.to_string()
@ -152,7 +152,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
/// Walk for `impl_item` nodes and emit edges from the concrete type to
/// the trait being implemented. Inherent impls (`impl Foo {}`) emit
/// no edge there is no super-trait relationship to record.
/// no edge, there is no super-trait relationship to record.
fn collect_rust<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mut F) {
walk(root, &mut |node| {
if node.kind() != "impl_item" {
@ -179,7 +179,7 @@ fn rust_path_leaf(n: Node<'_>, code: &[u8]) -> Option<String> {
match n.kind() {
"type_identifier" | "identifier" => text_of(n, code),
"scoped_type_identifier" | "scoped_identifier" => {
// `crate::foo::Bar` last segment.
// `crate::foo::Bar`, last segment.
let s = text_of(n, code)?;
Some(s.rsplit("::").next().unwrap_or(&s).to_string())
}
@ -286,12 +286,12 @@ fn collect_python<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &
let Some(superclasses) = node.child_by_field_name("superclasses") else {
return; // no parents
};
// `superclasses` is an `argument_list` each non-keyword
// `superclasses` is an `argument_list`, each non-keyword
// argument is a base class.
let mut cursor = superclasses.walk();
for arg in superclasses.named_children(&mut cursor) {
if let Some(t) = python_base_text(arg, code) {
// Skip Python `object` not informative.
// Skip Python `object`, not informative.
if t != "object" {
push(sub.clone(), t);
}
@ -304,7 +304,7 @@ fn python_base_text(n: Node<'_>, code: &[u8]) -> Option<String> {
match n.kind() {
"identifier" => text_of(n, code),
"attribute" => {
// `pkg.Base` last segment.
// `pkg.Base`, last segment.
let s = text_of(n, code)?;
Some(s.rsplit('.').next().unwrap_or(&s).to_string())
}
@ -474,7 +474,7 @@ mod tests {
let src = "interface Mine extends Foo, Bar {}";
let edges = collect("java", src);
// tree-sitter-java models `extends` on interface as `extends_interfaces`
// rooted at the same node at least one of the parents should land.
// rooted at the same node, at least one of the parents should land.
assert!(
edges.iter().any(|(s, _)| s == "Mine"),
"interface extends should emit at least one edge; got {edges:?}"
@ -516,8 +516,8 @@ mod tests {
#[test]
fn python_class_object_base_skipped() {
// Inheriting from `object` is not informative Python's
// implicit root. Phase 6 omits these edges to keep the
// Inheriting from `object` is not informative, Python's
// implicit root. We omit these edges to keep the
// hierarchy index focused on user-defined relationships.
let src = "class Plain(object):\n pass\n";
let edges = collect("python", src);

View file

@ -12,7 +12,7 @@ use tree_sitter::{Node, Tree};
/// - ES6: `import { A as B } from 'mod'` → B → ImportBinding { original: A, module: mod }
/// - CommonJS: `const { A: B } = require('mod')` → B → ImportBinding { original: A, module: mod }
///
/// Only aliased (renamed) bindings are recorded same-name imports (e.g.
/// Only aliased (renamed) bindings are recorded, same-name imports (e.g.
/// `import { exec }`) are already resolvable by their original name.
pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBindings {
let mut bindings = ImportBindings::new();
@ -149,7 +149,7 @@ pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBinding
continue;
}
// The alias is accessed via the "alias" field (a `name` node).
// The qualified name has no field find it by kind.
// The qualified name has no field, find it by kind.
let alias_node = clause.child_by_field_name("alias");
let mut c2 = clause.walk();
let qname_node = clause

View file

@ -45,7 +45,7 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
/// (JS `object`, TS `object`, Python `dictionary`). `names` contains
/// identifiers lifted from pair values whose key matches any entry in
/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field
/// pairs are present, returns `Some(vec![])` the sink is effectively
/// pairs are present, returns `Some(vec![])`, the sink is effectively
/// silenced because no destination identifier exists.
/// * `None` if the arg is absent, is not an object literal (plain string
/// / ident / expression), or has splat/spread children that break static
@ -77,7 +77,7 @@ pub(super) fn extract_destination_field_idents(
match child.kind() {
// `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't
// statically attribute spread contents to specific fields, so
// bail out caller falls back to the whole-arg filter, matching
// bail out, caller falls back to the whole-arg filter, matching
// the conservative posture used by arg_uses for splats.
"spread_element" | "dictionary_splat" => {
return None;
@ -107,7 +107,7 @@ pub(super) fn extract_destination_field_idents(
}
}),
// Computed keys like `[someVar]` can't be statically
// resolved skip (conservative: not a destination field).
// resolved, skip (conservative: not a destination field).
"computed_property_name" => continue,
_ => text_of(key_node, code),
};
@ -200,7 +200,7 @@ pub(super) fn extract_const_keyword_arg(
continue;
}
let value_node = child.child_by_field_name("value")?;
// Only return a literal identifiers / calls / complex exprs are
// Only return a literal, identifiers / calls / complex exprs are
// "dynamic" and must be reported as `None` so the gate can
// distinguish literal-safe from dynamic.
return match value_node.kind() {
@ -252,7 +252,7 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8])
/// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as
/// `arg0`). Returns `None` when the call has no arguments.
///
/// Used by per-language shape-aware sink suppression for example, Ruby
/// Used by per-language shape-aware sink suppression, for example, Ruby
/// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically
/// parameterised when arg 0 is a hash/symbol/array/non-interpolated string,
/// regardless of taint reaching that argument.
@ -268,7 +268,7 @@ pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bo
/// Walk a Java method-chain receiver looking for an inner `method_invocation`
/// whose method name matches one of `target_methods` (e.g. `createQuery`,
/// `prepareStatement`). Returns the kind of that inner call's arg 0 used
/// `prepareStatement`). Returns the kind of that inner call's arg 0, used
/// to verify the SQL-bearing call up-chain was given a string literal rather
/// than a concatenation / method call.
///
@ -307,7 +307,7 @@ pub(super) fn java_chain_arg0_kind_for_method(
/// method identifier matches one of `target_methods`, then return that
/// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node
/// represents a chained expression like `Model.where(...).preload(...).to_a`
/// the outermost call (`to_a`) has no arguments, so the shape suppressor
///, the outermost call (`to_a`) has no arguments, so the shape suppressor
/// must reach down the chain to inspect `where`'s arg 0.
///
/// Conservative: returns `None` if the chain doesn't contain a matching
@ -353,6 +353,116 @@ fn subtree_has_interpolation(n: Node) -> bool {
n.named_children(&mut cursor).any(subtree_has_interpolation)
}
/// Walk a JS/TS method-chain receiver-side to find an inner `call_expression`
/// whose member-property name matches one of `target_methods` (e.g. `query`,
/// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0.
///
/// Used to recognise ORM-accessor chains where a labelled SQL sink sits on
/// the receiver side of a parameterised execute method:
/// `strapi.db.query('admin::api-token').findOne({...})`. The outer call
/// (`findOne`) is the CFG node; the inner labelled `db.query` call carries
/// the literal model UID that proves the chain is parameterised.
///
/// Conservative: returns `None` when no matching inner call is found, so
/// callers fall through to the no-suppression path.
pub(super) fn js_chain_arg0_kind_for_method(
expr: Node,
target_methods: &[&str],
code: &[u8],
) -> Option<(String, bool)> {
let n = unwrap_parens(expr);
// tree-sitter-typescript / -javascript: call_expression with fields
// `function` (member_expression / identifier) and `arguments`.
if n.kind() == "call_expression" {
// Check this call's callee: if its property name (or full text) ends
// with one of `target_methods`, this is the inner labelled call.
if let Some(function) = n.child_by_field_name("function") {
// Property of a member_expression; falls back to the function
// text itself for bare-identifier calls.
let prop_text = function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
let full_text = text_of(function, code);
let leaf_text = full_text
.as_ref()
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
let matched = target_methods.iter().any(|m| {
prop_text.as_deref() == Some(*m)
|| leaf_text.as_deref() == Some(*m)
|| full_text.as_deref() == Some(*m)
|| full_text
.as_deref()
.is_some_and(|s| s.ends_with(&format!(".{m}")))
});
if matched {
return arg0_kind_and_interpolation(n);
}
// Drill down the receiver spine: function.object is the prior
// call in the chain.
if let Some(object) = function.child_by_field_name("object")
&& let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code)
{
return Some(found);
}
}
}
None
}
/// Walk the receiver chain of a JS/TS call to count *non-execute* method
/// calls between the outer call and an inner labelled call to
/// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer
/// chain method name (e.g. `findOne`) when an inner-call to `target_inner`
/// exists somewhere on the receiver spine, otherwise `None`.
///
/// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain
/// shape `<inner>.query(LITERAL).<orm_method>(...)`, bare
/// `connection.query("SELECT ...")` returns `None` because there is no
/// outer chain method.
pub(super) fn js_chain_outer_method_for_inner<'a>(
outer: Node<'a>,
target_inner: &[&str],
code: &'a [u8],
) -> Option<String> {
let n = unwrap_parens(outer);
if n.kind() != "call_expression" {
return None;
}
let function = n.child_by_field_name("function")?;
let object = function.child_by_field_name("object")?;
// If `object` itself is a call_expression whose property matches
// `target_inner`, the immediate outer is `function.property`.
if object.kind() == "call_expression" {
let inner_function = object.child_by_field_name("function");
if let Some(inner_function) = inner_function {
let prop_text = inner_function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
let full_text = text_of(inner_function, code);
let leaf_text = full_text
.as_ref()
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
let inner_matched = target_inner.iter().any(|m| {
prop_text.as_deref() == Some(*m)
|| leaf_text.as_deref() == Some(*m)
|| full_text.as_deref() == Some(*m)
|| full_text
.as_deref()
.is_some_and(|s| s.ends_with(&format!(".{m}")))
});
if inner_matched {
return function
.child_by_field_name("property")
.and_then(|p| text_of(p, code).map(|s| s.to_string()));
}
}
// Recurse: outer chain may have more depth (`a.b().c().d()` ,
// d is outermost, c is next, target may be at b or further in).
return js_chain_outer_method_for_inner(object, target_inner, code);
}
None
}
/// For a chained method call (`a.b().c().d()`), walk down the receiver
/// chain (`function.object`) and return the innermost call_expression
/// alongside its callee text (e.g. `"http.get"`).
@ -385,7 +495,7 @@ pub(super) fn find_chained_inner_call<'a>(
return None;
}
// Recurse: the inner call may itself be chained
// (`axios.get(u).then(h).catch(h)` innermost is `axios.get`).
// (`axios.get(u).then(h).catch(h)`, innermost is `axios.get`).
if let Some(inner) = find_chained_inner_call(object, lang, code) {
return Some(inner);
}
@ -398,7 +508,7 @@ pub(super) fn find_chained_inner_call<'a>(
.or_else(|| object.child_by_field_name("name"))?;
// Multi-line dotted member expressions (`http\n .get`) include
// formatting whitespace in the source-text slice. The labels map
// keys are literal `"http.get"` etc. strip whitespace so the
// keys are literal `"http.get"` etc., strip whitespace so the
// chained-call inner-gate rebinding fires for both single-line and
// multi-line chain styles. Also strips `\r` for CRLF sources.
// Motivated by upstream Parse Server CVE-2025-64430 which uses the
@ -410,18 +520,18 @@ pub(super) fn find_chained_inner_call<'a>(
/// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod
/// node) and yield each *named argument* of every inner call along the
/// way. Outer's own arguments are NOT included the caller already
/// way. Outer's own arguments are NOT included, the caller already
/// handles those via the standard `pre_emit_arg_source_nodes` pass over
/// `outer.arguments`.
///
/// For `json.NewDecoder(r.Body).Decode(emoji)`:
/// outer = `.Decode(emoji)` caller iterates `emoji`
/// inner = `json.NewDecoder(r.Body)` yielded arg: `r.Body`
/// outer = `.Decode(emoji)` , caller iterates `emoji`
/// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body`
///
/// We only pull from each inner call's `arguments` field, never from its
/// `function`/`method`/receiver expressions. That distinction matters
/// because chained source-receivers like `r.URL.Query()` expose a
/// member-text path that classifies as a Source but it's the OUTER
/// member-text path that classifies as a Source, but it's the OUTER
/// chain text (`r.URL.Query.Get`) that already classifies, so emitting
/// a synth source for the inner-call's own callee would double-count.
///
@ -498,7 +608,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
return false;
}
let first_arg = named[0];
// Extract the raw text of arg 0 must be a string literal or
// Extract the raw text of arg 0, must be a string literal or
// template string without interpolation.
let query_text = match first_arg.kind() {
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
@ -511,7 +621,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return false; // dynamic not safe
return false; // dynamic, not safe
}
text_of(first_arg, code)
}
@ -534,7 +644,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
/// - `$1`, `$2`, …, `$N` (PostgreSQL positional)
/// - `?` (MySQL / SQLite positional)
/// - `%s` (Python DB-API / psycopg2)
/// - `:identifier` (Oracle / named parameters) requires the colon to be
/// - `:identifier` (Oracle / named parameters), requires the colon to be
/// preceded by a space or `=` (to avoid matching JS ternary / object
/// literals).
pub(super) fn has_sql_placeholders(s: &str) -> bool {
@ -559,7 +669,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
&& i + 1 < len
&& bytes[i + 1].is_ascii_alphabetic() =>
{
// :identifier must be preceded by whitespace/= to avoid
// :identifier, must be preceded by whitespace/= to avoid
// false positives on object literals or ternary operators.
return true;
}
@ -581,7 +691,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
#[allow(clippy::only_used_in_recursion)]
pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
match node.kind() {
// Scalar strings but reject if they contain interpolation
// Scalar strings, but reject if they contain interpolation
// (e.g. Ruby `"hello #{name}"`, Python f-strings).
"string"
| "string_literal"
@ -602,7 +712,7 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
// PHP encapsed_string: safe only if no variable interpolation
"encapsed_string" => !has_interpolation_cfg(node),
// Wrapper: PHP/Go wrap each arg in an `argument` node unwrap
// Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap
"argument" => {
node.named_child_count() == 1
&& node
@ -765,7 +875,7 @@ pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool {
return false;
}
}
// Zero-arg calls are not "all literal" taint can still flow via a
// Zero-arg calls are not "all literal", taint can still flow via a
// non-literal receiver (e.g. `tainted.readObject()`), and the sink-
// suppression gate (`info.all_args_literal`) must not skip these.
if !any_arg {
@ -781,7 +891,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
let kind = child.kind();
// Skip argument lists those are checked by the caller.
// Skip argument lists, those are checked by the caller.
if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" {
continue;
}
@ -804,7 +914,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
/// Returns one `Vec<String>` per argument (in parameter-position order).
/// Returns empty if argument list can't be found or contains spread/keyword args.
pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>> {
// Ruby `subshell` (backticks) has no `arguments` field its children are
// Ruby `subshell` (backticks) has no `arguments` field, its children are
// string fragments and `interpolation` nodes. Lift each interpolation's
// identifiers into a positional arg so taint flows from `#{var}` into the
// synthetic "subshell" sink.
@ -834,7 +944,7 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>>
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
// Named / keyword arguments are tracked separately in `CallMeta.kwargs`
// and do not participate in positional indexing skip them here so
// and do not participate in positional indexing, skip them here so
// `arg_uses` remains strictly positional. Splats (spread/dict splat)
// still invalidate positional mapping; bail out in that case.
if kind == "spread_element"
@ -1058,13 +1168,13 @@ pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) -
/// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call
/// (non-method-chain) Go shape. The caller wires the resulting cap into
/// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the
/// taint engine consumes via the standard sanitizer pathway taint flows
/// taint engine consumes via the standard sanitizer pathway, taint flows
/// in on `s`, the matching cap is stripped from the result.
pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option<Cap> {
if call_ast.kind() != "call_expression" {
return None;
}
// The call's `function` field is a `selector_expression` `operand`
// The call's `function` field is a `selector_expression`, `operand`
// is the package ident (`strings`), `field` is the method ident.
let func = call_ast.child_by_field_name("function")?;
if func.kind() != "selector_expression" {
@ -1085,7 +1195,7 @@ pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> O
let new_lit = extract_const_string_arg(call_ast, 2, code)?;
// If the replacement itself reintroduces a dangerous sequence, don't
// credit the strip matches the Rust chain detector's policy.
// credit the strip, matches the Rust chain detector's policy.
if !caps_stripped_by_literal_pattern(&new_lit).is_empty() {
return None;
}
@ -1106,7 +1216,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
}
match lookup(lang, n.kind()) {
Kind::Function => {
// Function/closure expression passed as argument return the same
// Function/closure expression passed as argument, return the same
// synthetic anon name used by build_sub so callback_bindings and
// source_to_callback can match it to the extracted BodyCfg.
n.child_by_field_name("name")
@ -1155,7 +1265,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
/// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`].
///
/// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces
/// an empty vector positional indices past the splat are meaningless and
/// an empty vector, positional indices past the splat are meaningless and
/// downstream passes already treat an empty vector as "no info".
pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<Option<String>> {
let Some(args_node) = call_node.child_by_field_name("arguments") else {
@ -1175,7 +1285,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
return Vec::new();
}
// Named / keyword arguments are tracked separately in `kwargs` and
// don't participate in positional indexing skip them here so this
// don't participate in positional indexing, skip them here so this
// vector stays aligned with `arg_uses`.
if kind == "keyword_argument" || kind == "named_argument" {
continue;
@ -1198,7 +1308,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
| "raw_string_literal"
// PHP's double-quoted form (single-quoted maps to `string`).
// Only safe to lift when there is no `encapsed_string` /
// `embedded_expression` interpolation child checked below.
// `embedded_expression` interpolation child, checked below.
| "encapsed_string" => {
let raw = text_of(target, code);
raw.and_then(|s| strip_literal_quotes(&s, target, code))
@ -1212,7 +1322,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
/// Strip surrounding quotes from a syntactic string literal, resolving the
/// `string_content` child for Rust-style two-level string nodes. Returns the
/// raw inner text (no escape-sequence processing) sufficient for whitelist
/// raw inner text (no escape-sequence processing), sufficient for whitelist
/// matching against shell-metachar sets.
pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option<String> {
// Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child.
@ -1320,7 +1430,7 @@ pub(super) fn def_use(
// Python/Ruby `expression_statement` → `assignment`)
let mut cursor = ast.walk();
for child in ast.children(&mut cursor) {
// Only use left/right fields for actual assignment nodes binary
// Only use left/right fields for actual assignment nodes, binary
// expressions also have left/right but are not definitions.
let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment);
let child_name = child
@ -1403,7 +1513,7 @@ pub(super) fn def_use(
(defs, uses, vec![])
}
// iflet / whilelet the `let_condition` binds a variable from
// iflet / whilelet, the `let_condition` binds a variable from
// the value expression. E.g. `if let Ok(cmd) = env::var("CMD")`
// defines `cmd` and uses `env`, `var`, `CMD`.
Kind::If | Kind::While => {
@ -1418,7 +1528,7 @@ pub(super) fn def_use(
let mut tmp = Vec::<String>::new();
collect_idents(pat, code, &mut tmp);
// The first plain identifier in the pattern is the binding.
// Skip type identifiers (e.g. "Ok" in Ok(cmd)) take the
// Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the
// last ident which is the inner binding name.
defs = tmp.into_iter().last();
}

View file

@ -14,6 +14,7 @@ use crate::labels::{
};
use crate::summary::FuncSummary;
use crate::symbol::{FuncKey, Lang};
use crate::utils::snippet::truncate_at_char_boundary;
use smallvec::SmallVec;
use std::cell::RefCell;
use std::collections::{HashMap, HashSet};
@ -54,8 +55,8 @@ use literals::{
extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg,
extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node,
find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args,
is_parameterized_query_call, java_chain_arg0_kind_for_method, ruby_chain_arg0_for_method,
walk_chain_inner_call_args,
is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
};
use params::{
compute_container_and_kind, extract_param_meta, inject_framework_param_sources,
@ -74,7 +75,7 @@ pub fn extract_param_meta_for_test<'a>(
}
/// Test-only helper to populate the per-file DTO class map without
/// running `build_cfg`. Used by the Phase 6 audit harness in
/// running `build_cfg`. Used by the DTO audit harness in
/// `tests/typed_extractors_audit.rs` to verify that
/// `classify_param_type_*` resolves a same-file DTO via the
/// thread-local map.
@ -91,30 +92,26 @@ pub fn clear_dto_classes_for_test() {
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
}
// -------------------------------------------------------------------------
// Structural DFS index for function bodies
// -------------------------------------------------------------------------
//
// Per-file map of function-node start_byte → depth-first preorder index.
// Populated at the start of `build_cfg`, consumed by every site that
// previously formatted `<anon@{start_byte}>` or stored `start_byte` as
// the disambig. The DFS index is stable against edits elsewhere in the
// file (inserting a line above a function does not change its index).
//
// Thread-local is safe because `build_cfg` is not re-entrant within a
// single rayon worker: each file is parsed and CFG-built to completion
// before the next one starts.
// Per-file map of function-node start_byte → DFS preorder index. Stable
// against unrelated edits (inserting a line above a function doesn't
// change its index). Thread-local is safe, `build_cfg` is not
// re-entrant within a single rayon worker.
thread_local! {
static FN_DFS_INDICES: RefCell<HashMap<usize, u32>> = RefCell::new(HashMap::new());
/// Phase 6: per-file DTO class definitions. Populated at the top
/// of [`build_cfg`] by [`dto::collect_dto_classes`] so per-parameter
/// classifiers can resolve `@RequestBody T dto` /
/// `Json<CreateUser>` / `Annotated[CreateUser, Body()]` to a
/// [`crate::ssa::type_facts::TypeKind::Dto`] when the DTO type is
/// declared in the same file. Cleared at the end of `build_cfg`
/// so thread-local state never leaks between files.
/// Per-file DTO class definitions, populated at the top of
/// [`build_cfg`] so per-parameter classifiers can resolve typed
/// extractors against same-file DTOs.
pub(crate) static DTO_CLASSES: RefCell<HashMap<String, crate::ssa::type_facts::DtoFields>>
= RefCell::new(HashMap::new());
/// Per-file set of TS / JS `type X = Map<...>` (or `Set<...>` /
/// `Array<...>` / `T[]`) aliases, populated at the top of
/// [`build_cfg`]. Lets `classify_param_type_ts` resolve a
/// parameter typed `m: ElementsMap` to
/// [`crate::ssa::type_facts::TypeKind::LocalCollection`] via
/// same-file alias lookup. Cross-file aliases are not yet
/// resolved.
pub(crate) static TYPE_ALIAS_LC: RefCell<std::collections::HashSet<String>>
= RefCell::new(std::collections::HashSet::new());
}
/// Populate the per-file DFS-index map from a preorder walk of the
@ -148,11 +145,8 @@ fn fn_dfs_index(start_byte: usize) -> Option<u32> {
FN_DFS_INDICES.with(|cell| cell.borrow().get(&start_byte).copied())
}
/// Synthetic name for an anonymous function. Uses the DFS index when
/// available (`<anon#N>`), falls back to the byte offset when the map
/// is empty (e.g. during tests that bypass `build_cfg`). The `#`
/// sigil is intentionally different from `@` so the two formats are
/// distinguishable by downstream consumers.
/// Synthetic name for an anonymous function: `<anon#N>` from the DFS
/// index when available, `<anon@OFFSET>` as fallback.
pub(crate) fn anon_fn_name(start_byte: usize) -> String {
match fn_dfs_index(start_byte) {
Some(idx) => format!("<anon#{idx}>"),
@ -160,9 +154,7 @@ pub(crate) fn anon_fn_name(start_byte: usize) -> String {
}
}
/// Prefix check that accepts both the new `<anon#...>` and legacy
/// `<anon@...>` formats. Used by code paths that classify whether a
/// function name came from anonymous synthesis.
/// True for any anonymous-function synthesis prefix.
pub(crate) fn is_anon_fn_name(name: &str) -> bool {
name.starts_with("<anon#") || name.starts_with("<anon@")
}
@ -235,9 +227,9 @@ pub struct CallMeta {
///
/// CFG construction does NOT populate this field today (callee already
/// carries the full path). It is the canonical place to read the original
/// textual callee for **debug/display only** analysis code should walk
/// SSA `FieldProj` receivers (Phase 4) or use the
/// [`crate::labels::bare_method_name`] textual fallback (Phase 5).
/// textual callee for **debug/display only**, analysis code should walk
/// SSA `FieldProj` receivers or use the
/// [`crate::labels::bare_method_name`] textual fallback.
#[doc(hidden)]
#[serde(default)]
pub callee_text: Option<String>,
@ -248,14 +240,14 @@ pub struct CallMeta {
pub outer_callee: Option<String>,
/// Byte span of the inner call that supplied the classification, when
/// `find_classifiable_inner_call` overrode the outer callee. `None` when
/// the classification came from the outer AST node directly in that
/// the classification came from the outer AST node directly, in that
/// case `AstMeta.span` already points at the classified expression.
///
/// Consumers that want the location of the *labeled* call (sink/source/
/// sanitizer display, flow-step rendering, taint origin attribution)
/// should use [`NodeInfo::classification_span`] rather than reading this
/// field directly. `AstMeta.span` remains the authoritative "whole
/// statement" span used by structural passes (unreachability,
/// statement" span, used by structural passes (unreachability,
/// resource lifecycle, guard byte scans, CFG/taint span dedup).
#[serde(default)]
pub callee_span: Option<(usize, usize)>,
@ -283,7 +275,7 @@ pub struct CallMeta {
/// only positional arguments.
pub kwargs: Vec<(String, Vec<String>)>,
/// String-literal value at each positional argument of this call, parallel
/// to `arg_uses` `Some(s)` when the argument is a syntactic string
/// to `arg_uses`, `Some(s)` when the argument is a syntactic string
/// literal, `None` otherwise. Empty for non-call nodes or when positional
/// boundaries can't be determined. Consumed by the static-map abstract
/// analysis (and future literal-aware passes) so they don't need the
@ -302,10 +294,41 @@ pub struct CallMeta {
///
/// Takes priority over `sink_payload_args` in the SSA sink scan: when a
/// call has an object-literal destination arg, only idents under the
/// listed fields may contribute sink findings not every ident in the
/// listed fields may contribute sink findings, not every ident in the
/// positional slot.
///
/// Legacy single-gate path: populated only when this call site matched
/// exactly one gate. When a callee carries multiple gates (e.g. `fetch`
/// is both an SSRF and a `DATA_EXFIL` gate), per-gate filters live in
/// [`Self::gate_filters`] and this field is left `None`.
#[serde(default)]
pub destination_uses: Option<Vec<String>>,
/// Per-gate filters for callees that carry multiple gated-sink rules.
///
/// Each entry preserves one matching gate's `(label_caps, payload_args,
/// destination_uses)` so the SSA sink scan can attribute findings
/// per-cap. Empty when the call site matches zero or exactly one gate
/// (the single-gate case continues to use [`Self::sink_payload_args`] +
/// [`Self::destination_uses`]).
#[serde(default)]
pub gate_filters: Vec<GateFilter>,
}
/// One gate's contribution at a call site whose callee matches multiple
/// gates. The SSA taint engine processes each filter independently so a
/// `fetch({url: tainted}, {body: tainted})` flow surfaces as one SSRF
/// finding (URL filter) plus one `DATA_EXFIL` finding (body filter), each
/// carrying its own cap mask rather than a conflated union.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct GateFilter {
/// Sink caps emitted by this gate (e.g. `Cap::SSRF`, `Cap::DATA_EXFIL`).
pub label_caps: crate::labels::Cap,
/// Argument positions that carry the tainted payload for this gate.
pub payload_args: Vec<usize>,
/// Destination-aware filter: when `Some(names)`, the sink check only
/// considers SSA values whose `var_name` matches one of `names` (object-
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
pub destination_uses: Option<Vec<String>>,
}
/// Taint-classification and variable-flow metadata.
@ -349,7 +372,7 @@ pub struct NodeInfo {
///
/// This flag is scoped to taint-style sink suppression: it indicates
/// that no attacker-controlled data enters through the immediate
/// arguments. It does NOT mean the call is "safe" in general other
/// arguments. It does NOT mean the call is "safe" in general, other
/// detectors (resource lifecycle, structural analysis) may still
/// legitimately flag these calls.
pub all_args_literal: bool,
@ -411,7 +434,7 @@ pub struct NodeInfo {
pub is_eq_with_const: bool,
/// True when this node reads a numeric-length property on a container:
/// `arr.length`, `map.size`, `buf.byteLength`, `items.count`, `vec.len()`
/// either as a pure property access or as a zero-arg method call.
///, either as a pure property access or as a zero-arg method call.
/// Populated by inspecting the AST in `push_node` across JS/TS, Python,
/// Ruby, Java, Rust, PHP, and C/C++ idioms where these accessors return
/// an integer. Consumed by the type-fact analysis (`ssa::type_facts`)
@ -419,12 +442,12 @@ pub struct NodeInfo {
/// FILE_IO / SHELL_ESCAPE sink suppression for provably numeric
/// payloads.
pub is_numeric_length_access: bool,
/// Phase 6.3: the field name read on the RHS of an assignment whose
/// the field name read on the RHS of an assignment whose
/// RHS is a single member-access expression (e.g. `let x = dto.email`).
/// Set to `Some("email")` for that shape; left `None` otherwise.
/// Consumed by the type-fact analysis (`ssa::type_facts`) so reads
/// against a [`crate::ssa::type_facts::TypeKind::Dto`] receiver pick
/// up the field's declared `TypeKind`. Strictly additive when
/// up the field's declared `TypeKind`. Strictly additive, when
/// `None`, the legacy copy-prop semantics apply.
pub member_field: Option<String>,
}
@ -442,7 +465,7 @@ impl NodeInfo {
/// lines, flow-step rendering, symbolic witness extraction, debug views.
///
/// Use `ast.span` directly for **structural grain**: unreachability,
/// resource lifecycle, guard byte scans, CFG/taint span dedup anywhere
/// resource lifecycle, guard byte scans, CFG/taint span dedup, anywhere
/// the enclosing statement is the meaningful unit.
#[inline]
pub fn classification_span(&self) -> (usize, usize) {
@ -514,7 +537,7 @@ pub struct BodyMeta {
/// Per-parameter [`crate::ssa::type_facts::TypeKind`] inferred from
/// decorators / annotations / static type text at CFG construction
/// time. Same length as `params`; positions with no recoverable
/// type info are `None`. Strictly additive when every entry is
/// type info are `None`. Strictly additive, when every entry is
/// `None`, downstream behaviour is identical to the pre-Phase-1
/// engine.
pub param_types: Vec<Option<crate::ssa::type_facts::TypeKind>>,
@ -528,7 +551,7 @@ pub struct BodyMeta {
/// `LocalFuncSummary`. `None` for the synthetic top-level body.
///
/// All intra-file maps keyed on function identity (SSA summaries, callee
/// bodies, inline cache, callback bindings) use this key never the bare
/// bodies, inline cache, callback bindings) use this key, never the bare
/// leaf `name`, which is collision-prone across (container, arity,
/// disambig, kind).
pub func_key: Option<FuncKey>,
@ -589,7 +612,7 @@ pub struct FileCfg {
/// Promisify wrapper aliases: local name → wrapped callee name.
/// Only populated for JS/TS files.
pub promisify_aliases: PromisifyAliases,
/// Phase 6: per-file class / trait / interface hierarchy edges.
/// per-file class / trait / interface hierarchy edges.
/// Each entry is `(sub_container, super_container)` after
/// language-specific normalisation. See
/// [`crate::cfg::hierarchy`] for the per-language extraction
@ -711,14 +734,10 @@ fn extract_condition_raw<'a>(
vars.dedup();
vars.truncate(MAX_COND_VARS);
// 4. Extract text, truncated.
let text = text_of(cond, code).map(|t| {
if t.len() > MAX_CONDITION_TEXT_LEN {
t[..MAX_CONDITION_TEXT_LEN].to_string()
} else {
t
}
});
// 4. Extract text, truncated. UTF-8-safe, gogs (Gurmukhi) /
// discourse (Cyrillic) trip raw byte slices on regex literals.
let text = text_of(cond, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
(text, vars, negated)
}
@ -739,7 +758,7 @@ pub(super) fn detect_negation<'a>(
_if_ast: Node<'a>,
_lang: &str,
) -> (Node<'a>, bool) {
// Unwrap parenthesized_expression JS/Java/PHP wrap if-conditions in parens.
// Unwrap parenthesized_expression, JS/Java/PHP wrap if-conditions in parens.
// This lets us detect negation inside: `if (!expr)` → cond is `(!expr)`.
let cond = if cond.kind() == "parenthesized_expression" {
cond.child_by_field_name("expression")
@ -811,7 +830,7 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
"*" => Some(BinOp::Mul),
"/" => Some(BinOp::Div),
"%" => Some(BinOp::Mod),
// Bitwise (single-char tokens no conflict with && / ||)
// Bitwise (single-char tokens, no conflict with && / ||)
"&" => Some(BinOp::BitAnd),
"|" => Some(BinOp::BitOr),
"^" => Some(BinOp::BitXor),
@ -909,7 +928,7 @@ fn extract_template_prefix(ast: Node, lang: &str, code: &[u8]) -> Option<String>
/// `extract_template_prefix` for both assignment RHS and call arguments.
///
/// Also descends through `await` / `yield` wrappers and into the first
/// argument of a call expression this covers the common sink shape
/// argument of a call expression, this covers the common sink shape
/// `await axios.get(\`https://host/…${x}\`)` where the template literal lives
/// inside a call inside an `await` wrapper.
fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
@ -930,7 +949,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
}
"call_expression" | "call" | "new_expression" => {
// Descend into the first positional argument (e.g.
// `axios.get(\`https://…${x}\`)` the URL we want to lock
// `axios.get(\`https://…${x}\`)`, the URL we want to lock
// is the template-literal first argument of the call).
let args = cur
.child_by_field_name("arguments")
@ -942,7 +961,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
}
}
// Case 1: template literal `\`scheme://host/…${x}…\``.
// Case 1: template literal, `\`scheme://host/…${x}…\``.
if cur.kind() == "template_string" {
let mut w = cur.walk();
let first_child = cur.named_children(&mut w).next()?;
@ -957,7 +976,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
return None;
}
// Case 2: `"scheme://host/" + x` LHS is a string literal.
// Case 2: `"scheme://host/" + x`, LHS is a string literal.
if cur.kind() == "binary_expression" {
let mut w2 = cur.walk();
let mut ops = cur.children(&mut w2).filter(|c| !c.is_named());
@ -1028,7 +1047,7 @@ fn extract_bin_op_const(ast: Node, lang: &str, code: &[u8]) -> Option<i64> {
}
}
// Try left, then right one of them should be a literal
// Try left, then right, one of them should be a literal
try_parse_number(left, code).or_else(|| try_parse_number(right, code))
}
@ -1067,7 +1086,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
.named_child(0)
.is_some_and(|c| is_boolean_eq_const_tree(c, lang)),
"unary_expression" | "not_operator" => {
// `!` / `not` operator is an anonymous child; operand is the
// `!` / `not`, operator is an anonymous child; operand is the
// single named child.
let mut w = node.walk();
let mut op_is_not = false;
@ -1084,7 +1103,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
.is_some_and(|c| is_boolean_eq_const_tree(c, lang))
}
"boolean_operator" => {
// Python `and`/`or` operands are named children.
// Python `and`/`or`, operands are named children.
let l = node.named_child(0);
let r = node.named_child(1);
l.is_some_and(|n| is_boolean_eq_const_tree(n, lang))
@ -1137,9 +1156,9 @@ fn binary_operator_token(node: Node) -> Option<String> {
/// Property names whose value is provably an integer across the supported
/// languages: JS/TS `arr.length` (Array/String/TypedArray), `map.size`
/// (Map/Set), `buffer.byteLength` (ArrayBuffer/TypedArray); Python `.count`
/// (`str.count`, `list.count`, `tuple.count` all return int); Ruby `.length`
/// (`str.count`, `list.count`, `tuple.count`, all return int); Ruby `.length`
/// / `.size` / `.count`; Java `.size()` / `.length()`; Rust `.len()`. This
/// list is intentionally narrow only properties whose semantics across every
/// list is intentionally narrow, only properties whose semantics across every
/// host we scan return an integer, so the `TypeKind::Int` fact is sound.
fn is_numeric_length_property(name: &str) -> bool {
matches!(name, "length" | "size" | "byteLength" | "count" | "len")
@ -1157,7 +1176,7 @@ fn is_numeric_length_property(name: &str) -> bool {
/// Consumed by the type-fact analysis (`ssa::type_facts::analyze_types`) to
/// infer `TypeKind::Int` on the defined value so sink-cap suppression can
/// treat `"row " + arr.length` as a non-injectable payload.
/// Phase 6.3: when the RHS of an assignment / declaration is a single
/// when the RHS of an assignment / declaration is a single
/// member-access expression (`let x = dto.email`, `x = obj.field`,
/// `let x = obj["field"]`), return the property name. The CFG type-fact
/// analysis uses the recovered name to look up the field's declared
@ -1321,7 +1340,7 @@ fn find_single_binary_expr<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
// Check if ast itself is a binary expression
if is_binary_expr_kind(ast_kind, lang) {
// Verify it has exactly 2 named children (left, right) no nesting
// Verify it has exactly 2 named children (left, right), no nesting
let named_count = ast.named_child_count();
if named_count == 2 {
// Ensure neither child is itself a binary expression (that would
@ -1435,7 +1454,7 @@ pub(super) fn push_node<'a>(
// (e.g. PHP `object_creation_expression` has positional children).
.or_else(|| find_constructor_type_child(ast))
.and_then(|n| {
// IIFE: `(function(x){...})(arg)` the called expression is a
// IIFE: `(function(x){...})(arg)`, the called expression is a
// function literal with no identifier. Bind the call to the
// anonymous body's synthetic name so resolve_callee can find
// the extracted BodyCfg/summary. Without this, text_of() would
@ -1512,7 +1531,7 @@ pub(super) fn push_node<'a>(
// If this is a declaration/expression wrapper or an assignment that
// *contains* a call, prefer the first inner call identifier instead of
// the whole line. Track the inner call's byte span so we can populate
// `CallMeta.callee_span` once the labels settle enabling narrow
// `CallMeta.callee_span` once the labels settle, enabling narrow
// source-location reporting when the classified call lives several lines
// below the enclosing statement (e.g. call inside a multi-line template
// literal).
@ -1546,9 +1565,9 @@ pub(super) fn push_node<'a>(
let mut labels = classify_all(lang, &text, extra);
// If the outermost call didn't classify, try inner/nested calls.
// E.g. `str(eval(expr))` `str` is not a sink, but `eval` is.
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
// When the callee is overridden, save the original for container ops
// (e.g. `parts.add(req.getParameter(...))` callee becomes
// (e.g. `parts.add(req.getParameter(...))`, callee becomes
// "req.getParameter" but outer_callee preserves "parts.add").
let mut outer_callee: Option<String> = None;
let mut inner_callee_span: Option<(usize, usize)> = None;
@ -1568,7 +1587,7 @@ pub(super) fn push_node<'a>(
// For assignments like `element.innerHTML = value`, the inner-call heuristic
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
// If that didn't produce a label, check the LHS property name it may be a
// If that didn't produce a label, check the LHS property name, it may be a
// sink like `innerHTML`.
//
// This covers both direct `Kind::Assignment` nodes and `Kind::CallWrapper`
@ -1588,7 +1607,7 @@ pub(super) fn push_node<'a>(
if let Some(assign) = assign_node
&& let Some(lhs) = assign.child_by_field_name("left")
{
// Try full member expression first (e.g. "location.href") more
// Try full member expression first (e.g. "location.href"), more
// specific and avoids false positives on `a.href`.
if let Some(full) = member_expr_text(lhs, code) {
if let Some(l) = classify(lang, &full, extra) {
@ -1612,7 +1631,7 @@ pub(super) fn push_node<'a>(
// try to classify the member expression text as a source.
// This handles `var x = process.env.CMD` (JS), `os.environ["KEY"]` (Python),
// and similar property-access-based source patterns.
// Skip when the assignment's RHS is itself a function/lambda literal
// Skip when the assignment's RHS is itself a function/lambda literal ,
// labels found by `first_member_label` would come from inside the
// closure body and shouldn't tag the outer wrapper (e.g. Go's
// `run := func() { exec.Command(...) }` would otherwise inherit
@ -1687,7 +1706,7 @@ pub(super) fn push_node<'a>(
if labels.is_empty()
&& let Some(outer) = call_ast
&& let Some((inner, inner_callee_text)) = find_chained_inner_call(outer, lang, code)
&& classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_some()
&& !classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_empty()
{
call_ast = Some(inner);
outer_callee = Some(text.clone());
@ -1707,13 +1726,14 @@ pub(super) fn push_node<'a>(
// the outer statement `text`, so gate matcher names like `"fetch"` hit.
let mut sink_payload_args: Option<Vec<usize>> = None;
let mut destination_uses: Option<Vec<String>> = None;
let mut gate_filters: Vec<GateFilter> = Vec::new();
if labels.is_empty() {
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(cn) = gate_call {
let gate_callee_text = if call_ast.is_some() {
text.clone()
} else {
// Inner call reached via wrapper use the call-expression's
// Inner call reached via wrapper, use the call-expression's
// function name directly. Falls back to `text` so non-call-
// expression kinds (method calls, Ruby `call` nodes, macros)
// still have a usable callee string.
@ -1723,51 +1743,84 @@ pub(super) fn push_node<'a>(
.and_then(|f| text_of(f, code))
.unwrap_or_else(|| text.clone())
};
if let Some(gm) = classify_gated_sink(
let matches = classify_gated_sink(
lang,
&gate_callee_text,
|idx| extract_const_string_arg(cn, idx, code),
|kw| extract_const_keyword_arg(cn, kw, code),
|kw| has_keyword_arg(cn, kw, code),
) {
labels.push(gm.label);
let payload = gm.payload_args;
if payload == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual call
// arity so `collect_tainted_sink_values` checks each one.
let arity = extract_arg_uses(cn, code).len();
if arity > 0 {
sink_payload_args = Some((0..arity).collect());
}
} else if !payload.is_empty() {
sink_payload_args = Some(payload.to_vec());
}
);
// Destination-aware gates (outbound HTTP clients): when the
// gate declares destination-bearing object fields and the
// positional destination arg at call time is an object
// literal, narrow sink-taint checks to identifiers under
// those fields. Non-object arg forms (string / ident /
// expression) return `None` from the extractor and fall
// through to whole-arg positional filtering.
//
// We only populate destination_uses for the FIRST payload
// position that is an object literal. For outbound HTTP
// gates `payload_args` is always a single position (arg 0)
// so this is exact.
if !gm.object_destination_fields.is_empty() {
for &pos in gm.payload_args {
if let Some(names) = extract_destination_field_idents(
cn,
pos,
gm.object_destination_fields,
code,
) {
destination_uses = Some(names);
break;
if !matches.is_empty() {
// Per-gate filter accumulation. Each match contributes:
// * its label (added to `labels` so `resolve_sink_caps`
// downstream sees the union),
// * a `GateFilter` carrying that gate's specific
// `(label_caps, payload_args, destination_uses)` so
// the SSA sink scan can attribute taint per-cap.
let mut union_payload: Vec<usize> = Vec::new();
for gm in &matches {
labels.push(gm.label);
let payload_vec: Vec<usize> =
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual
// call arity so `collect_tainted_sink_values` checks
// each one.
let arity = extract_arg_uses(cn, code).len();
(0..arity).collect()
} else {
gm.payload_args.to_vec()
};
// Destination-aware gates: when the gate declares
// destination-bearing object fields and a payload-position
// arg is an object literal at call time, narrow sink-taint
// checks to identifiers under those fields. Non-object
// arg forms return `None` from the extractor and the gate
// falls back to whole-arg positional filtering.
let mut dest_uses: Option<Vec<String>> = None;
if !gm.object_destination_fields.is_empty() {
for &pos in gm.payload_args {
if let Some(names) = extract_destination_field_idents(
cn,
pos,
gm.object_destination_fields,
code,
) {
dest_uses = Some(names);
break;
}
}
}
let label_caps = match gm.label {
crate::labels::DataLabel::Sink(c) => c,
_ => crate::labels::Cap::empty(),
};
for &p in &payload_vec {
if !union_payload.contains(&p) {
union_payload.push(p);
}
}
gate_filters.push(GateFilter {
label_caps,
payload_args: payload_vec,
destination_uses: dest_uses,
});
}
if !union_payload.is_empty() {
sink_payload_args = Some(union_payload);
}
// Legacy single-gate path keeps `destination_uses` populated so
// the SSA fast-path (one filter) continues to work without
// consulting `gate_filters`. When multiple gates match,
// per-position filters live in `gate_filters` and the legacy
// field is intentionally left `None`.
if gate_filters.len() == 1 {
destination_uses = gate_filters[0].destination_uses.clone();
}
}
}
@ -1778,7 +1831,7 @@ pub(super) fn push_node<'a>(
// path-traversal or HTML metacharacters. The CFG collapses the whole
// chain into a single call node, so detection must inspect the AST of
// that node directly. Only fires when no Sanitizer label already
// classifies this node existing label rules win.
// classifies this node, existing label rules win.
if lang == "rust" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) {
if let Some(cn) = call_ast {
if cn.kind() == "call_expression" || cn.kind() == "method_call_expression" {
@ -1815,7 +1868,7 @@ pub(super) fn push_node<'a>(
// `having` / `joins` as `Sink(SQL_QUERY)` because their string-interpolation
// form (`Model.where("id = #{x}")`) is a real SQLi vector. But the same
// methods are intrinsically parameterised when arg 0 is a hash, symbol,
// array, or non-interpolated string Rails escapes the values. Rather
// array, or non-interpolated string, Rails escapes the values. Rather
// than dropping the sink (which would lose the genuine TPs), synthesise
// a same-node `Sanitizer(SQL_QUERY)` for the safe shapes; this clears
// SQL taint at the call and reflexively dominates the sink, suppressing
@ -1825,7 +1878,7 @@ pub(super) fn push_node<'a>(
// Chained calls (`Model.where(...).preload(...).to_a`) collapse into a
// single CFG node whose outer `call_ast` may be `to_a` (no args). The
// shape inspection has to walk the receiver chain to reach the AR query
// call itself `ruby_chain_arg0_for_method` does that walk.
// call itself, `ruby_chain_arg0_for_method` does that walk.
if (lang == "ruby" || lang == "rb")
&& labels
.iter()
@ -1859,7 +1912,7 @@ pub(super) fn push_node<'a>(
// and `Statement.executeQuery(String)` overloads are real injection
// sinks when given a concatenated SQL string. But the same method
// names on JPA `javax.persistence.Query` and JDBC `PreparedStatement`
// are zero-arg they execute SQL that was bound upstream by
// are zero-arg, they execute SQL that was bound upstream by
// `entityManager.createQuery(LITERAL)` / `connection.prepareStatement(LITERAL)`,
// and any bind values went through `setParameter` / `setString`
// (which the JDBC/JPA driver escapes). Walk the receiver chain to
@ -1894,7 +1947,7 @@ pub(super) fn push_node<'a>(
// (`createQuery` / `createNativeQuery` / `prepareStatement`)
// and require its arg 0 to be a string literal. Anything
// else (binary concat, identifier, method call) leaves
// the sink in place we cannot prove the SQL is
// the sink in place, we cannot prove the SQL is
// parameterised, so the structural finding stands.
const JPA_BIND_METHODS: &[&str] = &[
"createQuery",
@ -1914,6 +1967,89 @@ pub(super) fn push_node<'a>(
}
}
// Shape-based sanitizer synthesis for JS/TS ORM-accessor chains.
// The static label table marks `db.query` / `connection.query` /
// `pool.query` / `client.query` / `db.execute` as `Sink(SQL_QUERY)`
// because the bare `connection.query("SELECT ..." + name)` form is a
// real SQLi sink. But the same `db.query` method on Strapi-style ORMs
// takes a model UID literal and returns a chainable model accessor:
// `strapi.db.query('admin::api-token').findOne({ where: whereParams })`.
// The trailing `.findOne({...})` / `.findMany({...})` / `.create(...)`
// calls are intrinsically parameterised, the actual SQL is generated
// by the ORM, and the per-call values arrive through field-keyed object
// literals that the ORM driver escapes.
//
// Recognition rule: when the CFG node's classified text reaches a sink
// with `SQL_QUERY` cap, walk the receiver chain looking for an inner
// `*.query(...)` / `*.execute(...)` whose arg 0 is a string literal
// and whose result has at least one chained method call appended whose
// name is in the ORM-accessor whitelist. If both hold, synthesise a
// same-node `Sanitizer(SQL_QUERY)` mirroring the Java JPA fix. Bare
// `connection.query("SELECT ...")` (no chained method) and
// `db.query("UPDATE x SET y=" + name)` (non-literal arg 0) leave the
// sink in place, both are genuine SQLi shapes.
if (lang == "javascript"
|| lang == "js"
|| lang == "typescript"
|| lang == "ts"
|| lang == "tsx")
&& labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
&& !labels
.iter()
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
{
const QUERY_TARGETS: &[&str] = &["query", "execute"];
// ORM-accessor methods that take object-literal args and return
// promises of rows / row counts. Promise methods (`then`, `catch`,
// `finally`) deliberately excluded, they don't prove ORM shape.
const ORM_CHAIN_METHODS: &[&str] = &[
"findOne",
"findMany",
"findFirst",
"findUnique",
"findById",
"find",
"create",
"createMany",
"update",
"updateMany",
"upsert",
"delete",
"deleteMany",
"count",
"aggregate",
"distinct",
"save",
];
// Fall back to a deeper walk (up to 4 levels) for await/return-
// wrapped calls (e.g. `const x = await db.query(...).findOne(...)` ,
// call sits at depth 3 inside lexical_declaration > variable_declarator
// > await_expression > call_expression).
let chain_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(call_node) = chain_call {
// Outer method must be in the ORM whitelist *and* the chain must
// have a deeper inner call to a `query`/`execute` whose arg 0 is
// a string literal. Both checks gate the synthesis.
let outer_method = js_chain_outer_method_for_inner(call_node, QUERY_TARGETS, code);
let outer_is_orm = outer_method
.as_deref()
.is_some_and(|m| ORM_CHAIN_METHODS.contains(&m));
if outer_is_orm
&& let Some((arg0_kind, has_interp)) =
js_chain_arg0_kind_for_method(call_node, QUERY_TARGETS, code)
&& !has_interp
&& matches!(
arg0_kind.as_str(),
"string" | "string_fragment" | "template_string"
)
{
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
}
}
}
let span = (ast.start_byte(), ast.end_byte());
/* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */
@ -2036,7 +2172,7 @@ pub(super) fn push_node<'a>(
// (SSA `SsaOp::Call.receiver`, summary `receiver_to_return`/`receiver_to_sink`).
//
// Two cases:
// 1. Kind::CallMethod native method call AST (Java method_invocation,
// 1. Kind::CallMethod, native method call AST (Java method_invocation,
// Rust method_call_expression, Ruby call, PHP member_call_expression).
// Receiver is exposed via "object"/"receiver"/"scope" field on the call.
// 2. Kind::CallFn whose function child is a member_expression (JS/TS) or
@ -2065,7 +2201,7 @@ pub(super) fn push_node<'a>(
// value, which is what type-qualified resolution
// anchors on. Falls back to `root_receiver_text` (which
// returns raw text like "conn.execute") only if drilling
// fails preserving prior behavior for types we can't
// fails, preserving prior behavior for types we can't
// structurally reduce.
root_member_receiver(rn, code).or_else(|| root_receiver_text(cn, lang, code))
} else {
@ -2076,7 +2212,7 @@ pub(super) fn push_node<'a>(
// JS/TS `obj.method(x)`: call_expression.function = member_expression.
// Python `obj.method(x)`: call.function = attribute.
// Rust `obj.method(x)`: call_expression.function = field_expression
// (field on `value`, not `object` value can be another call
// (field on `value`, not `object`, value can be another call
// for chained forms like `Connection::open(p).unwrap().execute(...)`).
// Pull the receiver from the object/attribute-owner field.
let func_child = cn.child_by_field_name("function");
@ -2139,7 +2275,7 @@ pub(super) fn push_node<'a>(
// Python `with` and Java try-with-resources.
let is_raii_managed = is_raii_factory(lang, &text);
// Ruby block form auto-close: `File.open(path) { |f| f.read }`
// Ruby block form auto-close: `File.open(path) { |f| f.read }` ,
// the block parameter receives the resource and Ruby guarantees close
// at block exit. If assigned (`f = File.open(p) { ... }`), the
// variable holds the block's return value, not an open resource.
@ -2156,7 +2292,7 @@ pub(super) fn push_node<'a>(
// Prefer the span of the call found by `find_classifiable_inner_call`
// (deeper, classification-driven) over the one from `first_call_ident`
// (shallower, text-override-driven). Only record `callee_span` when it
// actually narrows against `ast.span` storing a redundant copy would
// actually narrows against `ast.span`, storing a redundant copy would
// just bloat every labeled Call node.
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
@ -2174,6 +2310,7 @@ pub(super) fn push_node<'a>(
kwargs,
arg_string_literals,
destination_uses,
gate_filters,
},
taint: TaintMeta {
labels,
@ -2228,7 +2365,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
/// Pre-emit dedicated Source CFG nodes for call arguments that contain source
/// member expressions.
///
/// **Two-step API** Source nodes must be created *before* the Call node so
/// **Two-step API**, Source nodes must be created *before* the Call node so
/// they receive lower graph indices. This is critical because the If handler
/// uses `NodeIndex::new(g.node_count())` to capture the first node built in a
/// branch and wires a True/False edge to it. If the Source node has a lower
@ -2239,7 +2376,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
/// the branch body.
///
/// True when `ast` is an assignment / declaration whose RHS is a
/// function or lambda literal i.e. shapes like
/// function or lambda literal, i.e. shapes like
/// * Go `run := func() { ... }`
/// * JS/TS `var run = function() { ... }` / `const run = () => ...`
/// * Python `run = lambda x: ...`
@ -2311,7 +2448,7 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
false
}
/// Pointer-Phase 6 / W5: when `ast` is (or wraps) an assignment whose
/// when `ast` is (or wraps) an assignment whose
/// LHS is a single subscript / index expression with a plain-identifier
/// receiver, emit a synthetic `__index_set__` Call node and return its
/// `NodeIndex`. Returns `None` for non-subscript LHSs, multi-target
@ -2328,7 +2465,7 @@ fn try_lower_subscript_write(
enclosing_func: Option<&str>,
call_ordinal: &mut u32,
) -> Option<NodeIndex> {
// Locate the assignment node `ast` may be the assignment itself
// Locate the assignment node, `ast` may be the assignment itself
// (Go `assignment_statement`) or a wrapper (`expression_statement`
// containing JS `assignment_expression` / Python `assignment`).
let assign_ast = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
@ -2383,7 +2520,7 @@ fn try_lower_subscript_write(
/// `synth_bindings` carry `(arg_pos, synth_name)` pairs that should be
/// appended to both the call's `arg_uses[arg_pos]` and its `taint.uses`.
/// `uses_only_synth_names` carry synth names that should *only* be
/// appended to `taint.uses` used for chain-inner-arg sources where the
/// appended to `taint.uses`, used for chain-inner-arg sources where the
/// synth value is not a positional argument of the OUTER call but still
/// participates in the call's implicit dependency chain (e.g. `r.Body`
/// inside `json.NewDecoder(r.Body).Decode(emoji)`'s receiver).
@ -2446,7 +2583,7 @@ fn pre_emit_arg_source_nodes(
for (pos, child) in children.iter().enumerate() {
let src_label = first_member_label(*child, lang, code, extra);
if let Some(DataLabel::Source(caps)) = src_label {
// Use the *current* node count as a unique token it equals the
// Use the *current* node count as a unique token, it equals the
// index the new Source node will receive.
let synth_name = format!("__nyx_src_{}_{}", g.node_count(), pos);
let member_text = first_member_text(*child, code);
@ -2481,7 +2618,7 @@ fn pre_emit_arg_source_nodes(
continue;
}
// Pointer-Phase 6 / W5: pre-emit `__index_get__` Call nodes for
//pre-emit `__index_get__` Call nodes for
// subscript / index-expression args when pointer analysis is
// enabled. This lets the W2/W4 container ELEM read hook fire
// on the synth call, propagating must/may/caps from the cell
@ -2489,7 +2626,7 @@ fn pre_emit_arg_source_nodes(
//
// Gated on `pointer::is_enabled()` so the env-var=0 path keeps
// CFG shapes bit-identical to today's output. Only fires when
// the array operand resolves to a plain identifier see
// the array operand resolves to a plain identifier, see
// `subscript_components` for the bail conditions.
if pointer_on
&& is_subscript_kind(child.kind())
@ -2539,7 +2676,7 @@ fn pre_emit_arg_source_nodes(
// Gated to Go and to writeback-shaped outer callees (`Decode` /
// `Unmarshal`) because the synth-source emission is only useful when
// a downstream writeback consumer reads from the chain's tainted
// receiver broader gating risks emitting synth sources whose taint
// receiver, broader gating risks emitting synth sources whose taint
// never propagates and whose presence trips Layer B AST-pattern
// suppression on unrelated sinks (see
// `tests/fixtures/real_world/go/taint/func_literal_capture.go`).
@ -2613,7 +2750,7 @@ fn pre_emit_arg_source_nodes(
/// Step 2: wire synthetic variable names from pre-emitted Source nodes into
/// the Call node's `arg_uses` and `uses`. `uses_only` synth names are
/// appended only to `taint.uses` used for chain-inner-arg sources whose
/// appended only to `taint.uses`, used for chain-inner-arg sources whose
/// synth value is not a positional outer-call argument.
fn apply_arg_source_bindings(
g: &mut Cfg,
@ -2724,7 +2861,7 @@ pub(super) fn build_sub<'a>(
.unwrap_or(false);
// Check for negation wrapping the entire condition (e.g. `!(a && b)`)
// if present, skip short-circuit decomposition (De Morgan out of scope).
//, if present, skip short-circuit decomposition (De Morgan out of scope).
let has_short_circuit = has_short_circuit
&& cond_subtree.map_or(false, |c| {
let unwrapped = unwrap_parens(c);
@ -3424,7 +3561,7 @@ pub(super) fn build_sub<'a>(
// When the grammar-level name is anonymous, try to derive a binding
// name from the surrounding declaration or assignment. This lets
// `var h = function(x){...}` / `this.run = () => {...}` participate
// in callback resolution callers referencing `h` or `run` can
// in callback resolution, callers referencing `h` or `run` can
// find the body via `resolve_local_func_key` and intra-file calls
// like `h()` can resolve to the anonymous body's summary. Without
// this, the body is keyed with the synthetic anon name and there
@ -3731,7 +3868,7 @@ pub(super) fn build_sub<'a>(
// would lower the return as a plain `StmtKind::Call`, losing
// the return semantics and letting fall-through Seq edges
// survive into the SSA terminator (the OR-chain rejection-arm
// defect see `or_chain_rejection_block_terminates_with_return`).
// defect, see `or_chain_rejection_block_terminates_with_return`).
if let Some(inner) = ast.children(&mut cursor).find(|c| {
matches!(
lookup(lang, c.kind()),
@ -3788,7 +3925,7 @@ pub(super) fn build_sub<'a>(
);
}
// Pointer-Phase 6 / W5: subscript-write lowering when the
//subscript-write lowering when the
// CallWrapper's inner expression is `arr[i] = v` (JS/TS,
// Python). See `try_lower_subscript_write` for shape +
// bail matrix.
@ -3824,7 +3961,7 @@ pub(super) fn build_sub<'a>(
// Pre-emit Source nodes for call arguments containing source
// member expressions (e.g. `req.body.returnTo` inside
// `res.redirect(req.body.returnTo)`). Created BEFORE the Call
// node so they get lower indices see doc comment on
// node so they get lower indices, see doc comment on
// `pre_emit_arg_source_nodes` for why this ordering matters.
let (effective_preds, src_bindings, src_uses_only) = if kind == StmtKind::Call {
pre_emit_arg_source_nodes(g, ast, lang, code, enclosing_func, analysis_rules, preds)
@ -3984,7 +4121,7 @@ pub(super) fn build_sub<'a>(
// Assignment that may contain a call (Python `x = os.getenv(...)`, Ruby `x = gets()`)
Kind::Assignment => {
// JS/TS ternary-RHS split same rationale as the CallWrapper branch.
// JS/TS ternary-RHS split, same rationale as the CallWrapper branch.
if matches!(lang, "javascript" | "typescript" | "tsx")
&& let (Some(left), Some(right)) = (
ast.child_by_field_name("left"),
@ -4011,7 +4148,7 @@ pub(super) fn build_sub<'a>(
}
}
// Pointer-Phase 6 / W5: subscript-write lowering. See
//subscript-write lowering. See
// `try_lower_subscript_write` for the per-language shape
// matrix and bail conditions.
if crate::pointer::is_enabled()
@ -4099,12 +4236,19 @@ pub(crate) fn build_cfg<'a>(
// function so thread-local state never leaks between files.
populate_fn_dfs_indices(tree, lang);
// Phase 6: harvest DTO class definitions before any param classifier
// runs. Empty for languages without a Phase 6 collector. Cleared
// harvest DTO class definitions before any param classifier
// runs. Empty for languages without a collector. Cleared
// alongside the DFS map at end-of-build_cfg.
DTO_CLASSES.with(|cell| {
*cell.borrow_mut() = dto::collect_dto_classes(tree.root_node(), lang, code);
});
// harvest same-file `type X = Map<...>` / `Set<...>` / `T[]`
// aliases so JS/TS param classifiers resolve `m: ElementsMap`
// to `LocalCollection`. Empty for non-JS/TS languages.
TYPE_ALIAS_LC.with(|cell| {
*cell.borrow_mut() =
dto::collect_type_alias_local_collections(tree.root_node(), lang, code);
});
// Create the top-level body graph (BodyId(0)).
let (mut g, entry, exit) = create_body_graph(0, code.len(), None);
@ -4143,7 +4287,7 @@ pub(crate) fn build_cfg<'a>(
connect_all(&mut g, &[e], exit, EdgeKind::Seq);
}
debug!(target: "cfg", "CFG DONE top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
debug!(target: "cfg", "CFG DONE, top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
if cfg!(debug_assertions) {
for idx in g.node_indices() {
@ -4231,10 +4375,11 @@ pub(crate) fn build_cfg<'a>(
// Clear the per-file DFS-index map so it does not leak to the next
// file built on this thread.
clear_fn_dfs_indices();
// Phase 6: same hygiene for the DTO map.
// same hygiene for the DTO map.
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
TYPE_ALIAS_LC.with(|cell| cell.borrow_mut().clear());
// Phase 6 (typed call-graph subtype awareness): collect every
// collect every
// declared inheritance / impl / implements relationship in the
// file. Per-language extractor in `cfg::hierarchy`; empty for
// Go and C. Each `(sub, super)` pair gets duplicated onto every
@ -4289,14 +4434,14 @@ fn apply_promisify_labels(
/// Build a `CalleeSite` carrying the richer per-call-site metadata for a
/// CFG node.
///
/// * `arity` positional argument count. `None` when `extract_arg_uses`
/// * `arity`, positional argument count. `None` when `extract_arg_uses`
/// bailed out on splats/keyword-args (length 0 does not distinguish
/// zero-arg calls from unknown; we treat 0 as a concrete zero). The
/// receiver is a separate channel via `CallMeta.receiver` and is not
/// represented in `arg_uses`, so `arity == arg_uses.len()` for calls.
/// * `receiver` forwarded verbatim from `CallMeta.receiver` (already
/// * `receiver`, forwarded verbatim from `CallMeta.receiver` (already
/// normalized to the root identifier).
/// * `qualifier` the segment(s) before the leaf identifier of the callee.
/// * `qualifier`, the segment(s) before the leaf identifier of the callee.
/// For **Rust** specifically, this is the *full* `::`-joined prefix (e.g.
/// `"crate::auth::token"` for `crate::auth::token::validate`) so that
/// cross-file `use`-map resolution in `callgraph.rs` has everything it
@ -4380,7 +4525,7 @@ pub(crate) fn export_summaries(
module_path: None,
rust_use_map: None,
rust_wildcards: None,
// Phase 6 hierarchy edges live on `FileCfg`, not on the
// Hierarchy edges live on `FileCfg`, not on the
// graph-local `FuncSummaries`. `ParsedFile::export_summaries_with_root`
// attaches them after this transform returns.
hierarchy_edges: Vec::new(),

View file

@ -8,7 +8,7 @@ use petgraph::graph::NodeIndex;
use smallvec::smallvec;
use tree_sitter::Node;
/// Phase 6.2 — resolve a syntactic class / struct / interface / model
/// resolve a syntactic class / struct / interface / model
/// name against the per-file [`DTO_CLASSES`] map populated at the top
/// of `build_cfg`. Returns the [`TypeKind::Dto`] carrying the
/// per-field type map when the class is declared in the same file;
@ -21,7 +21,7 @@ fn lookup_dto_class(class_name: &str) -> Option<TypeKind> {
/// Extract parameter names + per-position [`TypeKind`] from a function
/// AST node. Each entry's second slot is `Some(TypeKind)` when the
/// parameter's decorator, attribute, or static type annotation maps to
/// a known kind, and `None` otherwise. Strictly additive when no
/// a known kind, and `None` otherwise. Strictly additive, when no
/// type info is recoverable, behaviour is identical to the names-only
/// path.
pub(super) fn extract_param_meta<'a>(
@ -109,7 +109,7 @@ pub(super) fn extract_param_meta<'a>(
// Python `typed_parameter`, `default_parameter`,
// `typed_default_parameter`): the wrapper node has no `name`
// field but contains the identifier as a child. Pick the
// *first* identifier that is the parameter name; subsequent
// *first* identifier, that is the parameter name; subsequent
// identifiers are part of the type annotation or default
// expression.
if !found {
@ -123,7 +123,7 @@ pub(super) fn extract_param_meta<'a>(
continue;
}
// Bare identifier children e.g. Rust untyped closure params `|cmd|`
// Bare identifier children, e.g. Rust untyped closure params `|cmd|`
// where the child is an `identifier` node, not a `parameter` wrapper.
if child.kind() == "identifier" {
if let Some(txt) = text_of(child, code) {
@ -137,8 +137,8 @@ pub(super) fn extract_param_meta<'a>(
/// Walk up from a function definition node and build a container path.
///
/// Records the names of enclosing classes / impls / modules / namespaces /
/// structs and, for anonymous / nested functions, the name of an enclosing
/// named function joined with `::`. Also returns a `FuncKind` guess
/// structs, and, for anonymous / nested functions, the name of an enclosing
/// named function, joined with `::`. Also returns a `FuncKind` guess
/// reflecting the structural role.
///
/// Returns `(container, kind)`.
@ -185,7 +185,7 @@ pub(super) fn compute_container_and_kind(
| "enum_item"
| "struct_specifier"
| "struct_item" => Some("name"),
// Rust impl blocks pick the type name, not the trait name.
// Rust impl blocks, pick the type name, not the trait name.
"impl_item" => Some("type"),
// Go / C++ / PHP namespaces and modules.
"namespace_definition" | "namespace_declaration" | "module_declaration" | "module" => {
@ -223,7 +223,7 @@ pub(super) fn compute_container_and_kind(
|| pk == "lambda_expression"
|| pk == "function_expression"
{
// Nested definition record the outer function's name and
// Nested definition, record the outer function's name and
// classify self as Closure even if we got a real name.
if let Some(name_node) = parent.child_by_field_name("name") {
if let Some(text) = text_of(name_node, code) {
@ -428,15 +428,15 @@ pub(super) fn inject_framework_param_sources(
/// no recognised pattern matches, returns `None` and the engine
/// behaves exactly as before.
///
/// Recognised patterns (Phase 2):
/// * Java (Spring) `@PathVariable`/`@RequestParam Long X` →
/// Recognised patterns:
/// * Java (Spring), `@PathVariable`/`@RequestParam Long X` →
/// [`TypeKind::Int`]; `@RequestBody T` → object (no kind today).
/// * TypeScript (NestJS) `@Param('id') id: number` →
/// * TypeScript (NestJS), `@Param('id') id: number` →
/// [`TypeKind::Int`]; `@Body() dto: T` / `@Query('q') q: string`.
/// * Rust (Axum / Rocket / Actix) `Path<i64>` / `Path<u32>` /
/// * Rust (Axum / Rocket / Actix), `Path<i64>` / `Path<u32>` /
/// `web::Path<i64>` → [`TypeKind::Int`]; `Path<String>` →
/// [`TypeKind::String`].
/// * Python (FastAPI) `def h(x: int)` → [`TypeKind::Int`];
/// * Python (FastAPI), `def h(x: int)` → [`TypeKind::Int`];
/// `Annotated[int, Path()]` → [`TypeKind::Int`].
pub(super) fn classify_param_type<'a>(
param: Node<'a>,
@ -453,9 +453,9 @@ pub(super) fn classify_param_type<'a>(
}
}
/// Java (Spring) recognise typed-extractor parameters via the
/// Java (Spring), recognise typed-extractor parameters via the
/// surrounding annotation. Per Hard Rule 3, plain `Long X` without a
/// known framework annotation is **not** treated as a typed extractor
/// known framework annotation is **not** treated as a typed extractor ,
/// the parameter could be a regular function argument that the
/// framework never validates. Recognised annotations:
/// `@PathVariable`, `@RequestParam`, `@RequestBody`, `@RequestHeader`,
@ -473,7 +473,7 @@ fn classify_param_type_java<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
if let Some(k) = java_type_to_kind(&type_text) {
return Some(k);
}
// Phase 6.2: when the static type is a class name we don't classify
// when the static type is a class name we don't classify
// as a primitive (e.g. `@RequestBody CreateUser dto`), look up the
// class in the same-file DTO map. Strip any generics for the
// leading type so `Foo<Bar>` still resolves on `Foo`.
@ -527,7 +527,7 @@ fn has_java_framework_annotation(param: Node<'_>, code: &[u8]) -> bool {
}
/// Map a Java type-text fragment to a [`TypeKind`]. Public to the
/// `cfg` module so the Phase 6 DTO collector can reuse the same
/// `cfg` module so the DTO DTO collector can reuse the same
/// classifier for class fields.
pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
let bare = t.trim().trim_start_matches('@').trim();
@ -546,7 +546,7 @@ pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
/// Map a TypeScript type-text fragment (already stripped of leading
/// `:` / whitespace) to a primitive [`TypeKind`]. Used by both the
/// per-parameter classifier and the Phase 6 DTO collector.
/// per-parameter classifier and the DTO DTO collector.
pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
let head = t.split('<').next().unwrap_or(t).trim();
match head {
@ -557,13 +557,35 @@ pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
}
}
/// TypeScript (NestJS) recognise typed-extractor parameters via a
/// TypeScript (NestJS), recognise typed-extractor parameters via a
/// known NestJS decorator (`@Param`, `@Body`, `@Query`, `@Headers`,
/// `@Req`, `@Res`). Per Hard Rule 3, a bare `function h(id: number)`
/// is not a framework extractor without a NestJS decorator no
/// is not a framework extractor, without a NestJS decorator no
/// runtime gate is implied. Pipe coercions (`ParseIntPipe` /
/// `ParseBoolPipe`) override the static type.
///
/// Exception: parameters annotated as a known JS built-in collection
/// type (`Map<...>`, `Set<...>`, `WeakMap<...>`, `WeakSet<...>`,
/// `Array<...>` / `T[]` / `ReadonlyArray<...>`) resolve to
/// [`TypeKind::LocalCollection`] regardless of decorator presence.
/// `LocalCollection` is a *receiver-shape* claim, not a
/// framework-validated-input claim, it tells the auth analyser that
/// `param.get(k)` / `param.set(k, v)` / `param.find(p)` is a
/// container operation rather than a data-layer read/mutation. This
/// closes the Excalidraw FP cluster (`elementsMap: ElementsMap`,
/// `groupIdMapForOperation: Map<string, string>`) without affecting
/// any input-validation reasoning.
fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
let type_text = param
.child_by_field_name("type")
.and_then(|n| inner_ts_type_text(n, code));
if let Some(t) = type_text.as_deref()
&& let Some(k) = ts_type_to_local_collection(t.trim().trim_start_matches(':').trim())
{
return Some(k);
}
if !has_ts_decorator_argument(
param,
code,
@ -586,14 +608,12 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
if has_ts_decorator_argument(param, code, &["ParseBoolPipe"]) {
return Some(TypeKind::Bool);
}
let t = param
.child_by_field_name("type")
.and_then(|n| inner_ts_type_text(n, code))?;
let t = type_text?;
let stripped = t.trim().trim_start_matches(':').trim();
if let Some(k) = ts_type_to_kind(stripped) {
return Some(k);
}
// Phase 6.2: NestJS `@Body() dto: CreateUser` — when the static
// NestJS `@Body() dto: CreateUser`, when the static
// type is a class / interface name declared in the same file,
// resolve via the DTO map. Generic args dropped for the leading
// type so `Foo<Bar>` matches on `Foo`.
@ -601,8 +621,41 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
lookup_dto_class(head)
}
/// Map a TypeScript / JavaScript type-text fragment to
/// [`TypeKind::LocalCollection`] when the head is a JS built-in
/// container type. Recognises:
///
/// * `Map<K, V>`, `Set<T>`, `WeakMap<K, V>`, `WeakSet<T>`, the four
/// built-in keyed/unkeyed collection types.
/// * `Array<T>`, `ReadonlyArray<T>`, the named array generics.
/// * `T[]`, `readonly T[]`, the array shorthand syntax.
/// * Same-file `type X = Map<...>` aliases (resolved via the
/// per-file `TYPE_ALIAS_LC` map populated at the top of
/// [`build_cfg`]).
///
/// Same-file user types named `Map` / `Set` / etc. (which would
/// shadow the built-ins) are vanishingly rare in TS codebases that
/// also define the methods (`get`, `set`, `has`, `find`); the
/// classifier accepts the head match.
pub(super) fn ts_type_to_local_collection(t: &str) -> Option<TypeKind> {
let head_text = t.trim().trim_start_matches("readonly ").trim();
// Array shorthand: `T[]` or `readonly T[]`.
if head_text.ends_with("[]") {
return Some(TypeKind::LocalCollection);
}
let head = head_text.split('<').next().unwrap_or(head_text).trim();
match head {
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" | "ReadonlyArray" => {
Some(TypeKind::LocalCollection)
}
_ => super::TYPE_ALIAS_LC
.with(|cell| cell.borrow().contains(head))
.then_some(TypeKind::LocalCollection),
}
}
fn inner_ts_type_text<'a>(type_anno: Node<'a>, code: &'a [u8]) -> Option<String> {
// type_annotation node text is `: T` — unwrap to T.
// type_annotation node text is `: T`, unwrap to T.
if let Some(child) = type_anno.named_child(0) {
return text_of(child, code);
}
@ -643,10 +696,10 @@ fn has_ts_decorator_argument(param: Node<'_>, code: &[u8], wanted: &[&str]) -> b
false
}
/// Rust (Axum / Rocket / Actix) read the parameter's type text and
/// Rust (Axum / Rocket / Actix), read the parameter's type text and
/// look for `Path<i64>` / `Json<T>` / `Form<T>` / `Query<T>` shapes.
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)` without an
/// extractor wrapper) are **not** treated as typed extractors only
/// extractor wrapper) are **not** treated as typed extractors, only
/// framework-wrapped types qualify.
fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
if param.kind() != "parameter" {
@ -654,9 +707,121 @@ fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
}
let type_node = param.child_by_field_name("type")?;
let type_text = text_of(type_node, code)?;
// LocalCollection is a *receiver-shape* claim, not a
// framework-validated-input claim, Hard Rule 3's "bare primitives
// don't count" gate doesn't apply (mirrors `classify_param_type_ts`
// for the same reason). Captures `unsharded: RoaringBitmap`,
// `docids: &mut RoaringBitmap`, `params: HashMap<String, String>`,
// `new_shard_docids: &'a mut hashbrown::HashMap<...>` shapes from
// meilisearch/index-scheduler's bitmap bookkeeping where the
// verb-name dispatch (`is_mutation: insert/remove`) would otherwise
// classify these as DB writes.
if let Some(k) = rust_type_to_local_collection(&type_text) {
return Some(k);
}
rust_type_to_kind(&type_text)
}
/// Strip Rust reference markers, lifetimes, and `mut` from the head of
/// a type-text fragment so the underlying type name is exposed for
/// matching. Handles `&T`, `&mut T`, `&'a T`, `&'a mut T`, and
/// repeated `&` prefixes (e.g. `&&mut T`).
fn strip_rust_ref_markers(t: &str) -> &str {
let mut s = t.trim();
loop {
if let Some(rest) = s.strip_prefix('&') {
let rest = rest.trim_start();
// Optional lifetime label: `'a`, `'static`, `'_`.
let rest = if let Some(after) = rest.strip_prefix('\'') {
let end = after
.find(|c: char| !c.is_alphanumeric() && c != '_')
.unwrap_or(after.len());
after[end..].trim_start()
} else {
rest
};
// Optional `mut` keyword.
let rest = rest.strip_prefix("mut ").unwrap_or(rest).trim_start();
s = rest;
continue;
}
if let Some(rest) = s.strip_prefix("mut ") {
s = rest.trim_start();
continue;
}
break;
}
s
}
/// Map a Rust parameter / variable type-text to
/// [`TypeKind::LocalCollection`] when the head names a known
/// in-memory container. Strips reference / lifetime / `mut` markers,
/// drops module-path prefixes (`std::collections::`, `hashbrown::`,
/// `roaring::`), then matches the head against std and ecosystem
/// collection types.
///
/// Recognises:
/// * Std: `Vec`, `HashMap`, `HashSet`, `BTreeMap`, `BTreeSet`,
/// `VecDeque`, `BinaryHeap`, `LinkedList`.
/// * Ecosystem: `IndexMap`, `IndexSet` (indexmap), `SmallVec`
/// (smallvec), `DashMap`, `DashSet` (dashmap), `FxHashMap`,
/// `FxHashSet` (rustc-hash / fxhash), `RoaringBitmap`,
/// `RoaringTreemap` (roaring).
/// * Array / slice shorthand: `[T; N]`, `[T]` (covered by the
/// leading-`[` check after ref-stripping).
///
/// Returns `None` for `Database<...>` (heed/sled, persistent KV
/// store, NOT a local collection, keeping this `None` preserves
/// real IDOR detection on persistent-store calls), `Mutex<...>` /
/// `RwLock<...>` (synchronisation wrappers, not sink-shape claims),
/// and bare primitives.
pub(super) fn rust_type_to_local_collection(t: &str) -> Option<TypeKind> {
let stripped = strip_rust_ref_markers(t);
// Array / slice shorthand: `[T; N]` or `[T]` (the `&` was
// already stripped).
if stripped.starts_with('[') {
return Some(TypeKind::LocalCollection);
}
// Drop module-path prefix: keep only the last segment before `<`
// or end (`std::collections::HashMap<K, V>` → `HashMap`).
let head_with_generics = stripped.rsplit("::").next().unwrap_or(stripped);
let head = head_with_generics
.split('<')
.next()
.unwrap_or(head_with_generics)
.trim();
const TYPES: &[&str] = &[
"Vec",
"VecDeque",
"BinaryHeap",
"LinkedList",
"HashMap",
"HashSet",
"BTreeMap",
"BTreeSet",
"IndexMap",
"IndexSet",
"SmallVec",
"DashMap",
"DashSet",
"FxHashMap",
"FxHashSet",
"RoaringBitmap",
"RoaringTreemap",
];
if TYPES.contains(&head) {
Some(TypeKind::LocalCollection)
} else {
None
}
}
fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
let stripped = t.trim();
// Reject reference / mutability noise so `&Path<i64>` still matches
@ -666,7 +831,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
.trim_start_matches('&')
.trim_start_matches("mut ")
.trim();
// Only framework wrapper extractors qualify bare primitives like
// Only framework wrapper extractors qualify, bare primitives like
// `i64` could be regular function parameters with no framework
// validation gate.
for wrap in [
@ -684,7 +849,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
if let Some(rest) = stripped.strip_prefix(&prefix) {
if let Some(inner) = rest.strip_suffix('>') {
let inner = inner.trim();
// Tuple extractor `Path<(i64, String)>` first element wins.
// Tuple extractor `Path<(i64, String)>`, first element wins.
if inner.starts_with('(') {
let inside = inner.trim_start_matches('(').trim_end_matches(')');
let first = inside.split(',').next().unwrap_or("").trim();
@ -696,16 +861,16 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
if let Some(k) = rust_primitive_to_kind(inner) {
return Some(k);
}
// Phase 6.2: `Json<T>` / `Form<T>` / `Query<T>` /
// `Path<T>` with a same-file struct type resolve via
// `Json<T>` / `Form<T>` / `Query<T>` /
// `Path<T>` with a same-file struct type, resolve via
// the DTO map. Strip nested generics so `Json<Foo<i64>>`
// matches on `Foo`.
let head = inner.split('<').next().unwrap_or(inner).trim();
if let Some(k) = lookup_dto_class(head) {
return Some(k);
}
// Custom struct outside the same file leave None
// (cross-file resolution is Phase 6.4).
// Custom struct outside the same file, leave None
// (cross-file resolution is a follow-up).
return None;
}
}
@ -714,7 +879,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
}
/// Map a Rust primitive / `String` / `&str` to a [`TypeKind`]. Public
/// to the `cfg` module so the Phase 6 DTO collector can reuse it for
/// to the `cfg` module so the DTO DTO collector can reuse it for
/// `struct` field types.
pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
let t = t.trim();
@ -728,10 +893,10 @@ pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
}
}
/// Python (FastAPI) recognise typed-extractor parameters via the
/// Python (FastAPI), recognise typed-extractor parameters via the
/// `Annotated[X, Path()/Query()/Body()/Header()/Cookie()]` shape. Per
/// Hard Rule 3, a bare `def h(id: int)` is **not** a framework
/// extractor the function may be a plain Python function and the
/// extractor, the function may be a plain Python function and the
/// type annotation provides no runtime gate.
fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
let type_node = param.child_by_field_name("type")?;
@ -741,7 +906,7 @@ fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<Typ
fn python_type_to_kind(t: &str) -> Option<TypeKind> {
let stripped = t.trim();
// `Annotated[int, Path()]` only matches when one of the generic
// `Annotated[int, Path()]`, only matches when one of the generic
// args names a recognised FastAPI binding marker. Otherwise no
// framework gate is implied.
if let Some(inner) = stripped
@ -756,8 +921,8 @@ fn python_type_to_kind(t: &str) -> Option<TypeKind> {
if let Some(k) = python_primitive_to_kind(first) {
return Some(k);
}
// Phase 6.2: `Annotated[CreateUser, Body()]` with a same-file
// Pydantic model resolve via the DTO map. Generic args are
// `Annotated[CreateUser, Body()]` with a same-file
// Pydantic model, resolve via the DTO map. Generic args are
// dropped via the same head-split as `python_primitive_to_kind`.
let head = first.split('[').next().unwrap_or(first).trim();
return lookup_dto_class(head);
@ -773,7 +938,7 @@ fn contains_fastapi_marker(s: &str) -> bool {
}
/// Map a Python type expression to a primitive [`TypeKind`]. Used by
/// both the per-parameter classifier and the Phase 6 Pydantic-model
/// both the per-parameter classifier and the DTO Pydantic-model
/// field collector.
pub(super) fn python_primitive_to_kind(t: &str) -> Option<TypeKind> {
let head = t.trim().split('[').next().unwrap_or(t).trim();
@ -806,10 +971,70 @@ pub(super) fn is_configured_terminator(
mod typed_extractor_tests {
use super::{
contains_fastapi_marker, java_type_to_kind, python_primitive_to_kind, python_type_to_kind,
rust_primitive_to_kind, rust_type_to_kind,
rust_primitive_to_kind, rust_type_to_kind, rust_type_to_local_collection,
ts_type_to_local_collection,
};
use crate::ssa::type_facts::TypeKind;
// ── TypeScript / JavaScript local-collection types ───────────────────
#[test]
fn ts_built_in_collections_map_to_local_collection() {
// The four keyed/unkeyed built-in container generics.
assert_eq!(
ts_type_to_local_collection("Map<string, number>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("Set<string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("WeakMap<object, string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("WeakSet<object>"),
Some(TypeKind::LocalCollection)
);
// Array forms.
assert_eq!(
ts_type_to_local_collection("Array<string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("ReadonlyArray<string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("string[]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("readonly string[]"),
Some(TypeKind::LocalCollection)
);
// Excalidraw-style keyed map with index-type generic args.
assert_eq!(
ts_type_to_local_collection("Map<ExcalidrawElement[\"id\"], ExcalidrawElement>"),
Some(TypeKind::LocalCollection)
);
}
#[test]
fn ts_non_collection_types_return_none() {
// Plain primitives.
assert_eq!(ts_type_to_local_collection("string"), None);
assert_eq!(ts_type_to_local_collection("number"), None);
assert_eq!(ts_type_to_local_collection("boolean"), None);
// Promise / Iterator / etc. are not LocalCollections.
assert_eq!(ts_type_to_local_collection("Promise<string>"), None);
assert_eq!(ts_type_to_local_collection("Iterator<number>"), None);
// User types.
assert_eq!(ts_type_to_local_collection("CreateUserDto"), None);
assert_eq!(ts_type_to_local_collection("ElementsMap"), None);
}
// ── Java (Spring) ────────────────────────────────────────────────────
#[test]
@ -841,7 +1066,7 @@ mod typed_extractor_tests {
#[test]
fn java_request_body_dto_returns_none_until_phase_six() {
// @RequestBody CreateUserDto dto — no kind today; Phase 6 will
// @RequestBody CreateUserDto dto, no kind today; future passes will
// return DtoObject(name) once cross-file class resolution lands.
assert_eq!(java_type_to_kind("CreateUserDto"), None);
assert_eq!(java_type_to_kind("List<String>"), None);
@ -860,7 +1085,7 @@ mod typed_extractor_tests {
#[test]
fn rust_path_tuple_first_element_wins() {
// Path<(i64, String)> first slot is the int extractor that
// Path<(i64, String)>, first slot is the int extractor that
// matters for sink suppression.
assert_eq!(
rust_type_to_kind("Path<(i64, String)>"),
@ -876,15 +1101,15 @@ mod typed_extractor_tests {
#[test]
fn rust_json_dto_returns_none_until_phase_six() {
// Json<T> / Form<T> / Query<T> with a custom struct type no
// primitive resolution available; Phase 6 lifts to DTO.
// Json<T> / Form<T> / Query<T> with a custom struct type, no
// primitive resolution available; future passes will lift to DTO.
assert_eq!(rust_type_to_kind("Json<CreateUserDto>"), None);
assert_eq!(rust_type_to_kind("Form<CreateUserDto>"), None);
assert_eq!(rust_type_to_kind("Query<Filters>"), None);
}
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)`) are NOT
/// framework extractors only wrapper types (`Path<i64>` etc.)
/// framework extractors, only wrapper types (`Path<i64>` etc.)
/// imply a framework runtime gate. Bare i64 must return None.
#[test]
fn rust_bare_primitives_are_not_framework_extractors() {
@ -903,7 +1128,7 @@ mod typed_extractor_tests {
#[test]
fn python_bare_primitives_are_not_framework_extractors() {
// Per Hard Rule 3: bare `def h(id: int)` is NOT a typed
// extractor without an `Annotated[..., Path()/Query()/Body()]`
// extractor, without an `Annotated[..., Path()/Query()/Body()]`
// wrapper, no FastAPI gate is implied.
assert_eq!(python_type_to_kind("int"), None);
assert_eq!(python_type_to_kind("float"), None);
@ -936,7 +1161,7 @@ mod typed_extractor_tests {
#[test]
fn python_annotated_without_marker_returns_none() {
// Annotated without a FastAPI binding marker is a generic
// type-system tag not a framework extractor.
// type-system tag, not a framework extractor.
assert_eq!(python_type_to_kind("Annotated[int, str]"), None);
assert_eq!(python_type_to_kind("Annotated[int, MyMeta]"), None);
}
@ -954,4 +1179,128 @@ mod typed_extractor_tests {
assert!(contains_fastapi_marker("bytes, File()"));
assert!(!contains_fastapi_marker("int, str"));
}
// ── Rust local-collection types ──────────────────────────────────────
#[test]
fn rust_std_collections_map_to_local_collection() {
for ty in [
"Vec<u32>",
"HashMap<String, u32>",
"HashSet<u64>",
"BTreeMap<u32, String>",
"BTreeSet<u32>",
"VecDeque<u8>",
"BinaryHeap<u32>",
"LinkedList<i32>",
] {
assert_eq!(
rust_type_to_local_collection(ty),
Some(TypeKind::LocalCollection),
"{ty} should map to LocalCollection"
);
}
}
#[test]
fn rust_ecosystem_collections_map_to_local_collection() {
for ty in [
"IndexMap<String, u32>",
"IndexSet<u64>",
"SmallVec<[u32; 4]>",
"DashMap<String, u32>",
"DashSet<u64>",
"FxHashMap<String, u32>",
"FxHashSet<u64>",
"RoaringBitmap",
"RoaringTreemap",
] {
assert_eq!(
rust_type_to_local_collection(ty),
Some(TypeKind::LocalCollection),
"{ty} should map to LocalCollection"
);
}
}
#[test]
fn rust_module_qualified_collections_map_to_local_collection() {
// Module-path prefixes: keep only the last segment for matching.
assert_eq!(
rust_type_to_local_collection("std::collections::HashMap<K, V>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("hashbrown::HashMap<String, RoaringBitmap>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("roaring::RoaringBitmap"),
Some(TypeKind::LocalCollection)
);
}
#[test]
fn rust_reference_and_lifetime_markers_stripped() {
// `&T`, `&mut T`, `&'a T`, `&'a mut T`, `&'static T`,
// repeated `&` prefixes, all reach the underlying type head.
for ty in [
"&RoaringBitmap",
"&mut RoaringBitmap",
"&'a RoaringBitmap",
"&'a mut RoaringBitmap",
"&'static RoaringBitmap",
"&&mut RoaringBitmap",
"&'a mut hashbrown::HashMap<String, RoaringBitmap>",
] {
assert_eq!(
rust_type_to_local_collection(ty),
Some(TypeKind::LocalCollection),
"{ty} should map to LocalCollection after ref stripping"
);
}
}
#[test]
fn rust_array_and_slice_shorthand_map_to_local_collection() {
// `[T; N]` arrays and `[T]` slices are local containers.
assert_eq!(
rust_type_to_local_collection("[u32; 4]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("[u8]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("&[u32]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("&mut [u32]"),
Some(TypeKind::LocalCollection)
);
}
#[test]
fn rust_persistent_db_and_sync_wrappers_return_none() {
// heed / sled / rocksdb persistent-store handles are NOT local
// collections, preserves IDOR detection on real DB calls.
assert_eq!(
rust_type_to_local_collection("Database<BEU32, SerdeJson<Task>>"),
None
);
assert_eq!(rust_type_to_local_collection("heed::Database<K, V>"), None);
assert_eq!(rust_type_to_local_collection("sled::Db"), None);
// Sync wrappers don't claim a sink shape.
assert_eq!(rust_type_to_local_collection("Mutex<HashMap<K, V>>"), None);
assert_eq!(rust_type_to_local_collection("RwLock<Vec<u32>>"), None);
// Bare primitives.
assert_eq!(rust_type_to_local_collection("u32"), None);
assert_eq!(rust_type_to_local_collection("&str"), None);
assert_eq!(rust_type_to_local_collection("String"), None);
// Unrelated user types.
assert_eq!(rust_type_to_local_collection("MyDao<User>"), None);
assert_eq!(rust_type_to_local_collection("Connection"), None);
}
}