mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-27 20:29:39 +02:00
Python fp and docs updtes (#58)
* refactor: Update comments for clarity and add expectations.json files for performance metrics * feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks * feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks * refactor: Simplify code formatting for better readability in multiple files * refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration * feat: Update Java and Python patterns to include new security rules * refactor: Improve comment clarity and consistency across multiple Rust files * refactor: Simplify code formatting for improved readability in integration tests and module files * refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
parent
4db0805de6
commit
a438886217
291 changed files with 9485 additions and 3851 deletions
|
|
@ -10,7 +10,7 @@ use tree_sitter::Node;
|
|||
/// at the *case-level* shape `build_switch` sees here. Rust `match`, Go
|
||||
/// `switch`, and Java arrow-switches qualify; classic Java/C/C++/JS switches
|
||||
/// with fall-through do not. The check is per-language because Java mixes
|
||||
/// arrow and classic shapes — that's handled by inspecting the case kind in
|
||||
/// arrow and classic shapes, that's handled by inspecting the case kind in
|
||||
/// [`extract_case_literal_text`].
|
||||
fn lang_has_exclusive_cases(lang: &str) -> bool {
|
||||
matches!(lang, "rust" | "go")
|
||||
|
|
@ -19,7 +19,7 @@ fn lang_has_exclusive_cases(lang: &str) -> bool {
|
|||
/// Extract the scrutinee subtree from a switch-like AST node.
|
||||
///
|
||||
/// Returns the AST node referenced by the language's scrutinee field. Only
|
||||
/// fires for Rust `match`, Go `switch`, and Java `switch` statements — other
|
||||
/// fires for Rust `match`, Go `switch`, and Java `switch` statements, other
|
||||
/// languages return `None` so [`build_switch`] keeps its legacy behavior.
|
||||
fn extract_scrutinee_node<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
|
||||
let field = match lang {
|
||||
|
|
@ -39,7 +39,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
|
|||
let kind = case.kind();
|
||||
match (lang, kind) {
|
||||
("rust", "match_arm") => {
|
||||
// Reject guarded arms — `match x { y if cond => ... }`.
|
||||
// Reject guarded arms, `match x { y if cond => ... }`.
|
||||
if case.child_by_field_name("guard").is_some() {
|
||||
return None;
|
||||
}
|
||||
|
|
@ -71,7 +71,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
|
|||
text_of(inner, code)
|
||||
}
|
||||
("go", "expression_case") => {
|
||||
// Go case `case v1, v2: ...` — only handle exactly one expression.
|
||||
// Go case `case v1, v2: ...`, only handle exactly one expression.
|
||||
let value = case.child_by_field_name("value")?;
|
||||
let mut named_children: Vec<Node> = Vec::new();
|
||||
let mut cursor = value.walk();
|
||||
|
|
@ -195,7 +195,7 @@ pub(super) fn extract_catch_param_name<'a>(
|
|||
// -------------------------------------------------------------------------
|
||||
|
||||
/// Builds CFG for Ruby's `begin`/`rescue`/`ensure` blocks (and `body_statement`
|
||||
/// with inline rescue). Ruby's `begin` has no `body` field — the try-body
|
||||
/// with inline rescue). Ruby's `begin` has no `body` field, the try-body
|
||||
/// statements are direct children before `rescue`/`else`/`ensure` nodes.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn build_begin_rescue<'a>(
|
||||
|
|
@ -305,7 +305,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
|
||||
vec![synth]
|
||||
} else {
|
||||
// No param name — will wire exception edges to first rescue body node
|
||||
// No param name, will wire exception edges to first rescue body node
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
|
|
@ -333,7 +333,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
current_body_id,
|
||||
)
|
||||
} else {
|
||||
// No body field — build rescue node itself as a block.
|
||||
// No body field, build rescue node itself as a block.
|
||||
// Filter out meta-children (exceptions, exception_variable) by
|
||||
// iterating and building only statement children.
|
||||
let mut rescue_cursor = rescue_node.walk();
|
||||
|
|
@ -407,7 +407,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
try_exits
|
||||
};
|
||||
|
||||
// 6. Build ensure clause (Ruby's finally — always runs)
|
||||
// 6. Build ensure clause (Ruby's finally, always runs)
|
||||
if let Some(ensure_node) = ensure_clause {
|
||||
let mut ensure_preds: Vec<NodeIndex> = Vec::new();
|
||||
ensure_preds.extend(&normal_exits);
|
||||
|
|
@ -443,7 +443,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// switch handler — multi-way dispatch with fallthrough
|
||||
// switch handler, multi-way dispatch with fallthrough
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/// True for AST kinds that wrap a single switch case body.
|
||||
|
|
@ -490,7 +490,7 @@ pub(super) fn case_has_default_label(case: Node<'_>) -> bool {
|
|||
/// Build CFG for a switch statement.
|
||||
///
|
||||
/// The dispatch is decomposed into a chain of binary `StmtKind::If` headers
|
||||
/// — one per non-default case — because the SSA terminator only models 0/1/2
|
||||
///, one per non-default case, because the SSA terminator only models 0/1/2
|
||||
/// successors. A monolithic N-way header would otherwise be collapsed to
|
||||
/// `Goto(first)` and silently drop every other case. Each header's True edge
|
||||
/// reaches its case body; the False edge falls through to the next header (or
|
||||
|
|
@ -544,7 +544,7 @@ pub(super) fn build_switch<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
// Grammar didn't expose recognisable case nodes — fall back to a single
|
||||
// Grammar didn't expose recognisable case nodes, fall back to a single
|
||||
// header + Block-style walk so nodes still get linked.
|
||||
if cases.is_empty() {
|
||||
let header = push_node(
|
||||
|
|
@ -603,7 +603,7 @@ pub(super) fn build_switch<'a>(
|
|||
// arrow-switch), pre-extract the scrutinee text + idents so the synthetic
|
||||
// dispatch headers can carry a `<scrutinee> == <case_literal>` condition.
|
||||
// Falls back to `None` when the scrutinee is structurally complex (calls,
|
||||
// member chains, parenthesized expressions in Go) — the existing first-
|
||||
// member chains, parenthesized expressions in Go), the existing first-
|
||||
// reachable behavior remains correct in that case.
|
||||
let supports_exclusive_cases = lang_has_exclusive_cases(lang) || lang == "java";
|
||||
let (scrutinee_text, scrutinee_idents) = if supports_exclusive_cases {
|
||||
|
|
@ -647,7 +647,7 @@ pub(super) fn build_switch<'a>(
|
|||
for (idx, (case, is_default)) in cases.iter().copied().enumerate() {
|
||||
let is_last = idx + 1 == cases.len();
|
||||
|
||||
// Default at the chain tail doesn't get its own dispatch If — the
|
||||
// Default at the chain tail doesn't get its own dispatch If, the
|
||||
// previous header's False edge already targets it directly.
|
||||
let case_first_preds: Vec<NodeIndex> = if is_default && is_last {
|
||||
// First node of the default body becomes the False target of the
|
||||
|
|
@ -675,12 +675,13 @@ pub(super) fn build_switch<'a>(
|
|||
);
|
||||
// The dispatch header is purely structural (it stands in for the
|
||||
// discriminant comparison). It must not inherit Sink/Source labels
|
||||
// from the case body's text — push_node uses `text_of(ast)` for
|
||||
// from the case body's text, push_node uses `text_of(ast)` for
|
||||
// non-call kinds, which would let the body text drive classification.
|
||||
g[header].taint.labels.clear();
|
||||
g[header].call.callee = None;
|
||||
g[header].call.sink_payload_args = None;
|
||||
g[header].call.destination_uses = None;
|
||||
g[header].call.gate_filters.clear();
|
||||
// For mutually-exclusive switch shapes with a single-ident
|
||||
// scrutinee, synthesize a `<scrutinee> == <case_literal>`
|
||||
// structured condition on the dispatch header so SSA lowering
|
||||
|
|
@ -958,7 +959,7 @@ pub(super) fn build_try<'a>(
|
|||
|
||||
vec![synth]
|
||||
} else {
|
||||
// No param name — wire exception edges directly to first catch body node
|
||||
// No param name, wire exception edges directly to first catch body node
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ fn js_try_catch_has_exception_edges() {
|
|||
|
||||
/// When a classifiable call (here `eval`, a built-in JS sink) is nested
|
||||
/// inside a multi-line statement, the CFG node's `classification_span()`
|
||||
/// should point at the inner call, not at the outer statement's start —
|
||||
/// should point at the inner call, not at the outer statement's start ,
|
||||
/// so finding display reports the line the dangerous call actually lives
|
||||
/// on. `ast.span` must still cover the whole outer statement for
|
||||
/// structural passes that need the statement grain.
|
||||
|
|
@ -86,7 +86,7 @@ fn inner_call_override_narrows_classification_span() {
|
|||
}
|
||||
|
||||
/// `classification_span()` must fall back to `ast.span` when no narrower
|
||||
/// sub-expression was recorded — so existing structural code paths keep
|
||||
/// sub-expression was recorded, so existing structural code paths keep
|
||||
/// working unchanged for nodes whose classification applies to the whole
|
||||
/// outer node.
|
||||
#[test]
|
||||
|
|
@ -125,7 +125,7 @@ fn callee_span_unset_when_no_narrowing_is_possible() {
|
|||
// A bare `eval(x);` on one line: `first_call_ident` finds the
|
||||
// call_expression whose span is nearly the whole expression_statement
|
||||
// (different by the trailing `;`). `classification_span` still
|
||||
// returns a sensible line — but the exact trimming is an
|
||||
// returns a sensible line, but the exact trimming is an
|
||||
// implementation detail. What we assert here is the invariant:
|
||||
// if callee_span *is* set, it must be contained in ast.span.
|
||||
let src = b"function f() { eval(x); }";
|
||||
|
|
@ -708,7 +708,7 @@ fn python_if_and() {
|
|||
|
||||
#[test]
|
||||
fn ruby_unless_and() {
|
||||
// `unless a && b` — chain built, branches swapped
|
||||
// `unless a && b`, chain built, branches swapped
|
||||
// Body should run when condition is false
|
||||
let src = b"def f\n unless a && b\n x\n end\nend\n";
|
||||
let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE);
|
||||
|
|
@ -848,7 +848,7 @@ fn parse_tree(src: &[u8], ts_lang: Language) -> tree_sitter::Tree {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_skips_lambda_body() {
|
||||
// `process(lambda: eval(dangerous))` — Python-style.
|
||||
// `process(lambda: eval(dangerous))`, Python-style.
|
||||
// first_call_ident should return "process", not "eval".
|
||||
let src = b"process(lambda: eval(dangerous))";
|
||||
let ts_lang = Language::from(tree_sitter_python::LANGUAGE);
|
||||
|
|
@ -860,7 +860,7 @@ fn first_call_ident_skips_lambda_body() {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_skips_arrow_function_body() {
|
||||
// `process(() => eval(dangerous))` — JS arrow function in argument.
|
||||
// `process(() => eval(dangerous))`, JS arrow function in argument.
|
||||
let src = b"process(() => eval(dangerous))";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -871,7 +871,7 @@ fn first_call_ident_skips_arrow_function_body() {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_skips_named_function_in_arg() {
|
||||
// `process(function inner() { eval(dangerous); })` — named function expression in arg.
|
||||
// `process(function inner() { eval(dangerous); })`, named function expression in arg.
|
||||
let src = b"process(function inner() { eval(dangerous); })";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -882,7 +882,7 @@ fn first_call_ident_skips_named_function_in_arg() {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_normal_nested_call() {
|
||||
// `outer(inner(x))` — inner is NOT behind a function boundary, should be reachable.
|
||||
// `outer(inner(x))`, inner is NOT behind a function boundary, should be reachable.
|
||||
let src = b"outer(inner(x))";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -895,7 +895,7 @@ fn first_call_ident_normal_nested_call() {
|
|||
#[test]
|
||||
fn first_call_ident_finds_call_not_blocked_by_function() {
|
||||
// Ensure a call at the same level as a function literal is still found.
|
||||
// `[function() {}, actual_call()]` — array with function and call.
|
||||
// `[function() {}, actual_call()]`, array with function and call.
|
||||
let src = b"[function() {}, actual_call()]";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -908,7 +908,7 @@ fn first_call_ident_finds_call_not_blocked_by_function() {
|
|||
|
||||
#[test]
|
||||
fn callee_not_resolved_from_nested_function_arg() {
|
||||
// `safe_wrapper(function() { eval(user_input); })` — the CFG for the
|
||||
// `safe_wrapper(function() { eval(user_input); })`, the CFG for the
|
||||
// outer call should resolve the callee as "safe_wrapper", never "eval".
|
||||
let src = b"function f() { safe_wrapper(function() { eval(user_input); }); }";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
|
|
@ -923,7 +923,7 @@ fn callee_not_resolved_from_nested_function_arg() {
|
|||
assert!(has_safe, "expected a node with callee 'safe_wrapper'");
|
||||
|
||||
// The outer body should NOT have a node with callee "eval" attributed
|
||||
// to the outer expression — eval lives inside the nested function body.
|
||||
// to the outer expression, eval lives inside the nested function body.
|
||||
let outer_eval = body.graph.node_weights().any(|info| {
|
||||
info.call.callee.as_deref() == Some("eval") && info.ast.enclosing_func.is_none()
|
||||
});
|
||||
|
|
@ -1117,6 +1117,7 @@ fn clone_preserves_all_sub_structs() {
|
|||
kwargs: vec![("shell".into(), vec!["True".into()])],
|
||||
arg_string_literals: vec![Some("lit".into())],
|
||||
destination_uses: None,
|
||||
gate_filters: Vec::new(),
|
||||
},
|
||||
taint: TaintMeta {
|
||||
labels: {
|
||||
|
|
@ -1399,7 +1400,7 @@ fn js_promisify_ignored_for_non_js_langs() {
|
|||
|
||||
#[test]
|
||||
fn js_promisify_non_call_value_ignored() {
|
||||
// RHS is not a promisify call — no binding should be captured.
|
||||
// RHS is not a promisify call, no binding should be captured.
|
||||
let src = b"const execAsync = child_process.exec;";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let file_cfg = parse_to_file_cfg(src, "javascript", ts_lang);
|
||||
|
|
@ -1471,7 +1472,7 @@ fn cpp_function_extracts_param_names() {
|
|||
// ── callee-site metadata extraction ──────────────────────────────────
|
||||
|
||||
/// Callees collected into `LocalFuncSummary` should now carry structured
|
||||
/// arity, receiver, and qualifier fields — not just a bare name.
|
||||
/// arity, receiver, and qualifier fields, not just a bare name.
|
||||
#[test]
|
||||
fn local_summary_callees_carry_arity_and_receiver() {
|
||||
// Two calls: one is a plain function call with 2 args, the other is
|
||||
|
|
@ -1703,7 +1704,7 @@ fn local_summary_callees_have_distinct_ordinals() {
|
|||
.find(|(k, _)| k.name == "outer")
|
||||
.unwrap();
|
||||
|
||||
// Dedup key is (name, arity, receiver, qualifier, ordinal) — the two
|
||||
// Dedup key is (name, arity, receiver, qualifier, ordinal), the two
|
||||
// `a()` sites have different ordinals, so both must appear.
|
||||
let a_sites: Vec<_> = outer.callees.iter().filter(|c| c.name == "a").collect();
|
||||
assert_eq!(
|
||||
|
|
@ -1825,7 +1826,7 @@ fn anon_fn_named_from_short_var_decl_go() {
|
|||
|
||||
#[test]
|
||||
fn iife_callee_resolves_to_anon_body_js() {
|
||||
// `(function(arg){eval(arg);})(q)` — the CallFn arm must produce
|
||||
// `(function(arg){eval(arg);})(q)`, the CallFn arm must produce
|
||||
// a synthetic anon callee name so that taint can match the
|
||||
// inline body's FuncKey.
|
||||
let src = b"(function(arg){ eval(arg); })(q);";
|
||||
|
|
@ -1898,7 +1899,7 @@ fn strip_tags(s: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_unrecognised_literals() {
|
||||
// `.replace("foo", "bar")` contains no dangerous pattern — must NOT be
|
||||
// `.replace("foo", "bar")` contains no dangerous pattern, must NOT be
|
||||
// credited as a sanitizer. Preserves the FP→TN guard: replace calls
|
||||
// that don't strip anything dangerous must stay transparent to taint.
|
||||
let src = br#"
|
||||
|
|
@ -1916,7 +1917,7 @@ fn rewrite(s: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_when_replacement_reintroduces_pattern() {
|
||||
// `.replace("x", "..")` strips `x` but *reintroduces* `..` — be
|
||||
// `.replace("x", "..")` strips `x` but *reintroduces* `..`, be
|
||||
// maximally conservative and abandon all credit for this chain.
|
||||
let src = br#"
|
||||
fn evil(s: &str) -> String {
|
||||
|
|
@ -1933,7 +1934,7 @@ fn evil(s: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_dynamic_arg() {
|
||||
// `.replace(var, "")` — search is not a literal; pattern analysis can
|
||||
// `.replace(var, "")`, search is not a literal; pattern analysis can
|
||||
// say nothing about what was stripped. Must not earn credit.
|
||||
let src = br#"
|
||||
fn dynamic(s: &str, needle: &str) -> String {
|
||||
|
|
@ -1950,7 +1951,7 @@ fn dynamic(s: &str, needle: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_non_identifier_base() {
|
||||
// `get_s().replace("..", "")` — innermost receiver is a call, not a
|
||||
// `get_s().replace("..", "")`, innermost receiver is a call, not a
|
||||
// parameter. We have no reason to believe `get_s()` returns a value
|
||||
// that benefits the caller; refuse credit.
|
||||
let src = br#"
|
||||
|
|
@ -1976,7 +1977,7 @@ fn find_node_defining<'a>(cfg: &'a Cfg, var: &str) -> Option<&'a NodeInfo> {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_detected_on_js_property_read() {
|
||||
// `var count = items.length` — property access on a member expression
|
||||
// `var count = items.length`, property access on a member expression
|
||||
// should mark the CFG node as a numeric-length access so the
|
||||
// type-fact analysis infers TypeKind::Int for `count`.
|
||||
let src = br#"function f(items) {
|
||||
|
|
@ -1994,7 +1995,7 @@ fn numeric_length_access_detected_on_js_property_read() {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_detected_on_js_zero_arg_method_call() {
|
||||
// `var n = str.length()` — zero-arg method call form (uncommon in JS
|
||||
// `var n = str.length()`, zero-arg method call form (uncommon in JS
|
||||
// but present in other languages). Detector should unwrap a
|
||||
// zero-arg call around a member expression.
|
||||
let src = br#"function f(list) {
|
||||
|
|
@ -2012,7 +2013,7 @@ fn numeric_length_access_detected_on_js_zero_arg_method_call() {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_ignores_unrelated_properties() {
|
||||
// `var v = arr.foo` — arbitrary property reads must not be flagged.
|
||||
// `var v = arr.foo`, arbitrary property reads must not be flagged.
|
||||
let src = br#"function f(arr) {
|
||||
var v = arr.foo;
|
||||
return v;
|
||||
|
|
@ -2028,7 +2029,7 @@ fn numeric_length_access_ignores_unrelated_properties() {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_ignores_method_calls_with_args() {
|
||||
// `var r = s.indexOf('x')` — the detector must reject any call with
|
||||
// `var r = s.indexOf('x')`, the detector must reject any call with
|
||||
// positional arguments because those aren't pure length reads.
|
||||
let src = br#"function f(s) {
|
||||
var r = s.indexOf('x');
|
||||
|
|
@ -2043,7 +2044,7 @@ fn numeric_length_access_ignores_method_calls_with_args() {
|
|||
);
|
||||
}
|
||||
|
||||
// ── Pointer-Phase 6 / W5: subscript lowering tests ────────────────────────
|
||||
//── subscript lowering tests ────────────────────────
|
||||
|
||||
/// Scope for tests that flip `NYX_POINTER_ANALYSIS=1` so the CFG-side
|
||||
/// subscript synthesis activates. The env-var is restored afterwards
|
||||
|
|
@ -2290,7 +2291,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
|
|||
);
|
||||
}
|
||||
|
||||
/// JS switch fall-through (`case 1: a(); case 2: b();`) — case 1's
|
||||
/// JS switch fall-through (`case 1: a(); case 2: b();`), case 1's
|
||||
/// exit should flow into case 2's body so taint from `first()`
|
||||
/// reaches `second()`'s sinks.
|
||||
///
|
||||
|
|
@ -2301,7 +2302,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
|
|||
/// structural shape.
|
||||
/// (b) `first()` has a non-Back forward out-edge that lands inside
|
||||
/// the case-2 sub-graph (the actual fall-through wire), so we
|
||||
/// prove there *is* a fall-through edge — not just an
|
||||
/// prove there *is* a fall-through edge, not just an
|
||||
/// Entry→…→Exit path that happens to walk through both calls
|
||||
/// via the dispatch chain.
|
||||
///
|
||||
|
|
@ -2309,7 +2310,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
|
|||
/// Seq passthrough nodes (one per surrounding scope), so the
|
||||
/// fall-through edge from `first()` lands on the *first wrapper
|
||||
/// Seq node* of case 2, not on `second()` itself. Asserting that
|
||||
/// `second()` has ≥2 in-edges would therefore be wrong — the True
|
||||
/// `second()` has ≥2 in-edges would therefore be wrong, the True
|
||||
/// edge from the case-2 dispatch If targets the wrapper node, and
|
||||
/// only a single Seq chain leads from there to `second()`.
|
||||
#[test]
|
||||
|
|
@ -2800,7 +2801,7 @@ fn nested_loops_two_headers_two_back_edges() {
|
|||
|
||||
#[test]
|
||||
fn loop_with_break_no_back_edge_from_break() {
|
||||
// A `break` short-circuits the loop body — its edge must NOT be a
|
||||
// A `break` short-circuits the loop body, its edge must NOT be a
|
||||
// back edge to the header (it leaves the loop entirely).
|
||||
let src = b"function f() { while (cond()) { if (done()) break; body(); } }";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
|
|
@ -2879,7 +2880,7 @@ fn chained_method_call_rebinds_to_inner_gated_sink() {
|
|||
// no longer be the recorded callee for this node.
|
||||
if callee.ends_with("https.get") {
|
||||
// The inner-gate path must have populated sink_payload_args
|
||||
// (the gate's payload arg is position 0 — the URL string).
|
||||
// (the gate's payload arg is position 0, the URL string).
|
||||
assert!(
|
||||
info.call.sink_payload_args.is_some(),
|
||||
"expected sink_payload_args to be populated for chained \
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ use super::{
|
|||
member_expr_text, push_node, text_of,
|
||||
};
|
||||
use crate::labels::{DataLabel, LangAnalysisRules, classify};
|
||||
use crate::utils::snippet::truncate_at_char_boundary;
|
||||
use petgraph::graph::NodeIndex;
|
||||
use smallvec::SmallVec;
|
||||
use tree_sitter::Node;
|
||||
|
|
@ -72,20 +73,15 @@ pub(super) fn push_condition_node<'a>(
|
|||
code: &'a [u8],
|
||||
enclosing_func: Option<&str>,
|
||||
) -> NodeIndex {
|
||||
// Pass cond_ast as both args — sub-conditions are never `unless` nodes
|
||||
// Pass cond_ast as both args, sub-conditions are never `unless` nodes
|
||||
let (inner, negated) = detect_negation(cond_ast, cond_ast, lang);
|
||||
let mut vars = Vec::new();
|
||||
collect_idents(inner, code, &mut vars);
|
||||
vars.sort();
|
||||
vars.dedup();
|
||||
vars.truncate(MAX_COND_VARS);
|
||||
let text = text_of(cond_ast, code).map(|t| {
|
||||
if t.len() > MAX_CONDITION_TEXT_LEN {
|
||||
t[..MAX_CONDITION_TEXT_LEN].to_string()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
});
|
||||
let text = text_of(cond_ast, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
let span = (cond_ast.start_byte(), cond_ast.end_byte());
|
||||
g.add_node(NodeInfo {
|
||||
kind: StmtKind::If,
|
||||
|
|
@ -140,7 +136,7 @@ pub(super) fn detect_rust_let_match_guard<'a>(
|
|||
/// Synthesize a `StmtKind::If` CFG node carrying a Rust match-arm guard's
|
||||
/// condition text and vars. The let-binding name is added to `condition_vars`
|
||||
/// so `apply_branch_predicates` narrows validation to that specific variable
|
||||
/// — the variable that receives the arm's value and flows to downstream sinks.
|
||||
///, the variable that receives the arm's value and flows to downstream sinks.
|
||||
pub(super) fn emit_rust_match_guard_if<'a>(
|
||||
g: &mut Cfg,
|
||||
guard: Node<'a>,
|
||||
|
|
@ -154,13 +150,8 @@ pub(super) fn emit_rust_match_guard_if<'a>(
|
|||
vars.sort();
|
||||
vars.dedup();
|
||||
vars.truncate(MAX_COND_VARS);
|
||||
let text = text_of(guard, code).map(|t| {
|
||||
if t.len() > MAX_CONDITION_TEXT_LEN {
|
||||
t[..MAX_CONDITION_TEXT_LEN].to_string()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
});
|
||||
let text = text_of(guard, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
let span = (guard.start_byte(), guard.end_byte());
|
||||
g.add_node(NodeInfo {
|
||||
kind: StmtKind::If,
|
||||
|
|
@ -181,7 +172,7 @@ pub(super) fn emit_rust_match_guard_if<'a>(
|
|||
/// `lhs_text` is then synthesised by SSA lowering at the join.
|
||||
///
|
||||
/// The condition's identifiers live on the If node's `condition_vars`, **not**
|
||||
/// on the branch `uses`. This is the whole point of the split — cond is control
|
||||
/// on the branch `uses`. This is the whole point of the split, cond is control
|
||||
/// flow, branches are data flow.
|
||||
///
|
||||
/// Returns the exit frontier for downstream statement chaining (a single-element
|
||||
|
|
@ -219,7 +210,7 @@ pub(super) fn build_ternary_diamond<'a>(
|
|||
g[cond_if].is_eq_with_const = detect_eq_with_const(cond_ast, lang);
|
||||
connect_all(g, preds, cond_if, pred_edge);
|
||||
|
||||
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node) —
|
||||
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node) ,
|
||||
// a nested ternary recurses and returns its own join node.
|
||||
let true_exits = lower_ternary_branch(
|
||||
cons_ast,
|
||||
|
|
@ -332,7 +323,7 @@ pub(super) fn lower_ternary_branch<'a>(
|
|||
analysis_rules,
|
||||
);
|
||||
|
||||
// The branch expression's own `defines` (if any — typically None for a
|
||||
// The branch expression's own `defines` (if any, typically None for a
|
||||
// pure value expression) is replaced with the outer LHS so that both
|
||||
// branches agree on the target, driving phi insertion at the join.
|
||||
g[node].taint.defines = Some(lhs_text.to_string());
|
||||
|
|
@ -410,7 +401,7 @@ pub(super) fn classify_ternary_lhs(
|
|||
.unwrap_or_default();
|
||||
|
||||
// Try the full dotted path first (e.g. "document.cookie"), then fall back
|
||||
// to the property alone (e.g. "innerHTML") — mirrors the LHS classification
|
||||
// to the property alone (e.g. "innerHTML"), mirrors the LHS classification
|
||||
// already performed in `push_node` for non-split assignments.
|
||||
if let Some(l) = classify(lang, &lhs_text, extra) {
|
||||
labels.push(l);
|
||||
|
|
@ -429,7 +420,7 @@ pub(super) fn classify_ternary_lhs(
|
|||
/// Recursively decompose a boolean condition into a chain of `StmtKind::If` nodes
|
||||
/// with short-circuit edges.
|
||||
///
|
||||
/// Returns `(true_exits, false_exits)` — the sets of nodes from which True/False
|
||||
/// Returns `(true_exits, false_exits)`, the sets of nodes from which True/False
|
||||
/// edges should connect to the then/else branches.
|
||||
pub(super) fn build_condition_chain<'a>(
|
||||
cond_ast: Node<'a>,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use tree_sitter::Node;
|
|||
///
|
||||
/// Used by decorator extraction to reduce `login_required`, `permission_required(...)`,
|
||||
/// `flask_login.login_required`, `hasRole('ADMIN')` to their first identifier
|
||||
/// name — the matcher target.
|
||||
/// name, the matcher target.
|
||||
fn leading_ident_text(node: Node<'_>, code: &[u8]) -> Option<String> {
|
||||
let mut cur = node;
|
||||
loop {
|
||||
|
|
@ -56,7 +56,7 @@ fn normalize_decorator_name(raw: &str) -> String {
|
|||
let trimmed = raw.trim();
|
||||
let trimmed = trimmed.trim_start_matches(':').trim_start_matches('@');
|
||||
// If a call syntax leaked through (e.g. `UseGuards(AuthGuard)`), keep only
|
||||
// the head — callers that want the arg handle it separately.
|
||||
// the head, callers that want the arg handle it separately.
|
||||
let head = trimmed
|
||||
.split(['(', ' ', '\t', '\n'])
|
||||
.next()
|
||||
|
|
@ -115,7 +115,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
|
|||
/// are `decorator` nodes containing an `identifier` or `call` expression.
|
||||
/// - **JS/TS**: decorators attach to `method_definition` children or appear
|
||||
/// as siblings inside `class_body`; stage-3 decorators use `decorator` nodes.
|
||||
/// `@UseGuards(AuthGuard)` — we include the call args too.
|
||||
/// `@UseGuards(AuthGuard)`, we include the call args too.
|
||||
/// - **Java**: annotations live in the `modifiers` child of `method_declaration`;
|
||||
/// kinds are `marker_annotation` / `annotation`.
|
||||
/// - **Rust**: `function_item` has `attribute_item` siblings (outer `#[..]`).
|
||||
|
|
@ -127,7 +127,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
|
|||
/// at class body scope applies to every method in the class. `only:` /
|
||||
/// `except:` hash args scope the filter to the listed action names; the
|
||||
/// filter is only recorded for the current method when the scope matches.
|
||||
/// Conditional filters (`if:` / `unless:`) are not honored — those require
|
||||
/// Conditional filters (`if:` / `unless:`) are not honored, those require
|
||||
/// predicate evaluation and are deferred.
|
||||
pub(super) fn extract_auth_decorators<'a>(
|
||||
func_node: Node<'a>,
|
||||
|
|
@ -379,12 +379,12 @@ pub(super) fn extract_auth_decorators<'a>(
|
|||
}
|
||||
|
||||
/// If a Ruby statement is `before_action :name` (or `before_filter :name`),
|
||||
/// push the normalized filter name into `out` — honoring any `only:` / `except:`
|
||||
/// push the normalized filter name into `out`, honoring any `only:` / `except:`
|
||||
/// hash arguments against `method_name`.
|
||||
///
|
||||
/// Positional symbol args (`before_action :a, :b, only: [:x]`) all share the
|
||||
/// single trailing scope. Conditional filters (`if:` / `unless:`) are not
|
||||
/// honored here — those require predicate evaluation and are deferred.
|
||||
/// honored here, those require predicate evaluation and are deferred.
|
||||
fn collect_ruby_before_action(
|
||||
node: Node<'_>,
|
||||
code: &[u8],
|
||||
|
|
@ -499,7 +499,7 @@ fn collect_ruby_before_action(
|
|||
|
||||
/// Parse a single `only:` / `except:` hash pair and append the symbol list into
|
||||
/// the corresponding out-vec. Sets the `*_present` flag when the key is seen,
|
||||
/// regardless of whether the value parses into any symbols — treating
|
||||
/// regardless of whether the value parses into any symbols, treating
|
||||
/// `only: []` as "no actions match" is safer than ignoring the scope.
|
||||
fn collect_ruby_filter_pair(
|
||||
pair_node: Node<'_>,
|
||||
|
|
|
|||
|
|
@ -1,26 +1,28 @@
|
|||
//! Phase 6.1: per-language DTO definition collectors.
|
||||
//! per-language DTO definition collectors.
|
||||
//!
|
||||
//! Walks a parsed file's AST and emits `(class_name, DtoFields)` pairs
|
||||
//! for class / interface / struct / Pydantic-model declarations whose
|
||||
//! field types resolve to a recognised [`TypeKind`].
|
||||
//!
|
||||
//! Strictly additive: classes whose fields cannot be classified produce
|
||||
//! a `DtoFields` with an empty `fields` map — the caller must decide
|
||||
//! a `DtoFields` with an empty `fields` map, the caller must decide
|
||||
//! whether to use that as a "Dto with no inferred fields" or fall back
|
||||
//! to the pre-Phase-6 Object/Unknown classification.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use tree_sitter::Node;
|
||||
|
||||
use super::helpers::text_of;
|
||||
use super::params::{java_type_to_kind, python_primitive_to_kind, ts_type_to_kind};
|
||||
use super::params::{
|
||||
java_type_to_kind, python_primitive_to_kind, ts_type_to_kind, ts_type_to_local_collection,
|
||||
};
|
||||
use crate::ssa::type_facts::{DtoFields, TypeKind};
|
||||
|
||||
/// Collect all DTO-shaped class definitions in a parsed file.
|
||||
///
|
||||
/// Dispatches per-language; returns an empty map for languages without
|
||||
/// a Phase 6 collector (Go, Ruby, PHP, C/C++ — DTOs in those ecosystems
|
||||
/// a collector (Go, Ruby, PHP, C/C++, DTOs in those ecosystems
|
||||
/// either don't follow framework conventions Nyx tracks today, or are
|
||||
/// already covered by other type-inference paths).
|
||||
pub(super) fn collect_dto_classes(
|
||||
|
|
@ -39,6 +41,55 @@ pub(super) fn collect_dto_classes(
|
|||
out
|
||||
}
|
||||
|
||||
/// Collect same-file `type X = Map<...>` / `Set<...>` / `T[]`
|
||||
/// aliases for TS / JS so the param classifier can resolve a
|
||||
/// parameter typed `m: ElementsMap` (where
|
||||
/// `type ElementsMap = Map<K, V>`) to
|
||||
/// [`TypeKind::LocalCollection`].
|
||||
///
|
||||
/// Empty for non-JS/TS languages. Cross-file aliases are not
|
||||
/// resolved here, that requires the multi-file type-resolution
|
||||
/// pipeline that doesn't yet exist for TS. Excalidraw's
|
||||
/// `type ElementsMap = Map<...>` is in
|
||||
/// `packages/element/src/types.ts`; users that import the alias
|
||||
/// without a same-file copy still see the original FP. Most
|
||||
/// real-repo aliases the FP cluster touched were declared in the
|
||||
/// same file as their consumers (see fixture).
|
||||
pub(super) fn collect_type_alias_local_collections(
|
||||
root: Node<'_>,
|
||||
lang: &str,
|
||||
code: &[u8],
|
||||
) -> HashSet<String> {
|
||||
let mut out: HashSet<String> = HashSet::new();
|
||||
if matches!(lang, "typescript" | "ts" | "javascript" | "js") {
|
||||
collect_ts_type_alias_local_collections(root, code, &mut out);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn collect_ts_type_alias_local_collections(root: Node<'_>, code: &[u8], out: &mut HashSet<String>) {
|
||||
walk(root, &mut |node| {
|
||||
if node.kind() != "type_alias_declaration" {
|
||||
return;
|
||||
}
|
||||
let Some(name_node) = node.child_by_field_name("name") else {
|
||||
return;
|
||||
};
|
||||
let Some(alias_name) = text_of(name_node, code) else {
|
||||
return;
|
||||
};
|
||||
let Some(value_node) = node.child_by_field_name("value") else {
|
||||
return;
|
||||
};
|
||||
let Some(value_text) = text_of(value_node, code) else {
|
||||
return;
|
||||
};
|
||||
if ts_type_to_local_collection(value_text.trim()).is_some() {
|
||||
out.insert(alias_name);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// Java
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -163,7 +214,7 @@ fn extract_ts_property<'a>(node: Node<'a>, code: &'a [u8]) -> Option<(String, Ty
|
|||
let name_node = node.child_by_field_name("name")?;
|
||||
let field_name = text_of(name_node, code)?;
|
||||
let type_anno = node.child_by_field_name("type")?;
|
||||
// type_annotation node text is `: T` — walk to the inner type.
|
||||
// type_annotation node text is `: T`, walk to the inner type.
|
||||
let type_text = type_anno
|
||||
.named_child(0)
|
||||
.and_then(|t| text_of(t, code))
|
||||
|
|
@ -193,7 +244,7 @@ fn collect_rust(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFields
|
|||
return;
|
||||
};
|
||||
if body.kind() != "field_declaration_list" {
|
||||
// Tuple struct or unit struct — no named fields.
|
||||
// Tuple struct or unit struct, no named fields.
|
||||
return;
|
||||
}
|
||||
let mut fields = DtoFields::new(class_name.clone());
|
||||
|
|
@ -291,7 +342,7 @@ fn collect_python(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFiel
|
|||
/// Conservative supertype scan: returns true when the class definition
|
||||
/// has a superclass list whose text mentions `BaseModel` (covers both
|
||||
/// `BaseModel` and `pydantic.BaseModel`). No false positives on
|
||||
/// non-Pydantic classes named `BaseModel`-something — match is on the
|
||||
/// non-Pydantic classes named `BaseModel`-something, match is on the
|
||||
/// full token, not a substring.
|
||||
fn python_inherits_basemodel<'a>(class_node: Node<'a>, code: &'a [u8]) -> bool {
|
||||
let Some(supers) = class_node.child_by_field_name("superclasses") else {
|
||||
|
|
@ -418,7 +469,7 @@ mod tests {
|
|||
"#;
|
||||
let dtos = collect("rust", src);
|
||||
// Tuple structs have no named fields and must NOT produce a
|
||||
// DtoFields entry — Phase 6 only handles named-field DTOs.
|
||||
// DtoFields entry, This collector only handles named-field DTOs.
|
||||
assert!(!dtos.contains_key("Wrap"));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,11 +19,11 @@ pub(crate) fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option<String> {
|
|||
///
|
||||
/// For `Runtime.getRuntime().exec(cmd)`, the receiver of `exec` is the call
|
||||
/// `Runtime.getRuntime()`. This function drills through that to return
|
||||
/// `"Runtime"` — the outermost non-call object. This lets labels like
|
||||
/// `"Runtime"`, the outermost non-call object. This lets labels like
|
||||
/// `"Runtime.exec"` match correctly.
|
||||
pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<String> {
|
||||
match lookup(lang, n.kind()) {
|
||||
// The receiver is itself a call — drill into ITS receiver.
|
||||
// The receiver is itself a call, drill into ITS receiver.
|
||||
// e.g. for `Runtime.getRuntime()`, the object is `Runtime`.
|
||||
Kind::CallFn | Kind::CallMethod => {
|
||||
let inner = n
|
||||
|
|
@ -53,7 +53,7 @@ pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<Str
|
|||
/// identifier (e.g. call expressions, subscripts, `this`/`self`, etc.).
|
||||
pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
|
||||
let mut cur = n;
|
||||
// Bounded walk — tree-sitter can nest deeply but we only need a handful
|
||||
// Bounded walk, tree-sitter can nest deeply but we only need a handful
|
||||
// of hops for real code.
|
||||
for _ in 0..16 {
|
||||
match cur.kind() {
|
||||
|
|
@ -68,7 +68,7 @@ pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
|
|||
cur = cur.child_by_field_name("value")?;
|
||||
}
|
||||
// Drill through nested calls / method chains to find the base
|
||||
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` —
|
||||
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` ,
|
||||
// the receiver of `.execute` is the `.unwrap()` call whose
|
||||
// object is `Connection::open(p)`; we want the leftmost plain
|
||||
// identifier the chain resolves to (for SSA var_stacks lookup).
|
||||
|
|
@ -212,7 +212,7 @@ pub(crate) fn first_call_ident_with_span<'a>(
|
|||
return ident.map(|s| (s, span));
|
||||
}
|
||||
Kind::Function => {
|
||||
// Do not descend into nested function/lambda bodies —
|
||||
// Do not descend into nested function/lambda bodies ,
|
||||
// they are separate scopes and should not contribute
|
||||
// callee identifiers to the parent expression.
|
||||
continue;
|
||||
|
|
@ -240,7 +240,7 @@ pub(crate) fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> O
|
|||
/// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does.
|
||||
///
|
||||
/// Returns `(callee_text, label, span)` where `span` is the byte range of the
|
||||
/// inner call node itself — used to populate `CallMeta.callee_span` so that
|
||||
/// inner call node itself, used to populate `CallMeta.callee_span` so that
|
||||
/// display sites can report the actual call location rather than the enclosing
|
||||
/// statement's span.
|
||||
pub(crate) fn find_classifiable_inner_call<'a>(
|
||||
|
|
@ -251,7 +251,7 @@ pub(crate) fn find_classifiable_inner_call<'a>(
|
|||
) -> Option<(String, DataLabel, (usize, usize))> {
|
||||
let mut cursor = n.walk();
|
||||
for c in n.children(&mut cursor) {
|
||||
// Do not descend into Kind::Function nodes — they will be extracted
|
||||
// Do not descend into Kind::Function nodes, they will be extracted
|
||||
// as separate BodyCfg entries and should not contribute inner callees
|
||||
// to the parent expression.
|
||||
if lookup(lang, c.kind()) == Kind::Function {
|
||||
|
|
@ -329,7 +329,7 @@ pub(crate) fn member_expr_text_inner(n: Node, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"member_expression" | "attribute" | "selector_expression" => {
|
||||
// Tree-sitter exposes the receiver under `object` (JS/TS, Python),
|
||||
// `value` (Rust field_expression — handled in the matching arm
|
||||
// `value` (Rust field_expression, handled in the matching arm
|
||||
// above), or `operand` (Go selector_expression). Without the
|
||||
// `operand` fallback, Go member access like `r.Body` collapsed to
|
||||
// just the trailing field (`Body`), so source rules keyed on the
|
||||
|
|
@ -442,7 +442,7 @@ pub(crate) fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
|
|||
/// This finds anonymous functions / arrow functions / closures that are
|
||||
/// passed as arguments to a call and should be analysed as separate
|
||||
/// function scopes. Only direct function-argument children are collected
|
||||
/// (not functions nested inside other functions — those get handled when
|
||||
/// (not functions nested inside other functions, those get handled when
|
||||
/// the outer function is recursed into).
|
||||
pub(crate) fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec<Node<'a>> {
|
||||
let mut funcs = Vec::new();
|
||||
|
|
@ -558,7 +558,7 @@ pub(crate) fn derive_anon_fn_name_from_context<'a>(
|
|||
}
|
||||
|
||||
// Python: `h = lambda: ...` parents as `assignment`, handled above.
|
||||
// Python `default_parameter` assigning `def foo(x=lambda: 0)` — ambiguous, skip.
|
||||
// Python `default_parameter` assigning `def foo(x=lambda: 0)`, ambiguous, skip.
|
||||
_ => {
|
||||
// Some grammars wrap the RHS in an `expression`, `expression_list`,
|
||||
// or similar node between the binding site and the function literal.
|
||||
|
|
@ -709,7 +709,7 @@ pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec<String>) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: AST kind names for subscript / index expressions
|
||||
/// AST kind names for subscript / index expressions
|
||||
/// across the languages whose container-element flow we model.
|
||||
///
|
||||
/// JS/TS use `subscript_expression`; Python uses `subscript`; Go uses
|
||||
|
|
@ -724,7 +724,7 @@ pub(crate) fn is_subscript_kind(kind: &str) -> bool {
|
|||
)
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: when the LHS of an assignment statement is a
|
||||
/// when the LHS of an assignment statement is a
|
||||
/// subscript / index expression (or a single-element wrapper around
|
||||
/// one), return that node. Returns `None` for multi-target Go
|
||||
/// `expression_list`s, identifier LHSs, member-expression LHSs, etc.
|
||||
|
|
@ -745,10 +745,10 @@ pub(crate) fn subscript_lhs_node<'a>(lhs: Node<'a>, lang: &str) -> Option<Node<'
|
|||
None
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: extract `(array_text, index_text)` from a
|
||||
/// extract `(array_text, index_text)` from a
|
||||
/// subscript / index AST node.
|
||||
///
|
||||
/// Returns `None` when the array operand is not a plain identifier — we
|
||||
/// Returns `None` when the array operand is not a plain identifier, we
|
||||
/// only synthesise `__index_get__` / `__index_set__` calls when the
|
||||
/// receiver resolves cleanly to a SSA-renamed local, since the W2/W4
|
||||
/// container hooks need a stable receiver var_name to drive
|
||||
|
|
@ -771,7 +771,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
|
|||
n.named_children(&mut cur).nth(1)
|
||||
})?;
|
||||
let arr_kind = arr.kind();
|
||||
// Only proceed when the array is a plain identifier — otherwise
|
||||
// Only proceed when the array is a plain identifier, otherwise
|
||||
// we can't bind a stable receiver name for the synth Call.
|
||||
if !matches!(
|
||||
arr_kind,
|
||||
|
|
@ -780,7 +780,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
|
|||
return None;
|
||||
}
|
||||
let arr_text = text_of(arr, code)?;
|
||||
// PHP-style `$x` strip not needed here — Go/JS/Python don't use it.
|
||||
// PHP-style `$x` strip not needed here, Go/JS/Python don't use it.
|
||||
let idx_text = text_of(idx, code)?;
|
||||
Some((arr_text, idx_text))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
//! Phase 6: per-language class / trait / interface hierarchy extraction.
|
||||
//! per-language class / trait / interface hierarchy extraction.
|
||||
//!
|
||||
//! Walks a parsed file's AST and emits `(sub_container, super_container)`
|
||||
//! pairs for every declared inheritance / impl / implements relationship.
|
||||
|
|
@ -47,7 +47,7 @@ pub(crate) fn collect_hierarchy_edges(
|
|||
"php" => collect_php(root, code, &mut push),
|
||||
"cpp" | "c++" => collect_cpp(root, code, &mut push),
|
||||
// Go: structural / implicit interface satisfaction is intractable
|
||||
// per-file; Phase 6 deliberately skips it.
|
||||
// per-file; deliberately skipped it.
|
||||
// C: no inheritance.
|
||||
_ => {}
|
||||
}
|
||||
|
|
@ -70,7 +70,7 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
|
|||
let Some(sub) = text_of(name_node, code) else {
|
||||
return;
|
||||
};
|
||||
// `superclass` field on class_declaration — singular `extends Y`.
|
||||
// `superclass` field on class_declaration, singular `extends Y`.
|
||||
if let Some(superclass) = node.child_by_field_name("superclass") {
|
||||
let mut cursor = superclass.walk();
|
||||
for c in superclass.named_children(&mut cursor) {
|
||||
|
|
@ -79,13 +79,13 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
|
|||
}
|
||||
}
|
||||
}
|
||||
// `interfaces` field on class_declaration — `implements I, J`
|
||||
// `interfaces` field on class_declaration, `implements I, J`
|
||||
// wraps a `super_interfaces` → `type_list`.
|
||||
if let Some(ifaces) = node.child_by_field_name("interfaces") {
|
||||
collect_java_type_list(ifaces, code, &sub, push);
|
||||
}
|
||||
// `extends_interfaces` is an unnamed child on
|
||||
// interface_declaration — `extends Foo, Bar` for an
|
||||
// interface_declaration, `extends Foo, Bar` for an
|
||||
// interface. Walk children directly since it's not a field.
|
||||
let mut cursor = node.walk();
|
||||
for c in node.named_children(&mut cursor) {
|
||||
|
|
@ -123,7 +123,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"type_identifier" | "identifier" => text_of(n, code),
|
||||
"generic_type" => {
|
||||
// `Foo<T>` — the leading child is the bare type identifier.
|
||||
// `Foo<T>`, the leading child is the bare type identifier.
|
||||
let mut cursor = n.walk();
|
||||
for c in n.named_children(&mut cursor) {
|
||||
if matches!(
|
||||
|
|
@ -136,7 +136,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
None
|
||||
}
|
||||
"scoped_type_identifier" => {
|
||||
// `pkg.Foo` — return last segment.
|
||||
// `pkg.Foo`, return last segment.
|
||||
text_of(n, code).map(|s| {
|
||||
let last = s.rsplit('.').next().unwrap_or(&s);
|
||||
last.to_string()
|
||||
|
|
@ -152,7 +152,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
|
||||
/// Walk for `impl_item` nodes and emit edges from the concrete type to
|
||||
/// the trait being implemented. Inherent impls (`impl Foo {}`) emit
|
||||
/// no edge — there is no super-trait relationship to record.
|
||||
/// no edge, there is no super-trait relationship to record.
|
||||
fn collect_rust<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mut F) {
|
||||
walk(root, &mut |node| {
|
||||
if node.kind() != "impl_item" {
|
||||
|
|
@ -179,7 +179,7 @@ fn rust_path_leaf(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"type_identifier" | "identifier" => text_of(n, code),
|
||||
"scoped_type_identifier" | "scoped_identifier" => {
|
||||
// `crate::foo::Bar` — last segment.
|
||||
// `crate::foo::Bar`, last segment.
|
||||
let s = text_of(n, code)?;
|
||||
Some(s.rsplit("::").next().unwrap_or(&s).to_string())
|
||||
}
|
||||
|
|
@ -286,12 +286,12 @@ fn collect_python<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &
|
|||
let Some(superclasses) = node.child_by_field_name("superclasses") else {
|
||||
return; // no parents
|
||||
};
|
||||
// `superclasses` is an `argument_list` — each non-keyword
|
||||
// `superclasses` is an `argument_list`, each non-keyword
|
||||
// argument is a base class.
|
||||
let mut cursor = superclasses.walk();
|
||||
for arg in superclasses.named_children(&mut cursor) {
|
||||
if let Some(t) = python_base_text(arg, code) {
|
||||
// Skip Python `object` — not informative.
|
||||
// Skip Python `object`, not informative.
|
||||
if t != "object" {
|
||||
push(sub.clone(), t);
|
||||
}
|
||||
|
|
@ -304,7 +304,7 @@ fn python_base_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"identifier" => text_of(n, code),
|
||||
"attribute" => {
|
||||
// `pkg.Base` — last segment.
|
||||
// `pkg.Base`, last segment.
|
||||
let s = text_of(n, code)?;
|
||||
Some(s.rsplit('.').next().unwrap_or(&s).to_string())
|
||||
}
|
||||
|
|
@ -474,7 +474,7 @@ mod tests {
|
|||
let src = "interface Mine extends Foo, Bar {}";
|
||||
let edges = collect("java", src);
|
||||
// tree-sitter-java models `extends` on interface as `extends_interfaces`
|
||||
// rooted at the same node — at least one of the parents should land.
|
||||
// rooted at the same node, at least one of the parents should land.
|
||||
assert!(
|
||||
edges.iter().any(|(s, _)| s == "Mine"),
|
||||
"interface extends should emit at least one edge; got {edges:?}"
|
||||
|
|
@ -516,8 +516,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn python_class_object_base_skipped() {
|
||||
// Inheriting from `object` is not informative — Python's
|
||||
// implicit root. Phase 6 omits these edges to keep the
|
||||
// Inheriting from `object` is not informative, Python's
|
||||
// implicit root. We omit these edges to keep the
|
||||
// hierarchy index focused on user-defined relationships.
|
||||
let src = "class Plain(object):\n pass\n";
|
||||
let edges = collect("python", src);
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ use tree_sitter::{Node, Tree};
|
|||
/// - ES6: `import { A as B } from 'mod'` → B → ImportBinding { original: A, module: mod }
|
||||
/// - CommonJS: `const { A: B } = require('mod')` → B → ImportBinding { original: A, module: mod }
|
||||
///
|
||||
/// Only aliased (renamed) bindings are recorded — same-name imports (e.g.
|
||||
/// Only aliased (renamed) bindings are recorded, same-name imports (e.g.
|
||||
/// `import { exec }`) are already resolvable by their original name.
|
||||
pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBindings {
|
||||
let mut bindings = ImportBindings::new();
|
||||
|
|
@ -149,7 +149,7 @@ pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBinding
|
|||
continue;
|
||||
}
|
||||
// The alias is accessed via the "alias" field (a `name` node).
|
||||
// The qualified name has no field — find it by kind.
|
||||
// The qualified name has no field, find it by kind.
|
||||
let alias_node = clause.child_by_field_name("alias");
|
||||
let mut c2 = clause.walk();
|
||||
let qname_node = clause
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
|
|||
/// (JS `object`, TS `object`, Python `dictionary`). `names` contains
|
||||
/// identifiers lifted from pair values whose key matches any entry in
|
||||
/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field
|
||||
/// pairs are present, returns `Some(vec![])` — the sink is effectively
|
||||
/// pairs are present, returns `Some(vec![])`, the sink is effectively
|
||||
/// silenced because no destination identifier exists.
|
||||
/// * `None` if the arg is absent, is not an object literal (plain string
|
||||
/// / ident / expression), or has splat/spread children that break static
|
||||
|
|
@ -77,7 +77,7 @@ pub(super) fn extract_destination_field_idents(
|
|||
match child.kind() {
|
||||
// `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't
|
||||
// statically attribute spread contents to specific fields, so
|
||||
// bail out — caller falls back to the whole-arg filter, matching
|
||||
// bail out, caller falls back to the whole-arg filter, matching
|
||||
// the conservative posture used by arg_uses for splats.
|
||||
"spread_element" | "dictionary_splat" => {
|
||||
return None;
|
||||
|
|
@ -107,7 +107,7 @@ pub(super) fn extract_destination_field_idents(
|
|||
}
|
||||
}),
|
||||
// Computed keys like `[someVar]` can't be statically
|
||||
// resolved — skip (conservative: not a destination field).
|
||||
// resolved, skip (conservative: not a destination field).
|
||||
"computed_property_name" => continue,
|
||||
_ => text_of(key_node, code),
|
||||
};
|
||||
|
|
@ -200,7 +200,7 @@ pub(super) fn extract_const_keyword_arg(
|
|||
continue;
|
||||
}
|
||||
let value_node = child.child_by_field_name("value")?;
|
||||
// Only return a literal — identifiers / calls / complex exprs are
|
||||
// Only return a literal, identifiers / calls / complex exprs are
|
||||
// "dynamic" and must be reported as `None` so the gate can
|
||||
// distinguish literal-safe from dynamic.
|
||||
return match value_node.kind() {
|
||||
|
|
@ -252,7 +252,7 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8])
|
|||
/// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as
|
||||
/// `arg0`). Returns `None` when the call has no arguments.
|
||||
///
|
||||
/// Used by per-language shape-aware sink suppression — for example, Ruby
|
||||
/// Used by per-language shape-aware sink suppression, for example, Ruby
|
||||
/// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically
|
||||
/// parameterised when arg 0 is a hash/symbol/array/non-interpolated string,
|
||||
/// regardless of taint reaching that argument.
|
||||
|
|
@ -268,7 +268,7 @@ pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bo
|
|||
|
||||
/// Walk a Java method-chain receiver looking for an inner `method_invocation`
|
||||
/// whose method name matches one of `target_methods` (e.g. `createQuery`,
|
||||
/// `prepareStatement`). Returns the kind of that inner call's arg 0 — used
|
||||
/// `prepareStatement`). Returns the kind of that inner call's arg 0, used
|
||||
/// to verify the SQL-bearing call up-chain was given a string literal rather
|
||||
/// than a concatenation / method call.
|
||||
///
|
||||
|
|
@ -307,7 +307,7 @@ pub(super) fn java_chain_arg0_kind_for_method(
|
|||
/// method identifier matches one of `target_methods`, then return that
|
||||
/// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node
|
||||
/// represents a chained expression like `Model.where(...).preload(...).to_a`
|
||||
/// — the outermost call (`to_a`) has no arguments, so the shape suppressor
|
||||
///, the outermost call (`to_a`) has no arguments, so the shape suppressor
|
||||
/// must reach down the chain to inspect `where`'s arg 0.
|
||||
///
|
||||
/// Conservative: returns `None` if the chain doesn't contain a matching
|
||||
|
|
@ -353,6 +353,116 @@ fn subtree_has_interpolation(n: Node) -> bool {
|
|||
n.named_children(&mut cursor).any(subtree_has_interpolation)
|
||||
}
|
||||
|
||||
/// Walk a JS/TS method-chain receiver-side to find an inner `call_expression`
|
||||
/// whose member-property name matches one of `target_methods` (e.g. `query`,
|
||||
/// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0.
|
||||
///
|
||||
/// Used to recognise ORM-accessor chains where a labelled SQL sink sits on
|
||||
/// the receiver side of a parameterised execute method:
|
||||
/// `strapi.db.query('admin::api-token').findOne({...})`. The outer call
|
||||
/// (`findOne`) is the CFG node; the inner labelled `db.query` call carries
|
||||
/// the literal model UID that proves the chain is parameterised.
|
||||
///
|
||||
/// Conservative: returns `None` when no matching inner call is found, so
|
||||
/// callers fall through to the no-suppression path.
|
||||
pub(super) fn js_chain_arg0_kind_for_method(
|
||||
expr: Node,
|
||||
target_methods: &[&str],
|
||||
code: &[u8],
|
||||
) -> Option<(String, bool)> {
|
||||
let n = unwrap_parens(expr);
|
||||
// tree-sitter-typescript / -javascript: call_expression with fields
|
||||
// `function` (member_expression / identifier) and `arguments`.
|
||||
if n.kind() == "call_expression" {
|
||||
// Check this call's callee: if its property name (or full text) ends
|
||||
// with one of `target_methods`, this is the inner labelled call.
|
||||
if let Some(function) = n.child_by_field_name("function") {
|
||||
// Property of a member_expression; falls back to the function
|
||||
// text itself for bare-identifier calls.
|
||||
let prop_text = function
|
||||
.child_by_field_name("property")
|
||||
.and_then(|p| text_of(p, code));
|
||||
let full_text = text_of(function, code);
|
||||
let leaf_text = full_text
|
||||
.as_ref()
|
||||
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
|
||||
let matched = target_methods.iter().any(|m| {
|
||||
prop_text.as_deref() == Some(*m)
|
||||
|| leaf_text.as_deref() == Some(*m)
|
||||
|| full_text.as_deref() == Some(*m)
|
||||
|| full_text
|
||||
.as_deref()
|
||||
.is_some_and(|s| s.ends_with(&format!(".{m}")))
|
||||
});
|
||||
if matched {
|
||||
return arg0_kind_and_interpolation(n);
|
||||
}
|
||||
// Drill down the receiver spine: function.object is the prior
|
||||
// call in the chain.
|
||||
if let Some(object) = function.child_by_field_name("object")
|
||||
&& let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code)
|
||||
{
|
||||
return Some(found);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Walk the receiver chain of a JS/TS call to count *non-execute* method
|
||||
/// calls between the outer call and an inner labelled call to
|
||||
/// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer
|
||||
/// chain method name (e.g. `findOne`) when an inner-call to `target_inner`
|
||||
/// exists somewhere on the receiver spine, otherwise `None`.
|
||||
///
|
||||
/// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain
|
||||
/// shape `<inner>.query(LITERAL).<orm_method>(...)`, bare
|
||||
/// `connection.query("SELECT ...")` returns `None` because there is no
|
||||
/// outer chain method.
|
||||
pub(super) fn js_chain_outer_method_for_inner<'a>(
|
||||
outer: Node<'a>,
|
||||
target_inner: &[&str],
|
||||
code: &'a [u8],
|
||||
) -> Option<String> {
|
||||
let n = unwrap_parens(outer);
|
||||
if n.kind() != "call_expression" {
|
||||
return None;
|
||||
}
|
||||
let function = n.child_by_field_name("function")?;
|
||||
let object = function.child_by_field_name("object")?;
|
||||
// If `object` itself is a call_expression whose property matches
|
||||
// `target_inner`, the immediate outer is `function.property`.
|
||||
if object.kind() == "call_expression" {
|
||||
let inner_function = object.child_by_field_name("function");
|
||||
if let Some(inner_function) = inner_function {
|
||||
let prop_text = inner_function
|
||||
.child_by_field_name("property")
|
||||
.and_then(|p| text_of(p, code));
|
||||
let full_text = text_of(inner_function, code);
|
||||
let leaf_text = full_text
|
||||
.as_ref()
|
||||
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
|
||||
let inner_matched = target_inner.iter().any(|m| {
|
||||
prop_text.as_deref() == Some(*m)
|
||||
|| leaf_text.as_deref() == Some(*m)
|
||||
|| full_text.as_deref() == Some(*m)
|
||||
|| full_text
|
||||
.as_deref()
|
||||
.is_some_and(|s| s.ends_with(&format!(".{m}")))
|
||||
});
|
||||
if inner_matched {
|
||||
return function
|
||||
.child_by_field_name("property")
|
||||
.and_then(|p| text_of(p, code).map(|s| s.to_string()));
|
||||
}
|
||||
}
|
||||
// Recurse: outer chain may have more depth (`a.b().c().d()` ,
|
||||
// d is outermost, c is next, target may be at b or further in).
|
||||
return js_chain_outer_method_for_inner(object, target_inner, code);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// For a chained method call (`a.b().c().d()`), walk down the receiver
|
||||
/// chain (`function.object`) and return the innermost call_expression
|
||||
/// alongside its callee text (e.g. `"http.get"`).
|
||||
|
|
@ -385,7 +495,7 @@ pub(super) fn find_chained_inner_call<'a>(
|
|||
return None;
|
||||
}
|
||||
// Recurse: the inner call may itself be chained
|
||||
// (`axios.get(u).then(h).catch(h)` — innermost is `axios.get`).
|
||||
// (`axios.get(u).then(h).catch(h)`, innermost is `axios.get`).
|
||||
if let Some(inner) = find_chained_inner_call(object, lang, code) {
|
||||
return Some(inner);
|
||||
}
|
||||
|
|
@ -398,7 +508,7 @@ pub(super) fn find_chained_inner_call<'a>(
|
|||
.or_else(|| object.child_by_field_name("name"))?;
|
||||
// Multi-line dotted member expressions (`http\n .get`) include
|
||||
// formatting whitespace in the source-text slice. The labels map
|
||||
// keys are literal `"http.get"` etc. — strip whitespace so the
|
||||
// keys are literal `"http.get"` etc., strip whitespace so the
|
||||
// chained-call inner-gate rebinding fires for both single-line and
|
||||
// multi-line chain styles. Also strips `\r` for CRLF sources.
|
||||
// Motivated by upstream Parse Server CVE-2025-64430 which uses the
|
||||
|
|
@ -410,18 +520,18 @@ pub(super) fn find_chained_inner_call<'a>(
|
|||
|
||||
/// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod
|
||||
/// node) and yield each *named argument* of every inner call along the
|
||||
/// way. Outer's own arguments are NOT included — the caller already
|
||||
/// way. Outer's own arguments are NOT included, the caller already
|
||||
/// handles those via the standard `pre_emit_arg_source_nodes` pass over
|
||||
/// `outer.arguments`.
|
||||
///
|
||||
/// For `json.NewDecoder(r.Body).Decode(emoji)`:
|
||||
/// outer = `.Decode(emoji)` — caller iterates `emoji`
|
||||
/// inner = `json.NewDecoder(r.Body)` — yielded arg: `r.Body`
|
||||
/// outer = `.Decode(emoji)` , caller iterates `emoji`
|
||||
/// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body`
|
||||
///
|
||||
/// We only pull from each inner call's `arguments` field, never from its
|
||||
/// `function`/`method`/receiver expressions. That distinction matters
|
||||
/// because chained source-receivers like `r.URL.Query()` expose a
|
||||
/// member-text path that classifies as a Source — but it's the OUTER
|
||||
/// member-text path that classifies as a Source, but it's the OUTER
|
||||
/// chain text (`r.URL.Query.Get`) that already classifies, so emitting
|
||||
/// a synth source for the inner-call's own callee would double-count.
|
||||
///
|
||||
|
|
@ -498,7 +608,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
|
|||
return false;
|
||||
}
|
||||
let first_arg = named[0];
|
||||
// Extract the raw text of arg 0 — must be a string literal or
|
||||
// Extract the raw text of arg 0, must be a string literal or
|
||||
// template string without interpolation.
|
||||
let query_text = match first_arg.kind() {
|
||||
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
|
||||
|
|
@ -511,7 +621,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
|
|||
.named_children(&mut c)
|
||||
.any(|ch| ch.kind() == "template_substitution")
|
||||
{
|
||||
return false; // dynamic — not safe
|
||||
return false; // dynamic, not safe
|
||||
}
|
||||
text_of(first_arg, code)
|
||||
}
|
||||
|
|
@ -534,7 +644,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
|
|||
/// - `$1`, `$2`, …, `$N` (PostgreSQL positional)
|
||||
/// - `?` (MySQL / SQLite positional)
|
||||
/// - `%s` (Python DB-API / psycopg2)
|
||||
/// - `:identifier` (Oracle / named parameters) — requires the colon to be
|
||||
/// - `:identifier` (Oracle / named parameters), requires the colon to be
|
||||
/// preceded by a space or `=` (to avoid matching JS ternary / object
|
||||
/// literals).
|
||||
pub(super) fn has_sql_placeholders(s: &str) -> bool {
|
||||
|
|
@ -559,7 +669,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
|
|||
&& i + 1 < len
|
||||
&& bytes[i + 1].is_ascii_alphabetic() =>
|
||||
{
|
||||
// :identifier — must be preceded by whitespace/= to avoid
|
||||
// :identifier, must be preceded by whitespace/= to avoid
|
||||
// false positives on object literals or ternary operators.
|
||||
return true;
|
||||
}
|
||||
|
|
@ -581,7 +691,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
|
|||
#[allow(clippy::only_used_in_recursion)]
|
||||
pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
|
||||
match node.kind() {
|
||||
// Scalar strings — but reject if they contain interpolation
|
||||
// Scalar strings, but reject if they contain interpolation
|
||||
// (e.g. Ruby `"hello #{name}"`, Python f-strings).
|
||||
"string"
|
||||
| "string_literal"
|
||||
|
|
@ -602,7 +712,7 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
|
|||
// PHP encapsed_string: safe only if no variable interpolation
|
||||
"encapsed_string" => !has_interpolation_cfg(node),
|
||||
|
||||
// Wrapper: PHP/Go wrap each arg in an `argument` node — unwrap
|
||||
// Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap
|
||||
"argument" => {
|
||||
node.named_child_count() == 1
|
||||
&& node
|
||||
|
|
@ -765,7 +875,7 @@ pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
// Zero-arg calls are not "all literal" — taint can still flow via a
|
||||
// Zero-arg calls are not "all literal", taint can still flow via a
|
||||
// non-literal receiver (e.g. `tainted.readObject()`), and the sink-
|
||||
// suppression gate (`info.all_args_literal`) must not skip these.
|
||||
if !any_arg {
|
||||
|
|
@ -781,7 +891,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
|
|||
let mut cursor = node.walk();
|
||||
for child in node.children(&mut cursor) {
|
||||
let kind = child.kind();
|
||||
// Skip argument lists — those are checked by the caller.
|
||||
// Skip argument lists, those are checked by the caller.
|
||||
if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -804,7 +914,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
|
|||
/// Returns one `Vec<String>` per argument (in parameter-position order).
|
||||
/// Returns empty if argument list can't be found or contains spread/keyword args.
|
||||
pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>> {
|
||||
// Ruby `subshell` (backticks) has no `arguments` field — its children are
|
||||
// Ruby `subshell` (backticks) has no `arguments` field, its children are
|
||||
// string fragments and `interpolation` nodes. Lift each interpolation's
|
||||
// identifiers into a positional arg so taint flows from `#{var}` into the
|
||||
// synthetic "subshell" sink.
|
||||
|
|
@ -834,7 +944,7 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>>
|
|||
for child in args_node.named_children(&mut cursor) {
|
||||
let kind = child.kind();
|
||||
// Named / keyword arguments are tracked separately in `CallMeta.kwargs`
|
||||
// and do not participate in positional indexing — skip them here so
|
||||
// and do not participate in positional indexing, skip them here so
|
||||
// `arg_uses` remains strictly positional. Splats (spread/dict splat)
|
||||
// still invalidate positional mapping; bail out in that case.
|
||||
if kind == "spread_element"
|
||||
|
|
@ -1058,13 +1168,13 @@ pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) -
|
|||
/// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call
|
||||
/// (non-method-chain) Go shape. The caller wires the resulting cap into
|
||||
/// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the
|
||||
/// taint engine consumes via the standard sanitizer pathway — taint flows
|
||||
/// taint engine consumes via the standard sanitizer pathway, taint flows
|
||||
/// in on `s`, the matching cap is stripped from the result.
|
||||
pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option<Cap> {
|
||||
if call_ast.kind() != "call_expression" {
|
||||
return None;
|
||||
}
|
||||
// The call's `function` field is a `selector_expression` — `operand`
|
||||
// The call's `function` field is a `selector_expression`, `operand`
|
||||
// is the package ident (`strings`), `field` is the method ident.
|
||||
let func = call_ast.child_by_field_name("function")?;
|
||||
if func.kind() != "selector_expression" {
|
||||
|
|
@ -1085,7 +1195,7 @@ pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> O
|
|||
let new_lit = extract_const_string_arg(call_ast, 2, code)?;
|
||||
|
||||
// If the replacement itself reintroduces a dangerous sequence, don't
|
||||
// credit the strip — matches the Rust chain detector's policy.
|
||||
// credit the strip, matches the Rust chain detector's policy.
|
||||
if !caps_stripped_by_literal_pattern(&new_lit).is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
|
@ -1106,7 +1216,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
|
|||
}
|
||||
match lookup(lang, n.kind()) {
|
||||
Kind::Function => {
|
||||
// Function/closure expression passed as argument — return the same
|
||||
// Function/closure expression passed as argument, return the same
|
||||
// synthetic anon name used by build_sub so callback_bindings and
|
||||
// source_to_callback can match it to the extracted BodyCfg.
|
||||
n.child_by_field_name("name")
|
||||
|
|
@ -1155,7 +1265,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
|
|||
/// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`].
|
||||
///
|
||||
/// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces
|
||||
/// an empty vector — positional indices past the splat are meaningless and
|
||||
/// an empty vector, positional indices past the splat are meaningless and
|
||||
/// downstream passes already treat an empty vector as "no info".
|
||||
pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<Option<String>> {
|
||||
let Some(args_node) = call_node.child_by_field_name("arguments") else {
|
||||
|
|
@ -1175,7 +1285,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
|
|||
return Vec::new();
|
||||
}
|
||||
// Named / keyword arguments are tracked separately in `kwargs` and
|
||||
// don't participate in positional indexing — skip them here so this
|
||||
// don't participate in positional indexing, skip them here so this
|
||||
// vector stays aligned with `arg_uses`.
|
||||
if kind == "keyword_argument" || kind == "named_argument" {
|
||||
continue;
|
||||
|
|
@ -1198,7 +1308,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
|
|||
| "raw_string_literal"
|
||||
// PHP's double-quoted form (single-quoted maps to `string`).
|
||||
// Only safe to lift when there is no `encapsed_string` /
|
||||
// `embedded_expression` interpolation child — checked below.
|
||||
// `embedded_expression` interpolation child, checked below.
|
||||
| "encapsed_string" => {
|
||||
let raw = text_of(target, code);
|
||||
raw.and_then(|s| strip_literal_quotes(&s, target, code))
|
||||
|
|
@ -1212,7 +1322,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
|
|||
|
||||
/// Strip surrounding quotes from a syntactic string literal, resolving the
|
||||
/// `string_content` child for Rust-style two-level string nodes. Returns the
|
||||
/// raw inner text (no escape-sequence processing) — sufficient for whitelist
|
||||
/// raw inner text (no escape-sequence processing), sufficient for whitelist
|
||||
/// matching against shell-metachar sets.
|
||||
pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option<String> {
|
||||
// Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child.
|
||||
|
|
@ -1320,7 +1430,7 @@ pub(super) fn def_use(
|
|||
// Python/Ruby `expression_statement` → `assignment`)
|
||||
let mut cursor = ast.walk();
|
||||
for child in ast.children(&mut cursor) {
|
||||
// Only use left/right fields for actual assignment nodes — binary
|
||||
// Only use left/right fields for actual assignment nodes, binary
|
||||
// expressions also have left/right but are not definitions.
|
||||
let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment);
|
||||
let child_name = child
|
||||
|
|
@ -1403,7 +1513,7 @@ pub(super) fn def_use(
|
|||
(defs, uses, vec![])
|
||||
}
|
||||
|
||||
// if‑let / while‑let — the `let_condition` binds a variable from
|
||||
// if‑let / while‑let, the `let_condition` binds a variable from
|
||||
// the value expression. E.g. `if let Ok(cmd) = env::var("CMD")`
|
||||
// defines `cmd` and uses `env`, `var`, `CMD`.
|
||||
Kind::If | Kind::While => {
|
||||
|
|
@ -1418,7 +1528,7 @@ pub(super) fn def_use(
|
|||
let mut tmp = Vec::<String>::new();
|
||||
collect_idents(pat, code, &mut tmp);
|
||||
// The first plain identifier in the pattern is the binding.
|
||||
// Skip type identifiers (e.g. "Ok" in Ok(cmd)) — take the
|
||||
// Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the
|
||||
// last ident which is the inner binding name.
|
||||
defs = tmp.into_iter().last();
|
||||
}
|
||||
|
|
|
|||
443
src/cfg/mod.rs
443
src/cfg/mod.rs
|
|
@ -14,6 +14,7 @@ use crate::labels::{
|
|||
};
|
||||
use crate::summary::FuncSummary;
|
||||
use crate::symbol::{FuncKey, Lang};
|
||||
use crate::utils::snippet::truncate_at_char_boundary;
|
||||
use smallvec::SmallVec;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
|
@ -54,8 +55,8 @@ use literals::{
|
|||
extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg,
|
||||
extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node,
|
||||
find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args,
|
||||
is_parameterized_query_call, java_chain_arg0_kind_for_method, ruby_chain_arg0_for_method,
|
||||
walk_chain_inner_call_args,
|
||||
is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
|
||||
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
|
||||
};
|
||||
use params::{
|
||||
compute_container_and_kind, extract_param_meta, inject_framework_param_sources,
|
||||
|
|
@ -74,7 +75,7 @@ pub fn extract_param_meta_for_test<'a>(
|
|||
}
|
||||
|
||||
/// Test-only helper to populate the per-file DTO class map without
|
||||
/// running `build_cfg`. Used by the Phase 6 audit harness in
|
||||
/// running `build_cfg`. Used by the DTO audit harness in
|
||||
/// `tests/typed_extractors_audit.rs` to verify that
|
||||
/// `classify_param_type_*` resolves a same-file DTO via the
|
||||
/// thread-local map.
|
||||
|
|
@ -91,30 +92,26 @@ pub fn clear_dto_classes_for_test() {
|
|||
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Structural DFS index for function bodies
|
||||
// -------------------------------------------------------------------------
|
||||
//
|
||||
// Per-file map of function-node start_byte → depth-first preorder index.
|
||||
// Populated at the start of `build_cfg`, consumed by every site that
|
||||
// previously formatted `<anon@{start_byte}>` or stored `start_byte` as
|
||||
// the disambig. The DFS index is stable against edits elsewhere in the
|
||||
// file (inserting a line above a function does not change its index).
|
||||
//
|
||||
// Thread-local is safe because `build_cfg` is not re-entrant within a
|
||||
// single rayon worker: each file is parsed and CFG-built to completion
|
||||
// before the next one starts.
|
||||
// Per-file map of function-node start_byte → DFS preorder index. Stable
|
||||
// against unrelated edits (inserting a line above a function doesn't
|
||||
// change its index). Thread-local is safe, `build_cfg` is not
|
||||
// re-entrant within a single rayon worker.
|
||||
thread_local! {
|
||||
static FN_DFS_INDICES: RefCell<HashMap<usize, u32>> = RefCell::new(HashMap::new());
|
||||
/// Phase 6: per-file DTO class definitions. Populated at the top
|
||||
/// of [`build_cfg`] by [`dto::collect_dto_classes`] so per-parameter
|
||||
/// classifiers can resolve `@RequestBody T dto` /
|
||||
/// `Json<CreateUser>` / `Annotated[CreateUser, Body()]` to a
|
||||
/// [`crate::ssa::type_facts::TypeKind::Dto`] when the DTO type is
|
||||
/// declared in the same file. Cleared at the end of `build_cfg`
|
||||
/// so thread-local state never leaks between files.
|
||||
/// Per-file DTO class definitions, populated at the top of
|
||||
/// [`build_cfg`] so per-parameter classifiers can resolve typed
|
||||
/// extractors against same-file DTOs.
|
||||
pub(crate) static DTO_CLASSES: RefCell<HashMap<String, crate::ssa::type_facts::DtoFields>>
|
||||
= RefCell::new(HashMap::new());
|
||||
/// Per-file set of TS / JS `type X = Map<...>` (or `Set<...>` /
|
||||
/// `Array<...>` / `T[]`) aliases, populated at the top of
|
||||
/// [`build_cfg`]. Lets `classify_param_type_ts` resolve a
|
||||
/// parameter typed `m: ElementsMap` to
|
||||
/// [`crate::ssa::type_facts::TypeKind::LocalCollection`] via
|
||||
/// same-file alias lookup. Cross-file aliases are not yet
|
||||
/// resolved.
|
||||
pub(crate) static TYPE_ALIAS_LC: RefCell<std::collections::HashSet<String>>
|
||||
= RefCell::new(std::collections::HashSet::new());
|
||||
}
|
||||
|
||||
/// Populate the per-file DFS-index map from a preorder walk of the
|
||||
|
|
@ -148,11 +145,8 @@ fn fn_dfs_index(start_byte: usize) -> Option<u32> {
|
|||
FN_DFS_INDICES.with(|cell| cell.borrow().get(&start_byte).copied())
|
||||
}
|
||||
|
||||
/// Synthetic name for an anonymous function. Uses the DFS index when
|
||||
/// available (`<anon#N>`), falls back to the byte offset when the map
|
||||
/// is empty (e.g. during tests that bypass `build_cfg`). The `#`
|
||||
/// sigil is intentionally different from `@` so the two formats are
|
||||
/// distinguishable by downstream consumers.
|
||||
/// Synthetic name for an anonymous function: `<anon#N>` from the DFS
|
||||
/// index when available, `<anon@OFFSET>` as fallback.
|
||||
pub(crate) fn anon_fn_name(start_byte: usize) -> String {
|
||||
match fn_dfs_index(start_byte) {
|
||||
Some(idx) => format!("<anon#{idx}>"),
|
||||
|
|
@ -160,9 +154,7 @@ pub(crate) fn anon_fn_name(start_byte: usize) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
/// Prefix check that accepts both the new `<anon#...>` and legacy
|
||||
/// `<anon@...>` formats. Used by code paths that classify whether a
|
||||
/// function name came from anonymous synthesis.
|
||||
/// True for any anonymous-function synthesis prefix.
|
||||
pub(crate) fn is_anon_fn_name(name: &str) -> bool {
|
||||
name.starts_with("<anon#") || name.starts_with("<anon@")
|
||||
}
|
||||
|
|
@ -235,9 +227,9 @@ pub struct CallMeta {
|
|||
///
|
||||
/// CFG construction does NOT populate this field today (callee already
|
||||
/// carries the full path). It is the canonical place to read the original
|
||||
/// textual callee for **debug/display only** — analysis code should walk
|
||||
/// SSA `FieldProj` receivers (Phase 4) or use the
|
||||
/// [`crate::labels::bare_method_name`] textual fallback (Phase 5).
|
||||
/// textual callee for **debug/display only**, analysis code should walk
|
||||
/// SSA `FieldProj` receivers or use the
|
||||
/// [`crate::labels::bare_method_name`] textual fallback.
|
||||
#[doc(hidden)]
|
||||
#[serde(default)]
|
||||
pub callee_text: Option<String>,
|
||||
|
|
@ -248,14 +240,14 @@ pub struct CallMeta {
|
|||
pub outer_callee: Option<String>,
|
||||
/// Byte span of the inner call that supplied the classification, when
|
||||
/// `find_classifiable_inner_call` overrode the outer callee. `None` when
|
||||
/// the classification came from the outer AST node directly — in that
|
||||
/// the classification came from the outer AST node directly, in that
|
||||
/// case `AstMeta.span` already points at the classified expression.
|
||||
///
|
||||
/// Consumers that want the location of the *labeled* call (sink/source/
|
||||
/// sanitizer display, flow-step rendering, taint origin attribution)
|
||||
/// should use [`NodeInfo::classification_span`] rather than reading this
|
||||
/// field directly. `AstMeta.span` remains the authoritative "whole
|
||||
/// statement" span — used by structural passes (unreachability,
|
||||
/// statement" span, used by structural passes (unreachability,
|
||||
/// resource lifecycle, guard byte scans, CFG/taint span dedup).
|
||||
#[serde(default)]
|
||||
pub callee_span: Option<(usize, usize)>,
|
||||
|
|
@ -283,7 +275,7 @@ pub struct CallMeta {
|
|||
/// only positional arguments.
|
||||
pub kwargs: Vec<(String, Vec<String>)>,
|
||||
/// String-literal value at each positional argument of this call, parallel
|
||||
/// to `arg_uses` — `Some(s)` when the argument is a syntactic string
|
||||
/// to `arg_uses`, `Some(s)` when the argument is a syntactic string
|
||||
/// literal, `None` otherwise. Empty for non-call nodes or when positional
|
||||
/// boundaries can't be determined. Consumed by the static-map abstract
|
||||
/// analysis (and future literal-aware passes) so they don't need the
|
||||
|
|
@ -302,10 +294,41 @@ pub struct CallMeta {
|
|||
///
|
||||
/// Takes priority over `sink_payload_args` in the SSA sink scan: when a
|
||||
/// call has an object-literal destination arg, only idents under the
|
||||
/// listed fields may contribute sink findings — not every ident in the
|
||||
/// listed fields may contribute sink findings, not every ident in the
|
||||
/// positional slot.
|
||||
///
|
||||
/// Legacy single-gate path: populated only when this call site matched
|
||||
/// exactly one gate. When a callee carries multiple gates (e.g. `fetch`
|
||||
/// is both an SSRF and a `DATA_EXFIL` gate), per-gate filters live in
|
||||
/// [`Self::gate_filters`] and this field is left `None`.
|
||||
#[serde(default)]
|
||||
pub destination_uses: Option<Vec<String>>,
|
||||
/// Per-gate filters for callees that carry multiple gated-sink rules.
|
||||
///
|
||||
/// Each entry preserves one matching gate's `(label_caps, payload_args,
|
||||
/// destination_uses)` so the SSA sink scan can attribute findings
|
||||
/// per-cap. Empty when the call site matches zero or exactly one gate
|
||||
/// (the single-gate case continues to use [`Self::sink_payload_args`] +
|
||||
/// [`Self::destination_uses`]).
|
||||
#[serde(default)]
|
||||
pub gate_filters: Vec<GateFilter>,
|
||||
}
|
||||
|
||||
/// One gate's contribution at a call site whose callee matches multiple
|
||||
/// gates. The SSA taint engine processes each filter independently so a
|
||||
/// `fetch({url: tainted}, {body: tainted})` flow surfaces as one SSRF
|
||||
/// finding (URL filter) plus one `DATA_EXFIL` finding (body filter), each
|
||||
/// carrying its own cap mask rather than a conflated union.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct GateFilter {
|
||||
/// Sink caps emitted by this gate (e.g. `Cap::SSRF`, `Cap::DATA_EXFIL`).
|
||||
pub label_caps: crate::labels::Cap,
|
||||
/// Argument positions that carry the tainted payload for this gate.
|
||||
pub payload_args: Vec<usize>,
|
||||
/// Destination-aware filter: when `Some(names)`, the sink check only
|
||||
/// considers SSA values whose `var_name` matches one of `names` (object-
|
||||
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
|
||||
pub destination_uses: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
/// Taint-classification and variable-flow metadata.
|
||||
|
|
@ -349,7 +372,7 @@ pub struct NodeInfo {
|
|||
///
|
||||
/// This flag is scoped to taint-style sink suppression: it indicates
|
||||
/// that no attacker-controlled data enters through the immediate
|
||||
/// arguments. It does NOT mean the call is "safe" in general — other
|
||||
/// arguments. It does NOT mean the call is "safe" in general, other
|
||||
/// detectors (resource lifecycle, structural analysis) may still
|
||||
/// legitimately flag these calls.
|
||||
pub all_args_literal: bool,
|
||||
|
|
@ -411,7 +434,7 @@ pub struct NodeInfo {
|
|||
pub is_eq_with_const: bool,
|
||||
/// True when this node reads a numeric-length property on a container:
|
||||
/// `arr.length`, `map.size`, `buf.byteLength`, `items.count`, `vec.len()`
|
||||
/// — either as a pure property access or as a zero-arg method call.
|
||||
///, either as a pure property access or as a zero-arg method call.
|
||||
/// Populated by inspecting the AST in `push_node` across JS/TS, Python,
|
||||
/// Ruby, Java, Rust, PHP, and C/C++ idioms where these accessors return
|
||||
/// an integer. Consumed by the type-fact analysis (`ssa::type_facts`)
|
||||
|
|
@ -419,12 +442,12 @@ pub struct NodeInfo {
|
|||
/// FILE_IO / SHELL_ESCAPE sink suppression for provably numeric
|
||||
/// payloads.
|
||||
pub is_numeric_length_access: bool,
|
||||
/// Phase 6.3: the field name read on the RHS of an assignment whose
|
||||
/// the field name read on the RHS of an assignment whose
|
||||
/// RHS is a single member-access expression (e.g. `let x = dto.email`).
|
||||
/// Set to `Some("email")` for that shape; left `None` otherwise.
|
||||
/// Consumed by the type-fact analysis (`ssa::type_facts`) so reads
|
||||
/// against a [`crate::ssa::type_facts::TypeKind::Dto`] receiver pick
|
||||
/// up the field's declared `TypeKind`. Strictly additive — when
|
||||
/// up the field's declared `TypeKind`. Strictly additive, when
|
||||
/// `None`, the legacy copy-prop semantics apply.
|
||||
pub member_field: Option<String>,
|
||||
}
|
||||
|
|
@ -442,7 +465,7 @@ impl NodeInfo {
|
|||
/// lines, flow-step rendering, symbolic witness extraction, debug views.
|
||||
///
|
||||
/// Use `ast.span` directly for **structural grain**: unreachability,
|
||||
/// resource lifecycle, guard byte scans, CFG/taint span dedup — anywhere
|
||||
/// resource lifecycle, guard byte scans, CFG/taint span dedup, anywhere
|
||||
/// the enclosing statement is the meaningful unit.
|
||||
#[inline]
|
||||
pub fn classification_span(&self) -> (usize, usize) {
|
||||
|
|
@ -514,7 +537,7 @@ pub struct BodyMeta {
|
|||
/// Per-parameter [`crate::ssa::type_facts::TypeKind`] inferred from
|
||||
/// decorators / annotations / static type text at CFG construction
|
||||
/// time. Same length as `params`; positions with no recoverable
|
||||
/// type info are `None`. Strictly additive — when every entry is
|
||||
/// type info are `None`. Strictly additive, when every entry is
|
||||
/// `None`, downstream behaviour is identical to the pre-Phase-1
|
||||
/// engine.
|
||||
pub param_types: Vec<Option<crate::ssa::type_facts::TypeKind>>,
|
||||
|
|
@ -528,7 +551,7 @@ pub struct BodyMeta {
|
|||
/// `LocalFuncSummary`. `None` for the synthetic top-level body.
|
||||
///
|
||||
/// All intra-file maps keyed on function identity (SSA summaries, callee
|
||||
/// bodies, inline cache, callback bindings) use this key — never the bare
|
||||
/// bodies, inline cache, callback bindings) use this key, never the bare
|
||||
/// leaf `name`, which is collision-prone across (container, arity,
|
||||
/// disambig, kind).
|
||||
pub func_key: Option<FuncKey>,
|
||||
|
|
@ -589,7 +612,7 @@ pub struct FileCfg {
|
|||
/// Promisify wrapper aliases: local name → wrapped callee name.
|
||||
/// Only populated for JS/TS files.
|
||||
pub promisify_aliases: PromisifyAliases,
|
||||
/// Phase 6: per-file class / trait / interface hierarchy edges.
|
||||
/// per-file class / trait / interface hierarchy edges.
|
||||
/// Each entry is `(sub_container, super_container)` after
|
||||
/// language-specific normalisation. See
|
||||
/// [`crate::cfg::hierarchy`] for the per-language extraction
|
||||
|
|
@ -711,14 +734,10 @@ fn extract_condition_raw<'a>(
|
|||
vars.dedup();
|
||||
vars.truncate(MAX_COND_VARS);
|
||||
|
||||
// 4. Extract text, truncated.
|
||||
let text = text_of(cond, code).map(|t| {
|
||||
if t.len() > MAX_CONDITION_TEXT_LEN {
|
||||
t[..MAX_CONDITION_TEXT_LEN].to_string()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
});
|
||||
// 4. Extract text, truncated. UTF-8-safe, gogs (Gurmukhi) /
|
||||
// discourse (Cyrillic) trip raw byte slices on regex literals.
|
||||
let text = text_of(cond, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
|
||||
(text, vars, negated)
|
||||
}
|
||||
|
|
@ -739,7 +758,7 @@ pub(super) fn detect_negation<'a>(
|
|||
_if_ast: Node<'a>,
|
||||
_lang: &str,
|
||||
) -> (Node<'a>, bool) {
|
||||
// Unwrap parenthesized_expression — JS/Java/PHP wrap if-conditions in parens.
|
||||
// Unwrap parenthesized_expression, JS/Java/PHP wrap if-conditions in parens.
|
||||
// This lets us detect negation inside: `if (!expr)` → cond is `(!expr)`.
|
||||
let cond = if cond.kind() == "parenthesized_expression" {
|
||||
cond.child_by_field_name("expression")
|
||||
|
|
@ -811,7 +830,7 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
|
|||
"*" => Some(BinOp::Mul),
|
||||
"/" => Some(BinOp::Div),
|
||||
"%" => Some(BinOp::Mod),
|
||||
// Bitwise (single-char tokens — no conflict with && / ||)
|
||||
// Bitwise (single-char tokens, no conflict with && / ||)
|
||||
"&" => Some(BinOp::BitAnd),
|
||||
"|" => Some(BinOp::BitOr),
|
||||
"^" => Some(BinOp::BitXor),
|
||||
|
|
@ -909,7 +928,7 @@ fn extract_template_prefix(ast: Node, lang: &str, code: &[u8]) -> Option<String>
|
|||
/// `extract_template_prefix` for both assignment RHS and call arguments.
|
||||
///
|
||||
/// Also descends through `await` / `yield` wrappers and into the first
|
||||
/// argument of a call expression — this covers the common sink shape
|
||||
/// argument of a call expression, this covers the common sink shape
|
||||
/// `await axios.get(\`https://host/…${x}\`)` where the template literal lives
|
||||
/// inside a call inside an `await` wrapper.
|
||||
fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
||||
|
|
@ -930,7 +949,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
|||
}
|
||||
"call_expression" | "call" | "new_expression" => {
|
||||
// Descend into the first positional argument (e.g.
|
||||
// `axios.get(\`https://…${x}\`)` — the URL we want to lock
|
||||
// `axios.get(\`https://…${x}\`)`, the URL we want to lock
|
||||
// is the template-literal first argument of the call).
|
||||
let args = cur
|
||||
.child_by_field_name("arguments")
|
||||
|
|
@ -942,7 +961,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
|||
}
|
||||
}
|
||||
|
||||
// Case 1: template literal — `\`scheme://host/…${x}…\``.
|
||||
// Case 1: template literal, `\`scheme://host/…${x}…\``.
|
||||
if cur.kind() == "template_string" {
|
||||
let mut w = cur.walk();
|
||||
let first_child = cur.named_children(&mut w).next()?;
|
||||
|
|
@ -957,7 +976,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
|||
return None;
|
||||
}
|
||||
|
||||
// Case 2: `"scheme://host/" + x` — LHS is a string literal.
|
||||
// Case 2: `"scheme://host/" + x`, LHS is a string literal.
|
||||
if cur.kind() == "binary_expression" {
|
||||
let mut w2 = cur.walk();
|
||||
let mut ops = cur.children(&mut w2).filter(|c| !c.is_named());
|
||||
|
|
@ -1028,7 +1047,7 @@ fn extract_bin_op_const(ast: Node, lang: &str, code: &[u8]) -> Option<i64> {
|
|||
}
|
||||
}
|
||||
|
||||
// Try left, then right — one of them should be a literal
|
||||
// Try left, then right, one of them should be a literal
|
||||
try_parse_number(left, code).or_else(|| try_parse_number(right, code))
|
||||
}
|
||||
|
||||
|
|
@ -1067,7 +1086,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
|
|||
.named_child(0)
|
||||
.is_some_and(|c| is_boolean_eq_const_tree(c, lang)),
|
||||
"unary_expression" | "not_operator" => {
|
||||
// `!` / `not` — operator is an anonymous child; operand is the
|
||||
// `!` / `not`, operator is an anonymous child; operand is the
|
||||
// single named child.
|
||||
let mut w = node.walk();
|
||||
let mut op_is_not = false;
|
||||
|
|
@ -1084,7 +1103,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
|
|||
.is_some_and(|c| is_boolean_eq_const_tree(c, lang))
|
||||
}
|
||||
"boolean_operator" => {
|
||||
// Python `and`/`or` — operands are named children.
|
||||
// Python `and`/`or`, operands are named children.
|
||||
let l = node.named_child(0);
|
||||
let r = node.named_child(1);
|
||||
l.is_some_and(|n| is_boolean_eq_const_tree(n, lang))
|
||||
|
|
@ -1137,9 +1156,9 @@ fn binary_operator_token(node: Node) -> Option<String> {
|
|||
/// Property names whose value is provably an integer across the supported
|
||||
/// languages: JS/TS `arr.length` (Array/String/TypedArray), `map.size`
|
||||
/// (Map/Set), `buffer.byteLength` (ArrayBuffer/TypedArray); Python `.count`
|
||||
/// (`str.count`, `list.count`, `tuple.count` — all return int); Ruby `.length`
|
||||
/// (`str.count`, `list.count`, `tuple.count`, all return int); Ruby `.length`
|
||||
/// / `.size` / `.count`; Java `.size()` / `.length()`; Rust `.len()`. This
|
||||
/// list is intentionally narrow — only properties whose semantics across every
|
||||
/// list is intentionally narrow, only properties whose semantics across every
|
||||
/// host we scan return an integer, so the `TypeKind::Int` fact is sound.
|
||||
fn is_numeric_length_property(name: &str) -> bool {
|
||||
matches!(name, "length" | "size" | "byteLength" | "count" | "len")
|
||||
|
|
@ -1157,7 +1176,7 @@ fn is_numeric_length_property(name: &str) -> bool {
|
|||
/// Consumed by the type-fact analysis (`ssa::type_facts::analyze_types`) to
|
||||
/// infer `TypeKind::Int` on the defined value so sink-cap suppression can
|
||||
/// treat `"row " + arr.length` as a non-injectable payload.
|
||||
/// Phase 6.3: when the RHS of an assignment / declaration is a single
|
||||
/// when the RHS of an assignment / declaration is a single
|
||||
/// member-access expression (`let x = dto.email`, `x = obj.field`,
|
||||
/// `let x = obj["field"]`), return the property name. The CFG type-fact
|
||||
/// analysis uses the recovered name to look up the field's declared
|
||||
|
|
@ -1321,7 +1340,7 @@ fn find_single_binary_expr<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
|
|||
|
||||
// Check if ast itself is a binary expression
|
||||
if is_binary_expr_kind(ast_kind, lang) {
|
||||
// Verify it has exactly 2 named children (left, right) — no nesting
|
||||
// Verify it has exactly 2 named children (left, right), no nesting
|
||||
let named_count = ast.named_child_count();
|
||||
if named_count == 2 {
|
||||
// Ensure neither child is itself a binary expression (that would
|
||||
|
|
@ -1435,7 +1454,7 @@ pub(super) fn push_node<'a>(
|
|||
// (e.g. PHP `object_creation_expression` has positional children).
|
||||
.or_else(|| find_constructor_type_child(ast))
|
||||
.and_then(|n| {
|
||||
// IIFE: `(function(x){...})(arg)` — the called expression is a
|
||||
// IIFE: `(function(x){...})(arg)`, the called expression is a
|
||||
// function literal with no identifier. Bind the call to the
|
||||
// anonymous body's synthetic name so resolve_callee can find
|
||||
// the extracted BodyCfg/summary. Without this, text_of() would
|
||||
|
|
@ -1512,7 +1531,7 @@ pub(super) fn push_node<'a>(
|
|||
// If this is a declaration/expression wrapper or an assignment that
|
||||
// *contains* a call, prefer the first inner call identifier instead of
|
||||
// the whole line. Track the inner call's byte span so we can populate
|
||||
// `CallMeta.callee_span` once the labels settle — enabling narrow
|
||||
// `CallMeta.callee_span` once the labels settle, enabling narrow
|
||||
// source-location reporting when the classified call lives several lines
|
||||
// below the enclosing statement (e.g. call inside a multi-line template
|
||||
// literal).
|
||||
|
|
@ -1546,9 +1565,9 @@ pub(super) fn push_node<'a>(
|
|||
let mut labels = classify_all(lang, &text, extra);
|
||||
|
||||
// If the outermost call didn't classify, try inner/nested calls.
|
||||
// E.g. `str(eval(expr))` — `str` is not a sink, but `eval` is.
|
||||
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
|
||||
// When the callee is overridden, save the original for container ops
|
||||
// (e.g. `parts.add(req.getParameter(...))` — callee becomes
|
||||
// (e.g. `parts.add(req.getParameter(...))`, callee becomes
|
||||
// "req.getParameter" but outer_callee preserves "parts.add").
|
||||
let mut outer_callee: Option<String> = None;
|
||||
let mut inner_callee_span: Option<(usize, usize)> = None;
|
||||
|
|
@ -1568,7 +1587,7 @@ pub(super) fn push_node<'a>(
|
|||
|
||||
// For assignments like `element.innerHTML = value`, the inner-call heuristic
|
||||
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
|
||||
// If that didn't produce a label, check the LHS property name — it may be a
|
||||
// If that didn't produce a label, check the LHS property name, it may be a
|
||||
// sink like `innerHTML`.
|
||||
//
|
||||
// This covers both direct `Kind::Assignment` nodes and `Kind::CallWrapper`
|
||||
|
|
@ -1588,7 +1607,7 @@ pub(super) fn push_node<'a>(
|
|||
if let Some(assign) = assign_node
|
||||
&& let Some(lhs) = assign.child_by_field_name("left")
|
||||
{
|
||||
// Try full member expression first (e.g. "location.href") — more
|
||||
// Try full member expression first (e.g. "location.href"), more
|
||||
// specific and avoids false positives on `a.href`.
|
||||
if let Some(full) = member_expr_text(lhs, code) {
|
||||
if let Some(l) = classify(lang, &full, extra) {
|
||||
|
|
@ -1612,7 +1631,7 @@ pub(super) fn push_node<'a>(
|
|||
// try to classify the member expression text as a source.
|
||||
// This handles `var x = process.env.CMD` (JS), `os.environ["KEY"]` (Python),
|
||||
// and similar property-access-based source patterns.
|
||||
// Skip when the assignment's RHS is itself a function/lambda literal —
|
||||
// Skip when the assignment's RHS is itself a function/lambda literal ,
|
||||
// labels found by `first_member_label` would come from inside the
|
||||
// closure body and shouldn't tag the outer wrapper (e.g. Go's
|
||||
// `run := func() { exec.Command(...) }` would otherwise inherit
|
||||
|
|
@ -1687,7 +1706,7 @@ pub(super) fn push_node<'a>(
|
|||
if labels.is_empty()
|
||||
&& let Some(outer) = call_ast
|
||||
&& let Some((inner, inner_callee_text)) = find_chained_inner_call(outer, lang, code)
|
||||
&& classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_some()
|
||||
&& !classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_empty()
|
||||
{
|
||||
call_ast = Some(inner);
|
||||
outer_callee = Some(text.clone());
|
||||
|
|
@ -1707,13 +1726,14 @@ pub(super) fn push_node<'a>(
|
|||
// the outer statement `text`, so gate matcher names like `"fetch"` hit.
|
||||
let mut sink_payload_args: Option<Vec<usize>> = None;
|
||||
let mut destination_uses: Option<Vec<String>> = None;
|
||||
let mut gate_filters: Vec<GateFilter> = Vec::new();
|
||||
if labels.is_empty() {
|
||||
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
|
||||
if let Some(cn) = gate_call {
|
||||
let gate_callee_text = if call_ast.is_some() {
|
||||
text.clone()
|
||||
} else {
|
||||
// Inner call reached via wrapper — use the call-expression's
|
||||
// Inner call reached via wrapper, use the call-expression's
|
||||
// function name directly. Falls back to `text` so non-call-
|
||||
// expression kinds (method calls, Ruby `call` nodes, macros)
|
||||
// still have a usable callee string.
|
||||
|
|
@ -1723,51 +1743,84 @@ pub(super) fn push_node<'a>(
|
|||
.and_then(|f| text_of(f, code))
|
||||
.unwrap_or_else(|| text.clone())
|
||||
};
|
||||
if let Some(gm) = classify_gated_sink(
|
||||
let matches = classify_gated_sink(
|
||||
lang,
|
||||
&gate_callee_text,
|
||||
|idx| extract_const_string_arg(cn, idx, code),
|
||||
|kw| extract_const_keyword_arg(cn, kw, code),
|
||||
|kw| has_keyword_arg(cn, kw, code),
|
||||
) {
|
||||
labels.push(gm.label);
|
||||
let payload = gm.payload_args;
|
||||
if payload == crate::labels::ALL_ARGS_PAYLOAD {
|
||||
// Dynamic-activation sentinel: every positional arg is
|
||||
// conservatively a payload. Expand using the actual call
|
||||
// arity so `collect_tainted_sink_values` checks each one.
|
||||
let arity = extract_arg_uses(cn, code).len();
|
||||
if arity > 0 {
|
||||
sink_payload_args = Some((0..arity).collect());
|
||||
}
|
||||
} else if !payload.is_empty() {
|
||||
sink_payload_args = Some(payload.to_vec());
|
||||
}
|
||||
);
|
||||
|
||||
// Destination-aware gates (outbound HTTP clients): when the
|
||||
// gate declares destination-bearing object fields and the
|
||||
// positional destination arg at call time is an object
|
||||
// literal, narrow sink-taint checks to identifiers under
|
||||
// those fields. Non-object arg forms (string / ident /
|
||||
// expression) return `None` from the extractor and fall
|
||||
// through to whole-arg positional filtering.
|
||||
//
|
||||
// We only populate destination_uses for the FIRST payload
|
||||
// position that is an object literal. For outbound HTTP
|
||||
// gates `payload_args` is always a single position (arg 0)
|
||||
// so this is exact.
|
||||
if !gm.object_destination_fields.is_empty() {
|
||||
for &pos in gm.payload_args {
|
||||
if let Some(names) = extract_destination_field_idents(
|
||||
cn,
|
||||
pos,
|
||||
gm.object_destination_fields,
|
||||
code,
|
||||
) {
|
||||
destination_uses = Some(names);
|
||||
break;
|
||||
if !matches.is_empty() {
|
||||
// Per-gate filter accumulation. Each match contributes:
|
||||
// * its label (added to `labels` so `resolve_sink_caps`
|
||||
// downstream sees the union),
|
||||
// * a `GateFilter` carrying that gate's specific
|
||||
// `(label_caps, payload_args, destination_uses)` so
|
||||
// the SSA sink scan can attribute taint per-cap.
|
||||
let mut union_payload: Vec<usize> = Vec::new();
|
||||
for gm in &matches {
|
||||
labels.push(gm.label);
|
||||
|
||||
let payload_vec: Vec<usize> =
|
||||
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
|
||||
// Dynamic-activation sentinel: every positional arg is
|
||||
// conservatively a payload. Expand using the actual
|
||||
// call arity so `collect_tainted_sink_values` checks
|
||||
// each one.
|
||||
let arity = extract_arg_uses(cn, code).len();
|
||||
(0..arity).collect()
|
||||
} else {
|
||||
gm.payload_args.to_vec()
|
||||
};
|
||||
|
||||
// Destination-aware gates: when the gate declares
|
||||
// destination-bearing object fields and a payload-position
|
||||
// arg is an object literal at call time, narrow sink-taint
|
||||
// checks to identifiers under those fields. Non-object
|
||||
// arg forms return `None` from the extractor and the gate
|
||||
// falls back to whole-arg positional filtering.
|
||||
let mut dest_uses: Option<Vec<String>> = None;
|
||||
if !gm.object_destination_fields.is_empty() {
|
||||
for &pos in gm.payload_args {
|
||||
if let Some(names) = extract_destination_field_idents(
|
||||
cn,
|
||||
pos,
|
||||
gm.object_destination_fields,
|
||||
code,
|
||||
) {
|
||||
dest_uses = Some(names);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let label_caps = match gm.label {
|
||||
crate::labels::DataLabel::Sink(c) => c,
|
||||
_ => crate::labels::Cap::empty(),
|
||||
};
|
||||
|
||||
for &p in &payload_vec {
|
||||
if !union_payload.contains(&p) {
|
||||
union_payload.push(p);
|
||||
}
|
||||
}
|
||||
gate_filters.push(GateFilter {
|
||||
label_caps,
|
||||
payload_args: payload_vec,
|
||||
destination_uses: dest_uses,
|
||||
});
|
||||
}
|
||||
if !union_payload.is_empty() {
|
||||
sink_payload_args = Some(union_payload);
|
||||
}
|
||||
// Legacy single-gate path keeps `destination_uses` populated so
|
||||
// the SSA fast-path (one filter) continues to work without
|
||||
// consulting `gate_filters`. When multiple gates match,
|
||||
// per-position filters live in `gate_filters` and the legacy
|
||||
// field is intentionally left `None`.
|
||||
if gate_filters.len() == 1 {
|
||||
destination_uses = gate_filters[0].destination_uses.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1778,7 +1831,7 @@ pub(super) fn push_node<'a>(
|
|||
// path-traversal or HTML metacharacters. The CFG collapses the whole
|
||||
// chain into a single call node, so detection must inspect the AST of
|
||||
// that node directly. Only fires when no Sanitizer label already
|
||||
// classifies this node — existing label rules win.
|
||||
// classifies this node, existing label rules win.
|
||||
if lang == "rust" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) {
|
||||
if let Some(cn) = call_ast {
|
||||
if cn.kind() == "call_expression" || cn.kind() == "method_call_expression" {
|
||||
|
|
@ -1815,7 +1868,7 @@ pub(super) fn push_node<'a>(
|
|||
// `having` / `joins` as `Sink(SQL_QUERY)` because their string-interpolation
|
||||
// form (`Model.where("id = #{x}")`) is a real SQLi vector. But the same
|
||||
// methods are intrinsically parameterised when arg 0 is a hash, symbol,
|
||||
// array, or non-interpolated string — Rails escapes the values. Rather
|
||||
// array, or non-interpolated string, Rails escapes the values. Rather
|
||||
// than dropping the sink (which would lose the genuine TPs), synthesise
|
||||
// a same-node `Sanitizer(SQL_QUERY)` for the safe shapes; this clears
|
||||
// SQL taint at the call and reflexively dominates the sink, suppressing
|
||||
|
|
@ -1825,7 +1878,7 @@ pub(super) fn push_node<'a>(
|
|||
// Chained calls (`Model.where(...).preload(...).to_a`) collapse into a
|
||||
// single CFG node whose outer `call_ast` may be `to_a` (no args). The
|
||||
// shape inspection has to walk the receiver chain to reach the AR query
|
||||
// call itself — `ruby_chain_arg0_for_method` does that walk.
|
||||
// call itself, `ruby_chain_arg0_for_method` does that walk.
|
||||
if (lang == "ruby" || lang == "rb")
|
||||
&& labels
|
||||
.iter()
|
||||
|
|
@ -1859,7 +1912,7 @@ pub(super) fn push_node<'a>(
|
|||
// and `Statement.executeQuery(String)` overloads are real injection
|
||||
// sinks when given a concatenated SQL string. But the same method
|
||||
// names on JPA `javax.persistence.Query` and JDBC `PreparedStatement`
|
||||
// are zero-arg — they execute SQL that was bound upstream by
|
||||
// are zero-arg, they execute SQL that was bound upstream by
|
||||
// `entityManager.createQuery(LITERAL)` / `connection.prepareStatement(LITERAL)`,
|
||||
// and any bind values went through `setParameter` / `setString`
|
||||
// (which the JDBC/JPA driver escapes). Walk the receiver chain to
|
||||
|
|
@ -1894,7 +1947,7 @@ pub(super) fn push_node<'a>(
|
|||
// (`createQuery` / `createNativeQuery` / `prepareStatement`)
|
||||
// and require its arg 0 to be a string literal. Anything
|
||||
// else (binary concat, identifier, method call) leaves
|
||||
// the sink in place — we cannot prove the SQL is
|
||||
// the sink in place, we cannot prove the SQL is
|
||||
// parameterised, so the structural finding stands.
|
||||
const JPA_BIND_METHODS: &[&str] = &[
|
||||
"createQuery",
|
||||
|
|
@ -1914,6 +1967,89 @@ pub(super) fn push_node<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
// Shape-based sanitizer synthesis for JS/TS ORM-accessor chains.
|
||||
// The static label table marks `db.query` / `connection.query` /
|
||||
// `pool.query` / `client.query` / `db.execute` as `Sink(SQL_QUERY)`
|
||||
// because the bare `connection.query("SELECT ..." + name)` form is a
|
||||
// real SQLi sink. But the same `db.query` method on Strapi-style ORMs
|
||||
// takes a model UID literal and returns a chainable model accessor:
|
||||
// `strapi.db.query('admin::api-token').findOne({ where: whereParams })`.
|
||||
// The trailing `.findOne({...})` / `.findMany({...})` / `.create(...)`
|
||||
// calls are intrinsically parameterised, the actual SQL is generated
|
||||
// by the ORM, and the per-call values arrive through field-keyed object
|
||||
// literals that the ORM driver escapes.
|
||||
//
|
||||
// Recognition rule: when the CFG node's classified text reaches a sink
|
||||
// with `SQL_QUERY` cap, walk the receiver chain looking for an inner
|
||||
// `*.query(...)` / `*.execute(...)` whose arg 0 is a string literal
|
||||
// and whose result has at least one chained method call appended whose
|
||||
// name is in the ORM-accessor whitelist. If both hold, synthesise a
|
||||
// same-node `Sanitizer(SQL_QUERY)` mirroring the Java JPA fix. Bare
|
||||
// `connection.query("SELECT ...")` (no chained method) and
|
||||
// `db.query("UPDATE x SET y=" + name)` (non-literal arg 0) leave the
|
||||
// sink in place, both are genuine SQLi shapes.
|
||||
if (lang == "javascript"
|
||||
|| lang == "js"
|
||||
|| lang == "typescript"
|
||||
|| lang == "ts"
|
||||
|| lang == "tsx")
|
||||
&& labels
|
||||
.iter()
|
||||
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
|
||||
&& !labels
|
||||
.iter()
|
||||
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
|
||||
{
|
||||
const QUERY_TARGETS: &[&str] = &["query", "execute"];
|
||||
// ORM-accessor methods that take object-literal args and return
|
||||
// promises of rows / row counts. Promise methods (`then`, `catch`,
|
||||
// `finally`) deliberately excluded, they don't prove ORM shape.
|
||||
const ORM_CHAIN_METHODS: &[&str] = &[
|
||||
"findOne",
|
||||
"findMany",
|
||||
"findFirst",
|
||||
"findUnique",
|
||||
"findById",
|
||||
"find",
|
||||
"create",
|
||||
"createMany",
|
||||
"update",
|
||||
"updateMany",
|
||||
"upsert",
|
||||
"delete",
|
||||
"deleteMany",
|
||||
"count",
|
||||
"aggregate",
|
||||
"distinct",
|
||||
"save",
|
||||
];
|
||||
// Fall back to a deeper walk (up to 4 levels) for await/return-
|
||||
// wrapped calls (e.g. `const x = await db.query(...).findOne(...)` ,
|
||||
// call sits at depth 3 inside lexical_declaration > variable_declarator
|
||||
// > await_expression > call_expression).
|
||||
let chain_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
|
||||
if let Some(call_node) = chain_call {
|
||||
// Outer method must be in the ORM whitelist *and* the chain must
|
||||
// have a deeper inner call to a `query`/`execute` whose arg 0 is
|
||||
// a string literal. Both checks gate the synthesis.
|
||||
let outer_method = js_chain_outer_method_for_inner(call_node, QUERY_TARGETS, code);
|
||||
let outer_is_orm = outer_method
|
||||
.as_deref()
|
||||
.is_some_and(|m| ORM_CHAIN_METHODS.contains(&m));
|
||||
if outer_is_orm
|
||||
&& let Some((arg0_kind, has_interp)) =
|
||||
js_chain_arg0_kind_for_method(call_node, QUERY_TARGETS, code)
|
||||
&& !has_interp
|
||||
&& matches!(
|
||||
arg0_kind.as_str(),
|
||||
"string" | "string_fragment" | "template_string"
|
||||
)
|
||||
{
|
||||
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let span = (ast.start_byte(), ast.end_byte());
|
||||
|
||||
/* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */
|
||||
|
|
@ -2036,7 +2172,7 @@ pub(super) fn push_node<'a>(
|
|||
// (SSA `SsaOp::Call.receiver`, summary `receiver_to_return`/`receiver_to_sink`).
|
||||
//
|
||||
// Two cases:
|
||||
// 1. Kind::CallMethod — native method call AST (Java method_invocation,
|
||||
// 1. Kind::CallMethod, native method call AST (Java method_invocation,
|
||||
// Rust method_call_expression, Ruby call, PHP member_call_expression).
|
||||
// Receiver is exposed via "object"/"receiver"/"scope" field on the call.
|
||||
// 2. Kind::CallFn whose function child is a member_expression (JS/TS) or
|
||||
|
|
@ -2065,7 +2201,7 @@ pub(super) fn push_node<'a>(
|
|||
// value, which is what type-qualified resolution
|
||||
// anchors on. Falls back to `root_receiver_text` (which
|
||||
// returns raw text like "conn.execute") only if drilling
|
||||
// fails — preserving prior behavior for types we can't
|
||||
// fails, preserving prior behavior for types we can't
|
||||
// structurally reduce.
|
||||
root_member_receiver(rn, code).or_else(|| root_receiver_text(cn, lang, code))
|
||||
} else {
|
||||
|
|
@ -2076,7 +2212,7 @@ pub(super) fn push_node<'a>(
|
|||
// JS/TS `obj.method(x)`: call_expression.function = member_expression.
|
||||
// Python `obj.method(x)`: call.function = attribute.
|
||||
// Rust `obj.method(x)`: call_expression.function = field_expression
|
||||
// (field on `value`, not `object` — value can be another call
|
||||
// (field on `value`, not `object`, value can be another call
|
||||
// for chained forms like `Connection::open(p).unwrap().execute(...)`).
|
||||
// Pull the receiver from the object/attribute-owner field.
|
||||
let func_child = cn.child_by_field_name("function");
|
||||
|
|
@ -2139,7 +2275,7 @@ pub(super) fn push_node<'a>(
|
|||
// Python `with` and Java try-with-resources.
|
||||
let is_raii_managed = is_raii_factory(lang, &text);
|
||||
|
||||
// Ruby block form auto-close: `File.open(path) { |f| f.read }` —
|
||||
// Ruby block form auto-close: `File.open(path) { |f| f.read }` ,
|
||||
// the block parameter receives the resource and Ruby guarantees close
|
||||
// at block exit. If assigned (`f = File.open(p) { ... }`), the
|
||||
// variable holds the block's return value, not an open resource.
|
||||
|
|
@ -2156,7 +2292,7 @@ pub(super) fn push_node<'a>(
|
|||
// Prefer the span of the call found by `find_classifiable_inner_call`
|
||||
// (deeper, classification-driven) over the one from `first_call_ident`
|
||||
// (shallower, text-override-driven). Only record `callee_span` when it
|
||||
// actually narrows against `ast.span` — storing a redundant copy would
|
||||
// actually narrows against `ast.span`, storing a redundant copy would
|
||||
// just bloat every labeled Call node.
|
||||
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
|
||||
|
||||
|
|
@ -2174,6 +2310,7 @@ pub(super) fn push_node<'a>(
|
|||
kwargs,
|
||||
arg_string_literals,
|
||||
destination_uses,
|
||||
gate_filters,
|
||||
},
|
||||
taint: TaintMeta {
|
||||
labels,
|
||||
|
|
@ -2228,7 +2365,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
|
|||
/// Pre-emit dedicated Source CFG nodes for call arguments that contain source
|
||||
/// member expressions.
|
||||
///
|
||||
/// **Two-step API** — Source nodes must be created *before* the Call node so
|
||||
/// **Two-step API**, Source nodes must be created *before* the Call node so
|
||||
/// they receive lower graph indices. This is critical because the If handler
|
||||
/// uses `NodeIndex::new(g.node_count())` to capture the first node built in a
|
||||
/// branch and wires a True/False edge to it. If the Source node has a lower
|
||||
|
|
@ -2239,7 +2376,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
|
|||
/// the branch body.
|
||||
///
|
||||
/// True when `ast` is an assignment / declaration whose RHS is a
|
||||
/// function or lambda literal — i.e. shapes like
|
||||
/// function or lambda literal, i.e. shapes like
|
||||
/// * Go `run := func() { ... }`
|
||||
/// * JS/TS `var run = function() { ... }` / `const run = () => ...`
|
||||
/// * Python `run = lambda x: ...`
|
||||
|
|
@ -2311,7 +2448,7 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
|
|||
false
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: when `ast` is (or wraps) an assignment whose
|
||||
/// when `ast` is (or wraps) an assignment whose
|
||||
/// LHS is a single subscript / index expression with a plain-identifier
|
||||
/// receiver, emit a synthetic `__index_set__` Call node and return its
|
||||
/// `NodeIndex`. Returns `None` for non-subscript LHSs, multi-target
|
||||
|
|
@ -2328,7 +2465,7 @@ fn try_lower_subscript_write(
|
|||
enclosing_func: Option<&str>,
|
||||
call_ordinal: &mut u32,
|
||||
) -> Option<NodeIndex> {
|
||||
// Locate the assignment node — `ast` may be the assignment itself
|
||||
// Locate the assignment node, `ast` may be the assignment itself
|
||||
// (Go `assignment_statement`) or a wrapper (`expression_statement`
|
||||
// containing JS `assignment_expression` / Python `assignment`).
|
||||
let assign_ast = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
|
||||
|
|
@ -2383,7 +2520,7 @@ fn try_lower_subscript_write(
|
|||
/// `synth_bindings` carry `(arg_pos, synth_name)` pairs that should be
|
||||
/// appended to both the call's `arg_uses[arg_pos]` and its `taint.uses`.
|
||||
/// `uses_only_synth_names` carry synth names that should *only* be
|
||||
/// appended to `taint.uses` — used for chain-inner-arg sources where the
|
||||
/// appended to `taint.uses`, used for chain-inner-arg sources where the
|
||||
/// synth value is not a positional argument of the OUTER call but still
|
||||
/// participates in the call's implicit dependency chain (e.g. `r.Body`
|
||||
/// inside `json.NewDecoder(r.Body).Decode(emoji)`'s receiver).
|
||||
|
|
@ -2446,7 +2583,7 @@ fn pre_emit_arg_source_nodes(
|
|||
for (pos, child) in children.iter().enumerate() {
|
||||
let src_label = first_member_label(*child, lang, code, extra);
|
||||
if let Some(DataLabel::Source(caps)) = src_label {
|
||||
// Use the *current* node count as a unique token — it equals the
|
||||
// Use the *current* node count as a unique token, it equals the
|
||||
// index the new Source node will receive.
|
||||
let synth_name = format!("__nyx_src_{}_{}", g.node_count(), pos);
|
||||
let member_text = first_member_text(*child, code);
|
||||
|
|
@ -2481,7 +2618,7 @@ fn pre_emit_arg_source_nodes(
|
|||
continue;
|
||||
}
|
||||
|
||||
// Pointer-Phase 6 / W5: pre-emit `__index_get__` Call nodes for
|
||||
//pre-emit `__index_get__` Call nodes for
|
||||
// subscript / index-expression args when pointer analysis is
|
||||
// enabled. This lets the W2/W4 container ELEM read hook fire
|
||||
// on the synth call, propagating must/may/caps from the cell
|
||||
|
|
@ -2489,7 +2626,7 @@ fn pre_emit_arg_source_nodes(
|
|||
//
|
||||
// Gated on `pointer::is_enabled()` so the env-var=0 path keeps
|
||||
// CFG shapes bit-identical to today's output. Only fires when
|
||||
// the array operand resolves to a plain identifier — see
|
||||
// the array operand resolves to a plain identifier, see
|
||||
// `subscript_components` for the bail conditions.
|
||||
if pointer_on
|
||||
&& is_subscript_kind(child.kind())
|
||||
|
|
@ -2539,7 +2676,7 @@ fn pre_emit_arg_source_nodes(
|
|||
// Gated to Go and to writeback-shaped outer callees (`Decode` /
|
||||
// `Unmarshal`) because the synth-source emission is only useful when
|
||||
// a downstream writeback consumer reads from the chain's tainted
|
||||
// receiver — broader gating risks emitting synth sources whose taint
|
||||
// receiver, broader gating risks emitting synth sources whose taint
|
||||
// never propagates and whose presence trips Layer B AST-pattern
|
||||
// suppression on unrelated sinks (see
|
||||
// `tests/fixtures/real_world/go/taint/func_literal_capture.go`).
|
||||
|
|
@ -2613,7 +2750,7 @@ fn pre_emit_arg_source_nodes(
|
|||
|
||||
/// Step 2: wire synthetic variable names from pre-emitted Source nodes into
|
||||
/// the Call node's `arg_uses` and `uses`. `uses_only` synth names are
|
||||
/// appended only to `taint.uses` — used for chain-inner-arg sources whose
|
||||
/// appended only to `taint.uses`, used for chain-inner-arg sources whose
|
||||
/// synth value is not a positional outer-call argument.
|
||||
fn apply_arg_source_bindings(
|
||||
g: &mut Cfg,
|
||||
|
|
@ -2724,7 +2861,7 @@ pub(super) fn build_sub<'a>(
|
|||
.unwrap_or(false);
|
||||
|
||||
// Check for negation wrapping the entire condition (e.g. `!(a && b)`)
|
||||
// — if present, skip short-circuit decomposition (De Morgan out of scope).
|
||||
//, if present, skip short-circuit decomposition (De Morgan out of scope).
|
||||
let has_short_circuit = has_short_circuit
|
||||
&& cond_subtree.map_or(false, |c| {
|
||||
let unwrapped = unwrap_parens(c);
|
||||
|
|
@ -3424,7 +3561,7 @@ pub(super) fn build_sub<'a>(
|
|||
// When the grammar-level name is anonymous, try to derive a binding
|
||||
// name from the surrounding declaration or assignment. This lets
|
||||
// `var h = function(x){...}` / `this.run = () => {...}` participate
|
||||
// in callback resolution — callers referencing `h` or `run` can
|
||||
// in callback resolution, callers referencing `h` or `run` can
|
||||
// find the body via `resolve_local_func_key` and intra-file calls
|
||||
// like `h()` can resolve to the anonymous body's summary. Without
|
||||
// this, the body is keyed with the synthetic anon name and there
|
||||
|
|
@ -3731,7 +3868,7 @@ pub(super) fn build_sub<'a>(
|
|||
// would lower the return as a plain `StmtKind::Call`, losing
|
||||
// the return semantics and letting fall-through Seq edges
|
||||
// survive into the SSA terminator (the OR-chain rejection-arm
|
||||
// defect — see `or_chain_rejection_block_terminates_with_return`).
|
||||
// defect, see `or_chain_rejection_block_terminates_with_return`).
|
||||
if let Some(inner) = ast.children(&mut cursor).find(|c| {
|
||||
matches!(
|
||||
lookup(lang, c.kind()),
|
||||
|
|
@ -3788,7 +3925,7 @@ pub(super) fn build_sub<'a>(
|
|||
);
|
||||
}
|
||||
|
||||
// Pointer-Phase 6 / W5: subscript-write lowering when the
|
||||
//subscript-write lowering when the
|
||||
// CallWrapper's inner expression is `arr[i] = v` (JS/TS,
|
||||
// Python). See `try_lower_subscript_write` for shape +
|
||||
// bail matrix.
|
||||
|
|
@ -3824,7 +3961,7 @@ pub(super) fn build_sub<'a>(
|
|||
// Pre-emit Source nodes for call arguments containing source
|
||||
// member expressions (e.g. `req.body.returnTo` inside
|
||||
// `res.redirect(req.body.returnTo)`). Created BEFORE the Call
|
||||
// node so they get lower indices — see doc comment on
|
||||
// node so they get lower indices, see doc comment on
|
||||
// `pre_emit_arg_source_nodes` for why this ordering matters.
|
||||
let (effective_preds, src_bindings, src_uses_only) = if kind == StmtKind::Call {
|
||||
pre_emit_arg_source_nodes(g, ast, lang, code, enclosing_func, analysis_rules, preds)
|
||||
|
|
@ -3984,7 +4121,7 @@ pub(super) fn build_sub<'a>(
|
|||
|
||||
// Assignment that may contain a call (Python `x = os.getenv(...)`, Ruby `x = gets()`)
|
||||
Kind::Assignment => {
|
||||
// JS/TS ternary-RHS split — same rationale as the CallWrapper branch.
|
||||
// JS/TS ternary-RHS split, same rationale as the CallWrapper branch.
|
||||
if matches!(lang, "javascript" | "typescript" | "tsx")
|
||||
&& let (Some(left), Some(right)) = (
|
||||
ast.child_by_field_name("left"),
|
||||
|
|
@ -4011,7 +4148,7 @@ pub(super) fn build_sub<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
// Pointer-Phase 6 / W5: subscript-write lowering. See
|
||||
//subscript-write lowering. See
|
||||
// `try_lower_subscript_write` for the per-language shape
|
||||
// matrix and bail conditions.
|
||||
if crate::pointer::is_enabled()
|
||||
|
|
@ -4099,12 +4236,19 @@ pub(crate) fn build_cfg<'a>(
|
|||
// function so thread-local state never leaks between files.
|
||||
populate_fn_dfs_indices(tree, lang);
|
||||
|
||||
// Phase 6: harvest DTO class definitions before any param classifier
|
||||
// runs. Empty for languages without a Phase 6 collector. Cleared
|
||||
// harvest DTO class definitions before any param classifier
|
||||
// runs. Empty for languages without a collector. Cleared
|
||||
// alongside the DFS map at end-of-build_cfg.
|
||||
DTO_CLASSES.with(|cell| {
|
||||
*cell.borrow_mut() = dto::collect_dto_classes(tree.root_node(), lang, code);
|
||||
});
|
||||
// harvest same-file `type X = Map<...>` / `Set<...>` / `T[]`
|
||||
// aliases so JS/TS param classifiers resolve `m: ElementsMap`
|
||||
// to `LocalCollection`. Empty for non-JS/TS languages.
|
||||
TYPE_ALIAS_LC.with(|cell| {
|
||||
*cell.borrow_mut() =
|
||||
dto::collect_type_alias_local_collections(tree.root_node(), lang, code);
|
||||
});
|
||||
|
||||
// Create the top-level body graph (BodyId(0)).
|
||||
let (mut g, entry, exit) = create_body_graph(0, code.len(), None);
|
||||
|
|
@ -4143,7 +4287,7 @@ pub(crate) fn build_cfg<'a>(
|
|||
connect_all(&mut g, &[e], exit, EdgeKind::Seq);
|
||||
}
|
||||
|
||||
debug!(target: "cfg", "CFG DONE — top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
|
||||
debug!(target: "cfg", "CFG DONE, top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
for idx in g.node_indices() {
|
||||
|
|
@ -4231,10 +4375,11 @@ pub(crate) fn build_cfg<'a>(
|
|||
// Clear the per-file DFS-index map so it does not leak to the next
|
||||
// file built on this thread.
|
||||
clear_fn_dfs_indices();
|
||||
// Phase 6: same hygiene for the DTO map.
|
||||
// same hygiene for the DTO map.
|
||||
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
|
||||
TYPE_ALIAS_LC.with(|cell| cell.borrow_mut().clear());
|
||||
|
||||
// Phase 6 (typed call-graph subtype awareness): collect every
|
||||
// collect every
|
||||
// declared inheritance / impl / implements relationship in the
|
||||
// file. Per-language extractor in `cfg::hierarchy`; empty for
|
||||
// Go and C. Each `(sub, super)` pair gets duplicated onto every
|
||||
|
|
@ -4289,14 +4434,14 @@ fn apply_promisify_labels(
|
|||
/// Build a `CalleeSite` carrying the richer per-call-site metadata for a
|
||||
/// CFG node.
|
||||
///
|
||||
/// * `arity` — positional argument count. `None` when `extract_arg_uses`
|
||||
/// * `arity`, positional argument count. `None` when `extract_arg_uses`
|
||||
/// bailed out on splats/keyword-args (length 0 does not distinguish
|
||||
/// zero-arg calls from unknown; we treat 0 as a concrete zero). The
|
||||
/// receiver is a separate channel via `CallMeta.receiver` and is not
|
||||
/// represented in `arg_uses`, so `arity == arg_uses.len()` for calls.
|
||||
/// * `receiver` — forwarded verbatim from `CallMeta.receiver` (already
|
||||
/// * `receiver`, forwarded verbatim from `CallMeta.receiver` (already
|
||||
/// normalized to the root identifier).
|
||||
/// * `qualifier` — the segment(s) before the leaf identifier of the callee.
|
||||
/// * `qualifier`, the segment(s) before the leaf identifier of the callee.
|
||||
/// For **Rust** specifically, this is the *full* `::`-joined prefix (e.g.
|
||||
/// `"crate::auth::token"` for `crate::auth::token::validate`) so that
|
||||
/// cross-file `use`-map resolution in `callgraph.rs` has everything it
|
||||
|
|
@ -4380,7 +4525,7 @@ pub(crate) fn export_summaries(
|
|||
module_path: None,
|
||||
rust_use_map: None,
|
||||
rust_wildcards: None,
|
||||
// Phase 6 hierarchy edges live on `FileCfg`, not on the
|
||||
// Hierarchy edges live on `FileCfg`, not on the
|
||||
// graph-local `FuncSummaries`. `ParsedFile::export_summaries_with_root`
|
||||
// attaches them after this transform returns.
|
||||
hierarchy_edges: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ use petgraph::graph::NodeIndex;
|
|||
use smallvec::smallvec;
|
||||
use tree_sitter::Node;
|
||||
|
||||
/// Phase 6.2 — resolve a syntactic class / struct / interface / model
|
||||
/// resolve a syntactic class / struct / interface / model
|
||||
/// name against the per-file [`DTO_CLASSES`] map populated at the top
|
||||
/// of `build_cfg`. Returns the [`TypeKind::Dto`] carrying the
|
||||
/// per-field type map when the class is declared in the same file;
|
||||
|
|
@ -21,7 +21,7 @@ fn lookup_dto_class(class_name: &str) -> Option<TypeKind> {
|
|||
/// Extract parameter names + per-position [`TypeKind`] from a function
|
||||
/// AST node. Each entry's second slot is `Some(TypeKind)` when the
|
||||
/// parameter's decorator, attribute, or static type annotation maps to
|
||||
/// a known kind, and `None` otherwise. Strictly additive — when no
|
||||
/// a known kind, and `None` otherwise. Strictly additive, when no
|
||||
/// type info is recoverable, behaviour is identical to the names-only
|
||||
/// path.
|
||||
pub(super) fn extract_param_meta<'a>(
|
||||
|
|
@ -109,7 +109,7 @@ pub(super) fn extract_param_meta<'a>(
|
|||
// Python `typed_parameter`, `default_parameter`,
|
||||
// `typed_default_parameter`): the wrapper node has no `name`
|
||||
// field but contains the identifier as a child. Pick the
|
||||
// *first* identifier — that is the parameter name; subsequent
|
||||
// *first* identifier, that is the parameter name; subsequent
|
||||
// identifiers are part of the type annotation or default
|
||||
// expression.
|
||||
if !found {
|
||||
|
|
@ -123,7 +123,7 @@ pub(super) fn extract_param_meta<'a>(
|
|||
continue;
|
||||
}
|
||||
|
||||
// Bare identifier children — e.g. Rust untyped closure params `|cmd|`
|
||||
// Bare identifier children, e.g. Rust untyped closure params `|cmd|`
|
||||
// where the child is an `identifier` node, not a `parameter` wrapper.
|
||||
if child.kind() == "identifier" {
|
||||
if let Some(txt) = text_of(child, code) {
|
||||
|
|
@ -137,8 +137,8 @@ pub(super) fn extract_param_meta<'a>(
|
|||
/// Walk up from a function definition node and build a container path.
|
||||
///
|
||||
/// Records the names of enclosing classes / impls / modules / namespaces /
|
||||
/// structs — and, for anonymous / nested functions, the name of an enclosing
|
||||
/// named function — joined with `::`. Also returns a `FuncKind` guess
|
||||
/// structs, and, for anonymous / nested functions, the name of an enclosing
|
||||
/// named function, joined with `::`. Also returns a `FuncKind` guess
|
||||
/// reflecting the structural role.
|
||||
///
|
||||
/// Returns `(container, kind)`.
|
||||
|
|
@ -185,7 +185,7 @@ pub(super) fn compute_container_and_kind(
|
|||
| "enum_item"
|
||||
| "struct_specifier"
|
||||
| "struct_item" => Some("name"),
|
||||
// Rust impl blocks — pick the type name, not the trait name.
|
||||
// Rust impl blocks, pick the type name, not the trait name.
|
||||
"impl_item" => Some("type"),
|
||||
// Go / C++ / PHP namespaces and modules.
|
||||
"namespace_definition" | "namespace_declaration" | "module_declaration" | "module" => {
|
||||
|
|
@ -223,7 +223,7 @@ pub(super) fn compute_container_and_kind(
|
|||
|| pk == "lambda_expression"
|
||||
|| pk == "function_expression"
|
||||
{
|
||||
// Nested definition — record the outer function's name and
|
||||
// Nested definition, record the outer function's name and
|
||||
// classify self as Closure even if we got a real name.
|
||||
if let Some(name_node) = parent.child_by_field_name("name") {
|
||||
if let Some(text) = text_of(name_node, code) {
|
||||
|
|
@ -428,15 +428,15 @@ pub(super) fn inject_framework_param_sources(
|
|||
/// no recognised pattern matches, returns `None` and the engine
|
||||
/// behaves exactly as before.
|
||||
///
|
||||
/// Recognised patterns (Phase 2):
|
||||
/// * Java (Spring) — `@PathVariable`/`@RequestParam Long X` →
|
||||
/// Recognised patterns:
|
||||
/// * Java (Spring), `@PathVariable`/`@RequestParam Long X` →
|
||||
/// [`TypeKind::Int`]; `@RequestBody T` → object (no kind today).
|
||||
/// * TypeScript (NestJS) — `@Param('id') id: number` →
|
||||
/// * TypeScript (NestJS), `@Param('id') id: number` →
|
||||
/// [`TypeKind::Int`]; `@Body() dto: T` / `@Query('q') q: string`.
|
||||
/// * Rust (Axum / Rocket / Actix) — `Path<i64>` / `Path<u32>` /
|
||||
/// * Rust (Axum / Rocket / Actix), `Path<i64>` / `Path<u32>` /
|
||||
/// `web::Path<i64>` → [`TypeKind::Int`]; `Path<String>` →
|
||||
/// [`TypeKind::String`].
|
||||
/// * Python (FastAPI) — `def h(x: int)` → [`TypeKind::Int`];
|
||||
/// * Python (FastAPI), `def h(x: int)` → [`TypeKind::Int`];
|
||||
/// `Annotated[int, Path()]` → [`TypeKind::Int`].
|
||||
pub(super) fn classify_param_type<'a>(
|
||||
param: Node<'a>,
|
||||
|
|
@ -453,9 +453,9 @@ pub(super) fn classify_param_type<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
/// Java (Spring) — recognise typed-extractor parameters via the
|
||||
/// Java (Spring), recognise typed-extractor parameters via the
|
||||
/// surrounding annotation. Per Hard Rule 3, plain `Long X` without a
|
||||
/// known framework annotation is **not** treated as a typed extractor —
|
||||
/// known framework annotation is **not** treated as a typed extractor ,
|
||||
/// the parameter could be a regular function argument that the
|
||||
/// framework never validates. Recognised annotations:
|
||||
/// `@PathVariable`, `@RequestParam`, `@RequestBody`, `@RequestHeader`,
|
||||
|
|
@ -473,7 +473,7 @@ fn classify_param_type_java<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
|
|||
if let Some(k) = java_type_to_kind(&type_text) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: when the static type is a class name we don't classify
|
||||
// when the static type is a class name we don't classify
|
||||
// as a primitive (e.g. `@RequestBody CreateUser dto`), look up the
|
||||
// class in the same-file DTO map. Strip any generics for the
|
||||
// leading type so `Foo<Bar>` still resolves on `Foo`.
|
||||
|
|
@ -527,7 +527,7 @@ fn has_java_framework_annotation(param: Node<'_>, code: &[u8]) -> bool {
|
|||
}
|
||||
|
||||
/// Map a Java type-text fragment to a [`TypeKind`]. Public to the
|
||||
/// `cfg` module so the Phase 6 DTO collector can reuse the same
|
||||
/// `cfg` module so the DTO DTO collector can reuse the same
|
||||
/// classifier for class fields.
|
||||
pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let bare = t.trim().trim_start_matches('@').trim();
|
||||
|
|
@ -546,7 +546,7 @@ pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
|
||||
/// Map a TypeScript type-text fragment (already stripped of leading
|
||||
/// `:` / whitespace) to a primitive [`TypeKind`]. Used by both the
|
||||
/// per-parameter classifier and the Phase 6 DTO collector.
|
||||
/// per-parameter classifier and the DTO DTO collector.
|
||||
pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let head = t.split('<').next().unwrap_or(t).trim();
|
||||
match head {
|
||||
|
|
@ -557,13 +557,35 @@ pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
}
|
||||
}
|
||||
|
||||
/// TypeScript (NestJS) — recognise typed-extractor parameters via a
|
||||
/// TypeScript (NestJS), recognise typed-extractor parameters via a
|
||||
/// known NestJS decorator (`@Param`, `@Body`, `@Query`, `@Headers`,
|
||||
/// `@Req`, `@Res`). Per Hard Rule 3, a bare `function h(id: number)`
|
||||
/// is not a framework extractor — without a NestJS decorator no
|
||||
/// is not a framework extractor, without a NestJS decorator no
|
||||
/// runtime gate is implied. Pipe coercions (`ParseIntPipe` /
|
||||
/// `ParseBoolPipe`) override the static type.
|
||||
///
|
||||
/// Exception: parameters annotated as a known JS built-in collection
|
||||
/// type (`Map<...>`, `Set<...>`, `WeakMap<...>`, `WeakSet<...>`,
|
||||
/// `Array<...>` / `T[]` / `ReadonlyArray<...>`) resolve to
|
||||
/// [`TypeKind::LocalCollection`] regardless of decorator presence.
|
||||
/// `LocalCollection` is a *receiver-shape* claim, not a
|
||||
/// framework-validated-input claim, it tells the auth analyser that
|
||||
/// `param.get(k)` / `param.set(k, v)` / `param.find(p)` is a
|
||||
/// container operation rather than a data-layer read/mutation. This
|
||||
/// closes the Excalidraw FP cluster (`elementsMap: ElementsMap`,
|
||||
/// `groupIdMapForOperation: Map<string, string>`) without affecting
|
||||
/// any input-validation reasoning.
|
||||
fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
|
||||
let type_text = param
|
||||
.child_by_field_name("type")
|
||||
.and_then(|n| inner_ts_type_text(n, code));
|
||||
|
||||
if let Some(t) = type_text.as_deref()
|
||||
&& let Some(k) = ts_type_to_local_collection(t.trim().trim_start_matches(':').trim())
|
||||
{
|
||||
return Some(k);
|
||||
}
|
||||
|
||||
if !has_ts_decorator_argument(
|
||||
param,
|
||||
code,
|
||||
|
|
@ -586,14 +608,12 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
|
|||
if has_ts_decorator_argument(param, code, &["ParseBoolPipe"]) {
|
||||
return Some(TypeKind::Bool);
|
||||
}
|
||||
let t = param
|
||||
.child_by_field_name("type")
|
||||
.and_then(|n| inner_ts_type_text(n, code))?;
|
||||
let t = type_text?;
|
||||
let stripped = t.trim().trim_start_matches(':').trim();
|
||||
if let Some(k) = ts_type_to_kind(stripped) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: NestJS `@Body() dto: CreateUser` — when the static
|
||||
// NestJS `@Body() dto: CreateUser`, when the static
|
||||
// type is a class / interface name declared in the same file,
|
||||
// resolve via the DTO map. Generic args dropped for the leading
|
||||
// type so `Foo<Bar>` matches on `Foo`.
|
||||
|
|
@ -601,8 +621,41 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
|
|||
lookup_dto_class(head)
|
||||
}
|
||||
|
||||
/// Map a TypeScript / JavaScript type-text fragment to
|
||||
/// [`TypeKind::LocalCollection`] when the head is a JS built-in
|
||||
/// container type. Recognises:
|
||||
///
|
||||
/// * `Map<K, V>`, `Set<T>`, `WeakMap<K, V>`, `WeakSet<T>`, the four
|
||||
/// built-in keyed/unkeyed collection types.
|
||||
/// * `Array<T>`, `ReadonlyArray<T>`, the named array generics.
|
||||
/// * `T[]`, `readonly T[]`, the array shorthand syntax.
|
||||
/// * Same-file `type X = Map<...>` aliases (resolved via the
|
||||
/// per-file `TYPE_ALIAS_LC` map populated at the top of
|
||||
/// [`build_cfg`]).
|
||||
///
|
||||
/// Same-file user types named `Map` / `Set` / etc. (which would
|
||||
/// shadow the built-ins) are vanishingly rare in TS codebases that
|
||||
/// also define the methods (`get`, `set`, `has`, `find`); the
|
||||
/// classifier accepts the head match.
|
||||
pub(super) fn ts_type_to_local_collection(t: &str) -> Option<TypeKind> {
|
||||
let head_text = t.trim().trim_start_matches("readonly ").trim();
|
||||
// Array shorthand: `T[]` or `readonly T[]`.
|
||||
if head_text.ends_with("[]") {
|
||||
return Some(TypeKind::LocalCollection);
|
||||
}
|
||||
let head = head_text.split('<').next().unwrap_or(head_text).trim();
|
||||
match head {
|
||||
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" | "ReadonlyArray" => {
|
||||
Some(TypeKind::LocalCollection)
|
||||
}
|
||||
_ => super::TYPE_ALIAS_LC
|
||||
.with(|cell| cell.borrow().contains(head))
|
||||
.then_some(TypeKind::LocalCollection),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner_ts_type_text<'a>(type_anno: Node<'a>, code: &'a [u8]) -> Option<String> {
|
||||
// type_annotation node text is `: T` — unwrap to T.
|
||||
// type_annotation node text is `: T`, unwrap to T.
|
||||
if let Some(child) = type_anno.named_child(0) {
|
||||
return text_of(child, code);
|
||||
}
|
||||
|
|
@ -643,10 +696,10 @@ fn has_ts_decorator_argument(param: Node<'_>, code: &[u8], wanted: &[&str]) -> b
|
|||
false
|
||||
}
|
||||
|
||||
/// Rust (Axum / Rocket / Actix) — read the parameter's type text and
|
||||
/// Rust (Axum / Rocket / Actix), read the parameter's type text and
|
||||
/// look for `Path<i64>` / `Json<T>` / `Form<T>` / `Query<T>` shapes.
|
||||
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)` without an
|
||||
/// extractor wrapper) are **not** treated as typed extractors — only
|
||||
/// extractor wrapper) are **not** treated as typed extractors, only
|
||||
/// framework-wrapped types qualify.
|
||||
fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
|
||||
if param.kind() != "parameter" {
|
||||
|
|
@ -654,9 +707,121 @@ fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
|
|||
}
|
||||
let type_node = param.child_by_field_name("type")?;
|
||||
let type_text = text_of(type_node, code)?;
|
||||
|
||||
// LocalCollection is a *receiver-shape* claim, not a
|
||||
// framework-validated-input claim, Hard Rule 3's "bare primitives
|
||||
// don't count" gate doesn't apply (mirrors `classify_param_type_ts`
|
||||
// for the same reason). Captures `unsharded: RoaringBitmap`,
|
||||
// `docids: &mut RoaringBitmap`, `params: HashMap<String, String>`,
|
||||
// `new_shard_docids: &'a mut hashbrown::HashMap<...>` shapes from
|
||||
// meilisearch/index-scheduler's bitmap bookkeeping where the
|
||||
// verb-name dispatch (`is_mutation: insert/remove`) would otherwise
|
||||
// classify these as DB writes.
|
||||
if let Some(k) = rust_type_to_local_collection(&type_text) {
|
||||
return Some(k);
|
||||
}
|
||||
|
||||
rust_type_to_kind(&type_text)
|
||||
}
|
||||
|
||||
/// Strip Rust reference markers, lifetimes, and `mut` from the head of
|
||||
/// a type-text fragment so the underlying type name is exposed for
|
||||
/// matching. Handles `&T`, `&mut T`, `&'a T`, `&'a mut T`, and
|
||||
/// repeated `&` prefixes (e.g. `&&mut T`).
|
||||
fn strip_rust_ref_markers(t: &str) -> &str {
|
||||
let mut s = t.trim();
|
||||
loop {
|
||||
if let Some(rest) = s.strip_prefix('&') {
|
||||
let rest = rest.trim_start();
|
||||
// Optional lifetime label: `'a`, `'static`, `'_`.
|
||||
let rest = if let Some(after) = rest.strip_prefix('\'') {
|
||||
let end = after
|
||||
.find(|c: char| !c.is_alphanumeric() && c != '_')
|
||||
.unwrap_or(after.len());
|
||||
after[end..].trim_start()
|
||||
} else {
|
||||
rest
|
||||
};
|
||||
// Optional `mut` keyword.
|
||||
let rest = rest.strip_prefix("mut ").unwrap_or(rest).trim_start();
|
||||
s = rest;
|
||||
continue;
|
||||
}
|
||||
if let Some(rest) = s.strip_prefix("mut ") {
|
||||
s = rest.trim_start();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// Map a Rust parameter / variable type-text to
|
||||
/// [`TypeKind::LocalCollection`] when the head names a known
|
||||
/// in-memory container. Strips reference / lifetime / `mut` markers,
|
||||
/// drops module-path prefixes (`std::collections::`, `hashbrown::`,
|
||||
/// `roaring::`), then matches the head against std and ecosystem
|
||||
/// collection types.
|
||||
///
|
||||
/// Recognises:
|
||||
/// * Std: `Vec`, `HashMap`, `HashSet`, `BTreeMap`, `BTreeSet`,
|
||||
/// `VecDeque`, `BinaryHeap`, `LinkedList`.
|
||||
/// * Ecosystem: `IndexMap`, `IndexSet` (indexmap), `SmallVec`
|
||||
/// (smallvec), `DashMap`, `DashSet` (dashmap), `FxHashMap`,
|
||||
/// `FxHashSet` (rustc-hash / fxhash), `RoaringBitmap`,
|
||||
/// `RoaringTreemap` (roaring).
|
||||
/// * Array / slice shorthand: `[T; N]`, `[T]` (covered by the
|
||||
/// leading-`[` check after ref-stripping).
|
||||
///
|
||||
/// Returns `None` for `Database<...>` (heed/sled, persistent KV
|
||||
/// store, NOT a local collection, keeping this `None` preserves
|
||||
/// real IDOR detection on persistent-store calls), `Mutex<...>` /
|
||||
/// `RwLock<...>` (synchronisation wrappers, not sink-shape claims),
|
||||
/// and bare primitives.
|
||||
pub(super) fn rust_type_to_local_collection(t: &str) -> Option<TypeKind> {
|
||||
let stripped = strip_rust_ref_markers(t);
|
||||
|
||||
// Array / slice shorthand: `[T; N]` or `[T]` (the `&` was
|
||||
// already stripped).
|
||||
if stripped.starts_with('[') {
|
||||
return Some(TypeKind::LocalCollection);
|
||||
}
|
||||
|
||||
// Drop module-path prefix: keep only the last segment before `<`
|
||||
// or end (`std::collections::HashMap<K, V>` → `HashMap`).
|
||||
let head_with_generics = stripped.rsplit("::").next().unwrap_or(stripped);
|
||||
let head = head_with_generics
|
||||
.split('<')
|
||||
.next()
|
||||
.unwrap_or(head_with_generics)
|
||||
.trim();
|
||||
|
||||
const TYPES: &[&str] = &[
|
||||
"Vec",
|
||||
"VecDeque",
|
||||
"BinaryHeap",
|
||||
"LinkedList",
|
||||
"HashMap",
|
||||
"HashSet",
|
||||
"BTreeMap",
|
||||
"BTreeSet",
|
||||
"IndexMap",
|
||||
"IndexSet",
|
||||
"SmallVec",
|
||||
"DashMap",
|
||||
"DashSet",
|
||||
"FxHashMap",
|
||||
"FxHashSet",
|
||||
"RoaringBitmap",
|
||||
"RoaringTreemap",
|
||||
];
|
||||
if TYPES.contains(&head) {
|
||||
Some(TypeKind::LocalCollection)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let stripped = t.trim();
|
||||
// Reject reference / mutability noise so `&Path<i64>` still matches
|
||||
|
|
@ -666,7 +831,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
.trim_start_matches('&')
|
||||
.trim_start_matches("mut ")
|
||||
.trim();
|
||||
// Only framework wrapper extractors qualify — bare primitives like
|
||||
// Only framework wrapper extractors qualify, bare primitives like
|
||||
// `i64` could be regular function parameters with no framework
|
||||
// validation gate.
|
||||
for wrap in [
|
||||
|
|
@ -684,7 +849,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
if let Some(rest) = stripped.strip_prefix(&prefix) {
|
||||
if let Some(inner) = rest.strip_suffix('>') {
|
||||
let inner = inner.trim();
|
||||
// Tuple extractor `Path<(i64, String)>` — first element wins.
|
||||
// Tuple extractor `Path<(i64, String)>`, first element wins.
|
||||
if inner.starts_with('(') {
|
||||
let inside = inner.trim_start_matches('(').trim_end_matches(')');
|
||||
let first = inside.split(',').next().unwrap_or("").trim();
|
||||
|
|
@ -696,16 +861,16 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
if let Some(k) = rust_primitive_to_kind(inner) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: `Json<T>` / `Form<T>` / `Query<T>` /
|
||||
// `Path<T>` with a same-file struct type — resolve via
|
||||
// `Json<T>` / `Form<T>` / `Query<T>` /
|
||||
// `Path<T>` with a same-file struct type, resolve via
|
||||
// the DTO map. Strip nested generics so `Json<Foo<i64>>`
|
||||
// matches on `Foo`.
|
||||
let head = inner.split('<').next().unwrap_or(inner).trim();
|
||||
if let Some(k) = lookup_dto_class(head) {
|
||||
return Some(k);
|
||||
}
|
||||
// Custom struct outside the same file — leave None
|
||||
// (cross-file resolution is Phase 6.4).
|
||||
// Custom struct outside the same file, leave None
|
||||
// (cross-file resolution is a follow-up).
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
|
@ -714,7 +879,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
}
|
||||
|
||||
/// Map a Rust primitive / `String` / `&str` to a [`TypeKind`]. Public
|
||||
/// to the `cfg` module so the Phase 6 DTO collector can reuse it for
|
||||
/// to the `cfg` module so the DTO DTO collector can reuse it for
|
||||
/// `struct` field types.
|
||||
pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let t = t.trim();
|
||||
|
|
@ -728,10 +893,10 @@ pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Python (FastAPI) — recognise typed-extractor parameters via the
|
||||
/// Python (FastAPI), recognise typed-extractor parameters via the
|
||||
/// `Annotated[X, Path()/Query()/Body()/Header()/Cookie()]` shape. Per
|
||||
/// Hard Rule 3, a bare `def h(id: int)` is **not** a framework
|
||||
/// extractor — the function may be a plain Python function and the
|
||||
/// extractor, the function may be a plain Python function and the
|
||||
/// type annotation provides no runtime gate.
|
||||
fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
|
||||
let type_node = param.child_by_field_name("type")?;
|
||||
|
|
@ -741,7 +906,7 @@ fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<Typ
|
|||
|
||||
fn python_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let stripped = t.trim();
|
||||
// `Annotated[int, Path()]` — only matches when one of the generic
|
||||
// `Annotated[int, Path()]`, only matches when one of the generic
|
||||
// args names a recognised FastAPI binding marker. Otherwise no
|
||||
// framework gate is implied.
|
||||
if let Some(inner) = stripped
|
||||
|
|
@ -756,8 +921,8 @@ fn python_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
if let Some(k) = python_primitive_to_kind(first) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: `Annotated[CreateUser, Body()]` with a same-file
|
||||
// Pydantic model — resolve via the DTO map. Generic args are
|
||||
// `Annotated[CreateUser, Body()]` with a same-file
|
||||
// Pydantic model, resolve via the DTO map. Generic args are
|
||||
// dropped via the same head-split as `python_primitive_to_kind`.
|
||||
let head = first.split('[').next().unwrap_or(first).trim();
|
||||
return lookup_dto_class(head);
|
||||
|
|
@ -773,7 +938,7 @@ fn contains_fastapi_marker(s: &str) -> bool {
|
|||
}
|
||||
|
||||
/// Map a Python type expression to a primitive [`TypeKind`]. Used by
|
||||
/// both the per-parameter classifier and the Phase 6 Pydantic-model
|
||||
/// both the per-parameter classifier and the DTO Pydantic-model
|
||||
/// field collector.
|
||||
pub(super) fn python_primitive_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let head = t.trim().split('[').next().unwrap_or(t).trim();
|
||||
|
|
@ -806,10 +971,70 @@ pub(super) fn is_configured_terminator(
|
|||
mod typed_extractor_tests {
|
||||
use super::{
|
||||
contains_fastapi_marker, java_type_to_kind, python_primitive_to_kind, python_type_to_kind,
|
||||
rust_primitive_to_kind, rust_type_to_kind,
|
||||
rust_primitive_to_kind, rust_type_to_kind, rust_type_to_local_collection,
|
||||
ts_type_to_local_collection,
|
||||
};
|
||||
use crate::ssa::type_facts::TypeKind;
|
||||
|
||||
// ── TypeScript / JavaScript local-collection types ───────────────────
|
||||
|
||||
#[test]
|
||||
fn ts_built_in_collections_map_to_local_collection() {
|
||||
// The four keyed/unkeyed built-in container generics.
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Map<string, number>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Set<string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("WeakMap<object, string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("WeakSet<object>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
// Array forms.
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Array<string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("ReadonlyArray<string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("string[]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("readonly string[]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
// Excalidraw-style keyed map with index-type generic args.
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Map<ExcalidrawElement[\"id\"], ExcalidrawElement>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ts_non_collection_types_return_none() {
|
||||
// Plain primitives.
|
||||
assert_eq!(ts_type_to_local_collection("string"), None);
|
||||
assert_eq!(ts_type_to_local_collection("number"), None);
|
||||
assert_eq!(ts_type_to_local_collection("boolean"), None);
|
||||
// Promise / Iterator / etc. are not LocalCollections.
|
||||
assert_eq!(ts_type_to_local_collection("Promise<string>"), None);
|
||||
assert_eq!(ts_type_to_local_collection("Iterator<number>"), None);
|
||||
// User types.
|
||||
assert_eq!(ts_type_to_local_collection("CreateUserDto"), None);
|
||||
assert_eq!(ts_type_to_local_collection("ElementsMap"), None);
|
||||
}
|
||||
|
||||
// ── Java (Spring) ────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
|
@ -841,7 +1066,7 @@ mod typed_extractor_tests {
|
|||
|
||||
#[test]
|
||||
fn java_request_body_dto_returns_none_until_phase_six() {
|
||||
// @RequestBody CreateUserDto dto — no kind today; Phase 6 will
|
||||
// @RequestBody CreateUserDto dto, no kind today; future passes will
|
||||
// return DtoObject(name) once cross-file class resolution lands.
|
||||
assert_eq!(java_type_to_kind("CreateUserDto"), None);
|
||||
assert_eq!(java_type_to_kind("List<String>"), None);
|
||||
|
|
@ -860,7 +1085,7 @@ mod typed_extractor_tests {
|
|||
|
||||
#[test]
|
||||
fn rust_path_tuple_first_element_wins() {
|
||||
// Path<(i64, String)> — first slot is the int extractor that
|
||||
// Path<(i64, String)>, first slot is the int extractor that
|
||||
// matters for sink suppression.
|
||||
assert_eq!(
|
||||
rust_type_to_kind("Path<(i64, String)>"),
|
||||
|
|
@ -876,15 +1101,15 @@ mod typed_extractor_tests {
|
|||
|
||||
#[test]
|
||||
fn rust_json_dto_returns_none_until_phase_six() {
|
||||
// Json<T> / Form<T> / Query<T> with a custom struct type — no
|
||||
// primitive resolution available; Phase 6 lifts to DTO.
|
||||
// Json<T> / Form<T> / Query<T> with a custom struct type, no
|
||||
// primitive resolution available; future passes will lift to DTO.
|
||||
assert_eq!(rust_type_to_kind("Json<CreateUserDto>"), None);
|
||||
assert_eq!(rust_type_to_kind("Form<CreateUserDto>"), None);
|
||||
assert_eq!(rust_type_to_kind("Query<Filters>"), None);
|
||||
}
|
||||
|
||||
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)`) are NOT
|
||||
/// framework extractors — only wrapper types (`Path<i64>` etc.)
|
||||
/// framework extractors, only wrapper types (`Path<i64>` etc.)
|
||||
/// imply a framework runtime gate. Bare i64 must return None.
|
||||
#[test]
|
||||
fn rust_bare_primitives_are_not_framework_extractors() {
|
||||
|
|
@ -903,7 +1128,7 @@ mod typed_extractor_tests {
|
|||
#[test]
|
||||
fn python_bare_primitives_are_not_framework_extractors() {
|
||||
// Per Hard Rule 3: bare `def h(id: int)` is NOT a typed
|
||||
// extractor — without an `Annotated[..., Path()/Query()/Body()]`
|
||||
// extractor, without an `Annotated[..., Path()/Query()/Body()]`
|
||||
// wrapper, no FastAPI gate is implied.
|
||||
assert_eq!(python_type_to_kind("int"), None);
|
||||
assert_eq!(python_type_to_kind("float"), None);
|
||||
|
|
@ -936,7 +1161,7 @@ mod typed_extractor_tests {
|
|||
#[test]
|
||||
fn python_annotated_without_marker_returns_none() {
|
||||
// Annotated without a FastAPI binding marker is a generic
|
||||
// type-system tag — not a framework extractor.
|
||||
// type-system tag, not a framework extractor.
|
||||
assert_eq!(python_type_to_kind("Annotated[int, str]"), None);
|
||||
assert_eq!(python_type_to_kind("Annotated[int, MyMeta]"), None);
|
||||
}
|
||||
|
|
@ -954,4 +1179,128 @@ mod typed_extractor_tests {
|
|||
assert!(contains_fastapi_marker("bytes, File()"));
|
||||
assert!(!contains_fastapi_marker("int, str"));
|
||||
}
|
||||
|
||||
// ── Rust local-collection types ──────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn rust_std_collections_map_to_local_collection() {
|
||||
for ty in [
|
||||
"Vec<u32>",
|
||||
"HashMap<String, u32>",
|
||||
"HashSet<u64>",
|
||||
"BTreeMap<u32, String>",
|
||||
"BTreeSet<u32>",
|
||||
"VecDeque<u8>",
|
||||
"BinaryHeap<u32>",
|
||||
"LinkedList<i32>",
|
||||
] {
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection(ty),
|
||||
Some(TypeKind::LocalCollection),
|
||||
"{ty} should map to LocalCollection"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_ecosystem_collections_map_to_local_collection() {
|
||||
for ty in [
|
||||
"IndexMap<String, u32>",
|
||||
"IndexSet<u64>",
|
||||
"SmallVec<[u32; 4]>",
|
||||
"DashMap<String, u32>",
|
||||
"DashSet<u64>",
|
||||
"FxHashMap<String, u32>",
|
||||
"FxHashSet<u64>",
|
||||
"RoaringBitmap",
|
||||
"RoaringTreemap",
|
||||
] {
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection(ty),
|
||||
Some(TypeKind::LocalCollection),
|
||||
"{ty} should map to LocalCollection"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_module_qualified_collections_map_to_local_collection() {
|
||||
// Module-path prefixes: keep only the last segment for matching.
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("std::collections::HashMap<K, V>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("hashbrown::HashMap<String, RoaringBitmap>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("roaring::RoaringBitmap"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_reference_and_lifetime_markers_stripped() {
|
||||
// `&T`, `&mut T`, `&'a T`, `&'a mut T`, `&'static T`,
|
||||
// repeated `&` prefixes, all reach the underlying type head.
|
||||
for ty in [
|
||||
"&RoaringBitmap",
|
||||
"&mut RoaringBitmap",
|
||||
"&'a RoaringBitmap",
|
||||
"&'a mut RoaringBitmap",
|
||||
"&'static RoaringBitmap",
|
||||
"&&mut RoaringBitmap",
|
||||
"&'a mut hashbrown::HashMap<String, RoaringBitmap>",
|
||||
] {
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection(ty),
|
||||
Some(TypeKind::LocalCollection),
|
||||
"{ty} should map to LocalCollection after ref stripping"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_array_and_slice_shorthand_map_to_local_collection() {
|
||||
// `[T; N]` arrays and `[T]` slices are local containers.
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("[u32; 4]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("[u8]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("&[u32]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("&mut [u32]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_persistent_db_and_sync_wrappers_return_none() {
|
||||
// heed / sled / rocksdb persistent-store handles are NOT local
|
||||
// collections, preserves IDOR detection on real DB calls.
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("Database<BEU32, SerdeJson<Task>>"),
|
||||
None
|
||||
);
|
||||
assert_eq!(rust_type_to_local_collection("heed::Database<K, V>"), None);
|
||||
assert_eq!(rust_type_to_local_collection("sled::Db"), None);
|
||||
// Sync wrappers don't claim a sink shape.
|
||||
assert_eq!(rust_type_to_local_collection("Mutex<HashMap<K, V>>"), None);
|
||||
assert_eq!(rust_type_to_local_collection("RwLock<Vec<u32>>"), None);
|
||||
// Bare primitives.
|
||||
assert_eq!(rust_type_to_local_collection("u32"), None);
|
||||
assert_eq!(rust_type_to_local_collection("&str"), None);
|
||||
assert_eq!(rust_type_to_local_collection("String"), None);
|
||||
// Unrelated user types.
|
||||
assert_eq!(rust_type_to_local_collection("MyDao<User>"), None);
|
||||
assert_eq!(rust_type_to_local_collection("Connection"), None);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue