Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -127,12 +127,12 @@ use crate::utils::snippet::line_snippet as extract_line_snippet;
/// [`normalize_namespace`] convention) back to the absolute path the
/// diagnostic pipeline expects.
///
/// * Empty `file_rel` single-file scans normalize every namespace to
/// * Empty `file_rel`, single-file scans normalize every namespace to
/// `""`; treat that as "the file under analysis" and return
/// `fallback.to_string_lossy()`.
/// * `scan_root` absent we have no workspace root to resolve against;
/// * `scan_root` absent, we have no workspace root to resolve against;
/// return `file_rel` verbatim (it may already be absolute).
/// * Otherwise join `scan_root` with `file_rel`.
/// * Otherwise, join `scan_root` with `file_rel`.
fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -> String {
if file_rel.is_empty() {
return fallback.to_string_lossy().into_owned();
@ -163,7 +163,7 @@ fn build_taint_diag(
let source_info = cfg_graph.node_weight(finding.source);
// The reconstructed flow path is the authoritative view of where the
// taint started *in this body*. When present, prefer its first step's
// CFG span over `finding.source_span` which can be stale across
// CFG span over `finding.source_span`, which can be stale across
// multi-hop cross-body remaps (e.g. JS two-level solve where a
// callee-interior source gets its span rewritten to the enclosing
// body's entry node). Fall back to `source_span`, then to the source
@ -183,7 +183,7 @@ fn build_taint_diag(
// Prefer the source CFG node's callee string when it's a call expression
// (e.g. `os.getenv("X")`). For property-access sources like
// `navigator.userAgent` there is no callee fall back to the first flow
// `navigator.userAgent` there is no callee, fall back to the first flow
// step's `variable` (the SSA var name, e.g. "userAgent"), then to the
// source node's `taint.defines` / first `taint.uses` entry, before
// finally giving up and rendering "(unknown)".
@ -289,7 +289,7 @@ fn build_taint_diag(
// Convert raw flow steps to display FlowSteps. When the finding has a
// primary_location distinct from the call site, the last raw step is
// really the Call reclassify it and append a synthetic Sink step
// really the Call, reclassify it and append a synthetic Sink step
// pointing at the callee-internal dangerous instruction so analysts
// see both the call site and the final sink in the trace.
let mut flow_steps: Vec<FlowStep> = finding
@ -348,7 +348,7 @@ fn build_taint_diag(
.clone()
.or_else(|| Some(short_call_site.clone()));
// Resolved sink capability bits used by deduplication to distinguish
// Resolved sink capability bits, used by deduplication to distinguish
// sinks with different cap types on the same source line (e.g.
// `sink_sql(x); sink_shell(x);`).
let sink_caps_bits: u16 = cfg_graph[finding.sink]
@ -361,13 +361,33 @@ fn build_taint_diag(
})
.fold(0u16, |acc, b| acc | b);
// Phase C: when the sink's required caps include UNAUTHORIZED_ID — and
// the finding actually reached that sink via the taint engine — use a
// dedicated auth rule id so the finding is namespaced alongside the
// standalone `auth_analysis` subsystem's output instead of being folded
// into the generic `taint-unsanitised-flow` bucket.
let diag_id = if sink_caps_bits & crate::labels::Cap::UNAUTHORIZED_ID.bits() != 0 {
// Cap-specific rule-id routing.
//
// 1. `UNAUTHORIZED_ID`: namespace alongside the standalone `auth_analysis`
// subsystem's output so cross-tool aggregation lines up.
// 2. `DATA_EXFIL`: route to `taint-data-exfiltration` so SARIF surfaces a
// distinct rule id from SSRF, the two share callees (e.g. `fetch`)
// but represent different vulnerability classes.
//
// Prefer the per-finding `effective_sink_caps` (set by the multi-gate
// SSA dispatch) when populated; fall back to the union of all sink-label
// caps on the CFG node so legacy paths that build findings without
// setting `effective_sink_caps` still pick the right rule id.
let effective_caps = if finding.effective_sink_caps.is_empty() {
crate::labels::Cap::from_bits_truncate(sink_caps_bits)
} else {
finding.effective_sink_caps
};
let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
"rs.auth.missing_ownership_check.taint".to_string()
} else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& !effective_caps.contains(crate::labels::Cap::SSRF)
{
format!(
"taint-data-exfiltration (source {}:{})",
source_point.row + 1,
source_point.column + 1
)
} else {
format!(
"taint-unsanitised-flow (source {}:{})",
@ -452,7 +472,7 @@ fn build_taint_diag(
/// Resolve a file extension to a language slug (e.g. `"rust"`,
/// `"javascript"`). Public façade over [`lang_for_path`] for callers
/// that only need the slug used by the debug API to look up
/// that only need the slug, used by the debug API to look up
/// per-language rule enablement without re-parsing the file.
pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> {
lang_for_path(path).map(|(_, slug)| slug)
@ -467,7 +487,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
// use `.cc` / `.cxx` / `.hpp` / `.hh` / `.h++` rather than the
// `.cpp` synthetic-fixture extension. Without these mappings,
// the scanner silently skipped them. Headers (`.h` is omitted
// intentionally it's also valid C and disambiguating without a
// intentionally, it's also valid C and disambiguating without a
// build system is brittle).
Some("cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++") => {
Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp"))
@ -481,7 +501,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
"typescript",
)),
// TSX grammar is a superset of TypeScript plus JSX element/attribute
// nodes all TypeScript KINDS / RULES / PARAM_CONFIG entries apply,
// nodes, all TypeScript KINDS / RULES / PARAM_CONFIG entries apply,
// and JSX-specific sinks (e.g. `dangerouslySetInnerHTML`) layer on top
// via the same `typescript` slug.
Some("tsx") => Some((
@ -493,7 +513,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
"javascript",
)),
// JSX uses the same JavaScript grammar (tree-sitter-javascript handles
// JSX natively) slug "javascript" so all JS rules apply.
// JSX natively), slug "javascript" so all JS rules apply.
Some("jsx") => Some((
Language::from(tree_sitter_javascript::LANGUAGE),
"javascript",
@ -739,7 +759,7 @@ impl<'a> ParsedSource<'a> {
continue;
}
// Layer C: PHP `unserialize($x, ['allowed_classes' => [...]])`
// or `unserialize($x, ['allowed_classes' => false])`
// or `unserialize($x, ['allowed_classes' => false])` ,
// PHP 7+ structural mitigation against object injection.
// When the call passes an `allowed_classes` option set to
// either `false` (no class instantiation) or an array
@ -762,7 +782,7 @@ impl<'a> ParsedSource<'a> {
// format-string contributes attacker-controlled length.
// When the source argument is a string literal (or a
// ternary of two string literals), the contributed length
// is statically bounded there is no overflow vector
// is statically bounded, there is no overflow vector
// for an attacker even if the destination buffer is
// mis-sized. Same principle for `sprintf` when the
// format string is a literal containing no bare `%s`
@ -818,7 +838,7 @@ impl<'a> ParsedSource<'a> {
/// Sort, dedup, and optionally downgrade severity for non-production paths.
///
/// Dedup key matches the `issues` table PRIMARY KEY `(file_id, rule_id,
/// line, col)` severity is NOT part of the key. Two diags that agree
/// line, col)`, severity is NOT part of the key. Two diags that agree
/// on (line, col, id) but differ in severity (e.g. a pattern-rule finding
/// plus a taint-pipeline finding on the same call) would otherwise survive
/// dedup here and crash the indexer with a UNIQUE constraint violation.
@ -854,7 +874,7 @@ impl<'a> ParsedFile<'a> {
// project-level `FrameworkContext` misses frameworks the file
// obviously imports. Augment the per-file rule set with any
// framework-conditional rules keyed off in-file import specifiers
// (e.g. `import fastify from 'fastify'`). Idempotent skips
// (e.g. `import fastify from 'fastify'`). Idempotent, skips
// frameworks already active from the manifest pass.
let in_file_fws =
crate::utils::project::detect_in_file_frameworks(source.bytes, source.lang_slug);
@ -931,13 +951,13 @@ impl<'a> ParsedFile<'a> {
self.source.lang_slug,
);
// Phase 6 (typed call-graph subtype awareness): every
// every
// `FuncSummary` exported from this file carries a copy of the
// file's `hierarchy_edges` so the inheritance / impl /
// implements relationships persist through SQLite round-trips
// and re-merge into `crate::callgraph::TypeHierarchyIndex` at
// call-graph build time. Cheap (one clone per summary) and
// strictly additive `merge_summaries` deduplicates downstream.
// strictly additive, `merge_summaries` deduplicates downstream.
if !self.file_cfg.hierarchy_edges.is_empty() {
let edges = self.file_cfg.hierarchy_edges.clone();
for s in &mut out {
@ -982,7 +1002,7 @@ impl<'a> ParsedFile<'a> {
///
/// Returns two vectors keyed by canonical [`crate::symbol::FuncKey`].
/// The `FuncKey` identity preserves `(lang, namespace, container, name,
/// arity, disambig, kind)` so two same-name definitions in this file
/// arity, disambig, kind)`, so two same-name definitions in this file
/// (e.g. a free `process` and a `Worker::process`, or overloads with
/// different arities) land on distinct entries instead of the later one
/// shadowing the earlier one.
@ -1003,7 +1023,7 @@ impl<'a> ParsedFile<'a> {
// Use the FileCfg path (same one `analyse_file` uses at taint time) so
// the SSA summaries stored cross-file match exactly what pass 2 will
// resolve against no NodeIndex-space or entry-detection drift.
// resolve against, no NodeIndex-space or entry-detection drift.
let locator = crate::summary::SinkSiteLocator {
tree: &self.source.tree,
bytes: self.source.bytes,
@ -1024,7 +1044,7 @@ impl<'a> ParsedFile<'a> {
/// Lower every function body in this file to SSA exactly once. Used by
/// [`analyse_file_fused`] to share the result between the taint engine
/// ([`run_cfg_analyses_with_lowered`]) and the SSA artifact filter
/// ([`build_eligible_bodies_from_lowered`]) the prior code path lowered
/// ([`build_eligible_bodies_from_lowered`]), the prior code path lowered
/// twice (once inside `analyse_file`, once inside
/// `extract_ssa_artifacts_from_file_cfg`) and accounted for ~24% of the
/// pass-2 wall-clock on the bench corpus.
@ -1038,7 +1058,7 @@ impl<'a> ParsedFile<'a> {
/// here populates `param_to_sink` with concrete coordinates that the
/// emission path then promotes into `Finding.primary_location`,
/// causing the same-file summary-resolved sink to be reported at the
/// callee-internal sink line instead of the call site which both
/// callee-internal sink line instead of the call site, which both
/// duplicates the intraprocedural finding the taint engine already
/// emits at that exact line and re-attributes the flow finding away
/// from the user-visible call site. Closure-capture, lambda, and
@ -1263,13 +1283,11 @@ impl<'a> ParsedFile<'a> {
state::build_resource_method_summaries(&self.file_cfg.bodies, caller_lang);
let mut all_state_findings = Vec::new();
for body in &self.file_cfg.bodies {
// Phase 2 of the pointer-analysis rollout: when
// `NYX_POINTER_ANALYSIS=1` is set, derive a `var_name →
// PtrProxyHint` map from the body's points-to facts so
// the proxy-acquire transfer can suppress SymbolId
// attribution on field-aliased receivers (e.g. `m :=
// c.mu; m.Lock()`). Strict-additive — `None` when the
// env-var is unset and behaviour matches today exactly.
// When `NYX_POINTER_ANALYSIS=1` is set, derive a
// `var_name → PtrProxyHint` map from the body's
// points-to facts so the proxy-acquire transfer can
// suppress SymbolId attribution on field-aliased
// receivers (e.g. `m := c.mu; m.Lock()`).
let body_pointer_hints = cfg_analysis::build_body_const_facts(body, caller_lang)
.as_ref()
.and_then(|f| {
@ -1379,15 +1397,11 @@ impl<'a> ParsedFile<'a> {
)
}
/// Build a per-file `var_name → TypeKind` map by running SSA + type
/// facts on each body and copying type facts for SSA values whose
/// definition recorded a source-level variable name. When the same
/// name resolves to different non-`Unknown` types across bodies the
/// entry is dropped — absence is safe because the auth analysis
/// sink gate simply falls back to its syntactic heuristics. Returns
/// `None` when no body produces any typed variable (non-Rust files
/// currently emit few `LocalCollection` / security-typed facts, but
/// this path is language-agnostic).
/// Build a per-file `var_name → TypeKind` map from SSA + type facts.
/// Conflicting non-`Unknown` types across bodies drop the entry ,
/// absence is safe because the auth sink gate falls back to
/// syntactic heuristics. Returns `None` when no body produces a
/// typed variable.
fn collect_file_var_types(&self) -> Option<auth_analysis::VarTypes> {
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
let mut merged: std::collections::HashMap<String, crate::ssa::type_facts::TypeKind> =
@ -1492,7 +1506,7 @@ pub fn build_cfg_for_file(path: &Path, cfg: &Config) -> NyxResult<Option<(FileCf
/// Parse a file and return its `AuthorizationModel` for debug inspection.
///
/// Runs only the auth-extraction pipeline no taint, no CFG construction.
/// Runs only the auth-extraction pipeline, no taint, no CFG construction.
/// Returns `None` for binary files or unsupported languages. Used by the
/// `/api/debug/auth` route to surface the structured authorization model
/// (routes, units, sensitive operations, auth checks) in the debug UI.
@ -1607,7 +1621,7 @@ pub fn perf_stage_breakdown_fused(
/// Diagnostic stage-timing helper for the perf audit.
///
/// Times each stage of pass 2 internally and returns µs counts. Returns
/// `None` for unsupported languages. Not used in production just for
/// `None` for unsupported languages. Not used in production, just for
/// `tests/perf_breakdown.rs` to attribute time inside `run_rules_on_bytes`
/// without touching the hot path.
#[doc(hidden)]
@ -1651,7 +1665,7 @@ pub fn perf_stage_breakdown(
///
/// This is the shared pass-1 pipeline for indexed scans: parses once, builds
/// CFG once, and returns both summary types. Uses the same `ParsedFile`
/// pipeline as `analyse_file_fused` no divergent extraction path.
/// pipeline as `analyse_file_fused`, no divergent extraction path.
pub fn extract_all_summaries_from_bytes(
bytes: &[u8],
path: &Path,
@ -1727,7 +1741,7 @@ fn is_call_all_args_literal(node: tree_sitter::Node, bytes: &[u8]) -> bool {
}
// If the argument list is empty (no args), we conservatively do NOT
// suppress the danger may come from side effects, not arguments.
// suppress, the danger may come from side effects, not arguments.
has_any_arg
}
@ -1745,7 +1759,7 @@ fn find_enclosing_call(mut node: tree_sitter::Node) -> Option<tree_sitter::Node>
if kind == "function_call_expression" {
return Some(node);
}
// Stop at scope/statement boundaries don't cross into outer calls
// Stop at scope/statement boundaries, don't cross into outer calls
if kind.contains("block")
|| kind.contains("body")
|| kind == "program"
@ -1780,13 +1794,20 @@ fn find_arg_list(call: tree_sitter::Node) -> Option<tree_sitter::Node> {
fn is_literal_node(node: tree_sitter::Node, bytes: &[u8]) -> bool {
let kind = node.kind();
match kind {
// String literals (most languages)
// String literals, but Python's `string` node also covers
// f-strings, which carry `interpolation` children. An f-string
// with interpolation is *not* a literal: it embeds arbitrary
// expressions, so a sink call like `cursor.execute(f"…{x}")`
// must not be suppressed under Layer A's "all-literal args"
// shortcut. Same shape applies to any tree-sitter grammar
// that nests an `interpolation` (or `string_interpolation`)
// child inside a string node.
"string"
| "string_literal"
| "interpreted_string_literal"
| "raw_string_literal"
| "string_content"
| "string_fragment" => true,
| "string_fragment" => !has_interpolation(node),
// Numeric literals
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
@ -1901,7 +1922,7 @@ fn is_php_include_param_passthrough(include_node: tree_sitter::Node, bytes: &[u8
}
return true;
}
// Stop at class/program scope without a matching function bare
// Stop at class/program scope without a matching function, bare
// top-level `include $var` does not benefit from this guard.
"program" | "class_declaration" | "trait_declaration" | "interface_declaration" => {
return false;
@ -2011,7 +2032,7 @@ fn is_var_reassigned_before(
/// PHP-only: returns `true` when the captured `function_call_expression`
/// node is `unserialize($x, [..., 'allowed_classes' => <ARRAY|false>, ...])`.
/// This is the canonical PHP 7+ structural mitigation against object
/// injection explicitly restricting which classes the deserialiser may
/// injection, explicitly restricting which classes the deserialiser may
/// instantiate. Only suppress when the option is either:
///
/// - `'allowed_classes' => false` (no class instantiation), or
@ -2091,9 +2112,9 @@ fn is_php_unserialize_allowed_classes_restricted(
// Accept structural mitigation forms. The intent signal is
// "developer explicitly set allowed_classes to something other than
// `true`":
// - boolean `false` no class instantiation at all
// - array literal explicit allow-list
// - class-constant reference `self::ALLOWED_CLASSES` /
// - boolean `false` , no class instantiation at all
// - array literal , explicit allow-list
// - class-constant reference , `self::ALLOWED_CLASSES` /
// `Foo::CONSTANTS` resolved to
// a const array; engine cannot
// statically inspect, but the
@ -2126,7 +2147,7 @@ fn is_php_unserialize_allowed_classes_restricted(
/// `cpp.memory.*` mirrors) when the source argument can carry
/// attacker-controlled length. Calls whose source is a string literal
/// have a compile-time bound and cannot overflow due to attacker input
/// a too-small destination is a fixed developer bug (caught by
///, a too-small destination is a fixed developer bug (caught by
/// compiler warnings / `-fstack-protector` / clang-tidy / ASan), not an
/// exploitable channel. Suppressing these literal-source calls is a
/// deliberate noise / false-positive reduction aligned with Nyx's scope
@ -2141,14 +2162,14 @@ fn is_php_unserialize_allowed_classes_restricted(
/// - `tests/fixtures/real_world/c/state/malloc_lifecycle.expect.json`
/// - `tests/fixtures/real_world/cpp/state/new_delete.expect.json`
/// - `tests/fixtures/real_world/cpp/state/malloc_branches.expect.json`
/// - Positive cases (suppression must NOT fire source is a parameter
/// - Positive cases (suppression must NOT fire, source is a parameter
/// or other attacker-reachable value) live as hard expectations
/// (`must_match: true`) in the taint fixtures:
/// - `tests/fixtures/real_world/c/taint/buffer_overflow.c`
/// - `tests/fixtures/real_world/cpp/taint/gets_strcpy.cpp`
///
/// Removing this function or weakening its predicate would be caught by
/// neither it would be caught by the unit tests below.
/// neither, it would be caught by the unit tests below.
///
/// Pattern rules `c.memory.strcpy` / `c.memory.strcat` / `c.memory.sprintf`
/// (and the `cpp.memory.*` mirrors) flag the call syntactically; their
@ -2173,7 +2194,7 @@ fn is_php_unserialize_allowed_classes_restricted(
/// - source / format is an identifier (could be tainted, e.g.
/// `sprintf(buf, fmt, …)`) → keep firing
/// - format is `concatenated_string` containing identifier macros (e.g.
/// `"%" PRId64`) we cannot statically expand the macro, so refuse
/// `"%" PRId64`), we cannot statically expand the macro, so refuse
/// - bare `%s` in format → keep firing (could read unbounded length)
fn is_c_buffer_call_literal_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
let kind = match rule_id {
@ -2226,7 +2247,7 @@ enum CBufferRule {
/// True for: a C/C++ string literal, OR a `conditional_expression` whose
/// consequence + alternative are both either string literals or ALL_CAPS
/// identifiers (the canonical preprocessor-macro naming convention for
/// string-constant `#define`s `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
/// string-constant `#define`s, `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
/// pervasively in postgres' `formatting.c::DCH_a_m`). Parenthesised forms
/// are unwrapped.
///
@ -2348,7 +2369,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
}
i += 1;
if i >= bytes.len() {
// trailing `%` malformed, refuse to suppress
// trailing `%`, malformed, refuse to suppress
return false;
}
if bytes[i] == b'%' {
@ -2391,7 +2412,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
let conv = bytes[i];
i += 1;
match conv {
// Numeric / char / pointer specifiers bounded output for any input
// Numeric / char / pointer specifiers, bounded output for any input
b'd' | b'i' | b'u' | b'o' | b'x' | b'X' | b'c' | b'e' | b'E' | b'f' | b'F' | b'g'
| b'G' | b'a' | b'A' | b'p' | b'n' => continue,
// String specifier: only safe when precision-bounded
@ -2494,7 +2515,7 @@ struct TaintSuppressionCtx {
/// distinguish "taint proved safe" from "taint failed to track".
taint_finding_lines_by_func: HashMap<Option<String>, HashSet<usize>>,
/// Functions where the SSA engine emitted at least one
/// `all_validated` event every tainted input to *some* sink in
/// `all_validated` event, every tainted input to *some* sink in
/// the function passed through a recognised validation/
/// sanitisation predicate. Drained from
/// `take_all_validated_spans`; positive evidence that the engine
@ -2502,14 +2523,14 @@ struct TaintSuppressionCtx {
/// `taint-unsanitised-flow` finding fired and no Sanitizer label
/// is present. Covers validation, dominator-based pruning,
/// early-return guards, type-check predicates, and interprocedural
/// sanitiser wrappers all of which legitimately clear taint via
/// sanitiser wrappers, all of which legitimately clear taint via
/// SSA branch-narrowing rather than a labelled sanitiser node.
engine_validated_funcs: HashSet<Option<String>>,
/// Functions where some Source's defining variable is later
/// rebound to a literal RHS (carries `TaintMeta.const_text`) in
/// the same scope, with no Source label on the rebinding node.
/// Positive evidence that the engine's SSA renaming structurally
/// kills the source's taint before any sink can read it covers
/// kills the source's taint before any sink can read it, covers
/// `cmd = getenv(); cmd = "echo hello"; system(cmd)` patterns
/// where the rebind is what makes the code safe but the engine
/// has no `Sanitizer` label or `taint-unsanitised-flow` finding to
@ -2520,7 +2541,7 @@ struct TaintSuppressionCtx {
/// interprocedural analysis cleared the flow through a
/// user-defined wrapper (e.g. `def sanitize(s): return
/// shlex.quote(s)`). The current per-function `Sanitizer` check
/// only sees direct sanitisers in the *caller's* scope without
/// only sees direct sanitisers in the *caller's* scope, without
/// this signal, every helper-wrapped sanitiser fires as an
/// AST-pattern FP because the engine cleared the value via Phase
/// 11 inline analysis but the sink's enclosing scope has no
@ -2687,7 +2708,7 @@ impl TaintSuppressionCtx {
// an "interproc sanitiser caller" when its body invokes any
// helper whose own body contains a labelled Sanitizer. This
// handles wrappers like `def sanitize(s): return
// shlex.quote(s)` — the engine clears taint via Phase 11
// shlex.quote(s)`, the engine clears taint via
// inline analysis, but the caller's scope has no labelled
// Sanitizer of its own to satisfy Condition 4(b).
let mut interproc_sanitizer_callers: HashSet<Option<String>> = HashSet::new();
@ -2703,7 +2724,7 @@ impl TaintSuppressionCtx {
// each to its enclosing function via `sink_func_at_line`, and
// record the function as "engine-validated". The set was
// populated by `ssa_events_to_findings` whenever the engine
// emitted an `SsaTaintEvent { all_validated: true, .. }`
// emitted an `SsaTaintEvent { all_validated: true, .. }` ,
// i.e. the engine reached a sink and proved every tainted
// input passed validation. This is the broadest form of
// engine-success evidence, covering predicate validation
@ -2762,7 +2783,7 @@ impl TaintSuppressionCtx {
// sink, since taint couldn't have evaluated a flow that doesn't exist.
let func = match self.sink_func_at_line.get(&line) {
Some(f) => f,
None => return false, // No CFG sink at this line taint had no opportunity to evaluate
None => return false, // No CFG sink at this line, taint had no opportunity to evaluate
};
match self.source_lines_by_func.get(func) {
Some(source_lines) => {
@ -2788,7 +2809,7 @@ impl TaintSuppressionCtx {
// OR
// (c) the SSA engine emitted at least one `all_validated`
// event in this function (engine reached *some* sink and
// proved every tainted input was validated covers
// proved every tainted input was validated, covers
// predicate validation, dominator early-return,
// type-check predicates, and interprocedural sanitiser
// wrappers that don't carry an explicit Sanitizer
@ -2796,18 +2817,18 @@ impl TaintSuppressionCtx {
// OR
// (d) the function rebinds a Source's defining variable to
// a literal RHS at a later line (engine's SSA renaming
// structurally kills taint before any sink reads it
// structurally kills taint before any sink reads it ,
// covers `cmd = getenv(); cmd = "echo"; system(cmd)`),
// OR
// (e) the function calls a same-file helper whose body
// contains a labelled Sanitizer (interprocedural
// sanitiser wrapper covers `def sanitize(s): return
// sanitiser wrapper, covers `def sanitize(s): return
// shlex.quote(s)` patterns where the engine clears
// taint via Phase 11 inline analysis but the caller's
// taint via inline analysis but the caller's
// scope has no Sanitizer label of its own).
//
// When none hold, we can't distinguish silent engine failure
// from real safety e.g. Go points-to limitation on `&local`
// from real safety, e.g. Go points-to limitation on `&local`
// Decode destinations leaves the chain writeback fired but the
// field-cell propagation dead, suppressing legitimate
// AST-pattern findings on every Go CRUD handler whose Decode
@ -2854,7 +2875,7 @@ pub fn run_rules_on_bytes(
maybe_inject_test_panic(path);
let Some(source) = ParsedSource::try_new(bytes, path)? else {
// Not a recognized tree-sitter language try text-based patterns,
// Not a recognized tree-sitter language, try text-based patterns,
// but first surface a parse-timeout synthetic diag if that's what
// caused try_new to return None.
let mut out = scan_text_based_patterns(bytes, path, cfg);
@ -2964,7 +2985,7 @@ pub fn analyse_file_fused(
maybe_inject_test_panic(path);
let Some(source) = ParsedSource::try_new(bytes, path)? else {
// Not a recognized tree-sitter language try text-based patterns,
// Not a recognized tree-sitter language, try text-based patterns,
// and surface a parse-timeout synthetic diag if that's what caused
// try_new to return None.
let mut diags = scan_text_based_patterns(bytes, path, cfg);
@ -2995,7 +3016,7 @@ pub fn analyse_file_fused(
let (ssa_summaries, ssa_bodies) = if needs_cfg {
// Lower SSA exactly once and feed both the taint engine and the
// SSA-artifact extractor. Pre-fix, both consumers re-lowered the
// same `FileCfg` independently `lower_all_functions_from_bodies`
// same `FileCfg` independently, `lower_all_functions_from_bodies`
// accounted for ~20% of `analyse_file_fused` wall-clock on the
// bench corpus.
//
@ -3294,7 +3315,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
"method param pass-through should be recognised"
);
// Local variable assigned from concat NOT a pass-through.
// Local variable assigned from concat, NOT a pass-through.
let code = b"<?php\nclass C { function f(string $base): void { $f = $base . '/x.php'; include $f; } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3303,7 +3324,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
"concat-built local should NOT be treated as pass-through"
);
// Param reassigned before include NOT a pass-through.
// Param reassigned before include, NOT a pass-through.
let code = b"<?php\nfunction f($file) { $file = $_GET['x']; include $file; }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3312,7 +3333,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
"reassigned param should NOT be treated as pass-through"
);
// Top-level (no enclosing function) NOT a pass-through.
// Top-level (no enclosing function), NOT a pass-through.
let code = b"<?php\n$file = $_GET['x'];\ninclude $file;\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3357,7 +3378,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
"allowed_classes => self::CONST should be recognised as safe"
);
// allowed_classes => true unsafe default, must NOT be suppressed
// allowed_classes => true, unsafe default, must NOT be suppressed
let code = b"<?php\n$x = unserialize($d, ['allowed_classes' => true]);\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3366,7 +3387,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
"allowed_classes => true is the unsafe default, should NOT be suppressed"
);
// No second arg must NOT be suppressed
// No second arg, must NOT be suppressed
let code = b"<?php\n$x = unserialize($d);\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3375,7 +3396,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
"single-arg unserialize should NOT be suppressed"
);
// Dynamic options variable must NOT be suppressed
// Dynamic options variable, must NOT be suppressed
let code = b"<?php\n$x = unserialize($d, $opts);\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3387,7 +3408,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
#[test]
fn sprintf_format_safety_classifier() {
// Numeric / char / pointer specifiers bounded by definition.
// Numeric / char / pointer specifiers, bounded by definition.
assert!(sprintf_format_is_safe(""));
assert!(sprintf_format_is_safe("hello world"));
assert!(sprintf_format_is_safe("%d"));
@ -3396,11 +3417,11 @@ fn sprintf_format_safety_classifier() {
assert!(sprintf_format_is_safe("%5d %x %llo"));
assert!(sprintf_format_is_safe("%%literal-percent"));
assert!(sprintf_format_is_safe("%p"));
// Precision-bounded `%s` / `%.*s` output capped at precision.
// Precision-bounded `%s` / `%.*s`, output capped at precision.
assert!(sprintf_format_is_safe(" %.*s"));
assert!(sprintf_format_is_safe("%.5s"));
assert!(sprintf_format_is_safe("[%-.10s]"));
// Bare `%s` / width-only `%5s` width is a *minimum*, length is
// Bare `%s` / width-only `%5s`, width is a *minimum*, length is
// unbounded. Must NOT be suppressed.
assert!(!sprintf_format_is_safe("%s"));
assert!(!sprintf_format_is_safe("hello %s world"));
@ -3441,7 +3462,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
let q_strcat = r#"(call_expression function: (identifier) @id (#eq? @id "strcat")) @vuln"#;
let q_sprintf = r#"(call_expression function: (identifier) @id (#eq? @id "sprintf")) @vuln"#;
// strcpy(dst, "literal") postgres autoprewarm shape.
// strcpy(dst, "literal"), postgres autoprewarm shape.
let code = b"void f(char *d) { strcpy(d, \"pg_prewarm\"); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcpy);
@ -3450,7 +3471,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with string-literal source must be suppressed"
);
// strcpy(dst, cond ? "a" : "b") string-literal ternary.
// strcpy(dst, cond ? "a" : "b"), string-literal ternary.
let code = b"void f(char *s, int h) { strcpy(s, (h >= 12) ? \"p.m.\" : \"a.m.\"); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcpy);
@ -3459,7 +3480,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with ternary-of-literals source must be suppressed"
);
// strcpy(dst, cond ? P_M_STR : A_M_STR) postgres formatting.c
// strcpy(dst, cond ? P_M_STR : A_M_STR), postgres formatting.c
// shape with #define'd ALL_CAPS string-constant macros.
let code = b"#define P_M_STR \"p.m.\"\n#define A_M_STR \"a.m.\"\nvoid f(char *s, int h) { strcpy(s, (h >= 12) ? P_M_STR : A_M_STR); }\n";
let tree = parser.parse(code, None).unwrap();
@ -3469,7 +3490,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with ternary-of-ALL_CAPS-macros must be suppressed"
);
// strcpy(dst, cond ? var_a : var_b) lowercase variables, NOT a
// strcpy(dst, cond ? var_a : var_b), lowercase variables, NOT a
// recognisable preprocessor macro shape. Must NOT suppress.
let code = b"void f(char *s, int h, char *a, char *b) { strcpy(s, (h >= 12) ? a : b); }\n";
let tree = parser.parse(code, None).unwrap();
@ -3479,7 +3500,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with ternary-of-lowercase-vars must NOT be suppressed"
);
// strcat(dst, "literal") same principle as strcpy.
// strcat(dst, "literal"), same principle as strcpy.
let code = b"void f(char *d) { strcat(d, \" (done)\"); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcat);
@ -3488,7 +3509,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcat with string-literal source must be suppressed"
);
// sprintf(dst, "%lld%c", ...) numeric format string.
// sprintf(dst, "%lld%c", ...), numeric format string.
let code = b"void f(char *cp, long long v, char u) { sprintf(cp, \"%lld%c\", v, u); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3497,7 +3518,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"sprintf with numeric-only format must be suppressed"
);
// sprintf(str, " %.*s", N, x) precision-bounded `%s`.
// sprintf(str, " %.*s", N, x), precision-bounded `%s`.
let code = b"void f(char *str, int n, const char *x) { sprintf(str, \" %.*s\", n, x); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3506,7 +3527,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"sprintf with precision-bounded `%.*s` must be suppressed"
);
// strcpy(dst, src) where src is a non-literal must NOT suppress.
// strcpy(dst, src) where src is a non-literal, must NOT suppress.
let code = b"void f(char *d, char **a) { strcpy(d, a[1]); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcpy);
@ -3515,7 +3536,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with non-literal source must NOT be suppressed"
);
// sprintf with bare `%s` must NOT suppress.
// sprintf with bare `%s`, must NOT suppress.
let code = b"void f(char *b, const char *u) { sprintf(b, \"%s\", u); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3525,7 +3546,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
);
// sprintf with non-literal format (concatenated_string with PRI* macro)
// must NOT suppress (engine cannot statically expand the macro).
//, must NOT suppress (engine cannot statically expand the macro).
let code = b"void f(char *b, long long v) { sprintf(b, \"%\" PRId64, v); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3543,3 +3564,51 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"Layer D should only fire for buffer-overflow rule ids"
);
}
/// Regression: `is_literal_node` must NOT classify a Python f-string
/// (a `string` node containing `interpolation` children) as literal.
/// Layer A's "all-args-literal → suppress Security finding" shortcut
/// otherwise hides every CVE that injects via `cursor.execute(f"…{x}…")`
/// or `text(f"…{x}…")`. Motivated by CVE-2025-69662 (geopandas SQLi
/// via `text(f"SELECT … '{geom_name}' …")`) and CVE-2025-24793
/// (snowflake-connector-python f-string-built CREATE STAGE / DROP).
#[test]
fn is_literal_node_rejects_python_fstring_with_interpolation() {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
parser.set_language(&lang).unwrap();
// f-string with one interpolation segment, must be non-literal.
let code = b"x = f\"SELECT * WHERE y = '{u}'\"\n";
let tree = parser.parse(code, None).unwrap();
let assignment = tree
.root_node()
.child(0)
.and_then(|s| s.child(0))
.expect("assignment node");
let rhs = assignment
.child_by_field_name("right")
.expect("RHS of assignment");
assert_eq!(rhs.kind(), "string");
assert!(
!is_literal_node(rhs, code),
"f-string with interpolation must not be classified as literal"
);
// Plain string literal, must remain literal.
let code = b"x = \"plain literal\"\n";
let tree = parser.parse(code, None).unwrap();
let assignment = tree
.root_node()
.child(0)
.and_then(|s| s.child(0))
.expect("assignment node");
let rhs = assignment
.child_by_field_name("right")
.expect("RHS of assignment");
assert_eq!(rhs.kind(), "string");
assert!(
is_literal_node(rhs, code),
"plain string literal must be classified as literal"
);
}