diff --git a/.gitignore b/.gitignore index 37ea5d83..e259fd8f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ /.idea /frontend/node_modules /src/server/assets/dist +/marketing /.nyx /logs /book diff --git a/Cargo.toml b/Cargo.toml index c5c3ae7b..efdd74ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,13 @@ pkg-url = "{ repo }/releases/download/v{ version }/nyx-{ target }{ archive-suffi pkg-fmt = "zip" bin-dir = "target/{ target }/release/{ bin }{ binary-ext }" +# docs.rs builds the `serve` feature (default) so the server module renders. +# `smt` is left off — bundled Z3 takes too long on docs.rs builders, and +# `smt-system-z3` needs a system library that isn't available there. +[package.metadata.docs.rs] +features = ["serve"] +rustdoc-args = ["--cfg", "docsrs"] + [features] default = ["serve"] serve = ["dep:axum", "dep:tokio", "dep:tokio-stream", "dep:tower-http"] diff --git a/README.md b/README.md index 61887c5a..1d371835 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,12 @@ The corpus also holds a small set of vulnerable/patched pairs extracted from pub Fixtures live under [`tests/benchmark/cve_corpus/`](tests/benchmark/cve_corpus/) with upstream attribution headers. + + --- ## How it works diff --git a/assets/screenshots/demo.gif b/assets/screenshots/demo.gif index 689c7b64..0c6ea09f 100644 Binary files a/assets/screenshots/demo.gif and b/assets/screenshots/demo.gif differ diff --git a/build.rs b/build.rs index 074b0730..37fadd66 100644 --- a/build.rs +++ b/build.rs @@ -1,7 +1,9 @@ -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::Command; fn main() { + render_docs_for_rustdoc(); + // Only relevant when the serve feature is active if std::env::var("CARGO_FEATURE_SERVE").is_err() { return; @@ -14,11 +16,11 @@ fn main() { println!("cargo:rerun-if-changed=src/server/assets/dist/index.html"); if index_html.exists() { - // Dist already built — nothing to do + // Dist already built, nothing to do return; } - // Dist missing — try to build frontend + // Dist missing, try to build frontend let frontend_dir = Path::new("frontend"); if !frontend_dir.join("package.json").exists() { emit_placeholder_and_warn(dist_dir); @@ -56,6 +58,293 @@ fn main() { } } +// --------------------------------------------------------------------------- +// Rustdoc / docs.rs: render docs/*.md into $OUT_DIR with relative .md links +// rewritten to absolute github.com/elicpeter/nyx URLs so they resolve when the +// markdown is embedded in rustdoc via #![doc = include_str!(...)]. +// +// Source of truth stays in docs/. Files that don't exist (published-crate +// builds where docs/ wasn't packaged) fall back to a one-line stub so rustdoc +// still compiles. +// --------------------------------------------------------------------------- + +const GH_DOCS_BASE: &str = "https://github.com/elicpeter/nyx/blob/master/docs"; + +struct DocSpec { + /// Path under docs/, e.g. "how-it-works.md" or "detectors/taint.md". + src: &'static str, + /// Output filename in $OUT_DIR. + out: &'static str, +} + +const DOC_SPECS: &[DocSpec] = &[ + DocSpec { + src: "how-it-works.md", + out: "lib_intro.md", + }, + DocSpec { + src: "detectors/taint.md", + out: "taint.md", + }, + DocSpec { + src: "detectors/cfg.md", + out: "cfg_analysis.md", + }, + DocSpec { + src: "detectors/state.md", + out: "state.md", + }, + DocSpec { + src: "detectors/patterns.md", + out: "patterns.md", + }, + DocSpec { + src: "auth.md", + out: "auth_analysis.md", + }, +]; + +fn render_docs_for_rustdoc() { + let Ok(out_dir) = std::env::var("OUT_DIR") else { + return; + }; + let out_dir = PathBuf::from(out_dir); + let docs_dir = Path::new("docs"); + + for spec in DOC_SPECS { + let src_path = docs_dir.join(spec.src); + println!("cargo:rerun-if-changed=docs/{}", spec.src); + let out_path = out_dir.join(spec.out); + let rendered = match std::fs::read_to_string(&src_path) { + Ok(raw) => rewrite_doc_links(&raw, spec.src), + Err(_) => format!( + "See [`{base}/{src}`]({base}/{src}).\n", + base = GH_DOCS_BASE, + src = spec.src, + ), + }; + if let Err(e) = std::fs::write(&out_path, rendered) { + println!( + "cargo:warning=failed to write rendered doc {}: {}", + out_path.display(), + e + ); + } + } +} + +/// Render markdown for embedding in rustdoc. +/// +/// 1. Rewrites relative `.md` links to absolute github.com URLs: +/// - inline links: `](path.md)` and `](path.md#anchor)` +/// - reference defs: `[id]: path.md` +/// 2. Labels unmarked fenced code blocks as `text` so rustdoc does not try +/// to compile them as Rust (and choke on Unicode like `→`). +/// 3. Annotates `rust` fences with `,ignore` so rustdoc doesn't try to +/// compile or run prose-level snippets as doctests. GitHub still +/// highlights them as Rust because it keys off the first token. +/// +/// Skips link rewriting inside code fences. Skips link rewriting for URLs +/// that are already absolute (have a scheme), pure anchors (`#section`), +/// or non-`.md` paths. +fn rewrite_doc_links(content: &str, source_rel: &str) -> String { + let source_dir = Path::new(source_rel) + .parent() + .map(|p| p.to_string_lossy().into_owned()) + .unwrap_or_default(); + + let mut out = String::with_capacity(content.len() + 256); + let mut in_fence = false; + + for line in content.split_inclusive('\n') { + let body = line.strip_suffix('\n').unwrap_or(line); + let trimmed = body.trim_start(); + if trimmed.starts_with("```") { + let lang = trimmed.trim_start_matches('`').trim(); + if in_fence { + in_fence = false; + out.push_str(line); + } else { + in_fence = true; + let indent_len = body.len() - trimmed.len(); + if lang.is_empty() { + out.push_str(&body[..indent_len]); + out.push_str("```text"); + if line.ends_with('\n') { + out.push('\n'); + } + } else if is_rust_fence_needing_ignore(lang) { + out.push_str(&body[..indent_len]); + out.push_str("```rust,ignore"); + if line.ends_with('\n') { + out.push('\n'); + } + } else { + out.push_str(line); + } + } + continue; + } + if in_fence { + out.push_str(line); + } else { + rewrite_links_in_line(body, &source_dir, &mut out); + if line.ends_with('\n') { + out.push('\n'); + } + } + } + + out +} + +fn rewrite_links_in_line(line: &str, source_dir: &str, out: &mut String) { + let bytes = line.as_bytes(); + let mut i = 0; + while i < bytes.len() { + // Inline link: `](URL)`, markdown URLs do not contain a raw `)`. + if i + 1 < bytes.len() && bytes[i] == b']' && bytes[i + 1] == b'(' { + out.push_str("]("); + i += 2; + let url_start = i; + while i < bytes.len() && bytes[i] != b')' { + i += 1; + } + let url = &line[url_start..i]; + out.push_str(&maybe_rewrite_url(url, source_dir)); + } + // Reference def: `]: URL`. + else if i + 2 < bytes.len() + && bytes[i] == b']' + && bytes[i + 1] == b':' + && bytes[i + 2] == b' ' + { + out.push_str("]: "); + i += 3; + let url_start = i; + while i < bytes.len() && bytes[i] != b' ' { + i += 1; + } + let url = &line[url_start..i]; + out.push_str(&maybe_rewrite_url(url, source_dir)); + } else { + // `]` (0x5D) is ASCII; UTF-8 continuation bytes are 0x80-0xBF + // and start bytes are 0xC0+, so byte-level scanning of `]` is + // safe. For non-ASCII bytes, copy the full codepoint at once. + let b = bytes[i]; + if b < 0x80 { + out.push(b as char); + i += 1; + } else { + let len = utf8_seq_len(b); + let end = (i + len).min(bytes.len()); + out.push_str(&line[i..end]); + i = end; + } + } + } +} + +/// True for `rust` / `rust,...` fences that don't already opt out of +/// doctest execution. We rewrite these to `rust,ignore` because the prose +/// snippets in docs/ are illustrative, not standalone-compilable. +fn is_rust_fence_needing_ignore(lang: &str) -> bool { + let mut parts = lang.split(',').map(|p| p.trim()); + let Some(first) = parts.next() else { + return false; + }; + if !first.eq_ignore_ascii_case("rust") { + return false; + } + for tag in parts { + let t = tag.to_ascii_lowercase(); + if t == "ignore" || t == "no_run" || t == "compile_fail" || t == "should_panic" { + return false; + } + } + true +} + +fn utf8_seq_len(lead: u8) -> usize { + // lead < 0xC0 covers ASCII and unexpected continuation bytes; treat both as + // single-byte to make progress. + if lead < 0xC0 { + 1 + } else if lead < 0xE0 { + 2 + } else if lead < 0xF0 { + 3 + } else { + 4 + } +} + +fn maybe_rewrite_url(url: &str, source_dir: &str) -> String { + if url.is_empty() { + return url.to_string(); + } + // Already absolute (scheme://, mailto:, ssh://, etc.), leave alone. + if has_scheme(url) { + return url.to_string(); + } + // Pure anchor, leave alone. + if url.starts_with('#') { + return url.to_string(); + } + // Split off optional anchor. + let (path, anchor) = match url.find('#') { + Some(p) => (&url[..p], &url[p..]), + None => (url, ""), + }; + // Only rewrite if the path looks like a markdown file. + if !path.ends_with(".md") { + return url.to_string(); + } + // Resolve relative to source_dir. + let combined = if source_dir.is_empty() { + path.to_string() + } else { + format!("{}/{}", source_dir, path) + }; + let normalised = normalise_path(&combined); + format!("{}/{}{}", GH_DOCS_BASE, normalised, anchor) +} + +fn has_scheme(url: &str) -> bool { + // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) ":" + let mut chars = url.chars(); + let first = match chars.next() { + Some(c) => c, + None => return false, + }; + if !first.is_ascii_alphabetic() { + return false; + } + for c in chars { + if c == ':' { + return true; + } + if !(c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) { + return false; + } + } + false +} + +fn normalise_path(path: &str) -> String { + let mut stack: Vec<&str> = Vec::new(); + for seg in path.split('/') { + match seg { + "" | "." => {} + ".." => { + stack.pop(); + } + other => stack.push(other), + } + } + stack.join("/") +} + fn emit_placeholder_and_warn(dist_dir: &Path) { // Create minimal placeholder files so compilation succeeds std::fs::create_dir_all(dist_dir).ok(); diff --git a/docs/detectors.md b/docs/detectors.md index f3f95fdf..1400df8e 100644 --- a/docs/detectors.md +++ b/docs/detectors.md @@ -9,6 +9,16 @@ Nyx ships four independent detector families. They run together in `--mode full` | [State model](detectors/state.md) | `state-*` | Per-function state lattice | Use-after-close, double-close, leaks, unauthenticated access | | [AST patterns](detectors/patterns.md) | `..` | Tree-sitter structural match | Banned APIs, weak crypto, dangerous constructs | +The taint family is split into cap-specific rule classes when a sink callee carries multiple vulnerability classes: + +| Rule id | Cap | Surface | +|---|---|---| +| `taint-unsanitised-flow` | every cap except `data_exfil` and `unauthorized_id` | Default taint flow class | +| `taint-data-exfiltration` | `data_exfil` | Sensitive data flowing into the payload of an outbound network request (body / headers / json on `fetch`, body on `XMLHttpRequest.send`). Distinct from SSRF: the destination is fixed but attacker-influenced bytes leave the process. | +| `rs.auth.missing_ownership_check.taint` | `unauthorized_id` | Rust auth subsystem fold-in; see [auth.md](auth.md). | + +A single call site can fire several of these at once when it carries multiple gates — `fetch(taintedUrl, {body: tainted})` produces both an SSRF finding (URL flow) and a `taint-data-exfiltration` finding (body flow), each with its own cap mask rather than a conflated union. + For Rust auth-specific rules (`rs.auth.*`), see [auth.md](auth.md). ## How they combine diff --git a/docs/detectors/taint.md b/docs/detectors/taint.md index 2c436e05..7002a3a6 100644 --- a/docs/detectors/taint.md +++ b/docs/detectors/taint.md @@ -134,7 +134,8 @@ Sources, sanitizers, and sinks are linked by named capabilities. A sanitizer onl | `fmt_string` | | | `printf(var)` | | `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` with concatenation | | `deserialize` | | | `pickle.loads`, `yaml.load`, `Marshal.load` | -| `ssrf` | | URL-prefix locks | `requests.get`, `fetch`, `HttpClient.send` | +| `ssrf` | | URL-prefix locks | `requests.get`, `fetch` URL arg, outbound HTTP destination | +| `data_exfil` | | | `fetch` body / headers / json, `XMLHttpRequest.send` body | | `code_exec` | | | `eval`, `exec`, `Function` | | `crypto` | | | weak-algorithm constructors | | `unauthorized_id` | request-bound scoped IDs (Rust auth analysis) | ownership check | row-level write | diff --git a/docs/rules.md b/docs/rules.md index 23457261..ea5ea07c 100644 --- a/docs/rules.md +++ b/docs/rules.md @@ -112,12 +112,14 @@ The tables below are generated from `src/patterns/.rs` by [`tools/docgen`] | `go.crypto.md5` | Low | A | Medium | | `go.crypto.sha1` | Low | A | Medium | -### Java: 8 patterns +### Java: 10 patterns | Rule ID | Severity | Tier | Confidence | |---|---|---|---| | `java.cmdi.runtime_exec` | High | A | High | +| `java.code_exec.text4shell_interpolator` | High | A | High | | `java.deser.readobject` | High | A | High | +| `java.deser.snakeyaml_unsafe_constructor` | High | A | High | | `java.reflection.class_forname` | Medium | A | High | | `java.reflection.method_invoke` | Medium | A | High | | `java.sqli.execute_concat` | Medium | B | Medium | @@ -168,7 +170,7 @@ The tables below are generated from `src/patterns/.rs` by [`tools/docgen`] | `php.crypto.rand` | Low | A | Medium | | `php.crypto.sha1` | Low | A | Medium | -### Python: 13 patterns +### Python: 14 patterns | Rule ID | Severity | Tier | Confidence | |---|---|---|---| @@ -182,6 +184,7 @@ The tables below are generated from `src/patterns/.rs` by [`tools/docgen`] | `py.code_exec.compile` | Medium | A | High | | `py.deser.shelve_open` | Medium | A | High | | `py.sqli.execute_format` | Medium | B | Medium | +| `py.sqli.text_format` | Medium | B | Medium | | `py.xss.jinja_from_string` | Medium | A | High | | `py.crypto.md5` | Low | A | Medium | | `py.crypto.sha1` | Low | A | Medium | diff --git a/src/abstract_interp/bit_domain.rs b/src/abstract_interp/bit_domain.rs index b5b9f8ef..d7b7c313 100644 --- a/src/abstract_interp/bit_domain.rs +++ b/src/abstract_interp/bit_domain.rs @@ -19,8 +19,8 @@ use serde::{Deserialize, Serialize}; /// Bit-level abstract fact: known-zero and known-one masks. /// -/// - `top()` = `{known_zero: 0, known_one: 0}` — no bits known -/// - `bottom()` = `{known_zero: MAX, known_one: MAX}` — contradictory +/// - `top()` = `{known_zero: 0, known_one: 0}`, no bits known +/// - `bottom()` = `{known_zero: MAX, known_one: MAX}`, contradictory /// - `from_const(n)` = all 64 bits known /// /// Invariant: `known_zero & known_one == 0` for non-bottom values. @@ -253,7 +253,7 @@ impl AbstractDomain for BitFact { } } - /// Widen: same as join (finite lattice height — 64 bits × 3 states). + /// Widen: same as join (finite lattice height, 64 bits × 3 states). fn widen(&self, other: &Self) -> Self { self.join(other) } @@ -511,7 +511,7 @@ mod tests { #[test] fn right_shift_unknown_sign() { - // Sign bit unknown — high bits after shift should be unknown + // Sign bit unknown, high bits after shift should be unknown let a = BitFact { known_zero: 0x0F, known_one: 0, @@ -687,7 +687,7 @@ mod tests { } } - /// `a ⊓ b ⊑ a` and `a ⊓ b ⊑ b` — meet is the greatest lower bound. + /// `a ⊓ b ⊑ a` and `a ⊓ b ⊑ b`, meet is the greatest lower bound. #[test] fn meet_is_lower_bound_bit() { let xs = sample_bits(); @@ -700,7 +700,7 @@ mod tests { } } - /// `a ⊑ a ⊔ b` and `b ⊑ a ⊔ b` — join is the least upper bound. + /// `a ⊑ a ⊔ b` and `b ⊑ a ⊔ b`, join is the least upper bound. #[test] fn join_is_upper_bound_bit() { let xs = sample_bits(); diff --git a/src/abstract_interp/interval.rs b/src/abstract_interp/interval.rs index 865718e3..dff86d89 100644 --- a/src/abstract_interp/interval.rs +++ b/src/abstract_interp/interval.rs @@ -10,9 +10,9 @@ use serde::{Deserialize, Serialize}; /// Numeric interval: `[lo, hi]` inclusive bounds. /// -/// - `top()` = `[None, None]` — any integer -/// - `bottom()` = `[1, 0]` — empty / unsatisfiable (lo > hi) -/// - `exact(n)` = `[n, n]` — singleton +/// - `top()` = `[None, None]`, any integer +/// - `bottom()` = `[1, 0]`, empty / unsatisfiable (lo > hi) +/// - `exact(n)` = `[n, n]`, singleton #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct IntervalFact { pub lo: Option, @@ -278,7 +278,7 @@ impl IntervalFact { /// - One non-negative singleton mask `m`: `[0, m]` regardless of other /// operand's sign (two's complement AND with a non-negative mask always /// produces a non-negative result bounded by the mask). - /// - Both non-negative: `[0, min(a.hi, b.hi)]` — AND can only clear bits. + /// - Both non-negative: `[0, min(a.hi, b.hi)]`, AND can only clear bits. pub fn bit_and(&self, other: &Self) -> Self { if self.is_bottom() || other.is_bottom() { return Self::bottom(); @@ -330,7 +330,7 @@ impl IntervalFact { /// - Singletons: exact computation. /// - `x | 0` → `x`, `0 | x` → `x`. /// - Both non-negative with known upper bounds: `[max(a.lo, b.lo), - /// next_pow2_minus1(max(a.hi, b.hi))]` — OR can set any bit below + /// next_pow2_minus1(max(a.hi, b.hi))]`, OR can set any bit below /// the highest set bit of either operand. pub fn bit_or(&self, other: &Self) -> Self { if self.is_bottom() || other.is_bottom() { @@ -1054,7 +1054,7 @@ mod tests { let a = IntervalFact::exact(i64::MIN); let b = IntervalFact::exact(-1); let r = a.div(&b); - // Either bound becomes None (graceful) — exact representation + // Either bound becomes None (graceful), exact representation // depends on the impl, but we mainly assert no panic occurred // and the result is a valid interval. assert!( @@ -1078,7 +1078,7 @@ mod tests { assert_eq!(r.hi, Some(2)); } - /// Modulo by an interval that *contains* zero must escape to Top — + /// Modulo by an interval that *contains* zero must escape to Top , /// modulo-by-zero is undefined and we cannot precise-narrow it. #[test] fn modulo_divisor_spans_zero_is_top() { @@ -1096,7 +1096,7 @@ mod tests { /// `[i64::MIN, i64::MAX]` is the maximal interval. Any join with /// any other interval must remain `[i64::MIN, i64::MAX]` (or Top - /// equivalent) — this guards against accidental narrowing on join. + /// equivalent), this guards against accidental narrowing on join. #[test] fn full_range_is_join_absorbing() { let full = IntervalFact { @@ -1347,7 +1347,7 @@ mod tests { ); } - /// Modulo with exact-zero divisor — must escape to Top. + /// Modulo with exact-zero divisor, must escape to Top. #[test] fn modulo_by_exact_zero_is_top() { let a = IntervalFact { diff --git a/src/abstract_interp/mod.rs b/src/abstract_interp/mod.rs index d68b4c02..46438de3 100644 --- a/src/abstract_interp/mod.rs +++ b/src/abstract_interp/mod.rs @@ -45,7 +45,7 @@ pub fn is_enabled() -> bool { /// Per-SSA-value abstract element: product of all subdomains. /// -/// Each subdomain is independent — join, meet, widen, and leq are applied +/// Each subdomain is independent, join, meet, widen, and leq are applied /// component-wise. Adding a new subdomain requires adding a field here /// and updating the component-wise implementations. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] @@ -182,15 +182,15 @@ pub const MAX_LITERAL_PREFIX_LEN: usize = 64; /// restricted so the summary size stays constant regardless of callee body /// complexity: /// -/// * [`IntervalTransfer::Top`] — no interval knowledge crosses (default). -/// * [`IntervalTransfer::Identity`] — return = param (pass-through). -/// * [`IntervalTransfer::Affine`] — return = param * `mul` + `add` with +/// * [`IntervalTransfer::Top`], no interval knowledge crosses (default). +/// * [`IntervalTransfer::Identity`], return = param (pass-through). +/// * [`IntervalTransfer::Affine`], return = param * `mul` + `add` with /// `i64` constants; overflow defaults to Top at apply time. -/// * [`IntervalTransfer::Clamped`] — return is always in `[lo, hi]` regardless +/// * [`IntervalTransfer::Clamped`], return is always in `[lo, hi]` regardless /// of input. Captures callee-intrinsic bounds (e.g. `saturating` ops). /// /// No unbounded expression trees, no nesting. A callee whose behaviour does -/// not fit one of these forms falls back to `Top` — we never try to encode +/// not fit one of these forms falls back to `Top`, we never try to encode /// richer algebra in the summary. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum IntervalTransfer { @@ -247,9 +247,9 @@ impl IntervalTransfer { /// Mirrors [`IntervalTransfer`] for the string subdomain. Bounded by /// [`MAX_LITERAL_PREFIX_LEN`] to keep summary size constant. /// -/// * [`StringTransfer::Unknown`] — default. -/// * [`StringTransfer::Identity`] — return = param. -/// * [`StringTransfer::LiteralPrefix`] — return has this literal prefix +/// * [`StringTransfer::Unknown`], default. +/// * [`StringTransfer::Identity`], return = param. +/// * [`StringTransfer::LiteralPrefix`], return has this literal prefix /// regardless of input (callee-intrinsic). #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum StringTransfer { @@ -325,7 +325,7 @@ impl StringTransfer { /// caller's knowledge of each argument, without having to re-run the callee. /// /// Composition rule: `apply(input) = (interval.apply, string.apply, -/// bits=top)`. The bit domain is always Top — we do not track cross-file +/// bits=top)`. The bit domain is always Top, we do not track cross-file /// bit transfers. #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)] pub struct AbstractTransfer { @@ -351,7 +351,7 @@ impl AbstractTransfer { Self::default() } - /// True when neither subdomain carries any information — equivalent to + /// True when neither subdomain carries any information, equivalent to /// "omit this entry entirely". pub fn is_top(&self) -> bool { is_interval_top(&self.interval) && is_string_unknown(&self.string) @@ -410,7 +410,7 @@ impl AbstractState { /// Set abstract value for an SSA value. Drops Top values to save space. pub fn set(&mut self, v: SsaValue, val: AbstractValue) { if val.is_top() { - // Don't store Top — it's the default + // Don't store Top, it's the default if let Ok(idx) = self.values.binary_search_by_key(&v, |(id, _)| *id) { self.values.remove(idx); } @@ -422,7 +422,7 @@ impl AbstractState { if self.values.len() < MAX_ABSTRACT_VALUES { self.values.insert(idx, (v, val)); } - // Over budget: silently drop (conservative — defaults to Top) + // Over budget: silently drop (conservative, defaults to Top) } } } diff --git a/src/abstract_interp/path_domain.rs b/src/abstract_interp/path_domain.rs index f2e7f84d..3888df1e 100644 --- a/src/abstract_interp/path_domain.rs +++ b/src/abstract_interp/path_domain.rs @@ -15,7 +15,7 @@ //! Each axis is a three-value lattice [`Tri::No`] / [`Tri::Yes`] / [`Tri::Maybe`] //! where `Maybe` is Top (unknown) and `No` / `Yes` are the two definite //! refinements. A value is path-safe for a FILE_IO sink iff -//! `dotdot == No && absolute == No` — i.e. we have proof that *no* `..` +//! `dotdot == No && absolute == No`, i.e. we have proof that *no* `..` //! component and *no* absolute root can leak through. `normalized == Yes` //! alone is not sufficient (canonicalising an absolute input still produces //! an absolute path); prefix_lock is used separately to certify containment @@ -52,7 +52,7 @@ pub enum Tri { No, /// Proven present. Yes, - /// Unknown — no transfer or guard has proved the axis yet. + /// Unknown, no transfer or guard has proved the axis yet. Maybe, } @@ -367,12 +367,12 @@ impl AbstractDomain for PathFact { /// narrowed axis can be proved safe. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum PathRejection { - /// `x.contains("..")` — false branch proves `dotdot = No` on the receiver. + /// `x.contains("..")`, false branch proves `dotdot = No` on the receiver. DotDot, - /// `x.starts_with("/")` / `x.starts_with('\\')` — false branch proves + /// `x.starts_with("/")` / `x.starts_with('\\')`, false branch proves /// `absolute = No` on the receiver. AbsoluteSlash, - /// `x.is_absolute()` / `Path::new(x).is_absolute()` — false branch proves + /// `x.is_absolute()` / `Path::new(x).is_absolute()`, false branch proves /// `absolute = No` on the argument/receiver. IsAbsolute, /// Not a path-rejection idiom. @@ -384,7 +384,7 @@ pub enum PathRejection { /// the listed axis is refined. #[derive(Debug, Clone, PartialEq, Eq)] pub enum PathAssertion { - /// `x.starts_with("")` — true branch attaches + /// `x.starts_with("")`, true branch attaches /// `prefix_lock = Some("")` to the receiver. PrefixLock(String), /// Not a path-assertion idiom. @@ -426,7 +426,7 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec let clause = clause.trim(); // Multi-axis special case: `!filepath.IsLocal(p)` (Go). // `filepath.IsLocal` returns true iff the path stays within the - // current directory — no leading `/`, no `..` segments, no Windows + // current directory, no leading `/`, no `..` segments, no Windows // drive root. Idiomatic Go path-traversal guard: // `if !filepath.IsLocal(p) { return }` // The TRUE branch terminates; the FALSE branch (where IsLocal is @@ -449,7 +449,7 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec out } -/// Detect `!filepath.IsLocal()` — Go's idiomatic path-traversal +/// Detect `!filepath.IsLocal()`, Go's idiomatic path-traversal /// guard. Whitespace-tolerant: `! filepath.IsLocal(`, `!filepath . IsLocal(`, /// etc. Used by [`classify_path_rejection_axes`] to inject both /// [`PathRejection::DotDot`] and [`PathRejection::IsAbsolute`] on the false @@ -475,7 +475,7 @@ fn has_negated_filepath_is_local(clause: &str) -> bool { fn classify_path_rejection_atom(clause: &str) -> PathRejection { // `.contains("..")` (Rust, Java) / `.includes("..")` (JS/TS) / // `.include?("..")` (Ruby) / `strings.Contains(s, "..")` (Go) / - // `strstr(s, "..")` (C/C++) — every form recognised by + // `strstr(s, "..")` (C/C++), every form recognised by // `extract_contains_arg` returns `..` if the needle is the dotdot // segment. if let Some(needle) = extract_contains_arg(clause) @@ -483,7 +483,7 @@ fn classify_path_rejection_atom(clause: &str) -> PathRejection { { return PathRejection::DotDot; } - // Python `".." in s` — operator form. Look for `".." in ` + // Python `".." in s`, operator form. Look for `".." in ` // anywhere in the clause text. Conservative: requires the literal // `".." in ` substring (whitespace-tolerant). if has_python_dotdot_in(clause) { @@ -681,7 +681,7 @@ pub fn classify_path_assertion(text: &str) -> PathAssertion { /// * Must be non-empty. /// * The leaf segment must begin with an ASCII uppercase letter /// (Rust's variant / struct / type grammar). -/// * The leaf segment must be ASCII alphanumeric / underscore — no +/// * The leaf segment must be ASCII alphanumeric / underscore, no /// method call noise (parentheses, argument lists) survives here /// because callees arrive in their normalised scoped-identifier /// form. @@ -700,7 +700,7 @@ pub fn is_structural_variant_ctor(callee: &str) -> bool { // upper-camel-case names an enum variant or tuple struct (`Some`, // `Ok`, `MyResult`). A scoped identifier whose *penultimate* // segment is upper-camel-case names an associated constructor on - // that type — `Box::new`, `Cell::from`, `PathBuf::with_capacity`, + // that type, `Box::new`, `Cell::from`, `PathBuf::with_capacity`, // etc. The latter is the lower-leaf-case shape we want to admit // alongside the bare-variant shape. let segments: smallvec::SmallVec<[&str; 4]> = @@ -731,7 +731,7 @@ pub fn is_structural_variant_ctor(callee: &str) -> bool { /// PathFact of the receiver/first argument (the value being sanitised); /// it is used as the baseline to which the call's effect is applied. /// -/// Returned [`None`] means the callee is not a recognised path primitive — +/// Returned [`None`] means the callee is not a recognised path primitive , /// the caller should leave the result at its pre-existing PathFact (Top). /// /// Backwards-compatible wrapper around [`classify_path_primitive_rust`]. @@ -743,7 +743,7 @@ pub fn classify_path_primitive(callee: &str, input_fact: &PathFact) -> Option Option { // Accept both path-qualified (`std::fs::canonicalize`, `fs::canonicalize`) @@ -826,7 +826,7 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti // `Path::new(s)` / `PathBuf::from(s)`: // pass-through of the input's PathFact so downstream `starts_with` // checks against a Path/PathBuf value still see the underlying - // string's narrowed axes. No axis is forced — wrapping does not + // string's narrowed axes. No axis is forced, wrapping does not // sanitize on its own. "new" | "from" => { if callee_contains_segment(callee, "Path") || callee_contains_segment(callee, "PathBuf") @@ -837,8 +837,8 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti } } // Identity conversions on strings/paths. Each one re-binds the - // same logical value — the converted String / PathBuf / OsString - // still describes the exact same filesystem path — so the PathFact + // same logical value, the converted String / PathBuf / OsString + // still describes the exact same filesystem path, so the PathFact // flows through unchanged. Without this, a sanitised `s: &str` // would lose its narrowed axes the moment the helper returns // `s.to_string()` / `s.to_owned()` / `String::from(s)`. @@ -849,7 +849,7 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti } } -/// Python path-primitive classifier — `os.path.normpath`, `os.path.realpath`, +/// Python path-primitive classifier, `os.path.normpath`, `os.path.realpath`, /// `pathlib.Path.resolve`, `os.path.abspath`. /// /// Pattern conventions: tree-sitter-python emits dotted attribute access as @@ -893,7 +893,7 @@ pub fn classify_path_primitive_python(callee: &str, input_fact: &PathFact) -> Op } } -/// JavaScript / TypeScript path-primitive classifier — Node's `path` module: +/// JavaScript / TypeScript path-primitive classifier, Node's `path` module: /// `path.normalize`, `path.resolve`, `path.join`. pub fn classify_path_primitive_js(callee: &str, input_fact: &PathFact) -> Option { let leaf = rightmost_segment(callee); @@ -920,7 +920,7 @@ pub fn classify_path_primitive_js(callee: &str, input_fact: &PathFact) -> Option } } -/// Go path-primitive classifier — `path/filepath` package: +/// Go path-primitive classifier, `path/filepath` package: /// `filepath.Clean`, `filepath.Abs`. pub fn classify_path_primitive_go(callee: &str, input_fact: &PathFact) -> Option { let leaf = rightmost_segment(callee); @@ -947,7 +947,7 @@ pub fn classify_path_primitive_go(callee: &str, input_fact: &PathFact) -> Option } } -/// Java path-primitive classifier — `java.nio.file.Path.normalize` / +/// Java path-primitive classifier, `java.nio.file.Path.normalize` / /// `Paths.get(s).normalize().toAbsolutePath()`. pub fn classify_path_primitive_java(callee: &str, input_fact: &PathFact) -> Option { let leaf = rightmost_segment(callee); @@ -980,7 +980,7 @@ pub fn classify_path_primitive_java(callee: &str, input_fact: &PathFact) -> Opti } } -/// Ruby path-primitive classifier — `File.expand_path` / `Pathname#cleanpath`. +/// Ruby path-primitive classifier, `File.expand_path` / `Pathname#cleanpath`. pub fn classify_path_primitive_ruby(callee: &str, input_fact: &PathFact) -> Option { let leaf = rightmost_segment(callee); match leaf { @@ -1005,13 +1005,13 @@ pub fn classify_path_primitive_ruby(callee: &str, input_fact: &PathFact) -> Opti } } -/// PHP path-primitive classifier — `realpath`, `basename`. +/// PHP path-primitive classifier, `realpath`, `basename`. pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Option { let leaf = rightmost_segment(callee); match leaf { // `realpath($s)`: // Resolves symlinks and `..`, returns absolute path. Returns - // `false` if the file doesn't exist — but on the success path + // `false` if the file doesn't exist, but on the success path // (which is what reaches a sink), it produces a clean absolute path. "realpath" => { let mut f = input_fact.clone(); @@ -1021,7 +1021,7 @@ pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Optio Some(f) } // `basename($s)`: - // Strips directory components — guaranteed to contain no `..` + // Strips directory components, guaranteed to contain no `..` // (basename of `..` is `..`, but basename of any traversal- // prefixed path is just the leaf). Conservative: clear dotdot. "basename" => { @@ -1034,7 +1034,7 @@ pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Optio } } -/// C / C++ path-primitive classifier — POSIX `realpath`, +/// C / C++ path-primitive classifier, POSIX `realpath`, /// `std::filesystem::canonical`. pub fn classify_path_primitive_c_cpp(callee: &str, input_fact: &PathFact) -> Option { let leaf = rightmost_segment(callee); @@ -1089,7 +1089,7 @@ fn extract_contains_arg(text: &str) -> Option { "strstr(", ] { if let Some(idx) = text.find(prefix) { - // Skip past the first argument (receiver) — the literal needle + // Skip past the first argument (receiver), the literal needle // is the second arg, separated by a comma. Find the comma at // top level inside this call. let inner = &text[idx + prefix.len()..]; @@ -1123,7 +1123,7 @@ fn extract_starts_with_arg(text: &str) -> Option { return Some(s); } } - // Go free-function form `strings.HasPrefix(r, "/")` — second arg. + // Go free-function form `strings.HasPrefix(r, "/")`, second arg. if let Some(idx) = text.find("strings.HasPrefix(") { let inner = &text[idx + "strings.HasPrefix(".len()..]; if let Some(comma_idx) = top_level_comma(inner) { @@ -1762,7 +1762,7 @@ mod tests { assert!(is_structural_variant_ctor("Box::new")); assert!(is_structural_variant_ctor("std::option::Option::Some")); // User-defined upper-camel-case variant name participates the - // same way — name list is not part of the contract. + // same way, name list is not part of the contract. assert!(is_structural_variant_ctor("MyResult::Ok")); assert!(is_structural_variant_ctor("Wrapper")); } diff --git a/src/abstract_interp/string_domain.rs b/src/abstract_interp/string_domain.rs index 190baa7d..3220da0a 100644 --- a/src/abstract_interp/string_domain.rs +++ b/src/abstract_interp/string_domain.rs @@ -1,6 +1,6 @@ //! String abstract domain for abstract interpretation. //! -//! Tracks known prefix, suffix, and — when provably bounded — the finite set +//! Tracks known prefix, suffix, and, when provably bounded, the finite set //! of possible concrete string values. Used for SSRF suppression (URL prefix //! proves host is locked), command-injection suppression (lookup result //! bounded to a safe set of literals), and general string analysis. @@ -78,7 +78,7 @@ impl StringFact { /// the finite domain is `{s}`. /// /// Empty prefix/suffix are normalised to `None` because "starts/ends with - /// the empty string" carries no constraint — keeping `Some("")` would + /// the empty string" carries no constraint, keeping `Some("")` would /// break join idempotence (`Some("")` ⊔ `Some("")` collapses to `None`). pub fn exact(s: &str) -> Self { let prefix = truncate_prefix(s); @@ -134,7 +134,7 @@ impl StringFact { /// Inputs are sorted and deduped. If the cardinality exceeds /// [`MAX_DOMAIN_SIZE`] or the input is empty, the domain collapses to /// `None` (Top on this sub-field). The prefix/suffix sub-fields remain - /// unset — callers can combine with [`Self::exact`] for single-element + /// unset, callers can combine with [`Self::exact`] for single-element /// sets if tighter facts are desired. pub fn finite_set(values: Vec) -> Self { let mut v = values; @@ -411,7 +411,7 @@ fn truncate_suffix(s: &str) -> String { /// Longest common prefix of two strings, char-aligned. /// /// Iterates by `char` rather than `byte` so multi-byte UTF-8 code points are -/// either kept whole or dropped — a byte-wise comparison would slice into the +/// either kept whole or dropped, a byte-wise comparison would slice into the /// middle of a code point and produce mojibake (`x as char` on a UTF-8 /// continuation byte yields a garbage Latin-1 character). pub fn longest_common_prefix(a: &str, b: &str) -> String { @@ -746,7 +746,7 @@ mod tests { let a = StringFact::from_prefix("https://api.example.com/"); let b = StringFact::from_prefix("https://db.example.com/"); let r = a.join(&b); - // Common prefix is "https://" — anything past that diverges. + // Common prefix is "https://", anything past that diverges. assert_eq!( r.prefix.as_deref(), Some("https://"), @@ -781,7 +781,7 @@ mod tests { ] } - /// `x ⊔ x = x` — join is idempotent across all sample shapes. + /// `x ⊔ x = x`, join is idempotent across all sample shapes. #[test] fn join_idempotent_string() { for a in sample_strings() { @@ -789,7 +789,7 @@ mod tests { } } - /// `x ⊔ y = y ⊔ x` — join is commutative. + /// `x ⊔ y = y ⊔ x`, join is commutative. #[test] fn join_commutative_string() { let xs = sample_strings(); @@ -806,7 +806,7 @@ mod tests { } } - /// `x ⊓ x = x` — meet is idempotent. + /// `x ⊓ x = x`, meet is idempotent. #[test] fn meet_idempotent_string() { for a in sample_strings() { @@ -814,7 +814,7 @@ mod tests { } } - /// `x ⊓ y = y ⊓ x` — meet is commutative. + /// `x ⊓ y = y ⊓ x`, meet is commutative. #[test] fn meet_commutative_string() { let xs = sample_strings(); @@ -844,7 +844,7 @@ mod tests { } } - /// `x ⊑ x` — leq is reflexive. + /// `x ⊑ x`, leq is reflexive. #[test] fn leq_reflexive_string() { for a in sample_strings() { @@ -852,7 +852,7 @@ mod tests { } } - /// **Soundness**: `widen(a, b) ⊒ join(a, b)` — widening must + /// **Soundness**: `widen(a, b) ⊒ join(a, b)`, widening must /// over-approximate join, otherwise dataflow loses information. #[test] fn widen_over_approximates_join_string() { @@ -905,7 +905,7 @@ mod tests { } } - /// Empty-string exact value must distinguish from Top — it is a + /// Empty-string exact value must distinguish from Top, it is a /// singleton (`{""}`), not unconstrained. After the empty-prefix /// normalisation, prefix/suffix are `None` (carry no extra info) /// but the `domain` field still pins the value to exactly `""`. diff --git a/src/ast.rs b/src/ast.rs index 3ece8f99..269f16e1 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -127,12 +127,12 @@ use crate::utils::snippet::line_snippet as extract_line_snippet; /// [`normalize_namespace`] convention) back to the absolute path the /// diagnostic pipeline expects. /// -/// * Empty `file_rel` — single-file scans normalize every namespace to +/// * Empty `file_rel`, single-file scans normalize every namespace to /// `""`; treat that as "the file under analysis" and return /// `fallback.to_string_lossy()`. -/// * `scan_root` absent — we have no workspace root to resolve against; +/// * `scan_root` absent, we have no workspace root to resolve against; /// return `file_rel` verbatim (it may already be absolute). -/// * Otherwise — join `scan_root` with `file_rel`. +/// * Otherwise, join `scan_root` with `file_rel`. fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -> String { if file_rel.is_empty() { return fallback.to_string_lossy().into_owned(); @@ -163,7 +163,7 @@ fn build_taint_diag( let source_info = cfg_graph.node_weight(finding.source); // The reconstructed flow path is the authoritative view of where the // taint started *in this body*. When present, prefer its first step's - // CFG span over `finding.source_span` — which can be stale across + // CFG span over `finding.source_span`, which can be stale across // multi-hop cross-body remaps (e.g. JS two-level solve where a // callee-interior source gets its span rewritten to the enclosing // body's entry node). Fall back to `source_span`, then to the source @@ -183,7 +183,7 @@ fn build_taint_diag( // Prefer the source CFG node's callee string when it's a call expression // (e.g. `os.getenv("X")`). For property-access sources like - // `navigator.userAgent` there is no callee — fall back to the first flow + // `navigator.userAgent` there is no callee, fall back to the first flow // step's `variable` (the SSA var name, e.g. "userAgent"), then to the // source node's `taint.defines` / first `taint.uses` entry, before // finally giving up and rendering "(unknown)". @@ -289,7 +289,7 @@ fn build_taint_diag( // Convert raw flow steps to display FlowSteps. When the finding has a // primary_location distinct from the call site, the last raw step is - // really the Call — reclassify it and append a synthetic Sink step + // really the Call, reclassify it and append a synthetic Sink step // pointing at the callee-internal dangerous instruction so analysts // see both the call site and the final sink in the trace. let mut flow_steps: Vec = finding @@ -348,7 +348,7 @@ fn build_taint_diag( .clone() .or_else(|| Some(short_call_site.clone())); - // Resolved sink capability bits — used by deduplication to distinguish + // Resolved sink capability bits, used by deduplication to distinguish // sinks with different cap types on the same source line (e.g. // `sink_sql(x); sink_shell(x);`). let sink_caps_bits: u16 = cfg_graph[finding.sink] @@ -361,13 +361,33 @@ fn build_taint_diag( }) .fold(0u16, |acc, b| acc | b); - // Phase C: when the sink's required caps include UNAUTHORIZED_ID — and - // the finding actually reached that sink via the taint engine — use a - // dedicated auth rule id so the finding is namespaced alongside the - // standalone `auth_analysis` subsystem's output instead of being folded - // into the generic `taint-unsanitised-flow` bucket. - let diag_id = if sink_caps_bits & crate::labels::Cap::UNAUTHORIZED_ID.bits() != 0 { + // Cap-specific rule-id routing. + // + // 1. `UNAUTHORIZED_ID`: namespace alongside the standalone `auth_analysis` + // subsystem's output so cross-tool aggregation lines up. + // 2. `DATA_EXFIL`: route to `taint-data-exfiltration` so SARIF surfaces a + // distinct rule id from SSRF, the two share callees (e.g. `fetch`) + // but represent different vulnerability classes. + // + // Prefer the per-finding `effective_sink_caps` (set by the multi-gate + // SSA dispatch) when populated; fall back to the union of all sink-label + // caps on the CFG node so legacy paths that build findings without + // setting `effective_sink_caps` still pick the right rule id. + let effective_caps = if finding.effective_sink_caps.is_empty() { + crate::labels::Cap::from_bits_truncate(sink_caps_bits) + } else { + finding.effective_sink_caps + }; + let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) { "rs.auth.missing_ownership_check.taint".to_string() + } else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL) + && !effective_caps.contains(crate::labels::Cap::SSRF) + { + format!( + "taint-data-exfiltration (source {}:{})", + source_point.row + 1, + source_point.column + 1 + ) } else { format!( "taint-unsanitised-flow (source {}:{})", @@ -452,7 +472,7 @@ fn build_taint_diag( /// Resolve a file extension to a language slug (e.g. `"rust"`, /// `"javascript"`). Public façade over [`lang_for_path`] for callers -/// that only need the slug — used by the debug API to look up +/// that only need the slug, used by the debug API to look up /// per-language rule enablement without re-parsing the file. pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> { lang_for_path(path).map(|(_, slug)| slug) @@ -467,7 +487,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> { // use `.cc` / `.cxx` / `.hpp` / `.hh` / `.h++` rather than the // `.cpp` synthetic-fixture extension. Without these mappings, // the scanner silently skipped them. Headers (`.h` is omitted - // intentionally — it's also valid C and disambiguating without a + // intentionally, it's also valid C and disambiguating without a // build system is brittle). Some("cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++") => { Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp")) @@ -481,7 +501,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> { "typescript", )), // TSX grammar is a superset of TypeScript plus JSX element/attribute - // nodes — all TypeScript KINDS / RULES / PARAM_CONFIG entries apply, + // nodes, all TypeScript KINDS / RULES / PARAM_CONFIG entries apply, // and JSX-specific sinks (e.g. `dangerouslySetInnerHTML`) layer on top // via the same `typescript` slug. Some("tsx") => Some(( @@ -493,7 +513,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> { "javascript", )), // JSX uses the same JavaScript grammar (tree-sitter-javascript handles - // JSX natively) — slug "javascript" so all JS rules apply. + // JSX natively), slug "javascript" so all JS rules apply. Some("jsx") => Some(( Language::from(tree_sitter_javascript::LANGUAGE), "javascript", @@ -739,7 +759,7 @@ impl<'a> ParsedSource<'a> { continue; } // Layer C: PHP `unserialize($x, ['allowed_classes' => [...]])` - // or `unserialize($x, ['allowed_classes' => false])` — + // or `unserialize($x, ['allowed_classes' => false])` , // PHP 7+ structural mitigation against object injection. // When the call passes an `allowed_classes` option set to // either `false` (no class instantiation) or an array @@ -762,7 +782,7 @@ impl<'a> ParsedSource<'a> { // format-string contributes attacker-controlled length. // When the source argument is a string literal (or a // ternary of two string literals), the contributed length - // is statically bounded — there is no overflow vector + // is statically bounded, there is no overflow vector // for an attacker even if the destination buffer is // mis-sized. Same principle for `sprintf` when the // format string is a literal containing no bare `%s` @@ -818,7 +838,7 @@ impl<'a> ParsedSource<'a> { /// Sort, dedup, and optionally downgrade severity for non-production paths. /// /// Dedup key matches the `issues` table PRIMARY KEY `(file_id, rule_id, - /// line, col)` — severity is NOT part of the key. Two diags that agree + /// line, col)`, severity is NOT part of the key. Two diags that agree /// on (line, col, id) but differ in severity (e.g. a pattern-rule finding /// plus a taint-pipeline finding on the same call) would otherwise survive /// dedup here and crash the indexer with a UNIQUE constraint violation. @@ -854,7 +874,7 @@ impl<'a> ParsedFile<'a> { // project-level `FrameworkContext` misses frameworks the file // obviously imports. Augment the per-file rule set with any // framework-conditional rules keyed off in-file import specifiers - // (e.g. `import fastify from 'fastify'`). Idempotent — skips + // (e.g. `import fastify from 'fastify'`). Idempotent, skips // frameworks already active from the manifest pass. let in_file_fws = crate::utils::project::detect_in_file_frameworks(source.bytes, source.lang_slug); @@ -931,13 +951,13 @@ impl<'a> ParsedFile<'a> { self.source.lang_slug, ); - // Phase 6 (typed call-graph subtype awareness): every + // every // `FuncSummary` exported from this file carries a copy of the // file's `hierarchy_edges` so the inheritance / impl / // implements relationships persist through SQLite round-trips // and re-merge into `crate::callgraph::TypeHierarchyIndex` at // call-graph build time. Cheap (one clone per summary) and - // strictly additive — `merge_summaries` deduplicates downstream. + // strictly additive, `merge_summaries` deduplicates downstream. if !self.file_cfg.hierarchy_edges.is_empty() { let edges = self.file_cfg.hierarchy_edges.clone(); for s in &mut out { @@ -982,7 +1002,7 @@ impl<'a> ParsedFile<'a> { /// /// Returns two vectors keyed by canonical [`crate::symbol::FuncKey`]. /// The `FuncKey` identity preserves `(lang, namespace, container, name, - /// arity, disambig, kind)` — so two same-name definitions in this file + /// arity, disambig, kind)`, so two same-name definitions in this file /// (e.g. a free `process` and a `Worker::process`, or overloads with /// different arities) land on distinct entries instead of the later one /// shadowing the earlier one. @@ -1003,7 +1023,7 @@ impl<'a> ParsedFile<'a> { // Use the FileCfg path (same one `analyse_file` uses at taint time) so // the SSA summaries stored cross-file match exactly what pass 2 will - // resolve against — no NodeIndex-space or entry-detection drift. + // resolve against, no NodeIndex-space or entry-detection drift. let locator = crate::summary::SinkSiteLocator { tree: &self.source.tree, bytes: self.source.bytes, @@ -1024,7 +1044,7 @@ impl<'a> ParsedFile<'a> { /// Lower every function body in this file to SSA exactly once. Used by /// [`analyse_file_fused`] to share the result between the taint engine /// ([`run_cfg_analyses_with_lowered`]) and the SSA artifact filter - /// ([`build_eligible_bodies_from_lowered`]) — the prior code path lowered + /// ([`build_eligible_bodies_from_lowered`]), the prior code path lowered /// twice (once inside `analyse_file`, once inside /// `extract_ssa_artifacts_from_file_cfg`) and accounted for ~24% of the /// pass-2 wall-clock on the bench corpus. @@ -1038,7 +1058,7 @@ impl<'a> ParsedFile<'a> { /// here populates `param_to_sink` with concrete coordinates that the /// emission path then promotes into `Finding.primary_location`, /// causing the same-file summary-resolved sink to be reported at the - /// callee-internal sink line instead of the call site — which both + /// callee-internal sink line instead of the call site, which both /// duplicates the intraprocedural finding the taint engine already /// emits at that exact line and re-attributes the flow finding away /// from the user-visible call site. Closure-capture, lambda, and @@ -1263,13 +1283,11 @@ impl<'a> ParsedFile<'a> { state::build_resource_method_summaries(&self.file_cfg.bodies, caller_lang); let mut all_state_findings = Vec::new(); for body in &self.file_cfg.bodies { - // Phase 2 of the pointer-analysis rollout: when - // `NYX_POINTER_ANALYSIS=1` is set, derive a `var_name → - // PtrProxyHint` map from the body's points-to facts so - // the proxy-acquire transfer can suppress SymbolId - // attribution on field-aliased receivers (e.g. `m := - // c.mu; m.Lock()`). Strict-additive — `None` when the - // env-var is unset and behaviour matches today exactly. + // When `NYX_POINTER_ANALYSIS=1` is set, derive a + // `var_name → PtrProxyHint` map from the body's + // points-to facts so the proxy-acquire transfer can + // suppress SymbolId attribution on field-aliased + // receivers (e.g. `m := c.mu; m.Lock()`). let body_pointer_hints = cfg_analysis::build_body_const_facts(body, caller_lang) .as_ref() .and_then(|f| { @@ -1379,15 +1397,11 @@ impl<'a> ParsedFile<'a> { ) } - /// Build a per-file `var_name → TypeKind` map by running SSA + type - /// facts on each body and copying type facts for SSA values whose - /// definition recorded a source-level variable name. When the same - /// name resolves to different non-`Unknown` types across bodies the - /// entry is dropped — absence is safe because the auth analysis - /// sink gate simply falls back to its syntactic heuristics. Returns - /// `None` when no body produces any typed variable (non-Rust files - /// currently emit few `LocalCollection` / security-typed facts, but - /// this path is language-agnostic). + /// Build a per-file `var_name → TypeKind` map from SSA + type facts. + /// Conflicting non-`Unknown` types across bodies drop the entry , + /// absence is safe because the auth sink gate falls back to + /// syntactic heuristics. Returns `None` when no body produces a + /// typed variable. fn collect_file_var_types(&self) -> Option { let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust); let mut merged: std::collections::HashMap = @@ -1492,7 +1506,7 @@ pub fn build_cfg_for_file(path: &Path, cfg: &Config) -> NyxResult bool { } // If the argument list is empty (no args), we conservatively do NOT - // suppress — the danger may come from side effects, not arguments. + // suppress, the danger may come from side effects, not arguments. has_any_arg } @@ -1745,7 +1759,7 @@ fn find_enclosing_call(mut node: tree_sitter::Node) -> Option if kind == "function_call_expression" { return Some(node); } - // Stop at scope/statement boundaries — don't cross into outer calls + // Stop at scope/statement boundaries, don't cross into outer calls if kind.contains("block") || kind.contains("body") || kind == "program" @@ -1780,13 +1794,20 @@ fn find_arg_list(call: tree_sitter::Node) -> Option { fn is_literal_node(node: tree_sitter::Node, bytes: &[u8]) -> bool { let kind = node.kind(); match kind { - // String literals (most languages) + // String literals, but Python's `string` node also covers + // f-strings, which carry `interpolation` children. An f-string + // with interpolation is *not* a literal: it embeds arbitrary + // expressions, so a sink call like `cursor.execute(f"…{x}")` + // must not be suppressed under Layer A's "all-literal args" + // shortcut. Same shape applies to any tree-sitter grammar + // that nests an `interpolation` (or `string_interpolation`) + // child inside a string node. "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" | "string_content" - | "string_fragment" => true, + | "string_fragment" => !has_interpolation(node), // Numeric literals "integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => { @@ -1901,7 +1922,7 @@ fn is_php_include_param_passthrough(include_node: tree_sitter::Node, bytes: &[u8 } return true; } - // Stop at class/program scope without a matching function — bare + // Stop at class/program scope without a matching function, bare // top-level `include $var` does not benefit from this guard. "program" | "class_declaration" | "trait_declaration" | "interface_declaration" => { return false; @@ -2011,7 +2032,7 @@ fn is_var_reassigned_before( /// PHP-only: returns `true` when the captured `function_call_expression` /// node is `unserialize($x, [..., 'allowed_classes' => , ...])`. /// This is the canonical PHP 7+ structural mitigation against object -/// injection — explicitly restricting which classes the deserialiser may +/// injection, explicitly restricting which classes the deserialiser may /// instantiate. Only suppress when the option is either: /// /// - `'allowed_classes' => false` (no class instantiation), or @@ -2091,9 +2112,9 @@ fn is_php_unserialize_allowed_classes_restricted( // Accept structural mitigation forms. The intent signal is // "developer explicitly set allowed_classes to something other than // `true`": - // - boolean `false` — no class instantiation at all - // - array literal — explicit allow-list - // - class-constant reference — `self::ALLOWED_CLASSES` / + // - boolean `false` , no class instantiation at all + // - array literal , explicit allow-list + // - class-constant reference , `self::ALLOWED_CLASSES` / // `Foo::CONSTANTS` resolved to // a const array; engine cannot // statically inspect, but the @@ -2126,7 +2147,7 @@ fn is_php_unserialize_allowed_classes_restricted( /// `cpp.memory.*` mirrors) when the source argument can carry /// attacker-controlled length. Calls whose source is a string literal /// have a compile-time bound and cannot overflow due to attacker input -/// — a too-small destination is a fixed developer bug (caught by +///, a too-small destination is a fixed developer bug (caught by /// compiler warnings / `-fstack-protector` / clang-tidy / ASan), not an /// exploitable channel. Suppressing these literal-source calls is a /// deliberate noise / false-positive reduction aligned with Nyx's scope @@ -2141,14 +2162,14 @@ fn is_php_unserialize_allowed_classes_restricted( /// - `tests/fixtures/real_world/c/state/malloc_lifecycle.expect.json` /// - `tests/fixtures/real_world/cpp/state/new_delete.expect.json` /// - `tests/fixtures/real_world/cpp/state/malloc_branches.expect.json` -/// - Positive cases (suppression must NOT fire — source is a parameter +/// - Positive cases (suppression must NOT fire, source is a parameter /// or other attacker-reachable value) live as hard expectations /// (`must_match: true`) in the taint fixtures: /// - `tests/fixtures/real_world/c/taint/buffer_overflow.c` /// - `tests/fixtures/real_world/cpp/taint/gets_strcpy.cpp` /// /// Removing this function or weakening its predicate would be caught by -/// neither — it would be caught by the unit tests below. +/// neither, it would be caught by the unit tests below. /// /// Pattern rules `c.memory.strcpy` / `c.memory.strcat` / `c.memory.sprintf` /// (and the `cpp.memory.*` mirrors) flag the call syntactically; their @@ -2173,7 +2194,7 @@ fn is_php_unserialize_allowed_classes_restricted( /// - source / format is an identifier (could be tainted, e.g. /// `sprintf(buf, fmt, …)`) → keep firing /// - format is `concatenated_string` containing identifier macros (e.g. -/// `"%" PRId64`) — we cannot statically expand the macro, so refuse +/// `"%" PRId64`), we cannot statically expand the macro, so refuse /// - bare `%s` in format → keep firing (could read unbounded length) fn is_c_buffer_call_literal_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool { let kind = match rule_id { @@ -2226,7 +2247,7 @@ enum CBufferRule { /// True for: a C/C++ string literal, OR a `conditional_expression` whose /// consequence + alternative are both either string literals or ALL_CAPS /// identifiers (the canonical preprocessor-macro naming convention for -/// string-constant `#define`s — `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used +/// string-constant `#define`s, `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used /// pervasively in postgres' `formatting.c::DCH_a_m`). Parenthesised forms /// are unwrapped. /// @@ -2348,7 +2369,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool { } i += 1; if i >= bytes.len() { - // trailing `%` — malformed, refuse to suppress + // trailing `%`, malformed, refuse to suppress return false; } if bytes[i] == b'%' { @@ -2391,7 +2412,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool { let conv = bytes[i]; i += 1; match conv { - // Numeric / char / pointer specifiers — bounded output for any input + // Numeric / char / pointer specifiers, bounded output for any input b'd' | b'i' | b'u' | b'o' | b'x' | b'X' | b'c' | b'e' | b'E' | b'f' | b'F' | b'g' | b'G' | b'a' | b'A' | b'p' | b'n' => continue, // String specifier: only safe when precision-bounded @@ -2494,7 +2515,7 @@ struct TaintSuppressionCtx { /// distinguish "taint proved safe" from "taint failed to track". taint_finding_lines_by_func: HashMap, HashSet>, /// Functions where the SSA engine emitted at least one - /// `all_validated` event — every tainted input to *some* sink in + /// `all_validated` event, every tainted input to *some* sink in /// the function passed through a recognised validation/ /// sanitisation predicate. Drained from /// `take_all_validated_spans`; positive evidence that the engine @@ -2502,14 +2523,14 @@ struct TaintSuppressionCtx { /// `taint-unsanitised-flow` finding fired and no Sanitizer label /// is present. Covers validation, dominator-based pruning, /// early-return guards, type-check predicates, and interprocedural - /// sanitiser wrappers — all of which legitimately clear taint via + /// sanitiser wrappers, all of which legitimately clear taint via /// SSA branch-narrowing rather than a labelled sanitiser node. engine_validated_funcs: HashSet>, /// Functions where some Source's defining variable is later /// rebound to a literal RHS (carries `TaintMeta.const_text`) in /// the same scope, with no Source label on the rebinding node. /// Positive evidence that the engine's SSA renaming structurally - /// kills the source's taint before any sink can read it — covers + /// kills the source's taint before any sink can read it, covers /// `cmd = getenv(); cmd = "echo hello"; system(cmd)` patterns /// where the rebind is what makes the code safe but the engine /// has no `Sanitizer` label or `taint-unsanitised-flow` finding to @@ -2520,7 +2541,7 @@ struct TaintSuppressionCtx { /// interprocedural analysis cleared the flow through a /// user-defined wrapper (e.g. `def sanitize(s): return /// shlex.quote(s)`). The current per-function `Sanitizer` check - /// only sees direct sanitisers in the *caller's* scope — without + /// only sees direct sanitisers in the *caller's* scope, without /// this signal, every helper-wrapped sanitiser fires as an /// AST-pattern FP because the engine cleared the value via Phase /// 11 inline analysis but the sink's enclosing scope has no @@ -2687,7 +2708,7 @@ impl TaintSuppressionCtx { // an "interproc sanitiser caller" when its body invokes any // helper whose own body contains a labelled Sanitizer. This // handles wrappers like `def sanitize(s): return - // shlex.quote(s)` — the engine clears taint via Phase 11 + // shlex.quote(s)`, the engine clears taint via // inline analysis, but the caller's scope has no labelled // Sanitizer of its own to satisfy Condition 4(b). let mut interproc_sanitizer_callers: HashSet> = HashSet::new(); @@ -2703,7 +2724,7 @@ impl TaintSuppressionCtx { // each to its enclosing function via `sink_func_at_line`, and // record the function as "engine-validated". The set was // populated by `ssa_events_to_findings` whenever the engine - // emitted an `SsaTaintEvent { all_validated: true, .. }` — + // emitted an `SsaTaintEvent { all_validated: true, .. }` , // i.e. the engine reached a sink and proved every tainted // input passed validation. This is the broadest form of // engine-success evidence, covering predicate validation @@ -2762,7 +2783,7 @@ impl TaintSuppressionCtx { // sink, since taint couldn't have evaluated a flow that doesn't exist. let func = match self.sink_func_at_line.get(&line) { Some(f) => f, - None => return false, // No CFG sink at this line — taint had no opportunity to evaluate + None => return false, // No CFG sink at this line, taint had no opportunity to evaluate }; match self.source_lines_by_func.get(func) { Some(source_lines) => { @@ -2788,7 +2809,7 @@ impl TaintSuppressionCtx { // OR // (c) the SSA engine emitted at least one `all_validated` // event in this function (engine reached *some* sink and - // proved every tainted input was validated — covers + // proved every tainted input was validated, covers // predicate validation, dominator early-return, // type-check predicates, and interprocedural sanitiser // wrappers that don't carry an explicit Sanitizer @@ -2796,18 +2817,18 @@ impl TaintSuppressionCtx { // OR // (d) the function rebinds a Source's defining variable to // a literal RHS at a later line (engine's SSA renaming - // structurally kills taint before any sink reads it — + // structurally kills taint before any sink reads it , // covers `cmd = getenv(); cmd = "echo"; system(cmd)`), // OR // (e) the function calls a same-file helper whose body // contains a labelled Sanitizer (interprocedural - // sanitiser wrapper — covers `def sanitize(s): return + // sanitiser wrapper, covers `def sanitize(s): return // shlex.quote(s)` patterns where the engine clears - // taint via Phase 11 inline analysis but the caller's + // taint via inline analysis but the caller's // scope has no Sanitizer label of its own). // // When none hold, we can't distinguish silent engine failure - // from real safety — e.g. Go points-to limitation on `&local` + // from real safety, e.g. Go points-to limitation on `&local` // Decode destinations leaves the chain writeback fired but the // field-cell propagation dead, suppressing legitimate // AST-pattern findings on every Go CRUD handler whose Decode @@ -2854,7 +2875,7 @@ pub fn run_rules_on_bytes( maybe_inject_test_panic(path); let Some(source) = ParsedSource::try_new(bytes, path)? else { - // Not a recognized tree-sitter language — try text-based patterns, + // Not a recognized tree-sitter language, try text-based patterns, // but first surface a parse-timeout synthetic diag if that's what // caused try_new to return None. let mut out = scan_text_based_patterns(bytes, path, cfg); @@ -2964,7 +2985,7 @@ pub fn analyse_file_fused( maybe_inject_test_panic(path); let Some(source) = ParsedSource::try_new(bytes, path)? else { - // Not a recognized tree-sitter language — try text-based patterns, + // Not a recognized tree-sitter language, try text-based patterns, // and surface a parse-timeout synthetic diag if that's what caused // try_new to return None. let mut diags = scan_text_based_patterns(bytes, path, cfg); @@ -2995,7 +3016,7 @@ pub fn analyse_file_fused( let (ssa_summaries, ssa_bodies) = if needs_cfg { // Lower SSA exactly once and feed both the taint engine and the // SSA-artifact extractor. Pre-fix, both consumers re-lowered the - // same `FileCfg` independently — `lower_all_functions_from_bodies` + // same `FileCfg` independently, `lower_all_functions_from_bodies` // accounted for ~20% of `analyse_file_fused` wall-clock on the // bench corpus. // @@ -3294,7 +3315,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() { "method param pass-through should be recognised" ); - // Local variable assigned from concat — NOT a pass-through. + // Local variable assigned from concat, NOT a pass-through. let code = b" self::CONST should be recognised as safe" ); - // allowed_classes => true — unsafe default, must NOT be suppressed + // allowed_classes => true, unsafe default, must NOT be suppressed let code = b" true]);\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_php_capture(&tree, code, q); @@ -3366,7 +3387,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() { "allowed_classes => true is the unsafe default, should NOT be suppressed" ); - // No second arg — must NOT be suppressed + // No second arg, must NOT be suppressed let code = b"= 12) ? \"p.m.\" : \"a.m.\"); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_strcpy); @@ -3459,7 +3480,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "strcpy with ternary-of-literals source must be suppressed" ); - // strcpy(dst, cond ? P_M_STR : A_M_STR) — postgres formatting.c + // strcpy(dst, cond ? P_M_STR : A_M_STR), postgres formatting.c // shape with #define'd ALL_CAPS string-constant macros. let code = b"#define P_M_STR \"p.m.\"\n#define A_M_STR \"a.m.\"\nvoid f(char *s, int h) { strcpy(s, (h >= 12) ? P_M_STR : A_M_STR); }\n"; let tree = parser.parse(code, None).unwrap(); @@ -3469,7 +3490,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "strcpy with ternary-of-ALL_CAPS-macros must be suppressed" ); - // strcpy(dst, cond ? var_a : var_b) — lowercase variables, NOT a + // strcpy(dst, cond ? var_a : var_b), lowercase variables, NOT a // recognisable preprocessor macro shape. Must NOT suppress. let code = b"void f(char *s, int h, char *a, char *b) { strcpy(s, (h >= 12) ? a : b); }\n"; let tree = parser.parse(code, None).unwrap(); @@ -3479,7 +3500,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "strcpy with ternary-of-lowercase-vars must NOT be suppressed" ); - // strcat(dst, "literal") — same principle as strcpy. + // strcat(dst, "literal"), same principle as strcpy. let code = b"void f(char *d) { strcat(d, \" (done)\"); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_strcat); @@ -3488,7 +3509,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "strcat with string-literal source must be suppressed" ); - // sprintf(dst, "%lld%c", ...) — numeric format string. + // sprintf(dst, "%lld%c", ...), numeric format string. let code = b"void f(char *cp, long long v, char u) { sprintf(cp, \"%lld%c\", v, u); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_sprintf); @@ -3497,7 +3518,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "sprintf with numeric-only format must be suppressed" ); - // sprintf(str, " %.*s", N, x) — precision-bounded `%s`. + // sprintf(str, " %.*s", N, x), precision-bounded `%s`. let code = b"void f(char *str, int n, const char *x) { sprintf(str, \" %.*s\", n, x); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_sprintf); @@ -3506,7 +3527,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "sprintf with precision-bounded `%.*s` must be suppressed" ); - // strcpy(dst, src) where src is a non-literal — must NOT suppress. + // strcpy(dst, src) where src is a non-literal, must NOT suppress. let code = b"void f(char *d, char **a) { strcpy(d, a[1]); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_strcpy); @@ -3515,7 +3536,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "strcpy with non-literal source must NOT be suppressed" ); - // sprintf with bare `%s` — must NOT suppress. + // sprintf with bare `%s`, must NOT suppress. let code = b"void f(char *b, const char *u) { sprintf(b, \"%s\", u); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_sprintf); @@ -3525,7 +3546,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { ); // sprintf with non-literal format (concatenated_string with PRI* macro) - // — must NOT suppress (engine cannot statically expand the macro). + //, must NOT suppress (engine cannot statically expand the macro). let code = b"void f(char *b, long long v) { sprintf(b, \"%\" PRId64, v); }\n"; let tree = parser.parse(code, None).unwrap(); let cap = first_c_capture(&tree, code, q_sprintf); @@ -3543,3 +3564,51 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() { "Layer D should only fire for buffer-overflow rule ids" ); } + +/// Regression: `is_literal_node` must NOT classify a Python f-string +/// (a `string` node containing `interpolation` children) as literal. +/// Layer A's "all-args-literal → suppress Security finding" shortcut +/// otherwise hides every CVE that injects via `cursor.execute(f"…{x}…")` +/// or `text(f"…{x}…")`. Motivated by CVE-2025-69662 (geopandas SQLi +/// via `text(f"SELECT … '{geom_name}' …")`) and CVE-2025-24793 +/// (snowflake-connector-python f-string-built CREATE STAGE / DROP). +#[test] +fn is_literal_node_rejects_python_fstring_with_interpolation() { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); + parser.set_language(&lang).unwrap(); + + // f-string with one interpolation segment, must be non-literal. + let code = b"x = f\"SELECT * WHERE y = '{u}'\"\n"; + let tree = parser.parse(code, None).unwrap(); + let assignment = tree + .root_node() + .child(0) + .and_then(|s| s.child(0)) + .expect("assignment node"); + let rhs = assignment + .child_by_field_name("right") + .expect("RHS of assignment"); + assert_eq!(rhs.kind(), "string"); + assert!( + !is_literal_node(rhs, code), + "f-string with interpolation must not be classified as literal" + ); + + // Plain string literal, must remain literal. + let code = b"x = \"plain literal\"\n"; + let tree = parser.parse(code, None).unwrap(); + let assignment = tree + .root_node() + .child(0) + .and_then(|s| s.child(0)) + .expect("assignment node"); + let rhs = assignment + .child_by_field_name("right") + .expect("RHS of assignment"); + assert_eq!(rhs.kind(), "string"); + assert!( + is_literal_node(rhs, code), + "plain string literal must be classified as literal" + ); +} diff --git a/src/auth_analysis/checks.rs b/src/auth_analysis/checks.rs index c137450b..1529c8b0 100644 --- a/src/auth_analysis/checks.rs +++ b/src/auth_analysis/checks.rs @@ -220,7 +220,7 @@ fn check_token_override_without_validation( let mut findings = Vec::new(); for unit in &model.units { - // The rule reasons about "Token acceptance flow" — by + // The rule reasons about "Token acceptance flow", by // construction, that is a user-facing handler that receives a // token from the client and writes through token-bound state. // Internal helpers, Celery / cron tasks, Django migrations, @@ -335,15 +335,12 @@ fn has_prior_subject_auth( }) } -/// Phase A4 row-fetch exemption. +/// Row-fetch exemption. /// -/// Recognises the canonical "fetch-then-authorize" idiom in row-level -/// authz code: a route handler fetches a row by id (`let community = -/// Community::read(pool, data.community_id)?`), then calls a named -/// authorization function on the fetched row (`check_community_user_action( -/// &user, &community, ...)`). The authorization check appears -/// textually after the fetch, so the existing `check.line <= op.line` -/// rule cannot cover the fetch. +/// Recognises the "fetch-then-authorize" idiom: a handler fetches a +/// row by id then calls a named authorization function on it. The +/// check appears textually after the fetch, so the +/// `check.line <= op.line` rule cannot cover the fetch. /// /// The exemption fires only when: /// 1. `op` is the row-fetch operation itself (line == row let-line). @@ -353,7 +350,7 @@ fn has_prior_subject_auth( /// Coverage is intentionally narrow: only the row-fetch operation is /// exempted. Any sink that runs *between* the fetch and the check /// (e.g. `delete(community)` before `check_*`) still flags, because -/// its subject is `community` itself — not a fetch arg — and we +/// its subject is `community` itself, not a fetch arg, and we /// require the operation to be a row-fetch site to apply the /// exemption. fn has_row_fetch_exemption(unit: &AnalysisUnit, op: &SensitiveOperation) -> bool { @@ -374,8 +371,8 @@ fn has_row_fetch_exemption(unit: &AnalysisUnit, op: &SensitiveOperation) -> bool // Look for any non-login auth check whose subjects mention the row. // Match against the *root* of the subject's chain (`a.b.c` → `a`) - // so an auth check on a row's nested field — e.g. - // `is_mod_or_admin(pool, &user, comment_view.community.id)` — + // so an auth check on a row's nested field, e.g. + // `is_mod_or_admin(pool, &user, comment_view.community.id)` , // still names the row var. unit.auth_checks.iter().any(|check| { if matches!( @@ -425,6 +422,32 @@ fn has_prior_collection_auth( } fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &AnalysisUnit) -> bool { + // **Route-level guard short-circuit.** + // + // A check declared at the route boundary (Flask `@requires_role`, + // FastAPI `dependencies=[Depends(requires_access_dag(method= + // "POST", access_entity=DagAccessEntity.RUN))]`, Django + // `@permission_required`, Spring `@PreAuthorize`, Rails + // `before_action :authorize`, axum `RequireAuthorizationLayer`) + // gates the entire handler. The decorator / dependency call is + // opaque to the engine, the inner `requires_access_dag` carries + // no per-arg `ValueRef` pointing back into the handler body, so + // the per-name subject coverage walk below cannot match it. The + // structural shape, however, is unambiguous: every value the + // handler receives, every row it fetches, and every sink it + // calls runs after the route-level check has decided + // authorization. + // + // `has_prior_subject_auth` already filters out + // `LoginGuard` / `TokenExpiry` / `TokenRecipient` kinds before + // calling this helper (login alone proves identity, not + // authorization), so by the time we land here the kind is + // `Other` / `Membership` / `Ownership` / `AdminGuard`, i.e. an + // authorization-bearing decorator-level check. Returning `true` + // unconditionally for those is the correct semantics. + if check.is_route_level { + return true; + } let subject_key = canonical_subject_name(subject); let subject_related_base = related_subject_base(subject); // A2 + B3: walk the row-binding chain from this subject so a @@ -447,7 +470,7 @@ fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &Analy // check authorizes the resulting row (e.g. `check_community_user_action( // &user, &community, ..)` after `let community = Community::read( // pool, data.community_id)`), the check materially covers - // `data.community_id` too — it gated access to the row that was + // `data.community_id` too, it gated access to the row that was // fetched using that id, so any subsequent operation re-using the // same id (read of a related view, mutation on the row itself) is // within the scope of that authorization. @@ -527,7 +550,7 @@ fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &Analy /// to recover every ancestor row binding name. Cycle-safe via a /// visited set; depth-bounded at 16 hops to keep the worst case /// trivial. Returns a vec containing `start` followed by each -/// ancestor — empty when `start` is empty. +/// ancestor, empty when `start` is empty. fn row_binding_chain(unit: &AnalysisUnit, start: &str) -> Vec { let mut chain: Vec = Vec::new(); if start.is_empty() { @@ -583,7 +606,7 @@ fn is_relevant_target_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { /// it to a literal constant (`id := "id"`, `let userId = 1`, etc.). /// Such bindings cannot be user-controlled and so must not be /// classified as scoped-identifier subjects. Only matches plain -/// `Identifier`-kind subjects (no base/field) — member chains like +/// `Identifier`-kind subjects (no base/field), member chains like /// `req.params.id` still pass through to the regular checks. fn is_const_bound_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { if subject.base.is_some() || subject.field.is_some() { @@ -594,22 +617,22 @@ fn is_const_bound_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { /// True iff `subject` is a plain identifier that resolves to a /// function parameter whose static type is a payload-incompatible -/// scalar (numeric or boolean — see [`super::apply_typed_bounded_params`]). +/// scalar (numeric or boolean, see [`super::apply_typed_bounded_params`]). /// Spring `@PathVariable Long userId`, Axum `Path`, NestJS /// `@Param('id') id: number`, and FastAPI `user_id: int` all qualify. /// -/// Phase 6: also matches member-access subjects like `dto.userId` +/// also matches member-access subjects like `dto.userId` /// when `dto` is a typed-extractor parameter recognised by a Phase /// 1-2 matcher AND the field's declared TypeKind is Int/Bool. fn is_typed_bounded_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { if subject.base.is_none() && subject.field.is_none() { return unit.typed_bounded_vars.contains(&subject.name); } - // Phase 6: member-access shape `base.field` whose `base` is a + // member-access shape `base.field` whose `base` is a // typed-extractor parameter and whose field is declared as an // Int/Bool in the same-file DTO definition. Per Hard Rule 3, // only fires when the base param itself was recognised by a - // Phase 1-2 matcher — bare `dto.age` without a framework gate + // typed-extractor matcher, bare `dto.age` without a framework gate // never lifts. let Some(base) = subject.base.as_deref() else { return false; @@ -645,7 +668,7 @@ fn is_actor_context_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { // A3: `V.id`-shape subjects where `V` is bound from a login-guard / // auth-check call (or from a typed self-actor extractor parameter) // are the caller's own id. `V.group_id` / `V.workspace_id` stay - // relevant — only self-identifier fields trip this branch, so + // relevant, only self-identifier fields trip this branch, so // foreign scoped ids on the same actor binding still flag. if let Some(base) = subject.base.as_deref() { let root = base.split('.').next().unwrap_or(base); @@ -657,7 +680,7 @@ fn is_actor_context_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { } // Transitive copy of `V.id`: `let uid = user.id; query(.., &[uid])` - // — the subject `uid` is a plain identifier with no base/field, but + //, the subject `uid` is a plain identifier with no base/field, but // was recorded as a self-actor id copy at extract time. Treat it // as actor context. if unit.self_actor_id_vars.contains(&subject.name) { @@ -810,15 +833,15 @@ fn is_id_like_name(name: &str) -> bool { } /// True when the analysis unit shows positive evidence of receiving -/// user-controlled input — the precondition for any auth rule that +/// user-controlled input, the precondition for any auth rule that /// reasons about "scoped identifier" or "token-acceptance flow" /// shapes. /// /// A unit qualifies if any of the following hold: -/// * It is a recognised framework route handler (`RouteHandler` — +/// * It is a recognised framework route handler (`RouteHandler` , /// the strongest signal: registered with a router). /// * It accesses a request-shaped value (`request.body`, `req.params`, -/// `c.Query(..)`, etc.) — populated as `context_inputs`. +/// `c.Query(..)`, etc.), populated as `context_inputs`. /// * It declares at least one parameter whose name signals an /// externally-supplied value (id-like, token-like, request-like). /// Internal helpers that take only typed objects @@ -826,7 +849,7 @@ fn is_id_like_name(name: &str) -> bool { /// `items`) are excluded. /// /// Migrations, Celery tasks, pytest fixtures, conftest hooks, and -/// pure utility helpers fail all three conditions and are skipped — +/// pure utility helpers fail all three conditions and are skipped , /// they cannot, by construction, be the entry point of an /// authentication-bearing flow. fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool { @@ -843,7 +866,7 @@ fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool { /// as part of its calling contract? Captures three classes of name: /// * id-like (`*_id`, `*Id`, `id`, `*Ids`), /// * token-like (`token`, `*_token`, `accessToken`), -/// * framework-request objects (`request`, `req`, `ctx` — the +/// * framework-request objects (`request`, `req`, `ctx`, the /// standard names used by Express/Django/Flask/Gin/Axum/NestJS /// handlers as the parameter that carries the HTTP request). /// @@ -851,12 +874,26 @@ fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool { /// functions that, while not registered as route handlers, are /// clearly invoked with caller-supplied identifiers or request data. fn is_external_input_param_name(name: &str) -> bool { + // Pytest / unittest.mock convention: parameters injected by + // `@mock.patch(...)` decorators are universally named + // `mock_` (`mock_project_id`, `mock_session`, + // `mock_user_id`). Their values are MagicMock instances created + // by the test framework, not user-supplied input, even when the + // suffix carries an id-shaped tail. Refusing the entire `mock_` + // prefix is structural (mirrors pytest's documented convention) + // and closes the airflow `tests/unit/google/cloud/hooks/` + // cluster where every test method takes + // `(self, get_conn, mock_project_id)` and the suffix tripped the + // id-like heuristic. + if name.starts_with("mock_") || name.starts_with("mocked_") { + return false; + } if is_id_like_name(name) { return true; } let lower = name.to_ascii_lowercase(); // Token-shaped: bare `token` or any `*_token` / `*Token` / - // `accessToken` / `refreshToken`-style suffix. Conservative — + // `accessToken` / `refreshToken`-style suffix. Conservative , // only fires on explicit token-naming, not on incidental // substrings. if lower == "token" || lower.ends_with("_token") || lower.ends_with("token") { @@ -951,7 +988,7 @@ mod tests { assert!(is_actor_context_subject(&member("user", "uid"), &unit)); // Pitfall guard: `user.group_id` / `user.workspace_id` stay - // relevant — only self-identifier fields trip the widening. + // relevant, only self-identifier fields trip the widening. assert!(!is_actor_context_subject( &member("user", "group_id"), &unit @@ -962,7 +999,7 @@ mod tests { )); // Variables not in self_actor_vars fall back to the existing - // identity-key match — `target.id` still flags. + // identity-key match, `target.id` still flags. assert!(!is_actor_context_subject(&member("target", "id"), &unit)); } @@ -1036,7 +1073,7 @@ mod tests { assert!(!is_relevant_target_subject(&plain("id"), &unit)); // Plain `id` NOT in the const-bound set still flags as - // relevant — regression guard for the user-controlled case. + // relevant, regression guard for the user-controlled case. let unit2 = empty_unit(); assert!(is_relevant_target_subject(&plain("id"), &unit2)); @@ -1046,12 +1083,12 @@ mod tests { assert!(is_relevant_target_subject(&member("req", "id"), &unit)); } - /// Phase 5 typed-bounded subject exclusion: a parameter whose + /// Hierarchy: a parameter whose /// static type was recovered as `Int`/`Bool` (Spring `Long userId`, /// Axum `Path`, FastAPI `user_id: int`) has its name added to /// `unit.typed_bounded_vars` by `apply_typed_bounded_params`. The /// subject `userId` then must not be classified as a scoped - /// identifier — the framework guarantees the value is numeric and + /// identifier, the framework guarantees the value is numeric and /// cannot drive ownership-bypass. #[test] fn typed_bounded_plain_subjects_are_not_relevant() { @@ -1066,7 +1103,7 @@ mod tests { assert!(is_relevant_target_subject(&plain("user_id"), &unit2)); // Member access `req.user_id` is unaffected (only plain - // identifiers are exempted — fields/base remain regular + // identifiers are exempted, fields/base remain regular // subjects so DTO-shape leaks still flag). unit.typed_bounded_vars.insert("req".into()); assert!(is_relevant_target_subject(&member("req", "user_id"), &unit)); @@ -1080,17 +1117,17 @@ mod tests { #[test] fn unit_user_input_evidence_recognises_external_inputs() { // Function with no params and no context_inputs (Celery task - // shape) — must NOT count as user-input-bearing. + // shape), must NOT count as user-input-bearing. let mut unit = empty_unit(); assert!(!unit_has_user_input_evidence(&unit)); - // Adding internal-typed params (apps, schema_editor — Django + // Adding internal-typed params (apps, schema_editor, Django // migration RunPython callback shape) keeps the gate closed. unit.params.push("apps".into()); unit.params.push("schema_editor".into()); assert!(!unit_has_user_input_evidence(&unit)); - // pytest hook shape: (config, items) — gate stays closed. + // pytest hook shape: (config, items), gate stays closed. let mut unit = empty_unit(); unit.params.push("config".into()); unit.params.push("items".into()); @@ -1161,14 +1198,22 @@ mod tests { assert!(!is_external_input_param_name("manager")); // `c` alone is too common as a local variable to count. assert!(!is_external_input_param_name("c")); + // Pytest / unittest.mock fixture-injected mocks: `mock_` / + // `mocked_` names are MagicMock instances, not user input, + // even when the suffix (`mock_project_id`) is id-shaped. + assert!(!is_external_input_param_name("mock_project_id")); + assert!(!is_external_input_param_name("mock_session")); + assert!(!is_external_input_param_name("mock_user_id")); + assert!(!is_external_input_param_name("mocked_request")); + assert!(!is_external_input_param_name("mocked_token")); } - /// Phase A4 row-fetch exemption. + /// Row-fetch exemption. /// /// Row var declared at line 10; auth check naming the row appears /// at line 20. An operation at line 10 (the fetch) is exempted /// because the auth check authorises the resulting row. Coverage - /// is intentionally narrow — operations between fetch (10) and + /// is intentionally narrow, operations between fetch (10) and /// check (20) that are NOT row-fetch sites must still flag. #[test] fn row_fetch_exemption_covers_fetch_when_check_names_row() { @@ -1192,6 +1237,7 @@ mod tests { line: 20, args: Vec::new(), condition_text: None, + is_route_level: false, }); let fetch_op = SensitiveOperation { @@ -1206,7 +1252,7 @@ mod tests { assert!(has_row_fetch_exemption(&unit, &fetch_op)); // Operation at a different line (between fetch and check) is - // NOT a row-fetch site — exemption does not apply. + // NOT a row-fetch site, exemption does not apply. let mid_op = SensitiveOperation { kind: OperationKind::Mutation, sink_class: None, @@ -1229,7 +1275,7 @@ mod tests { "community".to_string(), (10, vec![member("data", "community_id")]), ); - // No auth check pushed — exemption must NOT apply. + // No auth check pushed, exemption must NOT apply. let fetch_op = SensitiveOperation { kind: OperationKind::Read, @@ -1256,7 +1302,7 @@ mod tests { (10, vec![member("data", "community_id")]), ); // Login-only check on the row should NOT exempt the row-fetch - // — login proves identity, not authorization. + //, login proves identity, not authorization. unit.auth_checks.push(AuthCheck { kind: AuthCheckKind::LoginGuard, callee: "require_login".into(), @@ -1265,6 +1311,7 @@ mod tests { line: 20, args: Vec::new(), condition_text: None, + is_route_level: false, }); let fetch_op = SensitiveOperation { @@ -1305,10 +1352,11 @@ mod tests { line: 20, args: Vec::new(), condition_text: None, + is_route_level: false, }; // Direct member subject `data.community_id` (the original - // request field) — covered via reverse-walk. + // request field), covered via reverse-walk. assert!(auth_check_covers_subject( &check, &member("data", "community_id"), @@ -1334,7 +1382,7 @@ mod tests { /// Subject as plain identifier copied from the request /// (`let community_id = data.community_id; let community = /// Community::read(pool, community_id);`) must also benefit from - /// the reverse-walk — `row_population_data["community"]` then + /// the reverse-walk, `row_population_data["community"]` then /// records `[community_id]` (a plain identifier, not the /// member-access shape). #[test] @@ -1352,6 +1400,7 @@ mod tests { line: 20, args: Vec::new(), condition_text: None, + is_route_level: false, }; assert!(auth_check_covers_subject( @@ -1392,9 +1441,10 @@ mod tests { line: 20, args: Vec::new(), condition_text: None, + is_route_level: false, }; - // Sink subject is the bare alias — covered via the chain. + // Sink subject is the bare alias, covered via the chain. assert!(auth_check_covers_subject( &check, &plain("community_id"), @@ -1412,4 +1462,73 @@ mod tests { // Plain identifier with no alias entry must NOT be covered. assert!(!auth_check_covers_subject(&check, &plain("post_id"), &unit)); } + + /// Route-level guard short-circuit (FastAPI / Flask / + /// Django / Spring / Rails / axum decorator-level auth). + /// + /// The decorator-level `@requires_role` / + /// `dependencies=[Depends(requires_access_dag(...))]` / + /// `before_action :authorize` runs before the handler body and + /// authorizes every value the handler receives. The check has + /// no per-arg `ValueRef` pointing back into the body, so the + /// per-name subject coverage walk cannot model the semantics. + /// `auth_check_covers_subject` short-circuits `true` for any + /// authorization-bearing route-level check (LoginGuard etc. are + /// already filtered out by `has_prior_subject_auth`). + #[test] + fn auth_check_covers_subject_route_level_short_circuits() { + use crate::auth_analysis::model::{AuthCheck, AuthCheckKind}; + + let unit = empty_unit(); + let route_check = AuthCheck { + kind: AuthCheckKind::Other, + callee: "requires_access_dag".into(), + subjects: Vec::new(), // route-level checks carry no body subjects + span: (0, 0), + line: 0, + args: Vec::new(), + condition_text: None, + is_route_level: true, + }; + + // Any subject is covered when the check is route-level , + // path param, request body field, row-fetch receiver, all of + // them. The per-name walk would have rejected each. + assert!(auth_check_covers_subject( + &route_check, + &plain("dag_id"), + &unit + )); + assert!(auth_check_covers_subject( + &route_check, + &member("req", "dag_run_id"), + &unit + )); + assert!(auth_check_covers_subject( + &route_check, + &plain("dag"), + &unit + )); + + // Sanity check: an in-body check with no subjects (the prior + // shape) does NOT cover arbitrary subjects. Without the + // route-level flag, the empty subjects vec means the + // `check.subjects.iter().any(...)` walk fails for every + // candidate. + let in_body_check = AuthCheck { + kind: AuthCheckKind::Other, + callee: "requires_access_dag".into(), + subjects: Vec::new(), + span: (0, 0), + line: 0, + args: Vec::new(), + condition_text: None, + is_route_level: false, + }; + assert!(!auth_check_covers_subject( + &in_body_check, + &plain("dag_id"), + &unit + )); + } } diff --git a/src/auth_analysis/config.rs b/src/auth_analysis/config.rs index 351b4490..075ff66e 100644 --- a/src/auth_analysis/config.rs +++ b/src/auth_analysis/config.rs @@ -173,7 +173,7 @@ impl AuthAnalysisRules { /// Does the LAST segment of the callee match a configured non-sink /// method name (case-sensitive exact)? Used to recognise DOM-API /// methods like `addEventListener` / `appendChild` regardless of - /// receiver — `someElement.addEventListener` is just as + /// receiver, `someElement.addEventListener` is just as /// categorically client-side as `document.addEventListener`. pub fn callee_has_non_sink_method(&self, callee: &str) -> bool { let last = bare_method_name(callee); @@ -200,19 +200,19 @@ impl AuthAnalysisRules { /// Classify a call into a [`SinkClass`]. /// /// Dispatch order (first match wins): - /// 1. `InMemoryLocal` — receiver is a known non-sink collection + /// 1. `InMemoryLocal`, receiver is a known non-sink collection /// (tracked in `non_sink_vars` or matches a configured /// non-sink prefix). - /// 2. `RealtimePublish` — receiver first-segment matches a + /// 2. `RealtimePublish`, receiver first-segment matches a /// configured realtime prefix (e.g. `realtime`, `pubsub`). - /// 3. `OutboundNetwork` — receiver first-segment matches a + /// 3. `OutboundNetwork`, receiver first-segment matches a /// configured outbound-network prefix (e.g. `http`, `reqwest`). - /// 4. `CacheCrossTenant` — receiver first-segment matches a + /// 4. `CacheCrossTenant`, receiver first-segment matches a /// configured cache prefix (e.g. `cache`, `redis`). - /// 5. `DbMutation` — callee name matches `mutation_indicator_names`. - /// 6. `DbCrossTenantRead` — callee name matches `read_indicator_names`. + /// 5. `DbMutation`, callee name matches `mutation_indicator_names`. + /// 6. `DbCrossTenantRead`, callee name matches `read_indicator_names`. /// - /// Returns `None` when the callee matches none of the above — the + /// Returns `None` when the callee matches none of the above, the /// call site is ignored by ownership-gap checks. pub fn classify_sink_class( &self, @@ -227,8 +227,8 @@ impl AuthAnalysisRules { // (`el.addEventListener`, `parent.appendChild`) are categorically // not data-layer auth-relevant operations. These shapes would // otherwise prefix-match read/mutation indicators (`get`, `add`, - // `remove`) — `getElementById` canonicalises to `getelementbyid` - // which `starts_with("get")` — and falsely classify as + // `remove`), `getElementById` canonicalises to `getelementbyid` + // which `starts_with("get")`, and falsely classify as // `DbCrossTenantRead` / `DbMutation`. if self.callee_has_non_sink_global_receiver(callee) || self.callee_has_non_sink_method(callee) @@ -251,7 +251,7 @@ impl AuthAnalysisRules { // receiver. When the receiver chain itself contains a call // expression (`w.Header().Get(..)`, `r.URL.Query().Get(..)`, // `db.Tx(..).Query(..)`), the receiver is the *return value of - // another call* — its type is opaque to the auth analyser and + // another call*, its type is opaque to the auth analyser and // the bare verb match is too speculative to assume a data-layer // sink. The realtime/outbound/cache prefix dispatches above // already match by the chain root; if none of them claimed the @@ -501,6 +501,13 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "user_passes_test".into(), "verify_access".into(), "authorize".into(), + // FastAPI dependency-injection auth idiom: airflow uses + // `Depends(requires_access_dag(method="GET"))`, + // `requires_access_connection(...)`, etc. The unwrapped + // inner call name is `requires_access_`; the + // `requires_access` prefix matches all variants via + // `matches_name`. + "requires_access".into(), ], mutation_indicator_names: vec![ "update".into(), @@ -615,7 +622,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "verify_access!".into(), "can_access?".into(), "can?".into(), - // Rails per-record permission predicates — the canonical + // Rails per-record permission predicates, the canonical // "load by id, then check on the loaded record" idiom // (see redmine `app/controllers/issues_controller.rb`, // mastodon controllers, diaspora ApplicationController). @@ -961,7 +968,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "can_access".into(), "can_manage".into(), // Common project-specific helpers seen in real Axum/Rocket - // codebases — kept as defaults so user code that names + // codebases, kept as defaults so user code that names // its membership helper after the resource still gets // recognised. Users can extend via `nyx.toml`. "require_group_member".into(), @@ -1045,7 +1052,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "FxHashSet".into(), "DashMap".into(), "DashSet".into(), - // `serde_json::Map` (last-segment `Map`) — common JSON + // `serde_json::Map` (last-segment `Map`), common JSON // body builder where `m.insert("k", v)` is a string-key // assignment on an in-memory object, not a DB write. "Map".into(), @@ -1161,7 +1168,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { ], non_sink_receiver_types: Vec::new(), non_sink_receiver_name_prefixes: Vec::new(), - // Browser/DOM globals — calls on these receivers are + // Browser/DOM globals, calls on these receivers are // categorically client-side (no server-side authorization // semantics). Without this list, `document.getElementById` // would prefix-match the read-indicator `get`, @@ -1196,7 +1203,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "WeakMap".into(), "WeakSet".into(), ], - // DOM-API methods — when the LAST segment of the callee + // DOM-API methods, when the LAST segment of the callee // matches, the call is non-data-layer regardless of receiver // (`el.addEventListener`, `parent.appendChild`). These // methods would otherwise prefix-match `add`, `remove`, @@ -1345,7 +1352,7 @@ pub fn first_receiver_segment(callee: &str) -> &str { callee.split('.').next().unwrap_or(callee) } -/// True when the callee's receiver chain contains a call expression — +/// True when the callee's receiver chain contains a call expression , /// i.e. the LAST segment is being invoked on the *return value* of an /// earlier call (`w.Header().Get`, `r.URL.Query().Get`, /// `db.Tx(opts).Query`). Detected as: the substring before the last @@ -1366,7 +1373,7 @@ pub fn receiver_is_chained_call(callee: &str) -> bool { /// (`member`, `owner`, `admin`, `access`, `permission`, `manager`, /// `editor`, `viewer`, `user`, `mod`). The resource segment is /// project-specific (`trip`, `doc`, `project`, `community`, …) and -/// cannot be enumerated in the static defaults — but the +/// cannot be enumerated in the static defaults, but the /// prefix+role pattern is unambiguous enough that recognising it as /// an authorization check is safe. Also accepts `is_` / /// `is__(or|and)_...` predicate forms (`is_admin`, @@ -1398,7 +1405,7 @@ fn is_require_resource_role_call(name: &str) -> bool { } // Pattern 2: `is_` and `is__(or|and)_...`. - // Conservative role list — excludes `user` / `staff` to avoid + // Conservative role list, excludes `user` / `staff` to avoid // matching ambiguous predicates like `is_user`. if let Some(rest) = lower.strip_prefix("is_") && !rest.is_empty() @@ -1682,7 +1689,7 @@ mod tests { assert!(receiver_is_chained_call("r.URL.Query().Get")); assert!(receiver_is_chained_call("db.Tx(opts).Query")); assert!(receiver_is_chained_call("client.WithToken(t).Get")); - // Pure field/identifier chain — no `(` anywhere. + // Pure field/identifier chain, no `(` anywhere. assert!(!receiver_is_chained_call("repo.Find")); assert!(!receiver_is_chained_call("c.Fs.Create")); assert!(!receiver_is_chained_call("globalBatchJobsMetrics.save")); @@ -1701,7 +1708,7 @@ mod tests { let empty: HashSet = HashSet::new(); // Chained-call receiver: verb-name fallback is suppressed. - // The minio `w.Header().Get(constName)` cluster — `Get` would + // The minio `w.Header().Get(constName)` cluster, `Get` would // match the `Get` read indicator on a bare receiver but the // chained-call shape masks the receiver type. assert_eq!(rules.classify_sink_class("w.Header().Get", &empty), None); @@ -1742,7 +1749,7 @@ mod tests { let rules = build_auth_rules(&cfg, "javascript"); let empty: HashSet = HashSet::new(); - // Globals — receiver-first-segment match. + // Globals, receiver-first-segment match. assert_eq!( rules.classify_sink_class("document.getElementById", &empty), Some(SinkClass::InMemoryLocal) @@ -1760,7 +1767,7 @@ mod tests { Some(SinkClass::InMemoryLocal) ); - // Method allowlist — last-segment match regardless of receiver. + // Method allowlist, last-segment match regardless of receiver. assert_eq!( rules.classify_sink_class("input.addEventListener", &empty), Some(SinkClass::InMemoryLocal) @@ -1801,22 +1808,22 @@ mod tests { assert!(rules.is_authorization_check("authz::require_trip_member")); assert!(rules.is_authorization_check("self.require_album_editor")); - // Negatives — random `require_*` calls without a known role + // Negatives, random `require_*` calls without a known role // suffix do NOT count as authorization. assert!(!rules.is_authorization_check("require_db")); assert!(!rules.is_authorization_check("require_user")); assert!(!rules.is_authorization_check("require_login")); // Bare `require_member` / `require_owner` (no resource segment) - // aren't enough — the resource segment is what makes the helper + // aren't enough, the resource segment is what makes the helper // unambiguous. assert!(!rules.is_authorization_check("require_member")); assert!(!rules.is_authorization_check("require_owner")); } - /// Phase A4 — broader verb / role / context-suffix shapes seen in - /// real-world Rust apps. `check___action` is the - /// canonical lemmy idiom; verifying the `is_` predicate - /// recogniser closes `is_mod_or_admin` style checks. + /// Broader verb / role / context-suffix shapes seen in real-world + /// Rust apps. `check___action` is the canonical + /// lemmy idiom; the `is_` predicate recogniser closes + /// `is_mod_or_admin` style checks. #[test] fn is_authorization_check_recognises_check_action_and_predicate_shapes() { let cfg = Config::default(); @@ -1847,7 +1854,7 @@ mod tests { assert!(rules.is_authorization_check("is_admin_or_moderator")); assert!(rules.is_authorization_check("is_member_and_owner")); - // Negatives — predicates whose tokens are NOT known auth roles. + // Negatives, predicates whose tokens are NOT known auth roles. assert!(!rules.is_authorization_check("is_user")); assert!(!rules.is_authorization_check("is_logged_in")); assert!(!rules.is_authorization_check("is_active")); diff --git a/src/auth_analysis/extract/axum.rs b/src/auth_analysis/extract/axum.rs index 163e41ed..4578787e 100644 --- a/src/auth_analysis/extract/axum.rs +++ b/src/auth_analysis/extract/axum.rs @@ -384,8 +384,8 @@ fn classify_rocket_param( /// /// **Looser than [`super::common::is_self_actor_type_text`] by /// design.** This recogniser runs only on the type of a route-bound -/// parameter — appearing in a route handler signature is itself a -/// strong signal — and a false positive here just over-credits the +/// parameter, appearing in a route handler signature is itself a +/// strong signal, and a false positive here just over-credits the /// route with a login guard, which is conservative w.r.t. flagging. /// `is_self_actor_type_text` runs on every parameter, including in /// non-route functions, and a false positive there suppresses @@ -625,6 +625,11 @@ pub(crate) fn inject_guard_checks( line, args: call.args.clone(), condition_text: None, + // Route-level guard injected from a tower / axum layer + // (`RequireAuthorizationLayer`, `axum_login::login_required!`, + // …). Tells `auth_check_covers_subject` to short-circuit + // for any non-login-guard match. + is_route_level: true, }); } } diff --git a/src/auth_analysis/extract/common.rs b/src/auth_analysis/extract/common.rs index f5779bb8..a11e2af0 100644 --- a/src/auth_analysis/extract/common.rs +++ b/src/auth_analysis/extract/common.rs @@ -51,6 +51,27 @@ fn collect_top_level_from_node( if decorated_definition_child(node) .is_some_and(|definition| definition.kind() == "function_definition") => { + // Celery / Airflow / DRF background-task decorators + // (`@instrumented_task`, `@shared_task`, `@app.task`, + // `@celery.task`, `@beat.shared_task`, `@periodic_task`, + // `@receiver`) mark a function as an internal scheduled + // job, not a user-reachable handler. Any id-shaped + // parameter name (`uuid: str`, `release_id: int`, + // `voucher_code_ids: list[int]`) refers to an + // internally-generated identifier, by construction the + // task is invoked from `task.delay(...)` in already-auth- + // checked code, never from an HTTP request directly. + // + // Skipping the unit at extract time stops the ownership / + // token-override / partial-batch-authorization rules from + // examining its operations. Real route handlers go + // through the framework extractors (Flask / + // FastAPI / Django / DRF) and re-add a `RouteHandler` + // unit with auth_checks injected from the route + // decorator, so this skip never hides a real handler. + if python_decorated_definition_is_background_task(node, bytes) { + return; + } model.units.push(build_function_unit_with_meta( node, AnalysisUnitKind::Function, @@ -143,13 +164,54 @@ pub fn attach_route_handler( model: &mut AuthorizationModel, ) -> Option { let handler_node = resolve_handler_node(root, handler_expr, bytes)?; - let unit_idx = model.units.len(); // `attach_route_handler` is called by route-aware extractors (express, // koa, fastify, axum, …) which already hold the file root. Build // the FileMeta once here so the JS/TS TRPC pre-scan only walks the // top-level decl set per file (instead of per route). let file_meta = FileMeta::scan(root, bytes); - let unit = build_function_unit_with_meta( + let line = handler_node.start_position().row + 1; + let handler_span = span(handler_node); + let definition = function_definition_node(handler_node); + // Route-handler-aware param list: includes id-like Python typed + // params (`dag_id: str`, `dag_run_id: str`) that + // `collect_param_names`'s default branch filters out for internal + // helpers. `inject_middleware_auth` clones this list into the + // synthetic-subject set on each middleware-injected auth check so + // `auth_check_covers_subject` matches the operation subjects + // produced by the handler body (e.g. `filter_by(dag_id=dag_id, + // run_id=dag_run_id)`). + let route_handler_params = function_params_route_handler(definition, bytes); + + // **Promote-or-create.** Most route-aware extractors invoke + // `collect_top_level_units` first, which already produced a + // [`AnalysisUnitKind::Function`] unit covering this same span. + // Pushing a brand-new RouteHandler unit duplicates the analysis + // surface, `check_ownership_gaps` then evaluates the operation + // twice and emits the FP from the (un-injected) Function unit even + // when the RouteHandler unit's middleware-derived auth check + // suppresses it. Promoting the existing unit keeps the model + // single-tenanted per handler so downstream auth-check injection + // (FastAPI `dependencies=[Depends(...)]`, Express middleware, ...) + // lands on the unit that's evaluated. + if let Some((idx, existing)) = model + .units + .iter_mut() + .enumerate() + .find(|(_, u)| u.kind == AnalysisUnitKind::Function && u.span == handler_span) + { + existing.kind = AnalysisUnitKind::RouteHandler; + existing.name = Some(route_name); + existing.params = route_handler_params.clone(); + return Some(ResolvedHandler { + unit_idx: idx, + span: handler_span, + params: route_handler_params, + line, + }); + } + + let unit_idx = model.units.len(); + let mut unit = build_function_unit_with_meta( handler_node, AnalysisUnitKind::RouteHandler, Some(route_name), @@ -157,14 +219,12 @@ pub fn attach_route_handler( rules, Some(&file_meta), ); - let params = unit.params.clone(); - let line = handler_node.start_position().row + 1; - let span = span(handler_node); + unit.params = route_handler_params.clone(); model.units.push(unit); Some(ResolvedHandler { unit_idx, - span, - params, + span: handler_span, + params: route_handler_params, line, }) } @@ -362,6 +422,19 @@ pub fn build_function_unit_with_meta( ) -> AnalysisUnit { let definition = function_definition_node(node); let params = function_params(definition, bytes); + // Structurally-typed bounded params: walk the parameter list and + // mark any param whose type annotation resolves to an integer or + // boolean scalar (`int`, `bool`, `Optional[int]`, `list[int]`, + // `Iterable[int]`, …) as typed-bounded. Mirrors the SSA-derived + // `apply_typed_bounded_params` lift but runs even when the SSA + // var_types map isn't supplied (internal helpers analysed without + // a CFG, ad-hoc unit lookups, …). Without this, a Python helper + // signature like `get_release_project_new_group_count(environment_ids: + // list[int], project_ids: list[int])` would drop into the + // ownership rule because the param names match `is_id_like` even + // though the static type proves the values are bounded numerics + // that can't carry a SQL/file/shell payload. + let preseeded_bounded = python_int_bounded_typed_params(definition, bytes); let line = node.start_position().row + 1; let mut state = UnitState::default(); // Seed Go's method-receiver name (`func (c *Cache) ...` → `c`) into @@ -372,7 +445,7 @@ pub fn build_function_unit_with_meta( // a `*Cache` field-call from a `*sql.DB` call by name alone, so we // err on the safe side per the deferred memo // (`project_realrepo_hugo.md`). Only Go's `method_declaration` - // exposes a `receiver` field — Rust/Java instance methods route + // exposes a `receiver` field, Rust/Java instance methods route // through `self`/`this` keywords and are unaffected. if let Some(receiver_name) = method_receiver_name(definition, bytes) { state.non_sink_vars.insert(receiver_name); @@ -416,7 +489,7 @@ pub fn build_function_unit_with_meta( self_actor_id_vars: state.self_actor_id_vars, authorized_sql_vars: state.authorized_sql_vars, const_bound_vars: state.const_bound_vars, - typed_bounded_vars: HashSet::new(), + typed_bounded_vars: preseeded_bounded, typed_bounded_dto_fields: std::collections::HashMap::new(), self_scoped_session_bases: state.self_scoped_session_bases, } @@ -470,7 +543,7 @@ struct UnitState { /// `let X = V.user_id` / `V.uid`). Populated by /// `collect_self_actor_id_binding`. Copied onto /// `AnalysisUnit.self_actor_id_vars` so subjects whose name appears - /// here count as actor context — closes the FP where a route + /// here count as actor context, closes the FP where a route /// handler does `let uid = user.id; query_all(.., &[uid])` and the /// engine sees `uid` only as a plain scoped id. self_actor_id_vars: HashSet, @@ -481,7 +554,7 @@ struct UnitState { /// `collect_row_field_binding` and `collect_for_row_binding`. authorized_sql_vars: HashSet, /// Local variables whose declaration binds them to a string, - /// numeric, or boolean literal — `id := "id"` / `let id = "1"` / + /// numeric, or boolean literal, `id := "id"` / `let id = "1"` / /// `String id = "id";`. These cannot be user-controlled and so /// must not be treated as scoped-identifier subjects by /// `is_relevant_target_subject`. Closes the gin/context_test.go @@ -503,7 +576,7 @@ struct UnitState { /// [`collect_trpc_ctx_param`] to decide whether a parameter's /// type annotation (often just an alias name like `GetOptions`) /// resolves to a TRPC handler signature. Empty for non-TS - /// languages — the scanner only matches TS-grammar node kinds. + /// languages, the scanner only matches TS-grammar node kinds. trpc_alias_names: HashSet, } @@ -543,7 +616,7 @@ fn collect_unit_state( collect_const_string_binding(node, bytes, state); } // JS/TS `variable_declarator` inside `lexical_declaration` - // (`const X = ...`, `let X = ...`) — exposes `name` + `value` + // (`const X = ...`, `let X = ...`), exposes `name` + `value` // fields. Run the same self-actor / self-actor-id binding // recognition as the Rust `let_declaration` arm above so the // session-self-actor copy chain (`const session = await @@ -555,8 +628,8 @@ fn collect_unit_state( collect_const_string_binding(node, bytes, state); } // Go `id := "id"` / Python `id = "id"` / Java `String id = "id";` / - // Ruby `id = "id"` — language-specific binding nodes that the - // let_declaration arm above doesn't catch. Const-only — never + // Ruby `id = "id"`, language-specific binding nodes that the + // let_declaration arm above doesn't catch. Const-only, never // marks self_actor / row_field / sql vars (those need richer // right-hand-side analysis already provided by the // let_declaration arm). @@ -589,7 +662,7 @@ fn collect_unit_state( "parameter" => { collect_typed_extractor_self_actor(node, bytes, state); } - // TS `required_parameter` / `optional_parameter` — the analogous + // TS `required_parameter` / `optional_parameter`, the analogous // arm to Rust's `parameter`. Recognise TRPC-shaped Options // params (`{ ctx, input }: GetOptions`) and add the destructured // ctx-base to `self_scoped_session_bases` so downstream @@ -651,6 +724,7 @@ fn collect_call(node: Node<'_>, bytes: &[u8], rules: &AuthAnalysisRules, state: line, args: string_args, condition_text: None, + is_route_level: false, }); } @@ -672,7 +746,7 @@ fn collect_call(node: Node<'_>, bytes: &[u8], rules: &AuthAnalysisRules, state: OperationKind::Read } } - // Publish / outbound / cache / DB mutation — treat as + // Publish / outbound / cache / DB mutation, treat as // write-shaped by default unless the callee name is a // read verb (e.g. `cache.get(tenant_id)`). _ => { @@ -725,6 +799,7 @@ fn collect_condition( line, args: Vec::new(), condition_text: Some(condition_text.clone()), + is_route_level: false, }); } @@ -737,6 +812,7 @@ fn collect_condition( line, args: Vec::new(), condition_text: Some(condition_text), + is_route_level: false, }); } } @@ -789,13 +865,13 @@ fn first_identifier_name(node: Node<'_>, bytes: &[u8]) -> Option { // Ruby `@foo` instance vars and `@@foo` class vars: // Rails controllers populate the row via `@issue = // Issue.find(...)`, so the row var is the *full* `@issue` - // text — chain_root in checks.rs strips on `.` only, so an + // text, chain_root in checks.rs strips on `.` only, so an // auth check on `@issue.visible?` resolves to root `@issue`, // matching the row var. | "instance_variable" | "class_variable" // Ruby globals `$foo` are unusual but match the same - // handler-state idiom — kept symmetric with @-vars. + // handler-state idiom, kept symmetric with @-vars. | "global_variable" ) { let value = text(node, bytes); @@ -874,10 +950,10 @@ fn collect_row_field_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitState /// /// Only fires when the value resolves to a member-access node and the /// resulting chain has at least two segments (`req.community_id`, -/// `data.user.id`, …) — single-ident receivers are uninteresting and a +/// `data.user.id`, …), single-ident receivers are uninteresting and a /// chain of length one would just duplicate the binding's own name. /// -/// Defensive: never overwrites an existing entry — first writer wins. +/// Defensive: never overwrites an existing entry, first writer wins. /// Re-binding the same local name (rare in idiomatic Rust) is treated /// as a separate variable scope; the rest of the analysis already /// works on the first binding seen during a top-down walk. @@ -929,7 +1005,7 @@ fn collect_member_alias_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitSt /// `has_row_fetch_exemption` looks for a row var "declared at this /// op's line", where `op.line` is the call site. Recording the /// let-line caused the multi-line shape to fall through the exemption -/// — surfaced on lemmy's `comment/lock.rs:31`, where every fetch-then- +///, surfaced on lemmy's `comment/lock.rs:31`, where every fetch-then- /// check route handler that wraps the read across two lines was /// flagged despite a textual auth check on the resulting row. fn collect_row_population(node: Node<'_>, bytes: &[u8], state: &mut UnitState) { @@ -979,7 +1055,7 @@ fn collect_row_population(node: Node<'_>, bytes: &[u8], state: &mut UnitState) { /// A3: record `let V = CALL(..)` (or `.await?` / `?` / reference /// chains wrapping such a call) where `CALL` matches a configured /// login-guard or authorization-check name. `V` is then treated as the -/// authenticated actor — `V.id`-shaped subjects are actor context and +/// authenticated actor, `V.id`-shaped subjects are actor context and /// shouldn't be flagged as foreign scoped IDs. fn collect_self_actor_binding( node: Node<'_>, @@ -1028,14 +1104,14 @@ fn collect_self_actor_binding( /// register as: /// /// * `const { user } = ctx.session` / `const { user } = await -/// getServerSession()` — RHS is a session container, so a +/// getServerSession()`, RHS is a session container, so a /// destructured `user` (or `currentUser`) becomes the unit's /// self-actor binding. -/// * `const { id } = req.user` / `const { userId } = session.user` — +/// * `const { id } = req.user` / `const { userId } = session.user` , /// RHS is the canonical authed-user base from /// `is_self_scoped_session_base_text`, so a destructured `id` / /// `userId` / `user_id` / `uid` becomes a self-actor-id binding. -/// * `const { user } = await loginGuardCall()` — also accepted +/// * `const { user } = await loginGuardCall()`, also accepted /// because `value_is_self_actor_call` already covers the /// `let user = require_auth(..)` shape; we lift that recognition /// into the destructure case so callers can extract the actor in a @@ -1072,12 +1148,12 @@ fn collect_destructured_self_actor_binding( continue; }; let (key, local) = match child.kind() { - // `{ user }` — key and local are the same identifier. + // `{ user }`, key and local are the same identifier. "shorthand_property_identifier_pattern" => { let name = text(child, bytes); (name.clone(), name) } - // `{ user = default }` — left is the shorthand key/local. + // `{ user = default }`, left is the shorthand key/local. "object_assignment_pattern" => { let Some(left) = child.child_by_field_name("left") else { continue; @@ -1092,7 +1168,7 @@ fn collect_destructured_self_actor_binding( }; (name.clone(), name) } - // `{ user: localName }` — `key` and `value` fields are + // `{ user: localName }`, `key` and `value` fields are // distinct (key from RHS source, local in our scope). "pair_pattern" => { let key_node = child.child_by_field_name("key"); @@ -1127,7 +1203,7 @@ fn collect_destructured_self_actor_binding( #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum DestructureRhsKind { - /// RHS is a session container — the destructured `user` field + /// RHS is a session container, the destructured `user` field /// resolves to the authenticated actor. Examples: `ctx.session`, /// `req.session`, `session`, `await getServerSession()`, /// `getSession()`. @@ -1136,7 +1212,7 @@ enum DestructureRhsKind { /// `ctx.session.user`). A destructured `id` field is the actor's /// own id. SelfActorBase, - /// RHS is not a session/actor source — destructure is irrelevant + /// RHS is not a session/actor source, destructure is irrelevant /// for self-actor recognition. None, } @@ -1146,7 +1222,7 @@ enum DestructureRhsKind { /// to `state.self_scoped_session_bases` by an earlier /// `collect_trpc_ctx_param` call. Used to mark the destructured /// `user` shorthand as a self-actor binding when extracting it from a -/// TRPC ctx param's local — `({ ctx }: Options) => { const { user } +/// TRPC ctx param's local, `({ ctx }: Options) => { const { user } /// = ctx; }`. fn lookup_trpc_ctx_destructure_match( node: Node<'_>, @@ -1258,7 +1334,7 @@ fn process_destructure_entry( } /// True when `node` (after walking through `await`/parens/non-null -/// wrappers) is a session-container expression — a chain ending in +/// wrappers) is a session-container expression, a chain ending in /// `.session` / `.state.session` / a bare `session` identifier, or a /// call to a known session-getter (`getServerSession()`, /// `getSession()`). Distinct from `value_is_self_actor_call` which @@ -1272,7 +1348,7 @@ fn value_is_session_provider_chain(node: Node<'_>, bytes: &[u8]) -> bool { return false; } let joined = chain.join("."); - // Bare session containers — `ctx.session`, `req.session`, + // Bare session containers, `ctx.session`, `req.session`, // `request.session`, plus the Koa `ctx.state` shape. matches!( joined.as_str(), @@ -1283,7 +1359,7 @@ fn value_is_session_provider_chain(node: Node<'_>, bytes: &[u8]) -> bool { let name = text(node, bytes); matches!(name.as_str(), "session") } - // Known session-getter calls. Conservative list — only + // Known session-getter calls. Conservative list, only // recogniser shapes that are unambiguously session-providing // in the JS/TS ecosystem (NextAuth's `getServerSession` is the // dominant one). `auth()` and `useSession()` are deliberately @@ -1374,7 +1450,7 @@ fn value_is_self_actor_base_chain(node: Node<'_>, bytes: &[u8]) -> bool { } /// Recognise variable bindings whose right-hand side is a literal -/// constant — string, integer, float, or boolean. A subject backed +/// constant, string, integer, float, or boolean. A subject backed /// by a constant binding cannot be user-controlled and so must not /// trigger `.auth.missing_ownership_check` even when the /// variable name happens to match `is_id_like` (e.g. @@ -1384,7 +1460,7 @@ fn value_is_self_actor_base_chain(node: Node<'_>, bytes: &[u8]) -> bool { /// (`parenthesized_expression`, `type_cast_expression`, /// reference/borrow expressions) before checking for a leaf literal /// kind. Conservative: any non-literal subexpression on the RHS -/// (a call, identifier, field-access) skips the binding — that var +/// (a call, identifier, field-access) skips the binding, that var /// might still hold attacker-controlled data. /// /// Handles the per-language declaration kinds wired in @@ -1393,7 +1469,7 @@ fn value_is_self_actor_base_chain(node: Node<'_>, bytes: &[u8]) -> bool { /// `local_variable_declaration`, Rust `let_declaration`, and bare /// `assignment_expression`. fn collect_const_string_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitState) { - // `assignment` / `assignment_expression`: `x = "foo"` — populate + // `assignment` / `assignment_expression`: `x = "foo"`, populate // the LHS (`name` / `left`) when the RHS is a literal. if matches!( node.kind(), @@ -1433,7 +1509,7 @@ fn collect_const_string_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitSt .or_else(|| node.child_by_field_name("default")) }); if let (Some(left), Some(right)) = (left, right) { - // expression_list parallel — pair LHS idents with RHS exprs. + // expression_list parallel, pair LHS idents with RHS exprs. let lhs_idents = collect_lhs_idents(left, bytes); let rhs_exprs: Vec> = if right.kind() == "expression_list" { let mut cursor = right.walk(); @@ -1468,7 +1544,7 @@ fn collect_const_string_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitSt // Rust `let_declaration` / Python `expression_statement` wrapping a // top-level assignment / JS `lexical_declaration` / Java - // `local_variable_declaration` — all expose the binding via + // `local_variable_declaration`, all expose the binding via // `pattern`/`name` + `value`. let pattern = node .child_by_field_name("pattern") @@ -1484,7 +1560,7 @@ fn collect_const_string_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitSt } // JS `lexical_declaration` / Java `local_variable_declaration` / - // Python `expression_statement` — the binding child is a wrapper + // Python `expression_statement`, the binding child is a wrapper // (`variable_declarator`). Recurse into wrappers; the // `variable_declarator` arm in `collect_unit_state` handles them. for idx in 0..node.named_child_count() { @@ -1506,7 +1582,7 @@ fn collect_const_string_binding(node: Node<'_>, bytes: &[u8], state: &mut UnitSt } /// Returns true if `node` (after unwrapping common wrappers) is a -/// pure literal — string, integer, float, boolean, or null. Returns +/// pure literal, string, integer, float, boolean, or null. Returns /// false for any expression that could carry attacker-controlled data /// (calls, identifiers, field access, template strings with /// interpolations). @@ -1684,20 +1760,20 @@ fn value_is_self_actor_id_field( } false } - // `(v.id as i64).into()` / `v.id.to_string()` / `v.id.clone()` — + // `(v.id as i64).into()` / `v.id.to_string()` / `v.id.clone()` , // call on a self-actor id field still propagates self-actor-id. "call_expression" | "call" | "method_invocation" | "method_call_expression" => { let receiver = node .child_by_field_name("function") .or_else(|| node.child_by_field_name("object")); if let Some(r) = receiver { - // Function field of a method call is `receiver.method` — + // Function field of a method call is `receiver.method` , // walk the receiver subtree for the self-actor id field. if value_is_self_actor_id_field(r, bytes, actor_vars) { return true; } // Also check the receiver of a method-style chain: - // `(v.id as i64).into()` — `function` is the + // `(v.id as i64).into()`, `function` is the // `field_expression` `(...).into`, whose `value` child // is the cast expression. if let Some(inner) = r @@ -1724,7 +1800,7 @@ fn is_self_actor_id_field_name(field: &str) -> bool { /// Recognise `let X = session.user.id` (or /// `req.session.user.id` / `ctx.session.user.id` / `req.user.id` / -/// `request.user.id`, etc.) — a copy of the authenticated actor's +/// `request.user.id`, etc.), a copy of the authenticated actor's /// own id field through one of the canonical session-context chains /// (the same set `is_self_scoped_session_subject` accepts at use /// time). Walks through wrappers (`await`, `?.`, parens, casts, @@ -1847,10 +1923,10 @@ fn value_is_self_actor_call(node: Node<'_>, bytes: &[u8], rules: &AuthAnalysisRu | "parenthesized_expression" | "match_expression" => { // For `match SCRUTINEE { ... }`, the scrutinee is the - // call we care about — if `require_auth().await` is being + // call we care about, if `require_auth().await` is being // matched, the `Ok(u) => u` arm gives us a self-actor // binding even when `?` isn't usable. Walk all named - // children — tree-sitter exposes both the scrutinee and + // children, tree-sitter exposes both the scrutinee and // the arms. for idx in 0..node.named_child_count() { let Some(child) = node.named_child(idx as u32) else { @@ -1943,10 +2019,11 @@ fn collect_sql_authorized_binding( line, args: Vec::new(), condition_text: None, + is_route_level: false, }); } -/// Always true — the direct-user-id-predicate path in +/// Always true, the direct-user-id-predicate path in /// `sql_semantics::classify_sql_query` doesn't depend on the ACL /// table list, so we still want to walk `let X = …query(LIT)…` /// chains even when the user hasn't configured any ACL tables. @@ -1977,7 +2054,7 @@ fn find_authorized_sql_call_in_chain<'tree>( ) { return None; } - // Collect any non-literal arg value-refs from this call — + // Collect any non-literal arg value-refs from this call , // these typically include the bound user id (e.g. // `.bind(user.id)` → adds `user.id` as a subject). if let Some(args_node) = cur.child_by_field_name("arguments") { @@ -2012,7 +2089,7 @@ fn find_authorized_sql_call_in_chain<'tree>( return Some((cur, bind_arg_refs)); } // Method matched but arg isn't a literal we recognise - // as authorized — bail. + // as authorized, bail. return None; } @@ -2036,7 +2113,7 @@ fn find_authorized_sql_call_in_chain<'tree>( } /// Recognised SQL prepare/query method names. Matched against the -/// last segment of the callee. String comparison only — we don't +/// last segment of the callee. String comparison only, we don't /// constrain the receiver to a specific type; known DB connection /// receivers are classified by the sink-class type gate, and this /// list is the orthogonal verb axis. @@ -2088,7 +2165,7 @@ fn collect_string_literal_text(node: Node<'_>, bytes: &[u8]) -> Option { } } -/// B3: `for ROW in X { … }` — when `X` (the iterator value) names a +/// B3: `for ROW in X { … }`, when `X` (the iterator value) names a /// SQL-authorized variable, mark `ROW` authorized too AND record /// `row_field_vars[ROW] = X` so transitive subject coverage works /// for column reads inside the loop body. @@ -2167,7 +2244,7 @@ fn single_iter_source_name(node: Node<'_>, bytes: &[u8]) -> Option { } /// B3: `let Y = ROW.method(..)` / `let Y = ROW.field` where `ROW` is -/// SQL-authorized — propagate authorized status to `Y` so any +/// SQL-authorized, propagate authorized status to `Y` so any /// downstream use (e.g. as a sink subject) is treated as covered. /// `row_field_vars[Y] = ROW` is already populated by /// `collect_row_field_binding`; this helper just propagates the @@ -2203,7 +2280,7 @@ fn propagate_sql_authorized_through_field_read( /// that downstream `V.id`-shaped subjects on a parameter of one of /// these types count as actor context, not foreign scoped IDs. /// -/// The recogniser is intentionally type-only — no name heuristic on +/// The recogniser is intentionally type-only, no name heuristic on /// the variable. A handler signature /// `pub async fn handler(.., local_user_view: LocalUserView)` is /// recognised because the type name matches, not because the @@ -2211,13 +2288,13 @@ fn propagate_sql_authorized_through_field_read( /// /// **Two acceptance forms:** /// -/// 1. *Tight exact set* — names whose entire identity is "auth +/// 1. *Tight exact set*, names whose entire identity is "auth /// subject": `Authenticated`, `Identity`, `Principal`. Adding new /// bare names to this set should be done sparingly; framework /// types that include `User` should go through the structural /// form instead. /// -/// 2. *Structural form* — a CamelCase identifier of the shape +/// 2. *Structural form*, a CamelCase identifier of the shape /// `User?` where `PREFIX` is one of `Local`, /// `Current`, `Session`, `Auth`, `Authenticated`, `LoggedIn`, /// `Admin`, and `SUFFIX` (optional) is one of `View`, `Info`, @@ -2226,9 +2303,9 @@ fn propagate_sql_authorized_through_field_read( /// `AuthenticatedUserContext`, etc. /// /// **Deliberately *not* matched:** -/// * Bare `User` — too loose; `User` parameters are very often +/// * Bare `User`, too loose; `User` parameters are very often /// deserialised payloads, not actor extractors. -/// * `UserView`, `UserPreferences` — same reason; the prefix is what +/// * `UserView`, `UserPreferences`, same reason; the prefix is what /// carries the auth signal, not the bare `User` segment. fn is_self_actor_type_text(ty: &str) -> bool { let trimmed = ty @@ -2255,7 +2332,7 @@ fn is_self_actor_type_text(ty: &str) -> bool { /// Implementation: strip a leading PREFIX, require the remainder to /// start with `User`, and accept either an exact `User` match or a /// `User`+SUFFIX match. Case-sensitive on the segment boundaries -/// because we want CamelCase types only — `localuser` wouldn't be a +/// because we want CamelCase types only, `localuser` wouldn't be a /// real Rust type name and matching it would create ambiguity with /// payload identifiers. fn matches_self_actor_user_form(base: &str) -> bool { @@ -2363,8 +2440,8 @@ fn unwrap_try_like(node: Node<'_>) -> Node<'_> { /// Detect the `if OWNER != SELF { return ... }` (or `==` with `else` /// early-exit) row-level ownership-equality pattern and emit a /// synthetic `AuthCheck { kind: Ownership }`. The AuthCheck is -/// back-dated to the row's `let` line — and populated with the row's -/// original fetch arguments as subjects — so the row-fetching call +/// back-dated to the row's `let` line, and populated with the row's +/// original fetch arguments as subjects, so the row-fetching call /// (e.g. `db.query_one(.., &[doc_id])`) is also covered. fn detect_ownership_equality_check(if_node: Node<'_>, bytes: &[u8], state: &mut UnitState) { let Some(condition_raw) = if_node.child_by_field_name("condition") else { @@ -2465,6 +2542,7 @@ fn detect_ownership_equality_check(if_node: Node<'_>, bytes: &[u8], state: &mut line: check_line, args: Vec::new(), condition_text: Some(condition_text), + is_route_level: false, }); } @@ -2657,15 +2735,210 @@ pub fn function_name(node: Node<'_>, bytes: &[u8]) -> Option { .filter(|name| !name.is_empty()) } +/// True when a Python `decorated_definition` node carries a +/// background-task / event-handler decorator. Recognised markers +/// (matched against the bare callee name, last segment of any +/// dotted/qualified form): +/// +/// * Celery: `task`, `shared_task`, `periodic_task`, +/// `app.task`, `celery.task`, `beat.shared_task`. +/// * Airflow: `instrumented_task`. +/// * Django: `receiver` (signal receiver, invoked by the framework, +/// not by an HTTP request). +/// +/// Used by `collect_top_level_from_node` to skip pushing a +/// `Function` unit for functions that cannot, by construction, be +/// the entry point of a user-input flow. Real route handlers are +/// added by the framework-specific route extractors (Flask / +/// Django / Spring / FastAPI / …) which re-build the unit with +/// `RouteHandler` kind and route-decorator-derived auth checks. +fn python_decorated_definition_is_background_task(node: Node<'_>, bytes: &[u8]) -> bool { + for idx in 0..node.named_child_count() { + let Some(child) = node.named_child(idx as u32) else { + continue; + }; + if child.kind() != "decorator" { + continue; + } + let Some(inner) = child.named_child(0) else { + continue; + }; + let callee_text = match inner.kind() { + "call" => { + let Some(function) = inner.child_by_field_name("function") else { + continue; + }; + text(function, bytes) + } + "identifier" | "attribute" | "scoped_identifier" => text(inner, bytes), + _ => continue, + }; + let last = callee_text.rsplit('.').next().unwrap_or(&callee_text); + if matches!( + last, + "task" | "shared_task" | "periodic_task" | "instrumented_task" | "receiver" + ) { + return true; + } + } + false +} + fn function_params(node: Node<'_>, bytes: &[u8]) -> Vec { let Some(params_node) = node.child_by_field_name("parameters") else { return Vec::new(); }; let mut params = Vec::new(); - collect_param_names(params_node, bytes, &mut params); + collect_param_names(params_node, bytes, false, &mut params); params } +/// Variant of [`function_params`] that always includes id-like typed +/// Python params (`dag_id: str`, `dag_run_id: str`). Used by +/// `attach_route_handler` to populate `unit.params` for RouteHandler +/// units so middleware-injected auth checks (FastAPI +/// `dependencies=[Depends(...)]`, Flask `@requires_role(...)`, etc.) +/// can synthesise subjects that cover every handler input, including +/// the id-shaped ones that are *the* primary user-controlled data on +/// REST routes. +/// +/// The id-like filter in [`collect_param_names`] exists to keep +/// internal helper signatures (`def f(release_id: int, project: +/// Project)`) from passing `unit_has_user_input_evidence`'s param +/// heuristic, which would over-fire `missing_ownership_check`. Route +/// handlers don't need that filter, they pass the precondition gate +/// via `kind == RouteHandler`, and missing the id-like params from +/// `unit.params` actively breaks the middleware-injection coverage +/// path. +pub fn function_params_route_handler(node: Node<'_>, bytes: &[u8]) -> Vec { + let Some(params_node) = node.child_by_field_name("parameters") else { + return Vec::new(); + }; + let mut params = Vec::new(); + collect_param_names(params_node, bytes, true, &mut params); + params +} + +/// Walk a Python function-definition node's parameter list and +/// collect every parameter whose static type annotation resolves to +/// an integer or boolean scalar (or a generic-wrapped int such as +/// `Optional[int]`, `list[int]`, `Iterable[int]`). These names are +/// used to seed `AnalysisUnit::typed_bounded_vars` so the ownership +/// rule's `is_typed_bounded_subject` filter recognises the bounded +/// type without requiring an SSA-derived `VarTypes` map. +/// +/// No-op for non-Python `function_definition` nodes, only +/// tree-sitter-python exposes the `typed_parameter` / +/// `typed_default_parameter` shapes inspected here. Conservative: +/// only int/bool/float scalars and known integer-list wrappers +/// qualify; bare `str`, `bytes`, `Path`, custom DTO types, and +/// `Annotated[int, Body()]` wrappers are NOT lifted because the +/// presence of an HTTP-binding marker indicates the value is +/// caller-controlled (the SSA pipeline handles those). +fn python_int_bounded_typed_params(node: Node<'_>, bytes: &[u8]) -> HashSet { + let mut out: HashSet = HashSet::new(); + let Some(params_node) = node.child_by_field_name("parameters") else { + return out; + }; + for idx in 0..params_node.named_child_count() { + let Some(child) = params_node.named_child(idx as u32) else { + continue; + }; + if !matches!(child.kind(), "typed_parameter" | "typed_default_parameter") { + continue; + } + let mut name: Option = None; + let mut type_text: Option = None; + for inner_idx in 0..child.named_child_count() { + let Some(inner) = child.named_child(inner_idx as u32) else { + continue; + }; + if inner.kind() == "identifier" && name.is_none() { + let n = text(inner, bytes); + if !n.is_empty() { + name = Some(n); + } + } else if inner.kind() == "type" { + type_text = Some(text(inner, bytes)); + } + } + if let (Some(n), Some(t)) = (name, type_text) + && python_type_text_is_integer_bounded(&t) + { + out.insert(n); + } + } + out +} + +/// Conservative recogniser for Python type annotations that bound a +/// value to an integer or boolean scalar. Accepts: +/// * Bare `int`, `bool`, `float`. +/// * Common generic wrappers whose element type is one of those: +/// `Optional[int]`, `Union[int, None]`, `list[int]`, `List[int]`, +/// `tuple[int, ...]`, `Sequence[int]`, `Iterable[int]`, +/// `set[int]`, `frozenset[int]`, `dict[int, ...]` (key only). +/// +/// `Annotated[int, ...]` is intentionally rejected, the FastAPI / +/// Pydantic binding marker indicates the value is caller-controlled. +fn python_type_text_is_integer_bounded(text: &str) -> bool { + let trimmed = text.trim(); + // Accept `T | None` (PEP 604) by recursing on each branch. + if trimmed.contains('|') { + return trimmed + .split('|') + .map(str::trim) + .all(|alt| alt == "None" || python_type_text_is_integer_bounded(alt)); + } + if matches!(trimmed, "int" | "bool" | "float") { + return true; + } + let Some((head, rest)) = trimmed.split_once('[') else { + return false; + }; + if !rest.ends_with(']') { + return false; + } + let inner = &rest[..rest.len() - 1]; + let head_trim = head.trim(); + // `Annotated[int, Body()]` etc. is a binding marker, refuse. + if matches!(head_trim, "Annotated" | "typing.Annotated") { + return false; + } + let inner_first = inner.split(',').next().unwrap_or(inner).trim(); + matches!( + head_trim, + "Optional" + | "typing.Optional" + | "Union" + | "typing.Union" + | "list" + | "List" + | "typing.List" + | "tuple" + | "Tuple" + | "typing.Tuple" + | "set" + | "Set" + | "typing.Set" + | "frozenset" + | "Frozenset" + | "Sequence" + | "typing.Sequence" + | "Iterable" + | "typing.Iterable" + | "Iterator" + | "typing.Iterator" + | "Collection" + | "typing.Collection" + | "dict" + | "Dict" + | "typing.Dict" + | "Mapping" + | "typing.Mapping" + ) && python_type_text_is_integer_bounded(inner_first) +} + /// Walk the tree starting at `node` and gather TS type-alias / /// interface names whose body references a TRPC-marker type /// (`TrpcSessionUser`, `TRPCContext`, …). Recurses only through @@ -2674,7 +2947,7 @@ fn function_params(node: Node<'_>, bytes: &[u8]) -> Vec { /// stops at function or class bodies to avoid an O(units × tree) /// blowup on files with many small functions. /// -/// No-op for non-TS files — the matched node kinds only exist in +/// No-op for non-TS files, the matched node kinds only exist in /// the TS grammar. Used by [`FileMeta::scan`] (called once per file /// in `collect_top_level_units` / `attach_route_handler`) to amortise /// the alias scan across all units in the same source file. @@ -2748,7 +3021,7 @@ fn body_text_references_trpc_marker(body_text: &str) -> bool { /// IS one of the file-level TRPC aliases (`state.trpc_alias_names`, /// populated by [`scan_trpc_aliases_from_node_root`]) or its annotation /// text inlines `TrpcSessionUser` directly. Bare `ctx.user` is never -/// added to the static session-base list — that would over-suppress +/// added to the static session-base list, that would over-suppress /// in non-TRPC code. Instead, the dynamic per-unit set /// `self_scoped_session_bases` carries the lift. fn collect_trpc_ctx_param(node: Node<'_>, bytes: &[u8], state: &mut UnitState) { @@ -2842,7 +3115,7 @@ fn type_text_is_trpc_options(ty_text: &str, trpc_alias_names: &HashSet) return true; } // Also accept the bare alias name appearing anywhere in the - // annotation text — handles `Promise` and other + // annotation text, handles `Promise` and other // wrappers without enumerating every shape. Word-boundary check // avoids matching aliases that are substrings of longer // identifiers. @@ -2898,7 +3171,12 @@ fn extract_receiver_param_name(node: Node<'_>, bytes: &[u8]) -> Option { None } -fn collect_param_names(node: Node<'_>, bytes: &[u8], out: &mut Vec) { +fn collect_param_names( + node: Node<'_>, + bytes: &[u8], + include_id_like_typed: bool, + out: &mut Vec, +) { match node.kind() { "identifier" | "property_identifier" | "shorthand_property_identifier_pattern" => { let name = text(node, bytes); @@ -2906,9 +3184,103 @@ fn collect_param_names(node: Node<'_>, bytes: &[u8], out: &mut Vec) { out.push(name); } } + // Rust `parameter` node: descend ONLY into the `pattern` field so + // type-segment identifiers don't pollute the param-name set. + // Without this scope, `dst: &std::path::Path` contributes `std`, + // `path`, and `Path` to `unit.params`, and `path` then matches + // the framework-request-name allow-list in + // `is_external_input_param_name`, gating + // `unit_has_user_input_evidence` open on internal helpers whose + // real params (`dst`, `tasks`, `index_base_map_size`) carry no + // user-facing shape. Cluster surfaced from + // meilisearch/index-scheduler/src/scheduler/process_snapshot_creation.rs::remove_tasks + // where `dst: &std::path::Path` made every `db.delete(task.uid)` + // call inside the snapshot cleanup loop fire + // `missing_ownership_check`. Same shape would over-fire for + // `req: &Request<...>` / `ctx: &Context` / similar typed + // helpers. + "parameter" => { + if let Some(pattern) = node.child_by_field_name("pattern") { + collect_param_names(pattern, bytes, include_id_like_typed, out); + return; + } + // Fallback (no `pattern` field): descend into named children + // generically, mirroring the default arm. + for idx in 0..node.named_child_count() { + let Some(child) = node.named_child(idx as u32) else { + continue; + }; + collect_param_names(child, bytes, include_id_like_typed, out); + } + } "default_parameter" | "typed_parameter" | "typed_default_parameter" => { + // tree-sitter-python's `typed_parameter` rule does not + // expose a `name` field (the identifier is the wrapper's + // first child, with the type expression as a sibling). We + // fall back to the first `identifier` child when + // `child_by_field_name("name")` returns None so typed + // Python params (`connection_id: str`, + // `organization_id: int`, …) actually flow into + // `unit.params` instead of being silently dropped. Without + // this, route-aware extractors (Flask + FastAPI) couldn't + // see a typed handler's path params and the FastAPI + // dependency-injection recogniser had no subject to + // synthesise its auth check against. Languages whose + // grammar carries a `name` field (TypeScript + // `required_parameter`, …) still take the explicit field + // path. + // + // Note: Restricting this fallback to non-id-like names + // (so internal helpers with `release_id: int`, + // `organization_id: int`, etc. don't pass + // `unit_has_user_input_evidence`) would avoid the helper + // FP regression observed on sentry. The principled + // long-term fix is cross-file type-flow so subjects like + // `project.id` (where `project: Project`) are recognised + // as typed-bounded everywhere they're used. Until that + // lands, we accept the cluster, handlers go through the + // route extractors, and route-decorator-derived auth + // checks suppress them. if let Some(name) = node.child_by_field_name("name") { - collect_param_names(name, bytes, out); + collect_param_names(name, bytes, include_id_like_typed, out); + return; + } + for idx in 0..node.named_child_count() { + let Some(child) = node.named_child(idx as u32) else { + continue; + }; + if child.kind() == "identifier" { + let name_text = text(child, bytes); + // Conservative for non-route-handler units: only + // push the name when it is NOT id-like. This is a + // stopgap until cross-file type-flow lets us + // suppress `obj.id` subjects on typed-object args; + // without it, exposing typed helpers like + // `def f(release_id: int, project: Project) -> ...` + // over-fires `missing_ownership_check` because the + // engine sees `project.id` as a foreign scoped id. + // Route handlers (`include_id_like_typed = true`) + // bypass this filter, id-like params on a REST + // route are *the* primary user input, and the + // RouteHandler kind already passes + // `unit_has_user_input_evidence` unconditionally, + // so including them in `unit.params` doesn't + // affect that gate but does let + // `inject_middleware_auth` synthesise auth-check + // subjects that match the operation subjects (the + // FastAPI `dependencies=[Depends(...)]` coverage + // path that was previously empty for handlers like + // `def get_dag_run(dag_id: str, dag_run_id: str, + // session)`). + let is_id_like = is_python_id_like_typed_param(&name_text); + if !name_text.is_empty() + && !out.contains(&name_text) + && (include_id_like_typed || !is_id_like) + { + out.push(name_text); + } + return; + } } } _ => { @@ -2916,12 +3288,23 @@ fn collect_param_names(node: Node<'_>, bytes: &[u8], out: &mut Vec) { let Some(child) = node.named_child(idx as u32) else { continue; }; - collect_param_names(child, bytes, out); + collect_param_names(child, bytes, include_id_like_typed, out); } } } } +/// Ascii-lowered id-shape predicate used by the Python typed-param +/// fallback in `collect_param_names`. Mirrors +/// `auth_analysis::checks::is_id_like_name` (cannot share that fn +/// directly without a cross-module dep), both must move in lockstep +/// so the precondition gate and the param-extraction filter agree on +/// what counts as id-like. +fn is_python_id_like_typed_param(name: &str) -> bool { + let lower = name.to_ascii_lowercase(); + lower == "id" || lower.ends_with("id") || lower.ends_with("_id") || lower.ends_with("ids") +} + pub fn is_function_like(node: Node<'_>) -> bool { matches!( node.kind(), @@ -3028,6 +3411,7 @@ pub fn auth_check_from_call_site( line, args: call.args.clone(), condition_text: None, + is_route_level: false, }) } @@ -3200,20 +3584,46 @@ fn matches_request_query(chain: &[String]) -> bool { fn matches_session_context(chain: &[String]) -> bool { let lower = lower_segments(chain); - (lower.first().is_some_and(|segment| { + // Bare `session` is overloaded: in JS/TS it routinely means + // NextAuth/express-session and `session.user.id` is auth context; + // in Python `session.commit()`, `session.add(..)`, `session.scalar(..)` + // are SQLAlchemy ORM calls which have nothing to do with + // authentication. When the chain starts with bare `session`, + // refuse to classify it as auth context if the next segment is a + // canonical SQLAlchemy / SQLAlchemy-style ORM method name , + // those are read/write verbs and never identity accessors. Any + // other field-style accessor (`session.user`, `session.user_id`, + // `session.workspace_id`, `session.role`) stays a Session-context + // chain so the stale-authorization / ownership rules still see + // session-backed foreign ids. Bare `session` with no following + // segment is ambiguous and refused. + // Chain length 1 (`session` alone, as the receiver of a subscript + // like `session[:user_id]`) stays auth context, the session + // ambiguity only kicks in when there's a follow-up segment that + // can be inspected. Length 2 with a known ORM verb (`session.commit`, + // `session.add`) is denylisted; any other follow-up segment + // (`session.user`, `session.workspace_id`, `session.role`) keeps + // its Session classification. Length 3+ chains with `session` at + // the root always stay auth (they describe a session-stored + // member or sub-member). + let bare_session_chain_is_auth = lower.first().is_some_and(|segment| segment == "session") + && (lower.len() == 1 || lower.len() >= 3 || !is_orm_session_verb(&lower[1])); + let unambiguous_chain_root = lower.first().is_some_and(|segment| { matches!( segment.as_str(), - "session" - | "current_user" + "current_user" | "current_account" | "current_member" | "securitycontext" | "principal" | "authentication" ) - })) || (lower.len() >= 2 - && matches!(lower[0].as_str(), "req" | "request") - && matches!(lower[1].as_str(), "session" | "user" | "currentuser")) + }); + bare_session_chain_is_auth + || unambiguous_chain_root + || (lower.len() >= 2 + && matches!(lower[0].as_str(), "req" | "request") + && matches!(lower[1].as_str(), "session" | "user" | "currentuser")) || (lower.len() >= 3 && lower[0] == "self" && matches!(lower[1].as_str(), "request" | "session" | "current_user") @@ -3223,6 +3633,46 @@ fn matches_session_context(chain: &[String]) -> bool { && matches!(lower[1].as_str(), "session" | "state")) } +/// Denylist of SQLAlchemy / generic ORM session verbs. The Python +/// pytest-fixture idiom (`session: Session = sqlalchemy_session()`) +/// drives every test method through `session.commit()` / +/// `session.add(...)` / `session.scalar(...)`; classifying any of +/// those calls as auth Session context would falsely qualify +/// thousands of test methods as receiving user input. Only verbs +/// that name a SQL/transaction operation are listed, identity- +/// looking field accessors (`user`, `user_id`, `role`, +/// `workspace_id`, `project_id`, ...) all pass through and remain +/// auth Session. +fn is_orm_session_verb(segment: &str) -> bool { + matches!( + segment, + "commit" + | "rollback" + | "flush" + | "refresh" + | "merge" + | "expunge" + | "expunge_all" + | "close" + | "begin" + | "begin_nested" + | "query" + | "scalar" + | "scalars" + | "execute" + | "exec" + | "exec_driver_sql" + | "add" + | "add_all" + | "delete" + | "bulk_save_objects" + | "bulk_insert_mappings" + | "bulk_update_mappings" + | "configure" + | "info" + ) +} + fn subscript_value_ref(node: Node<'_>, bytes: &[u8]) -> Option { let object = node .child_by_field_name("object") @@ -3679,13 +4129,13 @@ mod tests { assert!(is_self_actor_type_text("LocalUserView")); // Non-matches. - // Bare `User` — too loose; commonly a deserialised payload type. + // Bare `User`, too loose; commonly a deserialised payload type. assert!(!is_self_actor_type_text("User")); assert!(!is_self_actor_type_text("UserPreferences")); // `UserView` lacks an authority-prefix segment and stays a // payload-shaped name. assert!(!is_self_actor_type_text("UserView")); - // No prefix vocabulary match — still rejected. + // No prefix vocabulary match, still rejected. assert!(!is_self_actor_type_text("PaymentUser")); // Wrong suffix vocabulary. assert!(!is_self_actor_type_text("CurrentUserPreferences")); @@ -3695,7 +4145,7 @@ mod tests { assert!(!is_self_actor_type_text("Json")); // `RequireAuth` / `RequireLogin` were dropped from the exact // set: they aren't `User`-bearing types and aren't - // semantically the auth subject — they're guard markers. The + // semantically the auth subject, they're guard markers. The // route-aware `axum::classify_guard_type` still treats them // as a login guard via the looser substring match. assert!(!is_self_actor_type_text("RequireAuth")); @@ -3769,7 +4219,7 @@ mod tests { aliases.insert("GetOptions".to_string()); aliases.insert("UpdateOptions".to_string()); - // Inline `TrpcSessionUser` marker — accepted regardless of alias set. + // Inline `TrpcSessionUser` marker, accepted regardless of alias set. assert!(type_text_is_trpc_options( ": { ctx: { user: NonNullable } }", &aliases @@ -3830,11 +4280,148 @@ mod tests { // Koa ctx.state / ctx.session. assert!(bt("ctx.session.user")); assert!(bt("ctx.state.user")); - // Negatives — bases that are NOT canonical authed-user roots. + // Negatives, bases that are NOT canonical authed-user roots. assert!(!bt("req.body")); assert!(!bt("req.params")); assert!(!bt("ctx.user")); assert!(!bt("data.user")); assert!(!bt("user")); } + + /// Pins the bare-`session` chain narrowing: ORM session verbs + /// (`commit` / `add` / `scalar` / `execute` / ...) are denylisted + ///, they do not contribute auth Session evidence even though the + /// chain root is the literal name `session`. Any other field- + /// shaped second segment (`user`, `user_id`, `workspace_id`, + /// `project_id`, `role`) keeps its Session classification so the + /// stale-authorization / missing-ownership rules still see + /// session-backed foreign ids. Closes the airflow pytest cluster + /// where `session.commit()` made `unit_has_user_input_evidence` + /// return true on test methods with no actual user input, while + /// preserving the gin/rails/rocket stale-session fixtures whose + /// session chains use foreign-id field accessors. + #[test] + fn matches_session_context_denylists_orm_session_verbs() { + use super::matches_session_context as msc; + let v = |chain: &[&str]| chain.iter().map(|s| s.to_string()).collect::>(); + // Bare `session.`, auth context. + assert!(msc(&v(&["session", "user"]))); + assert!(msc(&v(&["session", "user_id"]))); + assert!(msc(&v(&["session", "id"]))); + assert!(msc(&v(&["session", "uid"]))); + assert!(msc(&v(&["session", "email"]))); + assert!(msc(&v(&["session", "currentUser"]))); + // Foreign-id fields stored on the session, must remain auth + // Session for the stale-authorization rule (gin/rails/rocket + // fixtures). + assert!(msc(&v(&["session", "workspace_id"]))); + assert!(msc(&v(&["session", "project_id"]))); + assert!(msc(&v(&["session", "role"]))); + assert!(msc(&v(&["session", "currentWorkspaceID"]))); + // SQLAlchemy verbs, NOT auth context. + assert!(!msc(&v(&["session", "commit"]))); + assert!(!msc(&v(&["session", "rollback"]))); + assert!(!msc(&v(&["session", "scalar"]))); + assert!(!msc(&v(&["session", "scalars"]))); + assert!(!msc(&v(&["session", "add"]))); + assert!(!msc(&v(&["session", "delete"]))); + assert!(!msc(&v(&["session", "execute"]))); + assert!(!msc(&v(&["session", "flush"]))); + assert!(!msc(&v(&["session", "query"]))); + assert!(!msc(&v(&["session", "merge"]))); + assert!(!msc(&v(&["session", "refresh"]))); + assert!(!msc(&v(&["session", "close"]))); + // Bare `session` alone (length 1) stays auth, covers + // subscript shapes like `session[:workspace_id]` whose object + // is just the bare `session` identifier. + assert!(msc(&v(&["session"]))); + // `req.session.user`, unchanged: explicit auth-session base. + assert!(msc(&v(&["req", "session", "user"]))); + // `request.session`, unchanged: req/request-prefixed arm + // recognises `session` regardless of any subsequent segment. + assert!(msc(&v(&["request", "session"]))); + // `current_user.`, unambiguous chain root, fires regardless. + assert!(msc(&v(&["current_user", "id"]))); + assert!(msc(&v(&["current_user", "preferences"]))); + } + + /// Rust `parameter` nodes carry both a `pattern` field (the + /// binding) and a `type` field (the annotation). Until the + /// `parameter` arm in `collect_param_names`, the recursive default + /// arm collected identifiers from the `type` subtree as well , + /// turning `dst: &std::path::Path` into the param name set + /// `["dst", "std", "path", "Path"]`. `path` then matched the + /// framework-request-name allow-list in `is_external_input_param_name`, + /// gating `unit_has_user_input_evidence` open on internal helpers + /// that take a filesystem-path argument and re-firing + /// `missing_ownership_check` at every id-shaped operation + /// downstream. The arm restricts descent to the `pattern` field + /// for Rust parameters so only true binding names reach + /// `unit.params`. Real-repo motivation: + /// meilisearch/index-scheduler/src/scheduler/process_snapshot_creation.rs::remove_tasks + /// (`dst: &std::path::Path` made every `db.delete(task.uid)` call + /// fire missing-ownership-check). Same shape would also fire for + /// Rust functions taking `req: &Request<...>`, + /// `ctx: &Context`, etc., where the type tail matches the + /// framework name list but the binding is unrelated. + #[test] + fn collect_param_names_rust_skips_type_segment_idents() { + use super::function_params; + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter::Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let src = b"unsafe fn remove_tasks(tasks: &[Task], dst: &std::path::Path, sz: usize) {}"; + let tree = parser.parse(src.as_slice(), None).unwrap(); + let func = tree + .root_node() + .child(0) + .expect("source_file should have a function"); + let params = function_params(func, src); + assert_eq!( + params, + vec!["tasks".to_string(), "dst".to_string(), "sz".to_string()], + "type-segment idents (`std`, `path`, `Path`) must NOT pollute the param-name set" + ); + } + + #[test] + fn collect_param_names_rust_handles_request_typed_params() { + // `req: &Request`, `Request` and `Body` lowercase to + // `request` and `body`, both in the framework name list. The + // binding `req` is the only legitimate param name. + use super::function_params; + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter::Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let src = b"fn handle(req: &Request, state: AppState) -> Response { todo!() }"; + let tree = parser.parse(src.as_slice(), None).unwrap(); + let func = tree.root_node().child(0).expect("function"); + let params = function_params(func, src); + assert_eq!( + params, + vec!["req".to_string(), "state".to_string()], + "type idents `Request`/`Body`/`Response`/`AppState` must not leak as params" + ); + } + + #[test] + fn collect_param_names_rust_destructured_pattern_picks_up_bindings() { + // Tuple-pattern binding: `((a, b)): (u32, u32)` should yield + // both bound names from the pattern subtree, but NOT the type + // segment `u32`. + use super::function_params; + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter::Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let src = b"fn split((a, b): (u32, u32)) {}"; + let tree = parser.parse(src.as_slice(), None).unwrap(); + let func = tree.root_node().child(0).expect("function"); + let params = function_params(func, src); + assert!(params.contains(&"a".to_string()), "got {:?}", params); + assert!(params.contains(&"b".to_string()), "got {:?}", params); + assert!(!params.contains(&"u32".to_string()), "got {:?}", params); + } } diff --git a/src/auth_analysis/extract/django.rs b/src/auth_analysis/extract/django.rs index abed6f27..1b7dfcc2 100644 --- a/src/auth_analysis/extract/django.rs +++ b/src/auth_analysis/extract/django.rs @@ -209,7 +209,12 @@ fn collect_class_based_routes( } let line = method_node.start_position().row + 1; for call in &middleware_calls { - if let Some(check) = auth_check_from_call_site(call, line, rules) { + if let Some(mut check) = auth_check_from_call_site(call, line, rules) { + // Django class-based-view decorators (`@method_decorator(login_required)`, + // `@permission_required(...)`) and DRF `permission_classes` + // are declared at the route boundary; mark route-level + // so coverage applies to the action body's operations. + check.is_route_level = true; unit.auth_checks.push(check); } } @@ -443,7 +448,14 @@ fn inject_middleware_auth( return; }; for call in middleware_calls { - if let Some(check) = auth_check_from_call_site(call, line, rules) { + if let Some(mut check) = auth_check_from_call_site(call, line, rules) { + // Django decorators (`@login_required`, `@permission_required`, + // `@user_passes_test`, etc.) and DRF `permission_classes` are + // declared at the route boundary; mark route-level so + // `auth_check_covers_subject` short-circuits `true` for any + // non-login-guard match. See flask.rs / model.rs for the + // full rationale. + check.is_route_level = true; unit.auth_checks.push(check); } } diff --git a/src/auth_analysis/extract/flask.rs b/src/auth_analysis/extract/flask.rs index e26d3a9a..59076eb5 100644 --- a/src/auth_analysis/extract/flask.rs +++ b/src/auth_analysis/extract/flask.rs @@ -67,6 +67,15 @@ fn maybe_collect_flask_route( for decorator in decorator_expressions(node) { if let Some(mut specs) = parse_flask_route_decorator(decorator, bytes) { route_specs.append(&mut specs); + // FastAPI puts route-level dependencies (auth checks + + // logging hooks) inside the route decorator's + // `dependencies=[Depends(...)]` keyword argument, instead + // of as separate `@decorator` lines like Flask. Walk the + // route decorator's keyword args for that shape and lift + // each `Depends(call(...))` element into the + // middleware_calls list, so the same `inject_middleware_auth` + // path that Flask uses also picks up FastAPI auth deps. + middleware_calls.extend(extract_fastapi_dependencies(decorator, bytes)); } else { middleware_calls.extend(expand_decorator_calls(decorator, bytes)); } @@ -220,6 +229,75 @@ fn expand_decorator_calls(node: Node<'_>, bytes: &[u8]) -> Vec { vec![call_site_from_node(node, bytes)] } +/// Walk the route-decorator call's keyword args looking for the FastAPI +/// `dependencies=[Depends(call(...)), Depends(call), ...]` shape. For +/// each `Depends(...)` list element, extract the inner callable as a +/// `CallSite` so it can flow through `inject_middleware_auth` and be +/// matched against the per-language authorization-check / login-guard +/// name lists. Refuses non-call elements and `Depends(...)` without a +/// recognised inner call shape. +/// +/// The function is decoupled from Flask semantics (Flask routes never +/// use `dependencies=`); the lookup is purely structural and matches +/// FastAPI's documented dependency-injection convention. Lives in the +/// flask module because Flask's route-decorator parser already targets +/// the `@.(, ...)` shape that FastAPI shares. +fn extract_fastapi_dependencies(decorator_expr: Node<'_>, bytes: &[u8]) -> Vec { + if decorator_expr.kind() != "call" { + return Vec::new(); + } + let Some(arguments) = decorator_expr.child_by_field_name("arguments") else { + return Vec::new(); + }; + let Some(value) = keyword_argument_value(arguments, bytes, "dependencies") else { + return Vec::new(); + }; + let mut out = Vec::new(); + for element in named_children(value) { + if let Some(call) = unwrap_depends_call(element, bytes) { + out.push(call); + } + } + out +} + +/// Unwrap one `Depends(...)` list element from a FastAPI `dependencies` +/// list and return the inner callable as a `CallSite`. Three shapes +/// are accepted: +/// * `Depends(callee(arg1, arg2))`, most common, the inner call is +/// the callable factory invocation; record `callee` as the auth +/// check. +/// * `Depends(callee)`, bare reference; record `callee` itself. +/// * `Depends()` / non-`Depends` items, skipped. +fn unwrap_depends_call(node: Node<'_>, bytes: &[u8]) -> Option { + if node.kind() != "call" { + return None; + } + let function = node.child_by_field_name("function")?; + let function_text = text(function, bytes); + if !is_depends_callee(&function_text) { + return None; + } + let arguments = node.child_by_field_name("arguments")?; + let first = named_children(arguments).into_iter().next()?; + match first.kind() { + "call" => Some(call_site_from_node(first, bytes)), + "identifier" | "attribute" | "scoped_identifier" => Some(call_site_from_node(first, bytes)), + _ => None, + } +} + +/// True for the FastAPI `Depends` marker, including the +/// fully-qualified `fastapi.Depends` form. Conservative: only literal +/// matches, no canonicalisation. +fn is_depends_callee(callee: &str) -> bool { + let trimmed = callee.trim(); + matches!( + trimmed, + "Depends" | "fastapi.Depends" | "fastapi.params.Depends" + ) +} + fn inject_middleware_auth( model: &mut AuthorizationModel, unit_idx: usize, @@ -231,8 +309,48 @@ fn inject_middleware_auth( return; }; for call in middleware_calls { - if let Some(check) = auth_check_from_call_site(call, line, rules) { + if let Some(mut check) = auth_check_from_call_site(call, line, rules) { + // Mark as route-level: the check is declared at the route + // boundary (Flask `@requires_role(...)` decorator, FastAPI + // `dependencies=[Depends(...)]`, or any custom-router + // equivalent) and semantically authorizes every value the + // handler receives, path param, body, query, downstream + // row fetches, the lot. `auth_check_covers_subject` reads + // `is_route_level` and short-circuits `true` for any + // non-login-guard match, which is the correct shape for a + // decorator-level guard whose inner call carries no + // per-arg subject ref pointing back into the handler body. + // LoginGuard / TokenExpiry / TokenRecipient kinds are + // already excluded by `has_prior_subject_auth`'s filter + // before they reach `auth_check_covers_subject`, so the + // flag is safe to set unconditionally here, it has no + // effect on those kinds. + check.is_route_level = true; unit.auth_checks.push(check); } } } + +#[cfg(test)] +mod fastapi_dependencies_tests { + use super::is_depends_callee; + + /// `is_depends_callee` only matches the FastAPI `Depends` marker. + /// Any other wrapper call inside `dependencies=[...]` is ignored , + /// extracting an inner callee from the wrong wrapper would + /// misclassify logging hooks or filter callables as auth checks. + #[test] + fn is_depends_callee_recognises_canonical_forms() { + assert!(is_depends_callee("Depends")); + assert!(is_depends_callee("fastapi.Depends")); + assert!(is_depends_callee("fastapi.params.Depends")); + // Whitespace tolerance. + assert!(is_depends_callee(" Depends ")); + // Negatives. + assert!(!is_depends_callee("Annotated")); + assert!(!is_depends_callee("Body")); + assert!(!is_depends_callee("Depends.something")); + assert!(!is_depends_callee("RequiresAuth")); + assert!(!is_depends_callee("")); + } +} diff --git a/src/auth_analysis/extract/mod.rs b/src/auth_analysis/extract/mod.rs index 34c4f712..d3d8546f 100644 --- a/src/auth_analysis/extract/mod.rs +++ b/src/auth_analysis/extract/mod.rs @@ -61,5 +61,104 @@ pub fn extract_authorization_model( } } + // **Dedup units by span across extractors.** Multiple extractors + // (e.g. Flask + Django on a Python file) each call + // `collect_top_level_units`, producing one unit per top-level + // function. When one extractor also recognises a route on that + // function and promotes its copy to `RouteHandler` (with injected + // middleware auth checks), the *other* extractor's untouched + // `Function` copy still runs through `check_ownership_gaps` and + // emits the FP from a unit that never saw the middleware-derived + // auth check. + // + // This step keeps a single canonical unit per source span, + // preferring `RouteHandler` over `Function`, merging auth_checks + // and folding operation lists conservatively. Route registrations + // are remapped to the surviving unit index. + deduplicate_units_by_span(&mut model); + model } + +fn deduplicate_units_by_span(model: &mut AuthorizationModel) { + use crate::auth_analysis::model::{AnalysisUnit, AnalysisUnitKind}; + use std::collections::HashMap; + + // First pass: choose a winner for each span, prefer the + // first-seen `RouteHandler` over any `Function` copy. + let mut winner_by_span: HashMap<(usize, usize), usize> = HashMap::new(); + for (idx, unit) in model.units.iter().enumerate() { + let key = unit.span; + match winner_by_span.get(&key) { + None => { + winner_by_span.insert(key, idx); + } + Some(&existing) => { + let prev_kind = model.units[existing].kind; + if prev_kind != AnalysisUnitKind::RouteHandler + && unit.kind == AnalysisUnitKind::RouteHandler + { + winner_by_span.insert(key, idx); + } + } + } + } + + // Second pass: drain auth_checks from losers so we can append them + // to the winners after the layout collapses. + let mut moved_checks: Vec> = + Vec::with_capacity(model.units.len()); + for old_idx in 0..model.units.len() { + let span = model.units[old_idx].span; + let winner = *winner_by_span.get(&span).unwrap_or(&old_idx); + if winner == old_idx { + moved_checks.push(Vec::new()); + } else { + moved_checks.push(std::mem::take(&mut model.units[old_idx].auth_checks)); + } + } + + // Third pass: emit surviving units (clone the winners) and build + // the old-idx → new-idx remap. + let mut new_idx_for_old: HashMap = HashMap::new(); + let mut surviving: Vec = Vec::with_capacity(winner_by_span.len()); + for old_idx in 0..model.units.len() { + let span = model.units[old_idx].span; + let winner = *winner_by_span.get(&span).unwrap_or(&old_idx); + if winner == old_idx { + new_idx_for_old.insert(old_idx, surviving.len()); + surviving.push(model.units[old_idx].clone()); + } + } + + // Fourth pass: drain loser auth_checks into their winners, deduping + // by (span, callee). Operations are not merged: both extractor + // passes recompute the same operation list from the AST, so the + // winner already carries the canonical set. + for (old_idx, checks) in moved_checks.iter_mut().enumerate() { + let span = model.units[old_idx].span; + let winner = *winner_by_span.get(&span).unwrap_or(&old_idx); + if winner == old_idx { + continue; + } + let Some(&new_winner_idx) = new_idx_for_old.get(&winner) else { + continue; + }; + for check in checks.drain(..) { + let already_present = surviving[new_winner_idx] + .auth_checks + .iter() + .any(|existing| existing.span == check.span && existing.callee == check.callee); + if !already_present { + surviving[new_winner_idx].auth_checks.push(check); + } + } + } + + model.units = surviving; + for route in &mut model.routes { + if let Some(&new_idx) = new_idx_for_old.get(&route.unit_idx) { + route.unit_idx = new_idx; + } + } +} diff --git a/src/auth_analysis/extract/rails.rs b/src/auth_analysis/extract/rails.rs index d5e46cd8..30c5153e 100644 --- a/src/auth_analysis/extract/rails.rs +++ b/src/auth_analysis/extract/rails.rs @@ -137,7 +137,14 @@ fn maybe_collect_controller( let line = child.start_position().row + 1; let middleware_calls = applicable_filters(&filter_directives, &action_name); for call in &middleware_calls { - if let Some(check) = auth_check_from_call_site(call, line, rules) { + if let Some(mut check) = auth_check_from_call_site(call, line, rules) { + // Rails `before_action :authorize_user`-style filter + // callbacks run before the action and authorize the + // entire request, same shape as FastAPI / Flask + // `dependencies=[Depends(...)]`. Mark route-level so + // `auth_check_covers_subject` covers the row-fetches + // and downstream sinks the action body performs. + check.is_route_level = true; unit.auth_checks.push(check); } } diff --git a/src/auth_analysis/extract/sinatra.rs b/src/auth_analysis/extract/sinatra.rs index e30b32b1..2cd82441 100644 --- a/src/auth_analysis/extract/sinatra.rs +++ b/src/auth_analysis/extract/sinatra.rs @@ -114,7 +114,13 @@ fn maybe_collect_route( ); let line = block.start_position().row + 1; for call in before_filters { - if let Some(check) = auth_check_from_call_site(call, line, rules) { + if let Some(mut check) = auth_check_from_call_site(call, line, rules) { + // Sinatra `before` filters run before the route handler + // body and authorize the request as a whole, same shape + // as Rails `before_action`. Route-level so coverage + // applies to the handler's row fetches and downstream + // sinks. + check.is_route_level = true; unit.auth_checks.push(check); } } diff --git a/src/auth_analysis/extract/spring.rs b/src/auth_analysis/extract/spring.rs index ce7a3913..e9e84c3b 100644 --- a/src/auth_analysis/extract/spring.rs +++ b/src/auth_analysis/extract/spring.rs @@ -111,7 +111,15 @@ fn maybe_collect_controller( rules, ); for call in &middleware_calls { - if let Some(check) = auth_check_from_call_site(call, line, rules) { + if let Some(mut check) = auth_check_from_call_site(call, line, rules) { + // Spring `@PreAuthorize` / `@Secured` / + // `@RolesAllowed` annotations are declared at the + // method or class boundary and authorize the entire + // request, same shape as FastAPI / Flask + // `dependencies=[Depends(...)]`. Mark route-level + // so `auth_check_covers_subject` covers row fetches + // and downstream sinks in the handler body. + check.is_route_level = true; unit.auth_checks.push(check); } } diff --git a/src/auth_analysis/mod.rs b/src/auth_analysis/mod.rs index 31b33fef..f8dcff39 100644 --- a/src/auth_analysis/mod.rs +++ b/src/auth_analysis/mod.rs @@ -1,3 +1,5 @@ +#![doc = include_str!(concat!(env!("OUT_DIR"), "/auth_analysis.md"))] + pub mod checks; pub mod config; pub mod extract; @@ -26,7 +28,7 @@ fn byte_offset_to_point(tree: &Tree, byte: usize) -> tree_sitter::Point { /// source-level variable name. Built at `run_auth_analysis` call sites /// by merging type facts across all bodies in the file; a variable name /// with conflicting types in different bodies is dropped (absence is -/// safe — the sink gate just falls back to name-based classification). +/// safe, the sink gate just falls back to name-based classification). pub type VarTypes = HashMap; #[allow(clippy::too_many_arguments)] @@ -87,7 +89,7 @@ pub fn run_auth_analysis( /// Used by pass 1 to persist per-file auth summaries for cross-file /// helper lifting. Only returns summaries for units whose body /// already proves at least one positional parameter under ownership / -/// membership / admin / authorization check — i.e. the exact +/// membership / admin / authorization check, i.e. the exact /// single-file lift set, so the cross-file variant does not widen what /// counts as a helper. pub fn extract_auth_summaries_by_key( @@ -198,7 +200,7 @@ fn build_unit_summary(unit: &model::AnalysisUnit) -> Option]` /// so member-access subjects like `dto.age` are recognised as /// payload-incompatible. Only fires when the base param itself was -/// recognised as a typed extractor by a Phase 1-2 matcher — bare +/// recognised as a typed extractor by a typed-extractor matcher, bare /// parameters with no framework gate never lift their fields. fn apply_typed_bounded_params(model: &mut model::AuthorizationModel, var_types: &VarTypes) { for unit in &mut model.units { @@ -310,7 +312,7 @@ fn sink_class_for_type( /// /// When `global_summaries` is `Some`, cross-file helpers are looked up /// via [`GlobalSummaries::get_auth`] after the same-file summary -/// gather — this recovers the handler-in-file-A calling +/// gather, this recovers the handler-in-file-A calling /// `require_owner`-in-file-B case that single-file lifting cannot see. fn apply_helper_lifting( model: &mut model::AuthorizationModel, @@ -408,7 +410,7 @@ fn build_helper_summaries( let mut summary = AuthCheckSummary::default(); for check in &unit.auth_checks { // We only lift checks that actively prove ownership / - // membership / admin-rights / authorize-helper — login + // membership / admin-rights / authorize-helper, login // and token-validity checks don't justify foreign-id // mutations and we want to keep parity with // `has_prior_subject_auth`'s filter. @@ -435,7 +437,7 @@ fn build_helper_summaries( } } if !summary.param_auth_kinds.is_empty() { - // Deduplicate by last segment of the function name — the + // Deduplicate by last segment of the function name, the // lifting site matches the call's last segment too. let last = name.rsplit('.').next().unwrap_or(name).to_string(); summaries @@ -492,7 +494,7 @@ fn stronger_check_kind(a: model::AuthCheckKind, b: model::AuthCheckKind) -> mode /// For one unit, synthesise an `AuthCheck` at every call site that /// targets a helper with a non-trivial summary. Subjects are taken /// from `call_site.args_value_refs[K]` for each auth-checked param -/// position K — these are the caller's concrete subjects passed at +/// position K, these are the caller's concrete subjects passed at /// that arg slot, exactly what `auth_check_covers_subject` needs. fn synthesise_checks_for_unit( unit: &model::AnalysisUnit, @@ -501,7 +503,7 @@ fn synthesise_checks_for_unit( let line_of = |span: (usize, usize)| -> usize { // Span is byte offsets; we don't have direct access to a Tree // here. Caller assigns line via `line` field on call_site - // through CallSite metadata absence — fall back to the unit's + // through CallSite metadata absence, fall back to the unit's // line since covers_subject uses `check.line <= op.line` and // helper calls are typically near the unit start. let _ = span; @@ -541,6 +543,7 @@ fn synthesise_checks_for_unit( line, args: call.args.clone(), condition_text: None, + is_route_level: false, }); } out @@ -563,7 +566,7 @@ fn call_site_line(unit: &model::AnalysisUnit, call: &model::CallSite) -> Option< None } -/// Cross-file variant of [`synthesise_checks_for_unit`] — for each +/// Cross-file variant of [`synthesise_checks_for_unit`], for each /// call site in `unit`, resolve the callee against `GlobalSummaries` /// and look up an `AuthCheckSummary` that was persisted by some other /// file's pass-1 extraction. Skips call sites already handled by the @@ -589,7 +592,7 @@ fn synthesise_cross_file_checks_for_unit( if unit.name.as_deref() == Some(last) { continue; } - // Skip if the single-file map already handled this callee — + // Skip if the single-file map already handled this callee , // that path has richer same-file context (existing // summaries from sibling units in this model) and its // synthesised check is strictly more precise. @@ -636,6 +639,7 @@ fn synthesise_cross_file_checks_for_unit( line, args: call.args.clone(), condition_text: None, + is_route_level: false, }); } out @@ -767,7 +771,7 @@ mod tests { Some(SinkClass::DbCrossTenantRead) ); // DatabaseConnection: unrecognized verb (`execute`) → DbMutation - // (conservative default — treat as write-shaped). + // (conservative default, treat as write-shaped). assert_eq!( sink_class_for_type(&TypeKind::DatabaseConnection, "conn.execute", &rules), Some(SinkClass::DbMutation) @@ -819,7 +823,7 @@ mod tests { ))); let var_types: VarTypes = HashMap::new(); apply_var_types_to_model(&mut model, &rules, &var_types); - // Unchanged — no entry in var_types for `db`. + // Unchanged, no entry in var_types for `db`. assert_eq!( model.units[0].operations[0].sink_class, Some(SinkClass::DbMutation) diff --git a/src/auth_analysis/model.rs b/src/auth_analysis/model.rs index 7cbf1c17..adb52c4d 100644 --- a/src/auth_analysis/model.rs +++ b/src/auth_analysis/model.rs @@ -55,7 +55,7 @@ pub enum OperationKind { } /// Classification of a sensitive operation by the resource it targets. -/// `check_ownership_gaps` only fires on the first five classes — +/// `check_ownership_gaps` only fires on the first five classes , /// `InMemoryLocal` is never authorization-relevant. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SinkClass { @@ -76,7 +76,7 @@ pub enum SinkClass { /// (Redis / memcache / distributed cache client). CacheCrossTenant, /// A method call against a local, in-memory collection (HashMap, - /// HashSet, Vec, …) — never authorization-relevant. + /// HashSet, Vec, …), never authorization-relevant. InMemoryLocal, } @@ -133,6 +133,33 @@ pub struct AuthCheck { pub line: usize, pub args: Vec, pub condition_text: Option, + /// True when the check was declared at the route boundary + /// (decorator / middleware / dependency-injection list) rather + /// than as a per-call check inside the handler body. + /// + /// Route-level non-login-guard checks authorize the *entire* + /// handler, they gate every value the handler receives, every + /// row the handler fetches, and every operation downstream. An + /// in-body `auth_check_covers_subject` walk that requires a + /// per-name subject match cannot model that semantics: a + /// FastAPI `dependencies=[Depends(requires_access_dag(method= + /// "POST", access_entity=DagAccessEntity.RUN))]` is opaque to + /// the engine, the inner `requires_access_dag` call carries no + /// per-arg subject ref pointing to `dag_id` or `dag.id`. The + /// flag tells `auth_check_covers_subject` to short-circuit + /// `true` for any non-login-guard route-level check, leaving + /// only the LoginGuard / TokenExpiry / TokenRecipient kinds + /// (already excluded upstream by `has_prior_subject_auth`'s + /// filter) to be ignored. + /// + /// Set by `inject_middleware_auth` (Django, Flask, FastAPI) at + /// the route-decorator entry point. Default `false` for + /// in-body checks (`require_membership(user, group_id)`, + /// `is_admin(user)`, etc.), those still flow through the + /// per-subject coverage logic so a check on + /// `community.creator_id` doesn't blanket-suppress every other + /// subject in the unit. + pub is_route_level: bool, } #[derive(Debug, Clone)] @@ -140,7 +167,7 @@ pub struct SensitiveOperation { pub kind: OperationKind, /// Sink classification. `None` means the operation was recorded /// for taxonomy completeness but does not match any known resource - /// class — defensive, and currently unused. + /// class, defensive, and currently unused. pub sink_class: Option, pub callee: String, pub subjects: Vec, @@ -183,7 +210,7 @@ pub struct AnalysisUnit { /// "fetch-then-authorize" exemption in `checks.rs`: if a row-fetch /// operation produces variable `V` and SOME auth check elsewhere /// in the unit names `V`, the row-fetch operation is considered - /// authorized — even though the check appears textually after the + /// authorized, even though the check appears textually after the /// fetch. This is the standard idiom in row-level authz code: /// fetch the row first to extract the resource id, then call /// `check__(&user, &row, ...)` to authorize it. @@ -199,7 +226,7 @@ pub struct AnalysisUnit { /// copies of `V.id` / `V.user_id` / `V.uid` / `V.userId` for some /// `V ∈ self_actor_vars`). Populated when the extractor sees /// `let X = V.id` or `let X = (V.id as ..).into()` / `V.id.into()` - /// shapes — anywhere a route-handler reduces the authenticated + /// shapes, anywhere a route-handler reduces the authenticated /// principal to a scalar id and reuses it as a SQL parameter. /// Consulted by `is_actor_context_subject` so subjects whose `name` /// is in this set count as actor context, not foreign scoped IDs. @@ -217,7 +244,7 @@ pub struct AnalysisUnit { /// one of these names. pub authorized_sql_vars: HashSet, /// Local variables bound (by `let`, `:=`, `var`, `const`) to a - /// pure literal — string, integer, float, or boolean. These are + /// pure literal, string, integer, float, or boolean. These are /// developer-chosen constants and cannot be user-controlled, so /// they must never trip `.auth.missing_ownership_check` /// even when the variable name passes `is_id_like`. Closes the @@ -231,22 +258,21 @@ pub struct AnalysisUnit { /// `is_typed_bounded_subject` so parameters like Spring `Long /// userId`, Axum `Path`, or FastAPI `user_id: int` are not /// classified as scoped-identifier subjects even when their name - /// passes `is_id_like` — the framework guarantees the value is a + /// passes `is_id_like`, the framework guarantees the value is a /// number that cannot carry a SQL/file/shell payload. pub typed_bounded_vars: HashSet, - /// Phase 6: per-DTO-extractor parameter, the field names whose + /// per-DTO-extractor parameter, the field names whose /// declared type is a payload-incompatible scalar. Map key is the /// parameter name (e.g. `dto`), value is the list of field names /// (e.g. `["age", "count"]`). Populated by /// [`super::apply_typed_bounded_params`] only when the parameter - /// itself was recognised as a typed extractor by a Phase 1-2 - /// matcher — bare parameters with no framework gate never lift - /// their fields. + /// itself was recognised as a typed extractor, bare parameters + /// with no framework gate never lift their fields. pub typed_bounded_dto_fields: HashMap>, /// Per-unit dynamic session-base text set, supplementing the /// hard-coded list in `is_self_scoped_session_base`. Populated by /// the extractor when a parameter's static type signals a known - /// auth-context shape — e.g. TRPC's `Options { ctx: { user: + /// auth-context shape, e.g. TRPC's `Options { ctx: { user: /// NonNullable } }` adds `.user` so /// downstream `ctx.user.id` accesses count as actor context. Each /// entry is the dotted base text (e.g. `"ctx.user"`, diff --git a/src/auth_analysis/sql_semantics.rs b/src/auth_analysis/sql_semantics.rs index 136a3df0..2a787987 100644 --- a/src/auth_analysis/sql_semantics.rs +++ b/src/auth_analysis/sql_semantics.rs @@ -28,7 +28,7 @@ pub enum SqlAuthClassification { /// Query is auth-gated. The JOIN (or direct WHERE) pins returned /// rows to the bound user. We don't track *which* bind position - /// here — the caller treats whichever bind value flows into the + /// here, the caller treats whichever bind value flows into the /// query as the user-id witness; that's safe because the caller /// already requires the row binding to come from a `let X = …` /// site we can name. @@ -37,12 +37,12 @@ pub enum SqlAuthClassification { /// Classify `sql` as auth-gated under the configured ACL tables. /// Returns `Some(Authorized)` when one of the recognized patterns -/// holds, `None` otherwise (conservative — unknown shapes are treated +/// holds, `None` otherwise (conservative, unknown shapes are treated /// as unauthorized). pub fn classify_sql_query(sql: &str, acl_tables: &[String]) -> Option { let normalized = normalize_sql(sql); if !normalized.trim_start().starts_with("select") { - // For B3 we only authorize SELECT queries — INSERT/UPDATE/DELETE + // For B3 we only authorize SELECT queries, INSERT/UPDATE/DELETE // need their own analysis and aren't in scope. (A literal // `DELETE … WHERE user_id = ?N` could be safely authorized, // but the call sites we care about for FP suppression are @@ -60,7 +60,7 @@ pub fn classify_sql_query(sql: &str, acl_tables: &[String]) -> Option [AS] ? JOIN [AS] ? ON … WHERE -/// .user_id = ?N` — verifies that an ACL table appears in a JOIN +/// .user_id = ?N`, verifies that an ACL table appears in a JOIN /// clause and that the WHERE clause contains a `<…>.user_id = ?` (or /// bare `user_id = ?`) predicate. Order of the WHERE predicates /// doesn't matter; AND/OR connectors are ignored. @@ -87,14 +87,14 @@ fn matches_join_through_acl(sql: &str, acl_tables: &[String]) -> bool { where_clause_contains_user_id_bind(where_clause) } -/// Direct ownership: `SELECT … FROM WHERE … user_id = ?N` — no +/// Direct ownership: `SELECT … FROM WHERE … user_id = ?N`, no /// JOIN. Covers single-table reads where the row already carries the /// owning user id (`SELECT … FROM docs WHERE user_id = ?1`). We do /// NOT require `id = ?M` to also be present; the `user_id = ?N` /// predicate alone is sufficient, since any row returned must be /// owned by the bound user. /// -/// Refuses to fire when a JOIN is present — the JOIN target may not +/// Refuses to fire when a JOIN is present, the JOIN target may not /// be in the ACL list, so the WHERE predicate (which may apply to /// the joined table, e.g. `WHERE al.user_id = ?N` against an /// `audit_log` JOIN) doesn't actually pin the primary rows to the @@ -125,7 +125,7 @@ fn where_clause_contains_user_id_bind(where_clause: &str) -> bool { for (idx, _) in where_only.match_indices(needle) { // Make sure this is a column boundary on the left side // (avoid matching `posted_user_id` or `target_user_id` - // — those don't pin to the actor). + //, those don't pin to the actor). let before = where_only[..idx].chars().last(); if !is_column_boundary_left(before) { continue; @@ -158,11 +158,11 @@ fn looks_like_bind_param(after_eq: &str) -> bool { return false; } match bytes[0] { - // ?N (sqlite/sqlx anonymous) — accept ?, ?1, ?2… + // ?N (sqlite/sqlx anonymous), accept ?, ?1, ?2… b'?' => true, - // $N (postgres style) — require a digit after. + // $N (postgres style), require a digit after. b'$' => bytes.get(1).is_some_and(|b| b.is_ascii_digit()), - // :name (named bind) — require an identifier char after. + // :name (named bind), require an identifier char after. b':' => bytes .get(1) .is_some_and(|b| b.is_ascii_alphabetic() || *b == b'_'), @@ -277,7 +277,7 @@ mod tests { #[test] fn join_against_non_acl_table_is_not_authorized() { - // `audit_log` is not in the configured ACL list — JOIN doesn't + // `audit_log` is not in the configured ACL list, JOIN doesn't // pin rows to the bound user, so the query is unauthorized. let sql = "SELECT d.* FROM docs d \ JOIN audit_log al ON al.doc_id = d.id \ @@ -301,7 +301,7 @@ mod tests { #[test] fn similar_column_names_do_not_trip_user_id_match() { - // `posted_user_id` shouldn't satisfy the `user_id = ?` check — + // `posted_user_id` shouldn't satisfy the `user_id = ?` check , // that column doesn't pin to the actor. let sql = "SELECT * FROM posts WHERE posted_user_id = ?1"; assert_eq!(classify_sql_query(sql, &acl()), None); diff --git a/src/callgraph.rs b/src/callgraph.rs index 819c998f..4b3f8710 100644 --- a/src/callgraph.rs +++ b/src/callgraph.rs @@ -16,7 +16,7 @@ use std::path::{Path, PathBuf}; #[derive(Debug, Clone)] pub struct CallEdge { /// The raw callee string as it appeared in source (e.g. `"env::var"`). - /// Preserved for diagnostics — **not** the normalized form used for resolution. + /// Preserved for diagnostics, **not** the normalized form used for resolution. #[allow(dead_code)] // used for future diagnostics and path display pub call_site: String, } @@ -28,7 +28,7 @@ pub struct UnresolvedCallee { pub callee_name: String, } -/// A callee that matched multiple function definitions — ambiguous. +/// A callee that matched multiple function definitions, ambiguous. #[derive(Debug, Clone)] pub struct AmbiguousCallee { pub caller: FuncKey, @@ -168,14 +168,14 @@ pub(crate) fn callee_container_hint(raw: &str) -> &str { /// /// Key design notes: /// -/// * Keys are **language-scoped** — a Java `findById` and a Python +/// * Keys are **language-scoped**, a Java `findById` and a Python /// `findById` never alias. Every other index in this module is also /// language-scoped (`by_lang_name`, `by_lang_qualified`); keeping the /// same partition here means devirtualisation's "subset of today's /// targets" invariant is structurally preserved. /// * The container key carries the [`FuncKey::container`] verbatim /// (e.g. `"Repository"` or nested `"Outer::Inner"`). Empty containers -/// are not indexed in `by_container` — free top-level functions live +/// are not indexed in `by_container`, free top-level functions live /// only in `by_name` and are looked up via the `None` container path. /// * `SmallVec` inline capacity is sized for the common case (≤ 2 same- /// container overloads, ≤ 4 same-name candidates across containers); @@ -199,7 +199,7 @@ impl ClassMethodIndex { /// Iteration is over every `FuncKey` in the map; each key is /// inserted into `by_name` and (when its container is non-empty) /// into `by_container`. No ordering guarantees on the candidate - /// vectors — call sites that need determinism should sort downstream. + /// vectors, call sites that need determinism should sort downstream. pub fn build(summaries: &GlobalSummaries) -> Self { let mut by_container: HashMap<(Lang, String, String), SmallVec<[FuncKey; 2]>> = HashMap::new(); @@ -223,11 +223,11 @@ impl ClassMethodIndex { /// Resolve `(container, method)` to its candidate target set. /// - /// * `container = Some(c)` — return only candidates whose defining + /// * `container = Some(c)`, return only candidates whose defining /// container equals `c`. Empty slice when no such target exists, /// even if a same-name function lives in another container. /// This is the **devirtualised** path: a hard subset of `by_name`. - /// * `container = None` — return every same-name candidate in the + /// * `container = None`, return every same-name candidate in the /// language. This is the **fallback** path used when the receiver /// type is unknown; matches today's name-only behaviour. /// @@ -264,48 +264,19 @@ impl ClassMethodIndex { } } -// ───────────────────────────────────────────────────────────────────────────── -// Type hierarchy index — Phase 6 (subtype awareness) -// ───────────────────────────────────────────────────────────────────────────── +// ── Type hierarchy index ──────────────────────────────────────────────── -/// Per-language `(super_type) → SmallVec<[sub_type]>` index built once -/// per call-graph construction from every merged -/// [`crate::summary::FuncSummary::hierarchy_edges`]. When a method -/// call's receiver is statically typed as a super-class / trait / -/// interface, the call-graph wedge fans out the edge to every concrete -/// implementer's matching method — recovering the dispatch precision -/// that would otherwise be lost to today's name-only resolution. +/// Per-language `(super_type) → sub-types` index built from every merged +/// [`crate::summary::FuncSummary::hierarchy_edges`]. Lets virtual +/// dispatch fan out to every concrete implementer's matching method. /// -/// Subtype semantics covered: -/// * Java `class X extends Y` / `class X implements I` / `interface -/// I extends J` -/// * Rust `impl Trait for Type` -/// * TypeScript `class X extends Y implements I` / -/// `interface I extends J` -/// * Python `class X(Base)` (excludes `object`) -/// * PHP, Ruby, C++ — see [`crate::cfg::hierarchy`] for the -/// per-language extraction rules. +/// Covers Java `extends`/`implements`, Rust `impl Trait for Type`, TS +/// `extends`/`implements`, Python `class X(Base)`, plus PHP/Ruby/C++ +/// (see [`crate::cfg::hierarchy`]). Go's structural interfaces are +/// intentionally omitted, name-only resolution is used instead. /// -/// Go's structural / implicit interface satisfaction is intractable to -/// enumerate from per-file information and is **deliberately omitted** -/// — Go callers fall back to today's name-only resolution, so -/// precision is unchanged from the pre-Phase-6 baseline. -/// -/// Key design notes -/// ──────────────── -/// -/// * **Language-scoped.** Mirrors [`ClassMethodIndex`]: a Java -/// `Repository` and a Python `Repository` never alias. -/// * **Bare container names.** No namespace qualification. When -/// container names alias across unrelated namespaces (rare in -/// practice, common in mono-repos) the resolver may over-fan-out; -/// that is conservative for *correctness* (a subset of dispatch -/// targets is unsafe — virtual dispatch may genuinely reach any -/// implementer) and may need namespace-qualified keying as a -/// Phase 6.5 follow-up if benchmark precision regresses. -/// * **`SmallVec` inline capacity.** 4 implementers per super-type -/// covers most real-world hierarchies without spillover; spillover -/// allocates but keeps lookups O(1) amortised. +/// Container names are bare (no namespace), so cross-namespace aliases +/// may over-fan-out. That is conservative for correctness. #[derive(Debug, Default, Clone)] pub struct TypeHierarchyIndex { /// `(lang, super_type)` → distinct sub-type / impl container names. @@ -438,15 +409,11 @@ impl TypeHierarchyIndex { /// 3. On ambiguity: use two-segment qualified name to narrow candidates /// 4. Interop edges (explicit cross-language bridges) /// -/// **Phase 3 (typed call-graph devirtualisation):** when an SSA -/// summary on the caller carries a `(call_ordinal, container_name)` -/// entry in [`crate::summary::ssa_summary::SsaFuncSummary::typed_call_receivers`], -/// the matching call site is first resolved via [`ClassMethodIndex`] -/// restricted to the receiver-typed container. An exact match (after -/// arity filter) becomes the edge; a multi-candidate hit is fed back -/// into the standard resolver via `CalleeQuery.receiver_type`; a -/// zero-candidate hit falls through to today's name-only resolution -/// so receiver-type misclassifications never silently drop edges. +/// Typed-call devirtualisation: when the caller's SSA summary carries +/// a typed container for a call ordinal, that site is first resolved +/// via [`ClassMethodIndex`] restricted to the receiver type. Exact +/// match → edge; multi-candidate → fed back through +/// `CalleeQuery.receiver_type`; zero match → name-only fallback. /// /// Unresolved and ambiguous callees are recorded for diagnostics but /// do **not** create edges. @@ -460,7 +427,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg index.insert(key.clone(), idx); } - // Phase 3: build a single `(lang, container, name) → candidates` + // build a single `(lang, container, name) → candidates` // index from the merged summaries. Used below to devirtualise // every method-call edge whose receiver has a recoverable type // fact. Cost is one allocation per FuncKey across the program; @@ -468,7 +435,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg // win on codebases with many same-name methods. let method_index = ClassMethodIndex::build(summaries); - // Phase 6: build a sibling `(lang, super_type) → sub_types` index + // build a sibling `(lang, super_type) → sub_types` index // from every merged summary's `hierarchy_edges`. Consumed below // to fan out method-call edges to all known concrete // implementers when a receiver's static type is a super-class / @@ -497,7 +464,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg None }; - // Phase 3: per-caller `(call_ordinal → container_name)` map + // per-caller `(call_ordinal → container_name)` map // pulled from the caller's SSA summary, when one exists. // Empty when the caller has no SSA summary (zero-param trivial // bodies skip extraction unless they had typed receivers) or @@ -520,23 +487,15 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg let leaf = callee_leaf_name(raw_callee); // Two-segment form for diagnostics / fallback disambiguation. let qualified = normalize_callee_name(raw_callee); - // Structured arity carried per call site — used to disambiguate + // Structured arity carried per call site, used to disambiguate // same-name/different-arity overloads during resolution. let arity_hint: Option = site.arity; - // Phase 3 devirtualisation entry point. Only fires for - // method calls (sites carrying a structured receiver) when - // the caller's SSA summary recorded a typed container for - // this ordinal. When `Some(container)` resolves to a - // single arity-matching target, we add the edge and skip - // the standard resolver. When it resolves to multiple, - // we fall through with the container hinted as - // `receiver_type` so `resolve_callee`'s authoritative - // step-1 picks the right one. When it resolves to zero, - // we fall through entirely so today's name-only path can - // still find the edge — preserving the - // "subset of today's targets, never a superset" rule - // even under type-fact misclassification. + // Devirtualisation: for method calls whose SSA summary + // recorded a typed container, resolve via ClassMethodIndex + // first. Single match → direct edge; multi → fall through + // with `receiver_type` set; zero → name-only fallback so + // misclassified receivers never silently drop edges. let typed_container: Option<&str> = if site.receiver.is_some() { typed_receivers.get(&site.ordinal).copied() } else { @@ -544,12 +503,10 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg }; if let Some(container) = typed_container { - // Phase 6: resolve the typed container *plus* every - // known sub-type / impl in the hierarchy index, so a - // receiver typed as a super-class / trait / interface - // fans out to every concrete implementer. When the - // hierarchy has no matching super-type entry, this - // collapses to the Phase 3 direct-container lookup. + // Resolve the typed container plus every known + // sub-type / impl, so a super-class / trait / interface + // receiver fans out to every concrete implementer. + // No hierarchy entry → direct-container lookup. let widened: Vec = hierarchy.resolve_with_hierarchy( &method_index, caller_key.lang, @@ -575,8 +532,8 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg } continue; } - // Phase 6: multiple arity-filtered candidates means - // genuine virtual dispatch through a super-type — fan + // multiple arity-filtered candidates means + // genuine virtual dispatch through a super-type, fan // out to *every* implementer. This widens edges // (correctly: the call genuinely may target any // implementer at runtime) so SCC sizes may grow on @@ -614,7 +571,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg continue; } // Either zero matches (fall through to legacy path) or - // multiple matches on the direct container — let + // multiple matches on the direct container, let // `resolve_callee` apply its authoritative // receiver_type filter + tie-breakers. if !arity_filtered.is_empty() { @@ -652,8 +609,8 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg // Rust callers with a module-qualified call (no receiver) go // through the `use`-map aware resolver first. When the call has - // a structured receiver it is a method call — the qualifier is - // an impl/trait name, not a module path — so we fall back to the + // a structured receiver it is a method call, the qualifier is + // an impl/trait name, not a module path, so we fall back to the // structured resolver. All other languages skip the use-map // branch entirely. let use_rust_path = caller_key.lang == Lang::Rust && site.receiver.is_none(); @@ -671,11 +628,11 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg // categorize each hint so the resolver can apply the right // policy: // - // * `namespace_qualifier` — structured module/namespace + // * `namespace_qualifier`, structured module/namespace // prefix (`env` in `env::var`, `http` in `http.Get`). - // * `receiver_var` — syntactic receiver variable (e.g. + // * `receiver_var`, syntactic receiver variable (e.g. // `obj` in `obj.method`); used only as a last tie-break. - // * `caller_container` — caller's own class/impl, so bare + // * `caller_container`, caller's own class/impl, so bare // `foo()` inside a method resolves to the same class. // // The raw text-parsed container (legacy @@ -815,7 +772,7 @@ fn resolve_via_interop( /// Compute SCC decomposition and topological ordering of the call graph. /// /// `petgraph::algo::tarjan_scc` returns SCCs in *reverse* topological order -/// of the condensation DAG — i.e. leaf SCCs (no outgoing cross-SCC edges) +/// of the condensation DAG, i.e. leaf SCCs (no outgoing cross-SCC edges) /// come **first**. That is exactly the **callee-first** order suitable for /// bottom-up taint propagation. pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis { @@ -850,7 +807,7 @@ pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis { /// [`crate::commands::scan::run_topo_batches`]. `cross_file` is a tighter /// signal used by joint fixed-point convergence: it implies the /// recursion involves at least one cross-file call edge, so the inline -/// cache and per-iteration findings need joint convergence — not just +/// cache and per-iteration findings need joint convergence, not just /// summary convergence. pub struct FileBatch<'a> { pub files: Vec<&'a PathBuf>, @@ -901,7 +858,7 @@ pub fn callers_of(cg: &CallGraph, callee: &FuncKey) -> Vec { /// result is a `HashSet` suitable for membership checks while /// filtering the batch's file list. /// -/// A changed callee's *own* namespace is also included — if the +/// A changed callee's *own* namespace is also included, if the /// callee's summary was refined, the file it lives in may itself /// have been a caller (intra-file recursion) or may carry sibling /// functions whose analysis should be re-run alongside the callee @@ -958,7 +915,7 @@ pub fn scc_file_batches_with_metadata<'a>( // 2. Build file relative-path → (min topo index, has_mutual_recursion, cross_file). // `cross_file` is set whenever the file participates in an SCC whose - // nodes span more than one namespace — the cross-file signal. + // nodes span more than one namespace, the cross-file signal. let mut file_topo: HashMap<&str, (usize, bool, bool)> = HashMap::new(); for (topo_pos, &scc_idx) in analysis.topo_scc_callee_first.iter().enumerate() { let scc_recursive = analysis.sccs[scc_idx].len() > 1; @@ -1015,7 +972,7 @@ pub fn scc_file_batches_with_metadata<'a>( /// of its functions appear. This ensures leaf callees are available as early /// as possible for files that depend on them. Caller functions in the same /// file that happen to be in a later SCC are no worse off than the current -/// fully-parallel approach — they simply don't yet benefit from ordering, +/// fully-parallel approach, they simply don't yet benefit from ordering, /// but nothing is lost. /// /// Returns `(ordered_batches, orphan_files)` where orphan_files are paths @@ -1188,7 +1145,7 @@ mod tests { fn same_name_python_and_rust() { let py_foo = make_summary("foo", "handler.py", "python", 0, vec![]); let rs_foo = make_summary("foo", "handler.rs", "rust", 0, vec![]); - // Python caller calls "foo" — should only see the Python one + // Python caller calls "foo", should only see the Python one let py_caller = make_summary("main", "app.py", "python", 0, vec!["foo"]); let gs = merge_summaries(vec![py_foo, rs_foo, py_caller], None); @@ -1315,7 +1272,7 @@ mod tests { let gs = merge_summaries(vec![helper_a, helper_b, caller], None); let cg = build_call_graph(&gs, &[]); - assert_eq!(cg.graph.edge_count(), 0); // no edge — ambiguous + assert_eq!(cg.graph.edge_count(), 0); // no edge, ambiguous assert!(cg.unresolved_not_found.is_empty()); assert_eq!(cg.unresolved_ambiguous.len(), 1); assert_eq!(cg.unresolved_ambiguous[0].callee_name, "helper"); @@ -1728,7 +1685,7 @@ mod tests { // Two "send" functions in different namespaces. let send_http = make_summary("send", "src/http.rs", "rust", 0, vec![]); let send_mail = make_summary("send", "src/mail.rs", "rust", 0, vec![]); - // Caller is in a third namespace, calling "http::send" — leaf "send" + // Caller is in a third namespace, calling "http::send", leaf "send" // is ambiguous, but "http" qualifier should match "src/http.rs". let caller = make_summary("caller", "src/main.rs", "rust", 0, vec!["http::send"]); @@ -1766,7 +1723,7 @@ mod tests { #[test] fn unqualified_callee_stays_ambiguous() { - // Same setup but caller uses unqualified "send" — no disambiguation + // Same setup but caller uses unqualified "send", no disambiguation let send_http = make_summary("send", "src/http.rs", "rust", 0, vec![]); let send_mail = make_summary("send", "src/mail.rs", "rust", 0, vec![]); let caller = make_summary("caller", "src/main.rs", "rust", 0, vec!["send"]); @@ -1806,7 +1763,7 @@ mod tests { // ── structured-metadata disambiguation (callee metadata) ───────────── /// Helper: build a summary whose callees carry structured CalleeSite - /// metadata — used by the tests below to exercise arity / receiver / + /// metadata, used by the tests below to exercise arity / receiver / /// qualifier propagation into resolution. fn summary_with_sites( name: &str, @@ -1840,7 +1797,7 @@ mod tests { // Two `encode` functions in the same file, different arities. let encode1 = make_summary("encode", "src/codec.rs", "rust", 1, vec![]); let encode2 = make_summary("encode", "src/codec.rs", "rust", 2, vec![]); - // Caller lives in *another* file so namespace does not disambiguate — + // Caller lives in *another* file so namespace does not disambiguate , // the only signal is the per-call-site arity. let caller = summary_with_sites( "driver", @@ -2007,7 +1964,7 @@ mod tests { #[test] fn legacy_string_callees_still_resolve() { let helper = make_summary("helper", "src/lib.rs", "rust", 0, vec![]); - // make_summary already returns CalleeSite::bare entries — i.e. the + // make_summary already returns CalleeSite::bare entries, i.e. the // "lifted legacy" form with no arity or receiver metadata. let caller = make_summary("main", "src/lib.rs", "rust", 0, vec!["helper"]); let gs = merge_summaries(vec![helper, caller], None); @@ -2017,7 +1974,7 @@ mod tests { assert!(cg.unresolved_ambiguous.is_empty()); } - // ── ClassMethodIndex (Phase 1: structural index, no behaviour wiring) ── + // ── ClassMethodIndex ──────────────────────────────────────────────── /// Helper: `(name, container)` pairs in the same file. Builds two /// summaries with the same leaf name on different containers so the @@ -2058,7 +2015,7 @@ mod tests { assert_eq!(cache_hits.len(), 1); assert_eq!(cache_hits[0].container, "Cache"); - // Bare-name lookup keeps both candidates — fallback behaviour. + // Bare-name lookup keeps both candidates, fallback behaviour. let bare_hits = idx.resolve(Lang::Rust, None, "findById"); assert_eq!( bare_hits.len(), @@ -2070,7 +2027,7 @@ mod tests { #[test] fn class_method_index_falls_back_to_name_when_container_unknown() { // `None` container or empty-string container both route to - // the bare-name index — equivalent to today's name-only edge + // the bare-name index, equivalent to today's name-only edge // insertion. let svc = make_method_summary("process", "OrderService", "src/svc.rs", "rust", 1); let helper = make_summary("process", "src/util.rs", "rust", 1, vec![]); @@ -2082,7 +2039,7 @@ mod tests { let none_hits = idx.resolve(Lang::Rust, None, "process"); assert_eq!(none_hits.len(), 2); - // Empty string container behaves identically to None — it is + // Empty string container behaves identically to None, it is // not stored under any container key. let empty_hits = idx.resolve(Lang::Rust, Some(""), "process"); assert_eq!(empty_hits.len(), 2); @@ -2107,7 +2064,7 @@ mod tests { .is_empty() ); // Right method, wrong container → empty (no fallback to bare-name - // when a container is supplied — that's the whole devirtualisation + // when a container is supplied, that's the whole devirtualisation // promise). assert!( idx.resolve(Lang::Rust, Some("OtherClass"), "findById") @@ -2140,7 +2097,7 @@ mod tests { #[test] fn class_method_index_handles_arity_overloads() { // Two arity overloads on the same container are both kept under - // the same `(container, name)` key — arity narrowing is the + // the same `(container, name)` key, arity narrowing is the // caller's responsibility (today's resolver also does this). let one = make_method_summary("encode", "Codec", "src/codec.rs", "rust", 1); let two = make_method_summary("encode", "Codec", "src/codec.rs", "rust", 2); @@ -2156,7 +2113,7 @@ mod tests { ); } - // ── Phase 3: devirtualised edge insertion via typed_call_receivers ── + // ── devirtualised edge insertion via typed_call_receivers ── /// Two `findById` definitions live on different containers in /// different files. A caller whose SSA summary records the @@ -2241,7 +2198,7 @@ mod tests { use crate::summary::ssa_summary::SsaFuncSummary; // Single `process` on `Worker`. No `process` exists on - // `Other` — that's the receiver type the caller's SSA + // `Other`, that's the receiver type the caller's SSA // summary will (incorrectly) record. let worker = make_method_summary("process", "Worker", "src/worker.rs", "rust", 1); let caller = summary_with_sites( @@ -2270,7 +2227,7 @@ mod tests { gs.insert_ssa( caller_key.clone(), SsaFuncSummary { - // Wrong receiver type — `Other::process` does not exist. + // Wrong receiver type, `Other::process` does not exist. typed_call_receivers: vec![(0, "Other".to_string())], ..Default::default() }, @@ -2292,7 +2249,7 @@ mod tests { ); } - // ── Phase 6: TypeHierarchyIndex ─────────────────────────────────── + // ── TypeHierarchyIndex ─────────────────────────────────── /// Helper: build a hierarchy index from a list of /// `(lang, sub, super)` edges by injecting them onto a single @@ -2334,7 +2291,7 @@ mod tests { TypeHierarchyIndex::build(&gs) } - /// B-1: Round-trip — a hierarchy built from a small set of edges + /// B-1: Round-trip, a hierarchy built from a small set of edges /// answers `subs_of` correctly and `super_keys_len` matches the /// distinct super count. #[test] @@ -2356,7 +2313,7 @@ mod tests { assert_eq!(h.super_keys_len(), 2); } - /// B-2: Java interface dispatch — `Repository r; r.findById(...)` + /// B-2: Java interface dispatch, `Repository r; r.findById(...)` /// fans out to every concrete implementer's `findById`. #[test] fn b2_java_interface_dispatch_fans_out_to_all_impls() { @@ -2421,7 +2378,7 @@ mod tests { assert_eq!(targets.len(), 2, "B-2: exactly two fan-out edges expected"); } - /// B-3: Java extends — `Base b; b.foo()` reaches Base AND Derived + /// B-3: Java extends, `Base b; b.foo()` reaches Base AND Derived /// when Derived extends Base. Pins inheritance fan-out separately /// from interface implements. #[test] @@ -2479,7 +2436,7 @@ mod tests { ); } - /// B-4: Rust trait dispatch — `Box; r.find(...)` reaches + /// B-4: Rust trait dispatch, `Box; r.find(...)` reaches /// every `impl Repo for X` `find`. #[test] fn b4_rust_trait_dispatch_fans_out_to_impls() { @@ -2536,10 +2493,9 @@ mod tests { ); } - /// B-7: Empty hierarchy — when the typed container has no recorded + /// B-7: Empty hierarchy, when the typed container has no recorded /// sub-types, `resolve_with_hierarchy` collapses to the direct - /// `ClassMethodIndex::resolve` lookup. Pin: Phase 6 is a no-op - /// when no inheritance was extracted. + /// `ClassMethodIndex::resolve` lookup. #[test] fn b7_empty_hierarchy_falls_back_to_single_container() { use crate::summary::ssa_summary::SsaFuncSummary; @@ -2561,7 +2517,7 @@ mod tests { ); let mut gs = merge_summaries(vec![repo, cache, caller], None); - // No hierarchy_edges set anywhere — Repository has no + // No hierarchy_edges set anywhere, Repository has no // sub-types, so devirtualisation collapses to direct match. let caller_key = FuncKey { lang: Lang::Rust, @@ -2589,10 +2545,9 @@ mod tests { assert_eq!(targets[0].container, "Repository"); } - /// B-8: Concrete sub-type — when the receiver is typed as the + /// B-8: Concrete sub-type, when the receiver is typed as the /// concrete sub-class (not the super-type), no hierarchy - /// expansion fires. Pin: Phase 6 narrows on concrete types - /// exactly like Phase 3. + /// expansion fires. #[test] fn b8_concrete_subtype_does_not_widen() { use crate::summary::ssa_summary::SsaFuncSummary; @@ -2654,7 +2609,7 @@ mod tests { assert_eq!(targets[0].container, "UserRepo"); } - /// B-9: Diamond — multiple impls sharing a super-type, dedup + /// B-9: Diamond, multiple impls sharing a super-type, dedup /// applied per call site so each FuncKey is edged at most once. #[test] fn b9_diamond_dedup_one_edge_per_funckey() { @@ -2662,7 +2617,7 @@ mod tests { let a = make_method_summary("doIt", "A", "src/A.java", "java", 0); let b = make_method_summary("doIt", "B", "src/B.java", "java", 0); - // A and B both extend Iface in two separate file emissions — + // A and B both extend Iface in two separate file emissions , // hierarchy_edges duplicates across files; dedup expected. let mut h1 = make_method_summary("__h", "Iface", "src/I1.java", "java", 0); h1.hierarchy_edges = vec![ @@ -2722,7 +2677,7 @@ mod tests { assert!(containers.contains("A") && containers.contains("B")); } - /// B-13: Stale hierarchy edge — sub-type referenced by an edge + /// B-13: Stale hierarchy edge, sub-type referenced by an edge /// no longer has a matching FuncKey. Resolver must not panic /// and must still resolve to whatever IS present. #[test] @@ -2730,7 +2685,7 @@ mod tests { use crate::summary::ssa_summary::SsaFuncSummary; // `Base` exists; `Derived` referenced by hierarchy_edges but - // its `foo` is never defined. Phase 6 must not panic and + // its `foo` is never defined. Resolver must not panic and // must still emit the Base::foo edge. let base = make_method_summary("foo", "Base", "src/Base.java", "java", 0); let mut h = make_method_summary("__h", "X", "src/X.java", "java", 0); @@ -2815,7 +2770,7 @@ mod tests { arity: Some(0), ..Default::default() }; - // A typed_call_receivers entry with ordinal=0 — but since the + // A typed_call_receivers entry with ordinal=0, but since the // site has receiver=None, this MUST be ignored. gs.insert_ssa( caller_key.clone(), diff --git a/src/cfg/blocks.rs b/src/cfg/blocks.rs index 0a20181f..fb6b6ed1 100644 --- a/src/cfg/blocks.rs +++ b/src/cfg/blocks.rs @@ -10,7 +10,7 @@ use tree_sitter::Node; /// at the *case-level* shape `build_switch` sees here. Rust `match`, Go /// `switch`, and Java arrow-switches qualify; classic Java/C/C++/JS switches /// with fall-through do not. The check is per-language because Java mixes -/// arrow and classic shapes — that's handled by inspecting the case kind in +/// arrow and classic shapes, that's handled by inspecting the case kind in /// [`extract_case_literal_text`]. fn lang_has_exclusive_cases(lang: &str) -> bool { matches!(lang, "rust" | "go") @@ -19,7 +19,7 @@ fn lang_has_exclusive_cases(lang: &str) -> bool { /// Extract the scrutinee subtree from a switch-like AST node. /// /// Returns the AST node referenced by the language's scrutinee field. Only -/// fires for Rust `match`, Go `switch`, and Java `switch` statements — other +/// fires for Rust `match`, Go `switch`, and Java `switch` statements, other /// languages return `None` so [`build_switch`] keeps its legacy behavior. fn extract_scrutinee_node<'a>(ast: Node<'a>, lang: &str) -> Option> { let field = match lang { @@ -39,7 +39,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) -> let kind = case.kind(); match (lang, kind) { ("rust", "match_arm") => { - // Reject guarded arms — `match x { y if cond => ... }`. + // Reject guarded arms, `match x { y if cond => ... }`. if case.child_by_field_name("guard").is_some() { return None; } @@ -71,7 +71,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) -> text_of(inner, code) } ("go", "expression_case") => { - // Go case `case v1, v2: ...` — only handle exactly one expression. + // Go case `case v1, v2: ...`, only handle exactly one expression. let value = case.child_by_field_name("value")?; let mut named_children: Vec = Vec::new(); let mut cursor = value.walk(); @@ -195,7 +195,7 @@ pub(super) fn extract_catch_param_name<'a>( // ------------------------------------------------------------------------- /// Builds CFG for Ruby's `begin`/`rescue`/`ensure` blocks (and `body_statement` -/// with inline rescue). Ruby's `begin` has no `body` field — the try-body +/// with inline rescue). Ruby's `begin` has no `body` field, the try-body /// statements are direct children before `rescue`/`else`/`ensure` nodes. #[allow(clippy::too_many_arguments)] pub(super) fn build_begin_rescue<'a>( @@ -305,7 +305,7 @@ pub(super) fn build_begin_rescue<'a>( vec![synth] } else { - // No param name — will wire exception edges to first rescue body node + // No param name, will wire exception edges to first rescue body node Vec::new() }; @@ -333,7 +333,7 @@ pub(super) fn build_begin_rescue<'a>( current_body_id, ) } else { - // No body field — build rescue node itself as a block. + // No body field, build rescue node itself as a block. // Filter out meta-children (exceptions, exception_variable) by // iterating and building only statement children. let mut rescue_cursor = rescue_node.walk(); @@ -407,7 +407,7 @@ pub(super) fn build_begin_rescue<'a>( try_exits }; - // 6. Build ensure clause (Ruby's finally — always runs) + // 6. Build ensure clause (Ruby's finally, always runs) if let Some(ensure_node) = ensure_clause { let mut ensure_preds: Vec = Vec::new(); ensure_preds.extend(&normal_exits); @@ -443,7 +443,7 @@ pub(super) fn build_begin_rescue<'a>( } // ------------------------------------------------------------------------- -// switch handler — multi-way dispatch with fallthrough +// switch handler, multi-way dispatch with fallthrough // ------------------------------------------------------------------------- /// True for AST kinds that wrap a single switch case body. @@ -490,7 +490,7 @@ pub(super) fn case_has_default_label(case: Node<'_>) -> bool { /// Build CFG for a switch statement. /// /// The dispatch is decomposed into a chain of binary `StmtKind::If` headers -/// — one per non-default case — because the SSA terminator only models 0/1/2 +///, one per non-default case, because the SSA terminator only models 0/1/2 /// successors. A monolithic N-way header would otherwise be collapsed to /// `Goto(first)` and silently drop every other case. Each header's True edge /// reaches its case body; the False edge falls through to the next header (or @@ -544,7 +544,7 @@ pub(super) fn build_switch<'a>( } } - // Grammar didn't expose recognisable case nodes — fall back to a single + // Grammar didn't expose recognisable case nodes, fall back to a single // header + Block-style walk so nodes still get linked. if cases.is_empty() { let header = push_node( @@ -603,7 +603,7 @@ pub(super) fn build_switch<'a>( // arrow-switch), pre-extract the scrutinee text + idents so the synthetic // dispatch headers can carry a ` == ` condition. // Falls back to `None` when the scrutinee is structurally complex (calls, - // member chains, parenthesized expressions in Go) — the existing first- + // member chains, parenthesized expressions in Go), the existing first- // reachable behavior remains correct in that case. let supports_exclusive_cases = lang_has_exclusive_cases(lang) || lang == "java"; let (scrutinee_text, scrutinee_idents) = if supports_exclusive_cases { @@ -647,7 +647,7 @@ pub(super) fn build_switch<'a>( for (idx, (case, is_default)) in cases.iter().copied().enumerate() { let is_last = idx + 1 == cases.len(); - // Default at the chain tail doesn't get its own dispatch If — the + // Default at the chain tail doesn't get its own dispatch If, the // previous header's False edge already targets it directly. let case_first_preds: Vec = if is_default && is_last { // First node of the default body becomes the False target of the @@ -675,12 +675,13 @@ pub(super) fn build_switch<'a>( ); // The dispatch header is purely structural (it stands in for the // discriminant comparison). It must not inherit Sink/Source labels - // from the case body's text — push_node uses `text_of(ast)` for + // from the case body's text, push_node uses `text_of(ast)` for // non-call kinds, which would let the body text drive classification. g[header].taint.labels.clear(); g[header].call.callee = None; g[header].call.sink_payload_args = None; g[header].call.destination_uses = None; + g[header].call.gate_filters.clear(); // For mutually-exclusive switch shapes with a single-ident // scrutinee, synthesize a ` == ` // structured condition on the dispatch header so SSA lowering @@ -958,7 +959,7 @@ pub(super) fn build_try<'a>( vec![synth] } else { - // No param name — wire exception edges directly to first catch body node + // No param name, wire exception edges directly to first catch body node Vec::new() }; diff --git a/src/cfg/cfg_tests.rs b/src/cfg/cfg_tests.rs index 60223f67..4b5080b3 100644 --- a/src/cfg/cfg_tests.rs +++ b/src/cfg/cfg_tests.rs @@ -43,7 +43,7 @@ fn js_try_catch_has_exception_edges() { /// When a classifiable call (here `eval`, a built-in JS sink) is nested /// inside a multi-line statement, the CFG node's `classification_span()` -/// should point at the inner call, not at the outer statement's start — +/// should point at the inner call, not at the outer statement's start , /// so finding display reports the line the dangerous call actually lives /// on. `ast.span` must still cover the whole outer statement for /// structural passes that need the statement grain. @@ -86,7 +86,7 @@ fn inner_call_override_narrows_classification_span() { } /// `classification_span()` must fall back to `ast.span` when no narrower -/// sub-expression was recorded — so existing structural code paths keep +/// sub-expression was recorded, so existing structural code paths keep /// working unchanged for nodes whose classification applies to the whole /// outer node. #[test] @@ -125,7 +125,7 @@ fn callee_span_unset_when_no_narrowing_is_possible() { // A bare `eval(x);` on one line: `first_call_ident` finds the // call_expression whose span is nearly the whole expression_statement // (different by the trailing `;`). `classification_span` still - // returns a sensible line — but the exact trimming is an + // returns a sensible line, but the exact trimming is an // implementation detail. What we assert here is the invariant: // if callee_span *is* set, it must be contained in ast.span. let src = b"function f() { eval(x); }"; @@ -708,7 +708,7 @@ fn python_if_and() { #[test] fn ruby_unless_and() { - // `unless a && b` — chain built, branches swapped + // `unless a && b`, chain built, branches swapped // Body should run when condition is false let src = b"def f\n unless a && b\n x\n end\nend\n"; let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE); @@ -848,7 +848,7 @@ fn parse_tree(src: &[u8], ts_lang: Language) -> tree_sitter::Tree { #[test] fn first_call_ident_skips_lambda_body() { - // `process(lambda: eval(dangerous))` — Python-style. + // `process(lambda: eval(dangerous))`, Python-style. // first_call_ident should return "process", not "eval". let src = b"process(lambda: eval(dangerous))"; let ts_lang = Language::from(tree_sitter_python::LANGUAGE); @@ -860,7 +860,7 @@ fn first_call_ident_skips_lambda_body() { #[test] fn first_call_ident_skips_arrow_function_body() { - // `process(() => eval(dangerous))` — JS arrow function in argument. + // `process(() => eval(dangerous))`, JS arrow function in argument. let src = b"process(() => eval(dangerous))"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); let tree = parse_tree(src, ts_lang); @@ -871,7 +871,7 @@ fn first_call_ident_skips_arrow_function_body() { #[test] fn first_call_ident_skips_named_function_in_arg() { - // `process(function inner() { eval(dangerous); })` — named function expression in arg. + // `process(function inner() { eval(dangerous); })`, named function expression in arg. let src = b"process(function inner() { eval(dangerous); })"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); let tree = parse_tree(src, ts_lang); @@ -882,7 +882,7 @@ fn first_call_ident_skips_named_function_in_arg() { #[test] fn first_call_ident_normal_nested_call() { - // `outer(inner(x))` — inner is NOT behind a function boundary, should be reachable. + // `outer(inner(x))`, inner is NOT behind a function boundary, should be reachable. let src = b"outer(inner(x))"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); let tree = parse_tree(src, ts_lang); @@ -895,7 +895,7 @@ fn first_call_ident_normal_nested_call() { #[test] fn first_call_ident_finds_call_not_blocked_by_function() { // Ensure a call at the same level as a function literal is still found. - // `[function() {}, actual_call()]` — array with function and call. + // `[function() {}, actual_call()]`, array with function and call. let src = b"[function() {}, actual_call()]"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); let tree = parse_tree(src, ts_lang); @@ -908,7 +908,7 @@ fn first_call_ident_finds_call_not_blocked_by_function() { #[test] fn callee_not_resolved_from_nested_function_arg() { - // `safe_wrapper(function() { eval(user_input); })` — the CFG for the + // `safe_wrapper(function() { eval(user_input); })`, the CFG for the // outer call should resolve the callee as "safe_wrapper", never "eval". let src = b"function f() { safe_wrapper(function() { eval(user_input); }); }"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); @@ -923,7 +923,7 @@ fn callee_not_resolved_from_nested_function_arg() { assert!(has_safe, "expected a node with callee 'safe_wrapper'"); // The outer body should NOT have a node with callee "eval" attributed - // to the outer expression — eval lives inside the nested function body. + // to the outer expression, eval lives inside the nested function body. let outer_eval = body.graph.node_weights().any(|info| { info.call.callee.as_deref() == Some("eval") && info.ast.enclosing_func.is_none() }); @@ -1117,6 +1117,7 @@ fn clone_preserves_all_sub_structs() { kwargs: vec![("shell".into(), vec!["True".into()])], arg_string_literals: vec![Some("lit".into())], destination_uses: None, + gate_filters: Vec::new(), }, taint: TaintMeta { labels: { @@ -1399,7 +1400,7 @@ fn js_promisify_ignored_for_non_js_langs() { #[test] fn js_promisify_non_call_value_ignored() { - // RHS is not a promisify call — no binding should be captured. + // RHS is not a promisify call, no binding should be captured. let src = b"const execAsync = child_process.exec;"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); let file_cfg = parse_to_file_cfg(src, "javascript", ts_lang); @@ -1471,7 +1472,7 @@ fn cpp_function_extracts_param_names() { // ── callee-site metadata extraction ────────────────────────────────── /// Callees collected into `LocalFuncSummary` should now carry structured -/// arity, receiver, and qualifier fields — not just a bare name. +/// arity, receiver, and qualifier fields, not just a bare name. #[test] fn local_summary_callees_carry_arity_and_receiver() { // Two calls: one is a plain function call with 2 args, the other is @@ -1703,7 +1704,7 @@ fn local_summary_callees_have_distinct_ordinals() { .find(|(k, _)| k.name == "outer") .unwrap(); - // Dedup key is (name, arity, receiver, qualifier, ordinal) — the two + // Dedup key is (name, arity, receiver, qualifier, ordinal), the two // `a()` sites have different ordinals, so both must appear. let a_sites: Vec<_> = outer.callees.iter().filter(|c| c.name == "a").collect(); assert_eq!( @@ -1825,7 +1826,7 @@ fn anon_fn_named_from_short_var_decl_go() { #[test] fn iife_callee_resolves_to_anon_body_js() { - // `(function(arg){eval(arg);})(q)` — the CallFn arm must produce + // `(function(arg){eval(arg);})(q)`, the CallFn arm must produce // a synthetic anon callee name so that taint can match the // inline body's FuncKey. let src = b"(function(arg){ eval(arg); })(q);"; @@ -1898,7 +1899,7 @@ fn strip_tags(s: &str) -> String { #[test] fn replace_chain_rejects_unrecognised_literals() { - // `.replace("foo", "bar")` contains no dangerous pattern — must NOT be + // `.replace("foo", "bar")` contains no dangerous pattern, must NOT be // credited as a sanitizer. Preserves the FP→TN guard: replace calls // that don't strip anything dangerous must stay transparent to taint. let src = br#" @@ -1916,7 +1917,7 @@ fn rewrite(s: &str) -> String { #[test] fn replace_chain_rejects_when_replacement_reintroduces_pattern() { - // `.replace("x", "..")` strips `x` but *reintroduces* `..` — be + // `.replace("x", "..")` strips `x` but *reintroduces* `..`, be // maximally conservative and abandon all credit for this chain. let src = br#" fn evil(s: &str) -> String { @@ -1933,7 +1934,7 @@ fn evil(s: &str) -> String { #[test] fn replace_chain_rejects_dynamic_arg() { - // `.replace(var, "")` — search is not a literal; pattern analysis can + // `.replace(var, "")`, search is not a literal; pattern analysis can // say nothing about what was stripped. Must not earn credit. let src = br#" fn dynamic(s: &str, needle: &str) -> String { @@ -1950,7 +1951,7 @@ fn dynamic(s: &str, needle: &str) -> String { #[test] fn replace_chain_rejects_non_identifier_base() { - // `get_s().replace("..", "")` — innermost receiver is a call, not a + // `get_s().replace("..", "")`, innermost receiver is a call, not a // parameter. We have no reason to believe `get_s()` returns a value // that benefits the caller; refuse credit. let src = br#" @@ -1976,7 +1977,7 @@ fn find_node_defining<'a>(cfg: &'a Cfg, var: &str) -> Option<&'a NodeInfo> { #[test] fn numeric_length_access_detected_on_js_property_read() { - // `var count = items.length` — property access on a member expression + // `var count = items.length`, property access on a member expression // should mark the CFG node as a numeric-length access so the // type-fact analysis infers TypeKind::Int for `count`. let src = br#"function f(items) { @@ -1994,7 +1995,7 @@ fn numeric_length_access_detected_on_js_property_read() { #[test] fn numeric_length_access_detected_on_js_zero_arg_method_call() { - // `var n = str.length()` — zero-arg method call form (uncommon in JS + // `var n = str.length()`, zero-arg method call form (uncommon in JS // but present in other languages). Detector should unwrap a // zero-arg call around a member expression. let src = br#"function f(list) { @@ -2012,7 +2013,7 @@ fn numeric_length_access_detected_on_js_zero_arg_method_call() { #[test] fn numeric_length_access_ignores_unrelated_properties() { - // `var v = arr.foo` — arbitrary property reads must not be flagged. + // `var v = arr.foo`, arbitrary property reads must not be flagged. let src = br#"function f(arr) { var v = arr.foo; return v; @@ -2028,7 +2029,7 @@ fn numeric_length_access_ignores_unrelated_properties() { #[test] fn numeric_length_access_ignores_method_calls_with_args() { - // `var r = s.indexOf('x')` — the detector must reject any call with + // `var r = s.indexOf('x')`, the detector must reject any call with // positional arguments because those aren't pure length reads. let src = br#"function f(s) { var r = s.indexOf('x'); @@ -2043,7 +2044,7 @@ fn numeric_length_access_ignores_method_calls_with_args() { ); } -// ── Pointer-Phase 6 / W5: subscript lowering tests ──────────────────────── +//── subscript lowering tests ──────────────────────── /// Scope for tests that flip `NYX_POINTER_ANALYSIS=1` so the CFG-side /// subscript synthesis activates. The env-var is restored afterwards @@ -2290,7 +2291,7 @@ fn js_switch_default_in_middle_reorders_to_tail() { ); } -/// JS switch fall-through (`case 1: a(); case 2: b();`) — case 1's +/// JS switch fall-through (`case 1: a(); case 2: b();`), case 1's /// exit should flow into case 2's body so taint from `first()` /// reaches `second()`'s sinks. /// @@ -2301,7 +2302,7 @@ fn js_switch_default_in_middle_reorders_to_tail() { /// structural shape. /// (b) `first()` has a non-Back forward out-edge that lands inside /// the case-2 sub-graph (the actual fall-through wire), so we -/// prove there *is* a fall-through edge — not just an +/// prove there *is* a fall-through edge, not just an /// Entry→…→Exit path that happens to walk through both calls /// via the dispatch chain. /// @@ -2309,7 +2310,7 @@ fn js_switch_default_in_middle_reorders_to_tail() { /// Seq passthrough nodes (one per surrounding scope), so the /// fall-through edge from `first()` lands on the *first wrapper /// Seq node* of case 2, not on `second()` itself. Asserting that -/// `second()` has ≥2 in-edges would therefore be wrong — the True +/// `second()` has ≥2 in-edges would therefore be wrong, the True /// edge from the case-2 dispatch If targets the wrapper node, and /// only a single Seq chain leads from there to `second()`. #[test] @@ -2800,7 +2801,7 @@ fn nested_loops_two_headers_two_back_edges() { #[test] fn loop_with_break_no_back_edge_from_break() { - // A `break` short-circuits the loop body — its edge must NOT be a + // A `break` short-circuits the loop body, its edge must NOT be a // back edge to the header (it leaves the loop entirely). let src = b"function f() { while (cond()) { if (done()) break; body(); } }"; let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); @@ -2879,7 +2880,7 @@ fn chained_method_call_rebinds_to_inner_gated_sink() { // no longer be the recorded callee for this node. if callee.ends_with("https.get") { // The inner-gate path must have populated sink_payload_args - // (the gate's payload arg is position 0 — the URL string). + // (the gate's payload arg is position 0, the URL string). assert!( info.call.sink_payload_args.is_some(), "expected sink_payload_args to be populated for chained \ diff --git a/src/cfg/conditions.rs b/src/cfg/conditions.rs index cde358e1..3b4e8244 100644 --- a/src/cfg/conditions.rs +++ b/src/cfg/conditions.rs @@ -4,6 +4,7 @@ use super::{ member_expr_text, push_node, text_of, }; use crate::labels::{DataLabel, LangAnalysisRules, classify}; +use crate::utils::snippet::truncate_at_char_boundary; use petgraph::graph::NodeIndex; use smallvec::SmallVec; use tree_sitter::Node; @@ -72,20 +73,15 @@ pub(super) fn push_condition_node<'a>( code: &'a [u8], enclosing_func: Option<&str>, ) -> NodeIndex { - // Pass cond_ast as both args — sub-conditions are never `unless` nodes + // Pass cond_ast as both args, sub-conditions are never `unless` nodes let (inner, negated) = detect_negation(cond_ast, cond_ast, lang); let mut vars = Vec::new(); collect_idents(inner, code, &mut vars); vars.sort(); vars.dedup(); vars.truncate(MAX_COND_VARS); - let text = text_of(cond_ast, code).map(|t| { - if t.len() > MAX_CONDITION_TEXT_LEN { - t[..MAX_CONDITION_TEXT_LEN].to_string() - } else { - t - } - }); + let text = text_of(cond_ast, code) + .map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string()); let span = (cond_ast.start_byte(), cond_ast.end_byte()); g.add_node(NodeInfo { kind: StmtKind::If, @@ -140,7 +136,7 @@ pub(super) fn detect_rust_let_match_guard<'a>( /// Synthesize a `StmtKind::If` CFG node carrying a Rust match-arm guard's /// condition text and vars. The let-binding name is added to `condition_vars` /// so `apply_branch_predicates` narrows validation to that specific variable -/// — the variable that receives the arm's value and flows to downstream sinks. +///, the variable that receives the arm's value and flows to downstream sinks. pub(super) fn emit_rust_match_guard_if<'a>( g: &mut Cfg, guard: Node<'a>, @@ -154,13 +150,8 @@ pub(super) fn emit_rust_match_guard_if<'a>( vars.sort(); vars.dedup(); vars.truncate(MAX_COND_VARS); - let text = text_of(guard, code).map(|t| { - if t.len() > MAX_CONDITION_TEXT_LEN { - t[..MAX_CONDITION_TEXT_LEN].to_string() - } else { - t - } - }); + let text = text_of(guard, code) + .map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string()); let span = (guard.start_byte(), guard.end_byte()); g.add_node(NodeInfo { kind: StmtKind::If, @@ -181,7 +172,7 @@ pub(super) fn emit_rust_match_guard_if<'a>( /// `lhs_text` is then synthesised by SSA lowering at the join. /// /// The condition's identifiers live on the If node's `condition_vars`, **not** -/// on the branch `uses`. This is the whole point of the split — cond is control +/// on the branch `uses`. This is the whole point of the split, cond is control /// flow, branches are data flow. /// /// Returns the exit frontier for downstream statement chaining (a single-element @@ -219,7 +210,7 @@ pub(super) fn build_ternary_diamond<'a>( g[cond_if].is_eq_with_const = detect_eq_with_const(cond_ast, lang); connect_all(g, preds, cond_if, pred_edge); - // 2. Branches. Each branch produces its own exit frontier (≥ 1 node) — + // 2. Branches. Each branch produces its own exit frontier (≥ 1 node) , // a nested ternary recurses and returns its own join node. let true_exits = lower_ternary_branch( cons_ast, @@ -332,7 +323,7 @@ pub(super) fn lower_ternary_branch<'a>( analysis_rules, ); - // The branch expression's own `defines` (if any — typically None for a + // The branch expression's own `defines` (if any, typically None for a // pure value expression) is replaced with the outer LHS so that both // branches agree on the target, driving phi insertion at the join. g[node].taint.defines = Some(lhs_text.to_string()); @@ -410,7 +401,7 @@ pub(super) fn classify_ternary_lhs( .unwrap_or_default(); // Try the full dotted path first (e.g. "document.cookie"), then fall back - // to the property alone (e.g. "innerHTML") — mirrors the LHS classification + // to the property alone (e.g. "innerHTML"), mirrors the LHS classification // already performed in `push_node` for non-split assignments. if let Some(l) = classify(lang, &lhs_text, extra) { labels.push(l); @@ -429,7 +420,7 @@ pub(super) fn classify_ternary_lhs( /// Recursively decompose a boolean condition into a chain of `StmtKind::If` nodes /// with short-circuit edges. /// -/// Returns `(true_exits, false_exits)` — the sets of nodes from which True/False +/// Returns `(true_exits, false_exits)`, the sets of nodes from which True/False /// edges should connect to the then/else branches. pub(super) fn build_condition_chain<'a>( cond_ast: Node<'a>, diff --git a/src/cfg/decorators.rs b/src/cfg/decorators.rs index 9b4ecd0f..8864d21a 100644 --- a/src/cfg/decorators.rs +++ b/src/cfg/decorators.rs @@ -5,7 +5,7 @@ use tree_sitter::Node; /// /// Used by decorator extraction to reduce `login_required`, `permission_required(...)`, /// `flask_login.login_required`, `hasRole('ADMIN')` to their first identifier -/// name — the matcher target. +/// name, the matcher target. fn leading_ident_text(node: Node<'_>, code: &[u8]) -> Option { let mut cur = node; loop { @@ -56,7 +56,7 @@ fn normalize_decorator_name(raw: &str) -> String { let trimmed = raw.trim(); let trimmed = trimmed.trim_start_matches(':').trim_start_matches('@'); // If a call syntax leaked through (e.g. `UseGuards(AuthGuard)`), keep only - // the head — callers that want the arg handle it separately. + // the head, callers that want the arg handle it separately. let head = trimmed .split(['(', ' ', '\t', '\n']) .next() @@ -115,7 +115,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec { /// are `decorator` nodes containing an `identifier` or `call` expression. /// - **JS/TS**: decorators attach to `method_definition` children or appear /// as siblings inside `class_body`; stage-3 decorators use `decorator` nodes. -/// `@UseGuards(AuthGuard)` — we include the call args too. +/// `@UseGuards(AuthGuard)`, we include the call args too. /// - **Java**: annotations live in the `modifiers` child of `method_declaration`; /// kinds are `marker_annotation` / `annotation`. /// - **Rust**: `function_item` has `attribute_item` siblings (outer `#[..]`). @@ -127,7 +127,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec { /// at class body scope applies to every method in the class. `only:` / /// `except:` hash args scope the filter to the listed action names; the /// filter is only recorded for the current method when the scope matches. -/// Conditional filters (`if:` / `unless:`) are not honored — those require +/// Conditional filters (`if:` / `unless:`) are not honored, those require /// predicate evaluation and are deferred. pub(super) fn extract_auth_decorators<'a>( func_node: Node<'a>, @@ -379,12 +379,12 @@ pub(super) fn extract_auth_decorators<'a>( } /// If a Ruby statement is `before_action :name` (or `before_filter :name`), -/// push the normalized filter name into `out` — honoring any `only:` / `except:` +/// push the normalized filter name into `out`, honoring any `only:` / `except:` /// hash arguments against `method_name`. /// /// Positional symbol args (`before_action :a, :b, only: [:x]`) all share the /// single trailing scope. Conditional filters (`if:` / `unless:`) are not -/// honored here — those require predicate evaluation and are deferred. +/// honored here, those require predicate evaluation and are deferred. fn collect_ruby_before_action( node: Node<'_>, code: &[u8], @@ -499,7 +499,7 @@ fn collect_ruby_before_action( /// Parse a single `only:` / `except:` hash pair and append the symbol list into /// the corresponding out-vec. Sets the `*_present` flag when the key is seen, -/// regardless of whether the value parses into any symbols — treating +/// regardless of whether the value parses into any symbols, treating /// `only: []` as "no actions match" is safer than ignoring the scope. fn collect_ruby_filter_pair( pair_node: Node<'_>, diff --git a/src/cfg/dto.rs b/src/cfg/dto.rs index 09072cc8..016c28a6 100644 --- a/src/cfg/dto.rs +++ b/src/cfg/dto.rs @@ -1,26 +1,28 @@ -//! Phase 6.1: per-language DTO definition collectors. +//! per-language DTO definition collectors. //! //! Walks a parsed file's AST and emits `(class_name, DtoFields)` pairs //! for class / interface / struct / Pydantic-model declarations whose //! field types resolve to a recognised [`TypeKind`]. //! //! Strictly additive: classes whose fields cannot be classified produce -//! a `DtoFields` with an empty `fields` map — the caller must decide +//! a `DtoFields` with an empty `fields` map, the caller must decide //! whether to use that as a "Dto with no inferred fields" or fall back //! to the pre-Phase-6 Object/Unknown classification. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use tree_sitter::Node; use super::helpers::text_of; -use super::params::{java_type_to_kind, python_primitive_to_kind, ts_type_to_kind}; +use super::params::{ + java_type_to_kind, python_primitive_to_kind, ts_type_to_kind, ts_type_to_local_collection, +}; use crate::ssa::type_facts::{DtoFields, TypeKind}; /// Collect all DTO-shaped class definitions in a parsed file. /// /// Dispatches per-language; returns an empty map for languages without -/// a Phase 6 collector (Go, Ruby, PHP, C/C++ — DTOs in those ecosystems +/// a collector (Go, Ruby, PHP, C/C++, DTOs in those ecosystems /// either don't follow framework conventions Nyx tracks today, or are /// already covered by other type-inference paths). pub(super) fn collect_dto_classes( @@ -39,6 +41,55 @@ pub(super) fn collect_dto_classes( out } +/// Collect same-file `type X = Map<...>` / `Set<...>` / `T[]` +/// aliases for TS / JS so the param classifier can resolve a +/// parameter typed `m: ElementsMap` (where +/// `type ElementsMap = Map`) to +/// [`TypeKind::LocalCollection`]. +/// +/// Empty for non-JS/TS languages. Cross-file aliases are not +/// resolved here, that requires the multi-file type-resolution +/// pipeline that doesn't yet exist for TS. Excalidraw's +/// `type ElementsMap = Map<...>` is in +/// `packages/element/src/types.ts`; users that import the alias +/// without a same-file copy still see the original FP. Most +/// real-repo aliases the FP cluster touched were declared in the +/// same file as their consumers (see fixture). +pub(super) fn collect_type_alias_local_collections( + root: Node<'_>, + lang: &str, + code: &[u8], +) -> HashSet { + let mut out: HashSet = HashSet::new(); + if matches!(lang, "typescript" | "ts" | "javascript" | "js") { + collect_ts_type_alias_local_collections(root, code, &mut out); + } + out +} + +fn collect_ts_type_alias_local_collections(root: Node<'_>, code: &[u8], out: &mut HashSet) { + walk(root, &mut |node| { + if node.kind() != "type_alias_declaration" { + return; + } + let Some(name_node) = node.child_by_field_name("name") else { + return; + }; + let Some(alias_name) = text_of(name_node, code) else { + return; + }; + let Some(value_node) = node.child_by_field_name("value") else { + return; + }; + let Some(value_text) = text_of(value_node, code) else { + return; + }; + if ts_type_to_local_collection(value_text.trim()).is_some() { + out.insert(alias_name); + } + }); +} + // ───────────────────────────────────────────────────────────────────── // Java // ───────────────────────────────────────────────────────────────────── @@ -163,7 +214,7 @@ fn extract_ts_property<'a>(node: Node<'a>, code: &'a [u8]) -> Option<(String, Ty let name_node = node.child_by_field_name("name")?; let field_name = text_of(name_node, code)?; let type_anno = node.child_by_field_name("type")?; - // type_annotation node text is `: T` — walk to the inner type. + // type_annotation node text is `: T`, walk to the inner type. let type_text = type_anno .named_child(0) .and_then(|t| text_of(t, code)) @@ -193,7 +244,7 @@ fn collect_rust(root: Node<'_>, code: &[u8], out: &mut HashMap, code: &[u8], out: &mut HashMap(class_node: Node<'a>, code: &'a [u8]) -> bool { let Some(supers) = class_node.child_by_field_name("superclasses") else { @@ -418,7 +469,7 @@ mod tests { "#; let dtos = collect("rust", src); // Tuple structs have no named fields and must NOT produce a - // DtoFields entry — Phase 6 only handles named-field DTOs. + // DtoFields entry, This collector only handles named-field DTOs. assert!(!dtos.contains_key("Wrap")); } diff --git a/src/cfg/helpers.rs b/src/cfg/helpers.rs index a18d7771..6a582ef6 100644 --- a/src/cfg/helpers.rs +++ b/src/cfg/helpers.rs @@ -19,11 +19,11 @@ pub(crate) fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option { /// /// For `Runtime.getRuntime().exec(cmd)`, the receiver of `exec` is the call /// `Runtime.getRuntime()`. This function drills through that to return -/// `"Runtime"` — the outermost non-call object. This lets labels like +/// `"Runtime"`, the outermost non-call object. This lets labels like /// `"Runtime.exec"` match correctly. pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option { match lookup(lang, n.kind()) { - // The receiver is itself a call — drill into ITS receiver. + // The receiver is itself a call, drill into ITS receiver. // e.g. for `Runtime.getRuntime()`, the object is `Runtime`. Kind::CallFn | Kind::CallMethod => { let inner = n @@ -53,7 +53,7 @@ pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option Option { let mut cur = n; - // Bounded walk — tree-sitter can nest deeply but we only need a handful + // Bounded walk, tree-sitter can nest deeply but we only need a handful // of hops for real code. for _ in 0..16 { match cur.kind() { @@ -68,7 +68,7 @@ pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option { cur = cur.child_by_field_name("value")?; } // Drill through nested calls / method chains to find the base - // identifier. E.g. `Connection::open(p).unwrap().execute(...)` — + // identifier. E.g. `Connection::open(p).unwrap().execute(...)` , // the receiver of `.execute` is the `.unwrap()` call whose // object is `Connection::open(p)`; we want the leftmost plain // identifier the chain resolves to (for SSA var_stacks lookup). @@ -212,7 +212,7 @@ pub(crate) fn first_call_ident_with_span<'a>( return ident.map(|s| (s, span)); } Kind::Function => { - // Do not descend into nested function/lambda bodies — + // Do not descend into nested function/lambda bodies , // they are separate scopes and should not contribute // callee identifiers to the parent expression. continue; @@ -240,7 +240,7 @@ pub(crate) fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> O /// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does. /// /// Returns `(callee_text, label, span)` where `span` is the byte range of the -/// inner call node itself — used to populate `CallMeta.callee_span` so that +/// inner call node itself, used to populate `CallMeta.callee_span` so that /// display sites can report the actual call location rather than the enclosing /// statement's span. pub(crate) fn find_classifiable_inner_call<'a>( @@ -251,7 +251,7 @@ pub(crate) fn find_classifiable_inner_call<'a>( ) -> Option<(String, DataLabel, (usize, usize))> { let mut cursor = n.walk(); for c in n.children(&mut cursor) { - // Do not descend into Kind::Function nodes — they will be extracted + // Do not descend into Kind::Function nodes, they will be extracted // as separate BodyCfg entries and should not contribute inner callees // to the parent expression. if lookup(lang, c.kind()) == Kind::Function { @@ -329,7 +329,7 @@ pub(crate) fn member_expr_text_inner(n: Node, code: &[u8]) -> Option { match n.kind() { "member_expression" | "attribute" | "selector_expression" => { // Tree-sitter exposes the receiver under `object` (JS/TS, Python), - // `value` (Rust field_expression — handled in the matching arm + // `value` (Rust field_expression, handled in the matching arm // above), or `operand` (Go selector_expression). Without the // `operand` fallback, Go member access like `r.Body` collapsed to // just the trailing field (`Body`), so source rules keyed on the @@ -442,7 +442,7 @@ pub(crate) fn first_member_text(n: Node, code: &[u8]) -> Option { /// This finds anonymous functions / arrow functions / closures that are /// passed as arguments to a call and should be analysed as separate /// function scopes. Only direct function-argument children are collected -/// (not functions nested inside other functions — those get handled when +/// (not functions nested inside other functions, those get handled when /// the outer function is recursed into). pub(crate) fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec> { let mut funcs = Vec::new(); @@ -558,7 +558,7 @@ pub(crate) fn derive_anon_fn_name_from_context<'a>( } // Python: `h = lambda: ...` parents as `assignment`, handled above. - // Python `default_parameter` assigning `def foo(x=lambda: 0)` — ambiguous, skip. + // Python `default_parameter` assigning `def foo(x=lambda: 0)`, ambiguous, skip. _ => { // Some grammars wrap the RHS in an `expression`, `expression_list`, // or similar node between the binding site and the function literal. @@ -709,7 +709,7 @@ pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec) { } } -/// Pointer-Phase 6 / W5: AST kind names for subscript / index expressions +/// AST kind names for subscript / index expressions /// across the languages whose container-element flow we model. /// /// JS/TS use `subscript_expression`; Python uses `subscript`; Go uses @@ -724,7 +724,7 @@ pub(crate) fn is_subscript_kind(kind: &str) -> bool { ) } -/// Pointer-Phase 6 / W5: when the LHS of an assignment statement is a +/// when the LHS of an assignment statement is a /// subscript / index expression (or a single-element wrapper around /// one), return that node. Returns `None` for multi-target Go /// `expression_list`s, identifier LHSs, member-expression LHSs, etc. @@ -745,10 +745,10 @@ pub(crate) fn subscript_lhs_node<'a>(lhs: Node<'a>, lang: &str) -> Option(n: Node<'a>, code: &'a [u8]) -> Option<(S n.named_children(&mut cur).nth(1) })?; let arr_kind = arr.kind(); - // Only proceed when the array is a plain identifier — otherwise + // Only proceed when the array is a plain identifier, otherwise // we can't bind a stable receiver name for the synth Call. if !matches!( arr_kind, @@ -780,7 +780,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S return None; } let arr_text = text_of(arr, code)?; - // PHP-style `$x` strip not needed here — Go/JS/Python don't use it. + // PHP-style `$x` strip not needed here, Go/JS/Python don't use it. let idx_text = text_of(idx, code)?; Some((arr_text, idx_text)) } diff --git a/src/cfg/hierarchy.rs b/src/cfg/hierarchy.rs index 4f351086..db7c6fc8 100644 --- a/src/cfg/hierarchy.rs +++ b/src/cfg/hierarchy.rs @@ -1,4 +1,4 @@ -//! Phase 6: per-language class / trait / interface hierarchy extraction. +//! per-language class / trait / interface hierarchy extraction. //! //! Walks a parsed file's AST and emits `(sub_container, super_container)` //! pairs for every declared inheritance / impl / implements relationship. @@ -47,7 +47,7 @@ pub(crate) fn collect_hierarchy_edges( "php" => collect_php(root, code, &mut push), "cpp" | "c++" => collect_cpp(root, code, &mut push), // Go: structural / implicit interface satisfaction is intractable - // per-file; Phase 6 deliberately skips it. + // per-file; deliberately skipped it. // C: no inheritance. _ => {} } @@ -70,7 +70,7 @@ fn collect_java(root: Node<'_>, code: &[u8], push: &mu let Some(sub) = text_of(name_node, code) else { return; }; - // `superclass` field on class_declaration — singular `extends Y`. + // `superclass` field on class_declaration, singular `extends Y`. if let Some(superclass) = node.child_by_field_name("superclass") { let mut cursor = superclass.walk(); for c in superclass.named_children(&mut cursor) { @@ -79,13 +79,13 @@ fn collect_java(root: Node<'_>, code: &[u8], push: &mu } } } - // `interfaces` field on class_declaration — `implements I, J` + // `interfaces` field on class_declaration, `implements I, J` // wraps a `super_interfaces` → `type_list`. if let Some(ifaces) = node.child_by_field_name("interfaces") { collect_java_type_list(ifaces, code, &sub, push); } // `extends_interfaces` is an unnamed child on - // interface_declaration — `extends Foo, Bar` for an + // interface_declaration, `extends Foo, Bar` for an // interface. Walk children directly since it's not a field. let mut cursor = node.walk(); for c in node.named_children(&mut cursor) { @@ -123,7 +123,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option { match n.kind() { "type_identifier" | "identifier" => text_of(n, code), "generic_type" => { - // `Foo` — the leading child is the bare type identifier. + // `Foo`, the leading child is the bare type identifier. let mut cursor = n.walk(); for c in n.named_children(&mut cursor) { if matches!( @@ -136,7 +136,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option { None } "scoped_type_identifier" => { - // `pkg.Foo` — return last segment. + // `pkg.Foo`, return last segment. text_of(n, code).map(|s| { let last = s.rsplit('.').next().unwrap_or(&s); last.to_string() @@ -152,7 +152,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option { /// Walk for `impl_item` nodes and emit edges from the concrete type to /// the trait being implemented. Inherent impls (`impl Foo {}`) emit -/// no edge — there is no super-trait relationship to record. +/// no edge, there is no super-trait relationship to record. fn collect_rust(root: Node<'_>, code: &[u8], push: &mut F) { walk(root, &mut |node| { if node.kind() != "impl_item" { @@ -179,7 +179,7 @@ fn rust_path_leaf(n: Node<'_>, code: &[u8]) -> Option { match n.kind() { "type_identifier" | "identifier" => text_of(n, code), "scoped_type_identifier" | "scoped_identifier" => { - // `crate::foo::Bar` — last segment. + // `crate::foo::Bar`, last segment. let s = text_of(n, code)?; Some(s.rsplit("::").next().unwrap_or(&s).to_string()) } @@ -286,12 +286,12 @@ fn collect_python(root: Node<'_>, code: &[u8], push: & let Some(superclasses) = node.child_by_field_name("superclasses") else { return; // no parents }; - // `superclasses` is an `argument_list` — each non-keyword + // `superclasses` is an `argument_list`, each non-keyword // argument is a base class. let mut cursor = superclasses.walk(); for arg in superclasses.named_children(&mut cursor) { if let Some(t) = python_base_text(arg, code) { - // Skip Python `object` — not informative. + // Skip Python `object`, not informative. if t != "object" { push(sub.clone(), t); } @@ -304,7 +304,7 @@ fn python_base_text(n: Node<'_>, code: &[u8]) -> Option { match n.kind() { "identifier" => text_of(n, code), "attribute" => { - // `pkg.Base` — last segment. + // `pkg.Base`, last segment. let s = text_of(n, code)?; Some(s.rsplit('.').next().unwrap_or(&s).to_string()) } @@ -474,7 +474,7 @@ mod tests { let src = "interface Mine extends Foo, Bar {}"; let edges = collect("java", src); // tree-sitter-java models `extends` on interface as `extends_interfaces` - // rooted at the same node — at least one of the parents should land. + // rooted at the same node, at least one of the parents should land. assert!( edges.iter().any(|(s, _)| s == "Mine"), "interface extends should emit at least one edge; got {edges:?}" @@ -516,8 +516,8 @@ mod tests { #[test] fn python_class_object_base_skipped() { - // Inheriting from `object` is not informative — Python's - // implicit root. Phase 6 omits these edges to keep the + // Inheriting from `object` is not informative, Python's + // implicit root. We omit these edges to keep the // hierarchy index focused on user-defined relationships. let src = "class Plain(object):\n pass\n"; let edges = collect("python", src); diff --git a/src/cfg/imports.rs b/src/cfg/imports.rs index 3dda462b..58ba2513 100644 --- a/src/cfg/imports.rs +++ b/src/cfg/imports.rs @@ -12,7 +12,7 @@ use tree_sitter::{Node, Tree}; /// - ES6: `import { A as B } from 'mod'` → B → ImportBinding { original: A, module: mod } /// - CommonJS: `const { A: B } = require('mod')` → B → ImportBinding { original: A, module: mod } /// -/// Only aliased (renamed) bindings are recorded — same-name imports (e.g. +/// Only aliased (renamed) bindings are recorded, same-name imports (e.g. /// `import { exec }`) are already resolvable by their original name. pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBindings { let mut bindings = ImportBindings::new(); @@ -149,7 +149,7 @@ pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBinding continue; } // The alias is accessed via the "alias" field (a `name` node). - // The qualified name has no field — find it by kind. + // The qualified name has no field, find it by kind. let alias_node = clause.child_by_field_name("alias"); let mut c2 = clause.walk(); let qname_node = clause diff --git a/src/cfg/literals.rs b/src/cfg/literals.rs index 7e3355e6..4f2b06c8 100644 --- a/src/cfg/literals.rs +++ b/src/cfg/literals.rs @@ -45,7 +45,7 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option> { /// (JS `object`, TS `object`, Python `dictionary`). `names` contains /// identifiers lifted from pair values whose key matches any entry in /// `fields` (case-sensitive; JS/TS identifiers). When no destination-field -/// pairs are present, returns `Some(vec![])` — the sink is effectively +/// pairs are present, returns `Some(vec![])`, the sink is effectively /// silenced because no destination identifier exists. /// * `None` if the arg is absent, is not an object literal (plain string /// / ident / expression), or has splat/spread children that break static @@ -77,7 +77,7 @@ pub(super) fn extract_destination_field_idents( match child.kind() { // `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't // statically attribute spread contents to specific fields, so - // bail out — caller falls back to the whole-arg filter, matching + // bail out, caller falls back to the whole-arg filter, matching // the conservative posture used by arg_uses for splats. "spread_element" | "dictionary_splat" => { return None; @@ -107,7 +107,7 @@ pub(super) fn extract_destination_field_idents( } }), // Computed keys like `[someVar]` can't be statically - // resolved — skip (conservative: not a destination field). + // resolved, skip (conservative: not a destination field). "computed_property_name" => continue, _ => text_of(key_node, code), }; @@ -200,7 +200,7 @@ pub(super) fn extract_const_keyword_arg( continue; } let value_node = child.child_by_field_name("value")?; - // Only return a literal — identifiers / calls / complex exprs are + // Only return a literal, identifiers / calls / complex exprs are // "dynamic" and must be reported as `None` so the gate can // distinguish literal-safe from dynamic. return match value_node.kind() { @@ -252,7 +252,7 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8]) /// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as /// `arg0`). Returns `None` when the call has no arguments. /// -/// Used by per-language shape-aware sink suppression — for example, Ruby +/// Used by per-language shape-aware sink suppression, for example, Ruby /// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically /// parameterised when arg 0 is a hash/symbol/array/non-interpolated string, /// regardless of taint reaching that argument. @@ -268,7 +268,7 @@ pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bo /// Walk a Java method-chain receiver looking for an inner `method_invocation` /// whose method name matches one of `target_methods` (e.g. `createQuery`, -/// `prepareStatement`). Returns the kind of that inner call's arg 0 — used +/// `prepareStatement`). Returns the kind of that inner call's arg 0, used /// to verify the SQL-bearing call up-chain was given a string literal rather /// than a concatenation / method call. /// @@ -307,7 +307,7 @@ pub(super) fn java_chain_arg0_kind_for_method( /// method identifier matches one of `target_methods`, then return that /// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node /// represents a chained expression like `Model.where(...).preload(...).to_a` -/// — the outermost call (`to_a`) has no arguments, so the shape suppressor +///, the outermost call (`to_a`) has no arguments, so the shape suppressor /// must reach down the chain to inspect `where`'s arg 0. /// /// Conservative: returns `None` if the chain doesn't contain a matching @@ -353,6 +353,116 @@ fn subtree_has_interpolation(n: Node) -> bool { n.named_children(&mut cursor).any(subtree_has_interpolation) } +/// Walk a JS/TS method-chain receiver-side to find an inner `call_expression` +/// whose member-property name matches one of `target_methods` (e.g. `query`, +/// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0. +/// +/// Used to recognise ORM-accessor chains where a labelled SQL sink sits on +/// the receiver side of a parameterised execute method: +/// `strapi.db.query('admin::api-token').findOne({...})`. The outer call +/// (`findOne`) is the CFG node; the inner labelled `db.query` call carries +/// the literal model UID that proves the chain is parameterised. +/// +/// Conservative: returns `None` when no matching inner call is found, so +/// callers fall through to the no-suppression path. +pub(super) fn js_chain_arg0_kind_for_method( + expr: Node, + target_methods: &[&str], + code: &[u8], +) -> Option<(String, bool)> { + let n = unwrap_parens(expr); + // tree-sitter-typescript / -javascript: call_expression with fields + // `function` (member_expression / identifier) and `arguments`. + if n.kind() == "call_expression" { + // Check this call's callee: if its property name (or full text) ends + // with one of `target_methods`, this is the inner labelled call. + if let Some(function) = n.child_by_field_name("function") { + // Property of a member_expression; falls back to the function + // text itself for bare-identifier calls. + let prop_text = function + .child_by_field_name("property") + .and_then(|p| text_of(p, code)); + let full_text = text_of(function, code); + let leaf_text = full_text + .as_ref() + .map(|s| s.rsplit('.').next().unwrap_or(s).to_string()); + let matched = target_methods.iter().any(|m| { + prop_text.as_deref() == Some(*m) + || leaf_text.as_deref() == Some(*m) + || full_text.as_deref() == Some(*m) + || full_text + .as_deref() + .is_some_and(|s| s.ends_with(&format!(".{m}"))) + }); + if matched { + return arg0_kind_and_interpolation(n); + } + // Drill down the receiver spine: function.object is the prior + // call in the chain. + if let Some(object) = function.child_by_field_name("object") + && let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code) + { + return Some(found); + } + } + } + None +} + +/// Walk the receiver chain of a JS/TS call to count *non-execute* method +/// calls between the outer call and an inner labelled call to +/// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer +/// chain method name (e.g. `findOne`) when an inner-call to `target_inner` +/// exists somewhere on the receiver spine, otherwise `None`. +/// +/// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain +/// shape `.query(LITERAL).(...)`, bare +/// `connection.query("SELECT ...")` returns `None` because there is no +/// outer chain method. +pub(super) fn js_chain_outer_method_for_inner<'a>( + outer: Node<'a>, + target_inner: &[&str], + code: &'a [u8], +) -> Option { + let n = unwrap_parens(outer); + if n.kind() != "call_expression" { + return None; + } + let function = n.child_by_field_name("function")?; + let object = function.child_by_field_name("object")?; + // If `object` itself is a call_expression whose property matches + // `target_inner`, the immediate outer is `function.property`. + if object.kind() == "call_expression" { + let inner_function = object.child_by_field_name("function"); + if let Some(inner_function) = inner_function { + let prop_text = inner_function + .child_by_field_name("property") + .and_then(|p| text_of(p, code)); + let full_text = text_of(inner_function, code); + let leaf_text = full_text + .as_ref() + .map(|s| s.rsplit('.').next().unwrap_or(s).to_string()); + let inner_matched = target_inner.iter().any(|m| { + prop_text.as_deref() == Some(*m) + || leaf_text.as_deref() == Some(*m) + || full_text.as_deref() == Some(*m) + || full_text + .as_deref() + .is_some_and(|s| s.ends_with(&format!(".{m}"))) + }); + if inner_matched { + return function + .child_by_field_name("property") + .and_then(|p| text_of(p, code).map(|s| s.to_string())); + } + } + // Recurse: outer chain may have more depth (`a.b().c().d()` , + // d is outermost, c is next, target may be at b or further in). + return js_chain_outer_method_for_inner(object, target_inner, code); + } + None +} + /// For a chained method call (`a.b().c().d()`), walk down the receiver /// chain (`function.object`) and return the innermost call_expression /// alongside its callee text (e.g. `"http.get"`). @@ -385,7 +495,7 @@ pub(super) fn find_chained_inner_call<'a>( return None; } // Recurse: the inner call may itself be chained - // (`axios.get(u).then(h).catch(h)` — innermost is `axios.get`). + // (`axios.get(u).then(h).catch(h)`, innermost is `axios.get`). if let Some(inner) = find_chained_inner_call(object, lang, code) { return Some(inner); } @@ -398,7 +508,7 @@ pub(super) fn find_chained_inner_call<'a>( .or_else(|| object.child_by_field_name("name"))?; // Multi-line dotted member expressions (`http\n .get`) include // formatting whitespace in the source-text slice. The labels map - // keys are literal `"http.get"` etc. — strip whitespace so the + // keys are literal `"http.get"` etc., strip whitespace so the // chained-call inner-gate rebinding fires for both single-line and // multi-line chain styles. Also strips `\r` for CRLF sources. // Motivated by upstream Parse Server CVE-2025-64430 which uses the @@ -410,18 +520,18 @@ pub(super) fn find_chained_inner_call<'a>( /// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod /// node) and yield each *named argument* of every inner call along the -/// way. Outer's own arguments are NOT included — the caller already +/// way. Outer's own arguments are NOT included, the caller already /// handles those via the standard `pre_emit_arg_source_nodes` pass over /// `outer.arguments`. /// /// For `json.NewDecoder(r.Body).Decode(emoji)`: -/// outer = `.Decode(emoji)` — caller iterates `emoji` -/// inner = `json.NewDecoder(r.Body)` — yielded arg: `r.Body` +/// outer = `.Decode(emoji)` , caller iterates `emoji` +/// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body` /// /// We only pull from each inner call's `arguments` field, never from its /// `function`/`method`/receiver expressions. That distinction matters /// because chained source-receivers like `r.URL.Query()` expose a -/// member-text path that classifies as a Source — but it's the OUTER +/// member-text path that classifies as a Source, but it's the OUTER /// chain text (`r.URL.Query.Get`) that already classifies, so emitting /// a synth source for the inner-call's own callee would double-count. /// @@ -498,7 +608,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool return false; } let first_arg = named[0]; - // Extract the raw text of arg 0 — must be a string literal or + // Extract the raw text of arg 0, must be a string literal or // template string without interpolation. let query_text = match first_arg.kind() { "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => { @@ -511,7 +621,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool .named_children(&mut c) .any(|ch| ch.kind() == "template_substitution") { - return false; // dynamic — not safe + return false; // dynamic, not safe } text_of(first_arg, code) } @@ -534,7 +644,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool /// - `$1`, `$2`, …, `$N` (PostgreSQL positional) /// - `?` (MySQL / SQLite positional) /// - `%s` (Python DB-API / psycopg2) -/// - `:identifier` (Oracle / named parameters) — requires the colon to be +/// - `:identifier` (Oracle / named parameters), requires the colon to be /// preceded by a space or `=` (to avoid matching JS ternary / object /// literals). pub(super) fn has_sql_placeholders(s: &str) -> bool { @@ -559,7 +669,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool { && i + 1 < len && bytes[i + 1].is_ascii_alphabetic() => { - // :identifier — must be preceded by whitespace/= to avoid + // :identifier, must be preceded by whitespace/= to avoid // false positives on object literals or ternary operators. return true; } @@ -581,7 +691,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool { #[allow(clippy::only_used_in_recursion)] pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool { match node.kind() { - // Scalar strings — but reject if they contain interpolation + // Scalar strings, but reject if they contain interpolation // (e.g. Ruby `"hello #{name}"`, Python f-strings). "string" | "string_literal" @@ -602,7 +712,7 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool { // PHP encapsed_string: safe only if no variable interpolation "encapsed_string" => !has_interpolation_cfg(node), - // Wrapper: PHP/Go wrap each arg in an `argument` node — unwrap + // Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap "argument" => { node.named_child_count() == 1 && node @@ -765,7 +875,7 @@ pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool { return false; } } - // Zero-arg calls are not "all literal" — taint can still flow via a + // Zero-arg calls are not "all literal", taint can still flow via a // non-literal receiver (e.g. `tainted.readObject()`), and the sink- // suppression gate (`info.all_args_literal`) must not skip these. if !any_arg { @@ -781,7 +891,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool { let mut cursor = node.walk(); for child in node.children(&mut cursor) { let kind = child.kind(); - // Skip argument lists — those are checked by the caller. + // Skip argument lists, those are checked by the caller. if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" { continue; } @@ -804,7 +914,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool { /// Returns one `Vec` per argument (in parameter-position order). /// Returns empty if argument list can't be found or contains spread/keyword args. pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec> { - // Ruby `subshell` (backticks) has no `arguments` field — its children are + // Ruby `subshell` (backticks) has no `arguments` field, its children are // string fragments and `interpolation` nodes. Lift each interpolation's // identifiers into a positional arg so taint flows from `#{var}` into the // synthetic "subshell" sink. @@ -834,7 +944,7 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec> for child in args_node.named_children(&mut cursor) { let kind = child.kind(); // Named / keyword arguments are tracked separately in `CallMeta.kwargs` - // and do not participate in positional indexing — skip them here so + // and do not participate in positional indexing, skip them here so // `arg_uses` remains strictly positional. Splats (spread/dict splat) // still invalidate positional mapping; bail out in that case. if kind == "spread_element" @@ -1058,13 +1168,13 @@ pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) - /// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call /// (non-method-chain) Go shape. The caller wires the resulting cap into /// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the -/// taint engine consumes via the standard sanitizer pathway — taint flows +/// taint engine consumes via the standard sanitizer pathway, taint flows /// in on `s`, the matching cap is stripped from the result. pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option { if call_ast.kind() != "call_expression" { return None; } - // The call's `function` field is a `selector_expression` — `operand` + // The call's `function` field is a `selector_expression`, `operand` // is the package ident (`strings`), `field` is the method ident. let func = call_ast.child_by_field_name("function")?; if func.kind() != "selector_expression" { @@ -1085,7 +1195,7 @@ pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> O let new_lit = extract_const_string_arg(call_ast, 2, code)?; // If the replacement itself reintroduces a dangerous sequence, don't - // credit the strip — matches the Rust chain detector's policy. + // credit the strip, matches the Rust chain detector's policy. if !caps_stripped_by_literal_pattern(&new_lit).is_empty() { return None; } @@ -1106,7 +1216,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti } match lookup(lang, n.kind()) { Kind::Function => { - // Function/closure expression passed as argument — return the same + // Function/closure expression passed as argument, return the same // synthetic anon name used by build_sub so callback_bindings and // source_to_callback can match it to the extracted BodyCfg. n.child_by_field_name("name") @@ -1155,7 +1265,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti /// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`]. /// /// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces -/// an empty vector — positional indices past the splat are meaningless and +/// an empty vector, positional indices past the splat are meaningless and /// downstream passes already treat an empty vector as "no info". pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec> { let Some(args_node) = call_node.child_by_field_name("arguments") else { @@ -1175,7 +1285,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec Vec { let raw = text_of(target, code); raw.and_then(|s| strip_literal_quotes(&s, target, code)) @@ -1212,7 +1322,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec Option { // Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child. @@ -1320,7 +1430,7 @@ pub(super) fn def_use( // Python/Ruby `expression_statement` → `assignment`) let mut cursor = ast.walk(); for child in ast.children(&mut cursor) { - // Only use left/right fields for actual assignment nodes — binary + // Only use left/right fields for actual assignment nodes, binary // expressions also have left/right but are not definitions. let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment); let child_name = child @@ -1403,7 +1513,7 @@ pub(super) fn def_use( (defs, uses, vec![]) } - // if‑let / while‑let — the `let_condition` binds a variable from + // if‑let / while‑let, the `let_condition` binds a variable from // the value expression. E.g. `if let Ok(cmd) = env::var("CMD")` // defines `cmd` and uses `env`, `var`, `CMD`. Kind::If | Kind::While => { @@ -1418,7 +1528,7 @@ pub(super) fn def_use( let mut tmp = Vec::::new(); collect_idents(pat, code, &mut tmp); // The first plain identifier in the pattern is the binding. - // Skip type identifiers (e.g. "Ok" in Ok(cmd)) — take the + // Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the // last ident which is the inner binding name. defs = tmp.into_iter().last(); } diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 7620cfd8..5e9e0743 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -14,6 +14,7 @@ use crate::labels::{ }; use crate::summary::FuncSummary; use crate::symbol::{FuncKey, Lang}; +use crate::utils::snippet::truncate_at_char_boundary; use smallvec::SmallVec; use std::cell::RefCell; use std::collections::{HashMap, HashSet}; @@ -54,8 +55,8 @@ use literals::{ extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg, extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node, find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args, - is_parameterized_query_call, java_chain_arg0_kind_for_method, ruby_chain_arg0_for_method, - walk_chain_inner_call_args, + is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method, + js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args, }; use params::{ compute_container_and_kind, extract_param_meta, inject_framework_param_sources, @@ -74,7 +75,7 @@ pub fn extract_param_meta_for_test<'a>( } /// Test-only helper to populate the per-file DTO class map without -/// running `build_cfg`. Used by the Phase 6 audit harness in +/// running `build_cfg`. Used by the DTO audit harness in /// `tests/typed_extractors_audit.rs` to verify that /// `classify_param_type_*` resolves a same-file DTO via the /// thread-local map. @@ -91,30 +92,26 @@ pub fn clear_dto_classes_for_test() { DTO_CLASSES.with(|cell| cell.borrow_mut().clear()); } -// ------------------------------------------------------------------------- -// Structural DFS index for function bodies -// ------------------------------------------------------------------------- -// -// Per-file map of function-node start_byte → depth-first preorder index. -// Populated at the start of `build_cfg`, consumed by every site that -// previously formatted `` or stored `start_byte` as -// the disambig. The DFS index is stable against edits elsewhere in the -// file (inserting a line above a function does not change its index). -// -// Thread-local is safe because `build_cfg` is not re-entrant within a -// single rayon worker: each file is parsed and CFG-built to completion -// before the next one starts. +// Per-file map of function-node start_byte → DFS preorder index. Stable +// against unrelated edits (inserting a line above a function doesn't +// change its index). Thread-local is safe, `build_cfg` is not +// re-entrant within a single rayon worker. thread_local! { static FN_DFS_INDICES: RefCell> = RefCell::new(HashMap::new()); - /// Phase 6: per-file DTO class definitions. Populated at the top - /// of [`build_cfg`] by [`dto::collect_dto_classes`] so per-parameter - /// classifiers can resolve `@RequestBody T dto` / - /// `Json` / `Annotated[CreateUser, Body()]` to a - /// [`crate::ssa::type_facts::TypeKind::Dto`] when the DTO type is - /// declared in the same file. Cleared at the end of `build_cfg` - /// so thread-local state never leaks between files. + /// Per-file DTO class definitions, populated at the top of + /// [`build_cfg`] so per-parameter classifiers can resolve typed + /// extractors against same-file DTOs. pub(crate) static DTO_CLASSES: RefCell> = RefCell::new(HashMap::new()); + /// Per-file set of TS / JS `type X = Map<...>` (or `Set<...>` / + /// `Array<...>` / `T[]`) aliases, populated at the top of + /// [`build_cfg`]. Lets `classify_param_type_ts` resolve a + /// parameter typed `m: ElementsMap` to + /// [`crate::ssa::type_facts::TypeKind::LocalCollection`] via + /// same-file alias lookup. Cross-file aliases are not yet + /// resolved. + pub(crate) static TYPE_ALIAS_LC: RefCell> + = RefCell::new(std::collections::HashSet::new()); } /// Populate the per-file DFS-index map from a preorder walk of the @@ -148,11 +145,8 @@ fn fn_dfs_index(start_byte: usize) -> Option { FN_DFS_INDICES.with(|cell| cell.borrow().get(&start_byte).copied()) } -/// Synthetic name for an anonymous function. Uses the DFS index when -/// available (``), falls back to the byte offset when the map -/// is empty (e.g. during tests that bypass `build_cfg`). The `#` -/// sigil is intentionally different from `@` so the two formats are -/// distinguishable by downstream consumers. +/// Synthetic name for an anonymous function: `` from the DFS +/// index when available, `` as fallback. pub(crate) fn anon_fn_name(start_byte: usize) -> String { match fn_dfs_index(start_byte) { Some(idx) => format!(""), @@ -160,9 +154,7 @@ pub(crate) fn anon_fn_name(start_byte: usize) -> String { } } -/// Prefix check that accepts both the new `` and legacy -/// `` formats. Used by code paths that classify whether a -/// function name came from anonymous synthesis. +/// True for any anonymous-function synthesis prefix. pub(crate) fn is_anon_fn_name(name: &str) -> bool { name.starts_with(", @@ -248,14 +240,14 @@ pub struct CallMeta { pub outer_callee: Option, /// Byte span of the inner call that supplied the classification, when /// `find_classifiable_inner_call` overrode the outer callee. `None` when - /// the classification came from the outer AST node directly — in that + /// the classification came from the outer AST node directly, in that /// case `AstMeta.span` already points at the classified expression. /// /// Consumers that want the location of the *labeled* call (sink/source/ /// sanitizer display, flow-step rendering, taint origin attribution) /// should use [`NodeInfo::classification_span`] rather than reading this /// field directly. `AstMeta.span` remains the authoritative "whole - /// statement" span — used by structural passes (unreachability, + /// statement" span, used by structural passes (unreachability, /// resource lifecycle, guard byte scans, CFG/taint span dedup). #[serde(default)] pub callee_span: Option<(usize, usize)>, @@ -283,7 +275,7 @@ pub struct CallMeta { /// only positional arguments. pub kwargs: Vec<(String, Vec)>, /// String-literal value at each positional argument of this call, parallel - /// to `arg_uses` — `Some(s)` when the argument is a syntactic string + /// to `arg_uses`, `Some(s)` when the argument is a syntactic string /// literal, `None` otherwise. Empty for non-call nodes or when positional /// boundaries can't be determined. Consumed by the static-map abstract /// analysis (and future literal-aware passes) so they don't need the @@ -302,10 +294,41 @@ pub struct CallMeta { /// /// Takes priority over `sink_payload_args` in the SSA sink scan: when a /// call has an object-literal destination arg, only idents under the - /// listed fields may contribute sink findings — not every ident in the + /// listed fields may contribute sink findings, not every ident in the /// positional slot. + /// + /// Legacy single-gate path: populated only when this call site matched + /// exactly one gate. When a callee carries multiple gates (e.g. `fetch` + /// is both an SSRF and a `DATA_EXFIL` gate), per-gate filters live in + /// [`Self::gate_filters`] and this field is left `None`. #[serde(default)] pub destination_uses: Option>, + /// Per-gate filters for callees that carry multiple gated-sink rules. + /// + /// Each entry preserves one matching gate's `(label_caps, payload_args, + /// destination_uses)` so the SSA sink scan can attribute findings + /// per-cap. Empty when the call site matches zero or exactly one gate + /// (the single-gate case continues to use [`Self::sink_payload_args`] + + /// [`Self::destination_uses`]). + #[serde(default)] + pub gate_filters: Vec, +} + +/// One gate's contribution at a call site whose callee matches multiple +/// gates. The SSA taint engine processes each filter independently so a +/// `fetch({url: tainted}, {body: tainted})` flow surfaces as one SSRF +/// finding (URL filter) plus one `DATA_EXFIL` finding (body filter), each +/// carrying its own cap mask rather than a conflated union. +#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct GateFilter { + /// Sink caps emitted by this gate (e.g. `Cap::SSRF`, `Cap::DATA_EXFIL`). + pub label_caps: crate::labels::Cap, + /// Argument positions that carry the tainted payload for this gate. + pub payload_args: Vec, + /// Destination-aware filter: when `Some(names)`, the sink check only + /// considers SSA values whose `var_name` matches one of `names` (object- + /// literal destination fields lifted at CFG time). `None` ⇒ whole arg. + pub destination_uses: Option>, } /// Taint-classification and variable-flow metadata. @@ -349,7 +372,7 @@ pub struct NodeInfo { /// /// This flag is scoped to taint-style sink suppression: it indicates /// that no attacker-controlled data enters through the immediate - /// arguments. It does NOT mean the call is "safe" in general — other + /// arguments. It does NOT mean the call is "safe" in general, other /// detectors (resource lifecycle, structural analysis) may still /// legitimately flag these calls. pub all_args_literal: bool, @@ -411,7 +434,7 @@ pub struct NodeInfo { pub is_eq_with_const: bool, /// True when this node reads a numeric-length property on a container: /// `arr.length`, `map.size`, `buf.byteLength`, `items.count`, `vec.len()` - /// — either as a pure property access or as a zero-arg method call. + ///, either as a pure property access or as a zero-arg method call. /// Populated by inspecting the AST in `push_node` across JS/TS, Python, /// Ruby, Java, Rust, PHP, and C/C++ idioms where these accessors return /// an integer. Consumed by the type-fact analysis (`ssa::type_facts`) @@ -419,12 +442,12 @@ pub struct NodeInfo { /// FILE_IO / SHELL_ESCAPE sink suppression for provably numeric /// payloads. pub is_numeric_length_access: bool, - /// Phase 6.3: the field name read on the RHS of an assignment whose + /// the field name read on the RHS of an assignment whose /// RHS is a single member-access expression (e.g. `let x = dto.email`). /// Set to `Some("email")` for that shape; left `None` otherwise. /// Consumed by the type-fact analysis (`ssa::type_facts`) so reads /// against a [`crate::ssa::type_facts::TypeKind::Dto`] receiver pick - /// up the field's declared `TypeKind`. Strictly additive — when + /// up the field's declared `TypeKind`. Strictly additive, when /// `None`, the legacy copy-prop semantics apply. pub member_field: Option, } @@ -442,7 +465,7 @@ impl NodeInfo { /// lines, flow-step rendering, symbolic witness extraction, debug views. /// /// Use `ast.span` directly for **structural grain**: unreachability, - /// resource lifecycle, guard byte scans, CFG/taint span dedup — anywhere + /// resource lifecycle, guard byte scans, CFG/taint span dedup, anywhere /// the enclosing statement is the meaningful unit. #[inline] pub fn classification_span(&self) -> (usize, usize) { @@ -514,7 +537,7 @@ pub struct BodyMeta { /// Per-parameter [`crate::ssa::type_facts::TypeKind`] inferred from /// decorators / annotations / static type text at CFG construction /// time. Same length as `params`; positions with no recoverable - /// type info are `None`. Strictly additive — when every entry is + /// type info are `None`. Strictly additive, when every entry is /// `None`, downstream behaviour is identical to the pre-Phase-1 /// engine. pub param_types: Vec>, @@ -528,7 +551,7 @@ pub struct BodyMeta { /// `LocalFuncSummary`. `None` for the synthetic top-level body. /// /// All intra-file maps keyed on function identity (SSA summaries, callee - /// bodies, inline cache, callback bindings) use this key — never the bare + /// bodies, inline cache, callback bindings) use this key, never the bare /// leaf `name`, which is collision-prone across (container, arity, /// disambig, kind). pub func_key: Option, @@ -589,7 +612,7 @@ pub struct FileCfg { /// Promisify wrapper aliases: local name → wrapped callee name. /// Only populated for JS/TS files. pub promisify_aliases: PromisifyAliases, - /// Phase 6: per-file class / trait / interface hierarchy edges. + /// per-file class / trait / interface hierarchy edges. /// Each entry is `(sub_container, super_container)` after /// language-specific normalisation. See /// [`crate::cfg::hierarchy`] for the per-language extraction @@ -711,14 +734,10 @@ fn extract_condition_raw<'a>( vars.dedup(); vars.truncate(MAX_COND_VARS); - // 4. Extract text, truncated. - let text = text_of(cond, code).map(|t| { - if t.len() > MAX_CONDITION_TEXT_LEN { - t[..MAX_CONDITION_TEXT_LEN].to_string() - } else { - t - } - }); + // 4. Extract text, truncated. UTF-8-safe, gogs (Gurmukhi) / + // discourse (Cyrillic) trip raw byte slices on regex literals. + let text = text_of(cond, code) + .map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string()); (text, vars, negated) } @@ -739,7 +758,7 @@ pub(super) fn detect_negation<'a>( _if_ast: Node<'a>, _lang: &str, ) -> (Node<'a>, bool) { - // Unwrap parenthesized_expression — JS/Java/PHP wrap if-conditions in parens. + // Unwrap parenthesized_expression, JS/Java/PHP wrap if-conditions in parens. // This lets us detect negation inside: `if (!expr)` → cond is `(!expr)`. let cond = if cond.kind() == "parenthesized_expression" { cond.child_by_field_name("expression") @@ -811,7 +830,7 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option { "*" => Some(BinOp::Mul), "/" => Some(BinOp::Div), "%" => Some(BinOp::Mod), - // Bitwise (single-char tokens — no conflict with && / ||) + // Bitwise (single-char tokens, no conflict with && / ||) "&" => Some(BinOp::BitAnd), "|" => Some(BinOp::BitOr), "^" => Some(BinOp::BitXor), @@ -909,7 +928,7 @@ fn extract_template_prefix(ast: Node, lang: &str, code: &[u8]) -> Option /// `extract_template_prefix` for both assignment RHS and call arguments. /// /// Also descends through `await` / `yield` wrappers and into the first -/// argument of a call expression — this covers the common sink shape +/// argument of a call expression, this covers the common sink shape /// `await axios.get(\`https://host/…${x}\`)` where the template literal lives /// inside a call inside an `await` wrapper. fn prefix_of_expression(node: Node, code: &[u8]) -> Option { @@ -930,7 +949,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option { } "call_expression" | "call" | "new_expression" => { // Descend into the first positional argument (e.g. - // `axios.get(\`https://…${x}\`)` — the URL we want to lock + // `axios.get(\`https://…${x}\`)`, the URL we want to lock // is the template-literal first argument of the call). let args = cur .child_by_field_name("arguments") @@ -942,7 +961,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option { } } - // Case 1: template literal — `\`scheme://host/…${x}…\``. + // Case 1: template literal, `\`scheme://host/…${x}…\``. if cur.kind() == "template_string" { let mut w = cur.walk(); let first_child = cur.named_children(&mut w).next()?; @@ -957,7 +976,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option { return None; } - // Case 2: `"scheme://host/" + x` — LHS is a string literal. + // Case 2: `"scheme://host/" + x`, LHS is a string literal. if cur.kind() == "binary_expression" { let mut w2 = cur.walk(); let mut ops = cur.children(&mut w2).filter(|c| !c.is_named()); @@ -1028,7 +1047,7 @@ fn extract_bin_op_const(ast: Node, lang: &str, code: &[u8]) -> Option { } } - // Try left, then right — one of them should be a literal + // Try left, then right, one of them should be a literal try_parse_number(left, code).or_else(|| try_parse_number(right, code)) } @@ -1067,7 +1086,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool { .named_child(0) .is_some_and(|c| is_boolean_eq_const_tree(c, lang)), "unary_expression" | "not_operator" => { - // `!` / `not` — operator is an anonymous child; operand is the + // `!` / `not`, operator is an anonymous child; operand is the // single named child. let mut w = node.walk(); let mut op_is_not = false; @@ -1084,7 +1103,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool { .is_some_and(|c| is_boolean_eq_const_tree(c, lang)) } "boolean_operator" => { - // Python `and`/`or` — operands are named children. + // Python `and`/`or`, operands are named children. let l = node.named_child(0); let r = node.named_child(1); l.is_some_and(|n| is_boolean_eq_const_tree(n, lang)) @@ -1137,9 +1156,9 @@ fn binary_operator_token(node: Node) -> Option { /// Property names whose value is provably an integer across the supported /// languages: JS/TS `arr.length` (Array/String/TypedArray), `map.size` /// (Map/Set), `buffer.byteLength` (ArrayBuffer/TypedArray); Python `.count` -/// (`str.count`, `list.count`, `tuple.count` — all return int); Ruby `.length` +/// (`str.count`, `list.count`, `tuple.count`, all return int); Ruby `.length` /// / `.size` / `.count`; Java `.size()` / `.length()`; Rust `.len()`. This -/// list is intentionally narrow — only properties whose semantics across every +/// list is intentionally narrow, only properties whose semantics across every /// host we scan return an integer, so the `TypeKind::Int` fact is sound. fn is_numeric_length_property(name: &str) -> bool { matches!(name, "length" | "size" | "byteLength" | "count" | "len") @@ -1157,7 +1176,7 @@ fn is_numeric_length_property(name: &str) -> bool { /// Consumed by the type-fact analysis (`ssa::type_facts::analyze_types`) to /// infer `TypeKind::Int` on the defined value so sink-cap suppression can /// treat `"row " + arr.length` as a non-injectable payload. -/// Phase 6.3: when the RHS of an assignment / declaration is a single +/// when the RHS of an assignment / declaration is a single /// member-access expression (`let x = dto.email`, `x = obj.field`, /// `let x = obj["field"]`), return the property name. The CFG type-fact /// analysis uses the recovered name to look up the field's declared @@ -1321,7 +1340,7 @@ fn find_single_binary_expr<'a>(ast: Node<'a>, lang: &str) -> Option> { // Check if ast itself is a binary expression if is_binary_expr_kind(ast_kind, lang) { - // Verify it has exactly 2 named children (left, right) — no nesting + // Verify it has exactly 2 named children (left, right), no nesting let named_count = ast.named_child_count(); if named_count == 2 { // Ensure neither child is itself a binary expression (that would @@ -1435,7 +1454,7 @@ pub(super) fn push_node<'a>( // (e.g. PHP `object_creation_expression` has positional children). .or_else(|| find_constructor_type_child(ast)) .and_then(|n| { - // IIFE: `(function(x){...})(arg)` — the called expression is a + // IIFE: `(function(x){...})(arg)`, the called expression is a // function literal with no identifier. Bind the call to the // anonymous body's synthetic name so resolve_callee can find // the extracted BodyCfg/summary. Without this, text_of() would @@ -1512,7 +1531,7 @@ pub(super) fn push_node<'a>( // If this is a declaration/expression wrapper or an assignment that // *contains* a call, prefer the first inner call identifier instead of // the whole line. Track the inner call's byte span so we can populate - // `CallMeta.callee_span` once the labels settle — enabling narrow + // `CallMeta.callee_span` once the labels settle, enabling narrow // source-location reporting when the classified call lives several lines // below the enclosing statement (e.g. call inside a multi-line template // literal). @@ -1546,9 +1565,9 @@ pub(super) fn push_node<'a>( let mut labels = classify_all(lang, &text, extra); // If the outermost call didn't classify, try inner/nested calls. - // E.g. `str(eval(expr))` — `str` is not a sink, but `eval` is. + // E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is. // When the callee is overridden, save the original for container ops - // (e.g. `parts.add(req.getParameter(...))` — callee becomes + // (e.g. `parts.add(req.getParameter(...))`, callee becomes // "req.getParameter" but outer_callee preserves "parts.add"). let mut outer_callee: Option = None; let mut inner_callee_span: Option<(usize, usize)> = None; @@ -1568,7 +1587,7 @@ pub(super) fn push_node<'a>( // For assignments like `element.innerHTML = value`, the inner-call heuristic // above may have overridden `text` with a call on the RHS (e.g. getElementById). - // If that didn't produce a label, check the LHS property name — it may be a + // If that didn't produce a label, check the LHS property name, it may be a // sink like `innerHTML`. // // This covers both direct `Kind::Assignment` nodes and `Kind::CallWrapper` @@ -1588,7 +1607,7 @@ pub(super) fn push_node<'a>( if let Some(assign) = assign_node && let Some(lhs) = assign.child_by_field_name("left") { - // Try full member expression first (e.g. "location.href") — more + // Try full member expression first (e.g. "location.href"), more // specific and avoids false positives on `a.href`. if let Some(full) = member_expr_text(lhs, code) { if let Some(l) = classify(lang, &full, extra) { @@ -1612,7 +1631,7 @@ pub(super) fn push_node<'a>( // try to classify the member expression text as a source. // This handles `var x = process.env.CMD` (JS), `os.environ["KEY"]` (Python), // and similar property-access-based source patterns. - // Skip when the assignment's RHS is itself a function/lambda literal — + // Skip when the assignment's RHS is itself a function/lambda literal , // labels found by `first_member_label` would come from inside the // closure body and shouldn't tag the outer wrapper (e.g. Go's // `run := func() { exec.Command(...) }` would otherwise inherit @@ -1687,7 +1706,7 @@ pub(super) fn push_node<'a>( if labels.is_empty() && let Some(outer) = call_ast && let Some((inner, inner_callee_text)) = find_chained_inner_call(outer, lang, code) - && classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_some() + && !classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_empty() { call_ast = Some(inner); outer_callee = Some(text.clone()); @@ -1707,13 +1726,14 @@ pub(super) fn push_node<'a>( // the outer statement `text`, so gate matcher names like `"fetch"` hit. let mut sink_payload_args: Option> = None; let mut destination_uses: Option> = None; + let mut gate_filters: Vec = Vec::new(); if labels.is_empty() { let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)); if let Some(cn) = gate_call { let gate_callee_text = if call_ast.is_some() { text.clone() } else { - // Inner call reached via wrapper — use the call-expression's + // Inner call reached via wrapper, use the call-expression's // function name directly. Falls back to `text` so non-call- // expression kinds (method calls, Ruby `call` nodes, macros) // still have a usable callee string. @@ -1723,51 +1743,84 @@ pub(super) fn push_node<'a>( .and_then(|f| text_of(f, code)) .unwrap_or_else(|| text.clone()) }; - if let Some(gm) = classify_gated_sink( + let matches = classify_gated_sink( lang, &gate_callee_text, |idx| extract_const_string_arg(cn, idx, code), |kw| extract_const_keyword_arg(cn, kw, code), |kw| has_keyword_arg(cn, kw, code), - ) { - labels.push(gm.label); - let payload = gm.payload_args; - if payload == crate::labels::ALL_ARGS_PAYLOAD { - // Dynamic-activation sentinel: every positional arg is - // conservatively a payload. Expand using the actual call - // arity so `collect_tainted_sink_values` checks each one. - let arity = extract_arg_uses(cn, code).len(); - if arity > 0 { - sink_payload_args = Some((0..arity).collect()); - } - } else if !payload.is_empty() { - sink_payload_args = Some(payload.to_vec()); - } + ); - // Destination-aware gates (outbound HTTP clients): when the - // gate declares destination-bearing object fields and the - // positional destination arg at call time is an object - // literal, narrow sink-taint checks to identifiers under - // those fields. Non-object arg forms (string / ident / - // expression) return `None` from the extractor and fall - // through to whole-arg positional filtering. - // - // We only populate destination_uses for the FIRST payload - // position that is an object literal. For outbound HTTP - // gates `payload_args` is always a single position (arg 0) - // so this is exact. - if !gm.object_destination_fields.is_empty() { - for &pos in gm.payload_args { - if let Some(names) = extract_destination_field_idents( - cn, - pos, - gm.object_destination_fields, - code, - ) { - destination_uses = Some(names); - break; + if !matches.is_empty() { + // Per-gate filter accumulation. Each match contributes: + // * its label (added to `labels` so `resolve_sink_caps` + // downstream sees the union), + // * a `GateFilter` carrying that gate's specific + // `(label_caps, payload_args, destination_uses)` so + // the SSA sink scan can attribute taint per-cap. + let mut union_payload: Vec = Vec::new(); + for gm in &matches { + labels.push(gm.label); + + let payload_vec: Vec = + if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD { + // Dynamic-activation sentinel: every positional arg is + // conservatively a payload. Expand using the actual + // call arity so `collect_tainted_sink_values` checks + // each one. + let arity = extract_arg_uses(cn, code).len(); + (0..arity).collect() + } else { + gm.payload_args.to_vec() + }; + + // Destination-aware gates: when the gate declares + // destination-bearing object fields and a payload-position + // arg is an object literal at call time, narrow sink-taint + // checks to identifiers under those fields. Non-object + // arg forms return `None` from the extractor and the gate + // falls back to whole-arg positional filtering. + let mut dest_uses: Option> = None; + if !gm.object_destination_fields.is_empty() { + for &pos in gm.payload_args { + if let Some(names) = extract_destination_field_idents( + cn, + pos, + gm.object_destination_fields, + code, + ) { + dest_uses = Some(names); + break; + } } } + + let label_caps = match gm.label { + crate::labels::DataLabel::Sink(c) => c, + _ => crate::labels::Cap::empty(), + }; + + for &p in &payload_vec { + if !union_payload.contains(&p) { + union_payload.push(p); + } + } + gate_filters.push(GateFilter { + label_caps, + payload_args: payload_vec, + destination_uses: dest_uses, + }); + } + if !union_payload.is_empty() { + sink_payload_args = Some(union_payload); + } + // Legacy single-gate path keeps `destination_uses` populated so + // the SSA fast-path (one filter) continues to work without + // consulting `gate_filters`. When multiple gates match, + // per-position filters live in `gate_filters` and the legacy + // field is intentionally left `None`. + if gate_filters.len() == 1 { + destination_uses = gate_filters[0].destination_uses.clone(); } } } @@ -1778,7 +1831,7 @@ pub(super) fn push_node<'a>( // path-traversal or HTML metacharacters. The CFG collapses the whole // chain into a single call node, so detection must inspect the AST of // that node directly. Only fires when no Sanitizer label already - // classifies this node — existing label rules win. + // classifies this node, existing label rules win. if lang == "rust" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) { if let Some(cn) = call_ast { if cn.kind() == "call_expression" || cn.kind() == "method_call_expression" { @@ -1815,7 +1868,7 @@ pub(super) fn push_node<'a>( // `having` / `joins` as `Sink(SQL_QUERY)` because their string-interpolation // form (`Model.where("id = #{x}")`) is a real SQLi vector. But the same // methods are intrinsically parameterised when arg 0 is a hash, symbol, - // array, or non-interpolated string — Rails escapes the values. Rather + // array, or non-interpolated string, Rails escapes the values. Rather // than dropping the sink (which would lose the genuine TPs), synthesise // a same-node `Sanitizer(SQL_QUERY)` for the safe shapes; this clears // SQL taint at the call and reflexively dominates the sink, suppressing @@ -1825,7 +1878,7 @@ pub(super) fn push_node<'a>( // Chained calls (`Model.where(...).preload(...).to_a`) collapse into a // single CFG node whose outer `call_ast` may be `to_a` (no args). The // shape inspection has to walk the receiver chain to reach the AR query - // call itself — `ruby_chain_arg0_for_method` does that walk. + // call itself, `ruby_chain_arg0_for_method` does that walk. if (lang == "ruby" || lang == "rb") && labels .iter() @@ -1859,7 +1912,7 @@ pub(super) fn push_node<'a>( // and `Statement.executeQuery(String)` overloads are real injection // sinks when given a concatenated SQL string. But the same method // names on JPA `javax.persistence.Query` and JDBC `PreparedStatement` - // are zero-arg — they execute SQL that was bound upstream by + // are zero-arg, they execute SQL that was bound upstream by // `entityManager.createQuery(LITERAL)` / `connection.prepareStatement(LITERAL)`, // and any bind values went through `setParameter` / `setString` // (which the JDBC/JPA driver escapes). Walk the receiver chain to @@ -1894,7 +1947,7 @@ pub(super) fn push_node<'a>( // (`createQuery` / `createNativeQuery` / `prepareStatement`) // and require its arg 0 to be a string literal. Anything // else (binary concat, identifier, method call) leaves - // the sink in place — we cannot prove the SQL is + // the sink in place, we cannot prove the SQL is // parameterised, so the structural finding stands. const JPA_BIND_METHODS: &[&str] = &[ "createQuery", @@ -1914,6 +1967,89 @@ pub(super) fn push_node<'a>( } } + // Shape-based sanitizer synthesis for JS/TS ORM-accessor chains. + // The static label table marks `db.query` / `connection.query` / + // `pool.query` / `client.query` / `db.execute` as `Sink(SQL_QUERY)` + // because the bare `connection.query("SELECT ..." + name)` form is a + // real SQLi sink. But the same `db.query` method on Strapi-style ORMs + // takes a model UID literal and returns a chainable model accessor: + // `strapi.db.query('admin::api-token').findOne({ where: whereParams })`. + // The trailing `.findOne({...})` / `.findMany({...})` / `.create(...)` + // calls are intrinsically parameterised, the actual SQL is generated + // by the ORM, and the per-call values arrive through field-keyed object + // literals that the ORM driver escapes. + // + // Recognition rule: when the CFG node's classified text reaches a sink + // with `SQL_QUERY` cap, walk the receiver chain looking for an inner + // `*.query(...)` / `*.execute(...)` whose arg 0 is a string literal + // and whose result has at least one chained method call appended whose + // name is in the ORM-accessor whitelist. If both hold, synthesise a + // same-node `Sanitizer(SQL_QUERY)` mirroring the Java JPA fix. Bare + // `connection.query("SELECT ...")` (no chained method) and + // `db.query("UPDATE x SET y=" + name)` (non-literal arg 0) leave the + // sink in place, both are genuine SQLi shapes. + if (lang == "javascript" + || lang == "js" + || lang == "typescript" + || lang == "ts" + || lang == "tsx") + && labels + .iter() + .any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY))) + && !labels + .iter() + .any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY))) + { + const QUERY_TARGETS: &[&str] = &["query", "execute"]; + // ORM-accessor methods that take object-literal args and return + // promises of rows / row counts. Promise methods (`then`, `catch`, + // `finally`) deliberately excluded, they don't prove ORM shape. + const ORM_CHAIN_METHODS: &[&str] = &[ + "findOne", + "findMany", + "findFirst", + "findUnique", + "findById", + "find", + "create", + "createMany", + "update", + "updateMany", + "upsert", + "delete", + "deleteMany", + "count", + "aggregate", + "distinct", + "save", + ]; + // Fall back to a deeper walk (up to 4 levels) for await/return- + // wrapped calls (e.g. `const x = await db.query(...).findOne(...)` , + // call sits at depth 3 inside lexical_declaration > variable_declarator + // > await_expression > call_expression). + let chain_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)); + if let Some(call_node) = chain_call { + // Outer method must be in the ORM whitelist *and* the chain must + // have a deeper inner call to a `query`/`execute` whose arg 0 is + // a string literal. Both checks gate the synthesis. + let outer_method = js_chain_outer_method_for_inner(call_node, QUERY_TARGETS, code); + let outer_is_orm = outer_method + .as_deref() + .is_some_and(|m| ORM_CHAIN_METHODS.contains(&m)); + if outer_is_orm + && let Some((arg0_kind, has_interp)) = + js_chain_arg0_kind_for_method(call_node, QUERY_TARGETS, code) + && !has_interp + && matches!( + arg0_kind.as_str(), + "string" | "string_fragment" | "template_string" + ) + { + labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY)); + } + } + } + let span = (ast.start_byte(), ast.end_byte()); /* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */ @@ -2036,7 +2172,7 @@ pub(super) fn push_node<'a>( // (SSA `SsaOp::Call.receiver`, summary `receiver_to_return`/`receiver_to_sink`). // // Two cases: - // 1. Kind::CallMethod — native method call AST (Java method_invocation, + // 1. Kind::CallMethod, native method call AST (Java method_invocation, // Rust method_call_expression, Ruby call, PHP member_call_expression). // Receiver is exposed via "object"/"receiver"/"scope" field on the call. // 2. Kind::CallFn whose function child is a member_expression (JS/TS) or @@ -2065,7 +2201,7 @@ pub(super) fn push_node<'a>( // value, which is what type-qualified resolution // anchors on. Falls back to `root_receiver_text` (which // returns raw text like "conn.execute") only if drilling - // fails — preserving prior behavior for types we can't + // fails, preserving prior behavior for types we can't // structurally reduce. root_member_receiver(rn, code).or_else(|| root_receiver_text(cn, lang, code)) } else { @@ -2076,7 +2212,7 @@ pub(super) fn push_node<'a>( // JS/TS `obj.method(x)`: call_expression.function = member_expression. // Python `obj.method(x)`: call.function = attribute. // Rust `obj.method(x)`: call_expression.function = field_expression - // (field on `value`, not `object` — value can be another call + // (field on `value`, not `object`, value can be another call // for chained forms like `Connection::open(p).unwrap().execute(...)`). // Pull the receiver from the object/attribute-owner field. let func_child = cn.child_by_field_name("function"); @@ -2139,7 +2275,7 @@ pub(super) fn push_node<'a>( // Python `with` and Java try-with-resources. let is_raii_managed = is_raii_factory(lang, &text); - // Ruby block form auto-close: `File.open(path) { |f| f.read }` — + // Ruby block form auto-close: `File.open(path) { |f| f.read }` , // the block parameter receives the resource and Ruby guarantees close // at block exit. If assigned (`f = File.open(p) { ... }`), the // variable holds the block's return value, not an open resource. @@ -2156,7 +2292,7 @@ pub(super) fn push_node<'a>( // Prefer the span of the call found by `find_classifiable_inner_call` // (deeper, classification-driven) over the one from `first_call_ident` // (shallower, text-override-driven). Only record `callee_span` when it - // actually narrows against `ast.span` — storing a redundant copy would + // actually narrows against `ast.span`, storing a redundant copy would // just bloat every labeled Call node. let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span); @@ -2174,6 +2310,7 @@ pub(super) fn push_node<'a>( kwargs, arg_string_literals, destination_uses, + gate_filters, }, taint: TaintMeta { labels, @@ -2228,7 +2365,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind: /// Pre-emit dedicated Source CFG nodes for call arguments that contain source /// member expressions. /// -/// **Two-step API** — Source nodes must be created *before* the Call node so +/// **Two-step API**, Source nodes must be created *before* the Call node so /// they receive lower graph indices. This is critical because the If handler /// uses `NodeIndex::new(g.node_count())` to capture the first node built in a /// branch and wires a True/False edge to it. If the Source node has a lower @@ -2239,7 +2376,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind: /// the branch body. /// /// True when `ast` is an assignment / declaration whose RHS is a -/// function or lambda literal — i.e. shapes like +/// function or lambda literal, i.e. shapes like /// * Go `run := func() { ... }` /// * JS/TS `var run = function() { ... }` / `const run = () => ...` /// * Python `run = lambda x: ...` @@ -2311,7 +2448,7 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool { false } -/// Pointer-Phase 6 / W5: when `ast` is (or wraps) an assignment whose +/// when `ast` is (or wraps) an assignment whose /// LHS is a single subscript / index expression with a plain-identifier /// receiver, emit a synthetic `__index_set__` Call node and return its /// `NodeIndex`. Returns `None` for non-subscript LHSs, multi-target @@ -2328,7 +2465,7 @@ fn try_lower_subscript_write( enclosing_func: Option<&str>, call_ordinal: &mut u32, ) -> Option { - // Locate the assignment node — `ast` may be the assignment itself + // Locate the assignment node, `ast` may be the assignment itself // (Go `assignment_statement`) or a wrapper (`expression_statement` // containing JS `assignment_expression` / Python `assignment`). let assign_ast = if matches!(lookup(lang, ast.kind()), Kind::Assignment) { @@ -2383,7 +2520,7 @@ fn try_lower_subscript_write( /// `synth_bindings` carry `(arg_pos, synth_name)` pairs that should be /// appended to both the call's `arg_uses[arg_pos]` and its `taint.uses`. /// `uses_only_synth_names` carry synth names that should *only* be -/// appended to `taint.uses` — used for chain-inner-arg sources where the +/// appended to `taint.uses`, used for chain-inner-arg sources where the /// synth value is not a positional argument of the OUTER call but still /// participates in the call's implicit dependency chain (e.g. `r.Body` /// inside `json.NewDecoder(r.Body).Decode(emoji)`'s receiver). @@ -2446,7 +2583,7 @@ fn pre_emit_arg_source_nodes( for (pos, child) in children.iter().enumerate() { let src_label = first_member_label(*child, lang, code, extra); if let Some(DataLabel::Source(caps)) = src_label { - // Use the *current* node count as a unique token — it equals the + // Use the *current* node count as a unique token, it equals the // index the new Source node will receive. let synth_name = format!("__nyx_src_{}_{}", g.node_count(), pos); let member_text = first_member_text(*child, code); @@ -2481,7 +2618,7 @@ fn pre_emit_arg_source_nodes( continue; } - // Pointer-Phase 6 / W5: pre-emit `__index_get__` Call nodes for + //pre-emit `__index_get__` Call nodes for // subscript / index-expression args when pointer analysis is // enabled. This lets the W2/W4 container ELEM read hook fire // on the synth call, propagating must/may/caps from the cell @@ -2489,7 +2626,7 @@ fn pre_emit_arg_source_nodes( // // Gated on `pointer::is_enabled()` so the env-var=0 path keeps // CFG shapes bit-identical to today's output. Only fires when - // the array operand resolves to a plain identifier — see + // the array operand resolves to a plain identifier, see // `subscript_components` for the bail conditions. if pointer_on && is_subscript_kind(child.kind()) @@ -2539,7 +2676,7 @@ fn pre_emit_arg_source_nodes( // Gated to Go and to writeback-shaped outer callees (`Decode` / // `Unmarshal`) because the synth-source emission is only useful when // a downstream writeback consumer reads from the chain's tainted - // receiver — broader gating risks emitting synth sources whose taint + // receiver, broader gating risks emitting synth sources whose taint // never propagates and whose presence trips Layer B AST-pattern // suppression on unrelated sinks (see // `tests/fixtures/real_world/go/taint/func_literal_capture.go`). @@ -2613,7 +2750,7 @@ fn pre_emit_arg_source_nodes( /// Step 2: wire synthetic variable names from pre-emitted Source nodes into /// the Call node's `arg_uses` and `uses`. `uses_only` synth names are -/// appended only to `taint.uses` — used for chain-inner-arg sources whose +/// appended only to `taint.uses`, used for chain-inner-arg sources whose /// synth value is not a positional outer-call argument. fn apply_arg_source_bindings( g: &mut Cfg, @@ -2724,7 +2861,7 @@ pub(super) fn build_sub<'a>( .unwrap_or(false); // Check for negation wrapping the entire condition (e.g. `!(a && b)`) - // — if present, skip short-circuit decomposition (De Morgan out of scope). + //, if present, skip short-circuit decomposition (De Morgan out of scope). let has_short_circuit = has_short_circuit && cond_subtree.map_or(false, |c| { let unwrapped = unwrap_parens(c); @@ -3424,7 +3561,7 @@ pub(super) fn build_sub<'a>( // When the grammar-level name is anonymous, try to derive a binding // name from the surrounding declaration or assignment. This lets // `var h = function(x){...}` / `this.run = () => {...}` participate - // in callback resolution — callers referencing `h` or `run` can + // in callback resolution, callers referencing `h` or `run` can // find the body via `resolve_local_func_key` and intra-file calls // like `h()` can resolve to the anonymous body's summary. Without // this, the body is keyed with the synthetic anon name and there @@ -3731,7 +3868,7 @@ pub(super) fn build_sub<'a>( // would lower the return as a plain `StmtKind::Call`, losing // the return semantics and letting fall-through Seq edges // survive into the SSA terminator (the OR-chain rejection-arm - // defect — see `or_chain_rejection_block_terminates_with_return`). + // defect, see `or_chain_rejection_block_terminates_with_return`). if let Some(inner) = ast.children(&mut cursor).find(|c| { matches!( lookup(lang, c.kind()), @@ -3788,7 +3925,7 @@ pub(super) fn build_sub<'a>( ); } - // Pointer-Phase 6 / W5: subscript-write lowering when the + //subscript-write lowering when the // CallWrapper's inner expression is `arr[i] = v` (JS/TS, // Python). See `try_lower_subscript_write` for shape + // bail matrix. @@ -3824,7 +3961,7 @@ pub(super) fn build_sub<'a>( // Pre-emit Source nodes for call arguments containing source // member expressions (e.g. `req.body.returnTo` inside // `res.redirect(req.body.returnTo)`). Created BEFORE the Call - // node so they get lower indices — see doc comment on + // node so they get lower indices, see doc comment on // `pre_emit_arg_source_nodes` for why this ordering matters. let (effective_preds, src_bindings, src_uses_only) = if kind == StmtKind::Call { pre_emit_arg_source_nodes(g, ast, lang, code, enclosing_func, analysis_rules, preds) @@ -3984,7 +4121,7 @@ pub(super) fn build_sub<'a>( // Assignment that may contain a call (Python `x = os.getenv(...)`, Ruby `x = gets()`) Kind::Assignment => { - // JS/TS ternary-RHS split — same rationale as the CallWrapper branch. + // JS/TS ternary-RHS split, same rationale as the CallWrapper branch. if matches!(lang, "javascript" | "typescript" | "tsx") && let (Some(left), Some(right)) = ( ast.child_by_field_name("left"), @@ -4011,7 +4148,7 @@ pub(super) fn build_sub<'a>( } } - // Pointer-Phase 6 / W5: subscript-write lowering. See + //subscript-write lowering. See // `try_lower_subscript_write` for the per-language shape // matrix and bail conditions. if crate::pointer::is_enabled() @@ -4099,12 +4236,19 @@ pub(crate) fn build_cfg<'a>( // function so thread-local state never leaks between files. populate_fn_dfs_indices(tree, lang); - // Phase 6: harvest DTO class definitions before any param classifier - // runs. Empty for languages without a Phase 6 collector. Cleared + // harvest DTO class definitions before any param classifier + // runs. Empty for languages without a collector. Cleared // alongside the DFS map at end-of-build_cfg. DTO_CLASSES.with(|cell| { *cell.borrow_mut() = dto::collect_dto_classes(tree.root_node(), lang, code); }); + // harvest same-file `type X = Map<...>` / `Set<...>` / `T[]` + // aliases so JS/TS param classifiers resolve `m: ElementsMap` + // to `LocalCollection`. Empty for non-JS/TS languages. + TYPE_ALIAS_LC.with(|cell| { + *cell.borrow_mut() = + dto::collect_type_alias_local_collections(tree.root_node(), lang, code); + }); // Create the top-level body graph (BodyId(0)). let (mut g, entry, exit) = create_body_graph(0, code.len(), None); @@ -4143,7 +4287,7 @@ pub(crate) fn build_cfg<'a>( connect_all(&mut g, &[e], exit, EdgeKind::Seq); } - debug!(target: "cfg", "CFG DONE — top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1); + debug!(target: "cfg", "CFG DONE, top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1); if cfg!(debug_assertions) { for idx in g.node_indices() { @@ -4231,10 +4375,11 @@ pub(crate) fn build_cfg<'a>( // Clear the per-file DFS-index map so it does not leak to the next // file built on this thread. clear_fn_dfs_indices(); - // Phase 6: same hygiene for the DTO map. + // same hygiene for the DTO map. DTO_CLASSES.with(|cell| cell.borrow_mut().clear()); + TYPE_ALIAS_LC.with(|cell| cell.borrow_mut().clear()); - // Phase 6 (typed call-graph subtype awareness): collect every + // collect every // declared inheritance / impl / implements relationship in the // file. Per-language extractor in `cfg::hierarchy`; empty for // Go and C. Each `(sub, super)` pair gets duplicated onto every @@ -4289,14 +4434,14 @@ fn apply_promisify_labels( /// Build a `CalleeSite` carrying the richer per-call-site metadata for a /// CFG node. /// -/// * `arity` — positional argument count. `None` when `extract_arg_uses` +/// * `arity`, positional argument count. `None` when `extract_arg_uses` /// bailed out on splats/keyword-args (length 0 does not distinguish /// zero-arg calls from unknown; we treat 0 as a concrete zero). The /// receiver is a separate channel via `CallMeta.receiver` and is not /// represented in `arg_uses`, so `arity == arg_uses.len()` for calls. -/// * `receiver` — forwarded verbatim from `CallMeta.receiver` (already +/// * `receiver`, forwarded verbatim from `CallMeta.receiver` (already /// normalized to the root identifier). -/// * `qualifier` — the segment(s) before the leaf identifier of the callee. +/// * `qualifier`, the segment(s) before the leaf identifier of the callee. /// For **Rust** specifically, this is the *full* `::`-joined prefix (e.g. /// `"crate::auth::token"` for `crate::auth::token::validate`) so that /// cross-file `use`-map resolution in `callgraph.rs` has everything it @@ -4380,7 +4525,7 @@ pub(crate) fn export_summaries( module_path: None, rust_use_map: None, rust_wildcards: None, - // Phase 6 hierarchy edges live on `FileCfg`, not on the + // Hierarchy edges live on `FileCfg`, not on the // graph-local `FuncSummaries`. `ParsedFile::export_summaries_with_root` // attaches them after this transform returns. hierarchy_edges: Vec::new(), diff --git a/src/cfg/params.rs b/src/cfg/params.rs index 14118724..788452d4 100644 --- a/src/cfg/params.rs +++ b/src/cfg/params.rs @@ -8,7 +8,7 @@ use petgraph::graph::NodeIndex; use smallvec::smallvec; use tree_sitter::Node; -/// Phase 6.2 — resolve a syntactic class / struct / interface / model +/// resolve a syntactic class / struct / interface / model /// name against the per-file [`DTO_CLASSES`] map populated at the top /// of `build_cfg`. Returns the [`TypeKind::Dto`] carrying the /// per-field type map when the class is declared in the same file; @@ -21,7 +21,7 @@ fn lookup_dto_class(class_name: &str) -> Option { /// Extract parameter names + per-position [`TypeKind`] from a function /// AST node. Each entry's second slot is `Some(TypeKind)` when the /// parameter's decorator, attribute, or static type annotation maps to -/// a known kind, and `None` otherwise. Strictly additive — when no +/// a known kind, and `None` otherwise. Strictly additive, when no /// type info is recoverable, behaviour is identical to the names-only /// path. pub(super) fn extract_param_meta<'a>( @@ -109,7 +109,7 @@ pub(super) fn extract_param_meta<'a>( // Python `typed_parameter`, `default_parameter`, // `typed_default_parameter`): the wrapper node has no `name` // field but contains the identifier as a child. Pick the - // *first* identifier — that is the parameter name; subsequent + // *first* identifier, that is the parameter name; subsequent // identifiers are part of the type annotation or default // expression. if !found { @@ -123,7 +123,7 @@ pub(super) fn extract_param_meta<'a>( continue; } - // Bare identifier children — e.g. Rust untyped closure params `|cmd|` + // Bare identifier children, e.g. Rust untyped closure params `|cmd|` // where the child is an `identifier` node, not a `parameter` wrapper. if child.kind() == "identifier" { if let Some(txt) = text_of(child, code) { @@ -137,8 +137,8 @@ pub(super) fn extract_param_meta<'a>( /// Walk up from a function definition node and build a container path. /// /// Records the names of enclosing classes / impls / modules / namespaces / -/// structs — and, for anonymous / nested functions, the name of an enclosing -/// named function — joined with `::`. Also returns a `FuncKind` guess +/// structs, and, for anonymous / nested functions, the name of an enclosing +/// named function, joined with `::`. Also returns a `FuncKind` guess /// reflecting the structural role. /// /// Returns `(container, kind)`. @@ -185,7 +185,7 @@ pub(super) fn compute_container_and_kind( | "enum_item" | "struct_specifier" | "struct_item" => Some("name"), - // Rust impl blocks — pick the type name, not the trait name. + // Rust impl blocks, pick the type name, not the trait name. "impl_item" => Some("type"), // Go / C++ / PHP namespaces and modules. "namespace_definition" | "namespace_declaration" | "module_declaration" | "module" => { @@ -223,7 +223,7 @@ pub(super) fn compute_container_and_kind( || pk == "lambda_expression" || pk == "function_expression" { - // Nested definition — record the outer function's name and + // Nested definition, record the outer function's name and // classify self as Closure even if we got a real name. if let Some(name_node) = parent.child_by_field_name("name") { if let Some(text) = text_of(name_node, code) { @@ -428,15 +428,15 @@ pub(super) fn inject_framework_param_sources( /// no recognised pattern matches, returns `None` and the engine /// behaves exactly as before. /// -/// Recognised patterns (Phase 2): -/// * Java (Spring) — `@PathVariable`/`@RequestParam Long X` → +/// Recognised patterns: +/// * Java (Spring), `@PathVariable`/`@RequestParam Long X` → /// [`TypeKind::Int`]; `@RequestBody T` → object (no kind today). -/// * TypeScript (NestJS) — `@Param('id') id: number` → +/// * TypeScript (NestJS), `@Param('id') id: number` → /// [`TypeKind::Int`]; `@Body() dto: T` / `@Query('q') q: string`. -/// * Rust (Axum / Rocket / Actix) — `Path` / `Path` / +/// * Rust (Axum / Rocket / Actix), `Path` / `Path` / /// `web::Path` → [`TypeKind::Int`]; `Path` → /// [`TypeKind::String`]. -/// * Python (FastAPI) — `def h(x: int)` → [`TypeKind::Int`]; +/// * Python (FastAPI), `def h(x: int)` → [`TypeKind::Int`]; /// `Annotated[int, Path()]` → [`TypeKind::Int`]. pub(super) fn classify_param_type<'a>( param: Node<'a>, @@ -453,9 +453,9 @@ pub(super) fn classify_param_type<'a>( } } -/// Java (Spring) — recognise typed-extractor parameters via the +/// Java (Spring), recognise typed-extractor parameters via the /// surrounding annotation. Per Hard Rule 3, plain `Long X` without a -/// known framework annotation is **not** treated as a typed extractor — +/// known framework annotation is **not** treated as a typed extractor , /// the parameter could be a regular function argument that the /// framework never validates. Recognised annotations: /// `@PathVariable`, `@RequestParam`, `@RequestBody`, `@RequestHeader`, @@ -473,7 +473,7 @@ fn classify_param_type_java<'a>(param: Node<'a>, code: &'a [u8]) -> Option` still resolves on `Foo`. @@ -527,7 +527,7 @@ fn has_java_framework_annotation(param: Node<'_>, code: &[u8]) -> bool { } /// Map a Java type-text fragment to a [`TypeKind`]. Public to the -/// `cfg` module so the Phase 6 DTO collector can reuse the same +/// `cfg` module so the DTO DTO collector can reuse the same /// classifier for class fields. pub(super) fn java_type_to_kind(t: &str) -> Option { let bare = t.trim().trim_start_matches('@').trim(); @@ -546,7 +546,7 @@ pub(super) fn java_type_to_kind(t: &str) -> Option { /// Map a TypeScript type-text fragment (already stripped of leading /// `:` / whitespace) to a primitive [`TypeKind`]. Used by both the -/// per-parameter classifier and the Phase 6 DTO collector. +/// per-parameter classifier and the DTO DTO collector. pub(super) fn ts_type_to_kind(t: &str) -> Option { let head = t.split('<').next().unwrap_or(t).trim(); match head { @@ -557,13 +557,35 @@ pub(super) fn ts_type_to_kind(t: &str) -> Option { } } -/// TypeScript (NestJS) — recognise typed-extractor parameters via a +/// TypeScript (NestJS), recognise typed-extractor parameters via a /// known NestJS decorator (`@Param`, `@Body`, `@Query`, `@Headers`, /// `@Req`, `@Res`). Per Hard Rule 3, a bare `function h(id: number)` -/// is not a framework extractor — without a NestJS decorator no +/// is not a framework extractor, without a NestJS decorator no /// runtime gate is implied. Pipe coercions (`ParseIntPipe` / /// `ParseBoolPipe`) override the static type. +/// +/// Exception: parameters annotated as a known JS built-in collection +/// type (`Map<...>`, `Set<...>`, `WeakMap<...>`, `WeakSet<...>`, +/// `Array<...>` / `T[]` / `ReadonlyArray<...>`) resolve to +/// [`TypeKind::LocalCollection`] regardless of decorator presence. +/// `LocalCollection` is a *receiver-shape* claim, not a +/// framework-validated-input claim, it tells the auth analyser that +/// `param.get(k)` / `param.set(k, v)` / `param.find(p)` is a +/// container operation rather than a data-layer read/mutation. This +/// closes the Excalidraw FP cluster (`elementsMap: ElementsMap`, +/// `groupIdMapForOperation: Map`) without affecting +/// any input-validation reasoning. fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option { + let type_text = param + .child_by_field_name("type") + .and_then(|n| inner_ts_type_text(n, code)); + + if let Some(t) = type_text.as_deref() + && let Some(k) = ts_type_to_local_collection(t.trim().trim_start_matches(':').trim()) + { + return Some(k); + } + if !has_ts_decorator_argument( param, code, @@ -586,14 +608,12 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option` matches on `Foo`. @@ -601,8 +621,41 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option`, `Set`, `WeakMap`, `WeakSet`, the four +/// built-in keyed/unkeyed collection types. +/// * `Array`, `ReadonlyArray`, the named array generics. +/// * `T[]`, `readonly T[]`, the array shorthand syntax. +/// * Same-file `type X = Map<...>` aliases (resolved via the +/// per-file `TYPE_ALIAS_LC` map populated at the top of +/// [`build_cfg`]). +/// +/// Same-file user types named `Map` / `Set` / etc. (which would +/// shadow the built-ins) are vanishingly rare in TS codebases that +/// also define the methods (`get`, `set`, `has`, `find`); the +/// classifier accepts the head match. +pub(super) fn ts_type_to_local_collection(t: &str) -> Option { + let head_text = t.trim().trim_start_matches("readonly ").trim(); + // Array shorthand: `T[]` or `readonly T[]`. + if head_text.ends_with("[]") { + return Some(TypeKind::LocalCollection); + } + let head = head_text.split('<').next().unwrap_or(head_text).trim(); + match head { + "Map" | "Set" | "WeakMap" | "WeakSet" | "Array" | "ReadonlyArray" => { + Some(TypeKind::LocalCollection) + } + _ => super::TYPE_ALIAS_LC + .with(|cell| cell.borrow().contains(head)) + .then_some(TypeKind::LocalCollection), + } +} + fn inner_ts_type_text<'a>(type_anno: Node<'a>, code: &'a [u8]) -> Option { - // type_annotation node text is `: T` — unwrap to T. + // type_annotation node text is `: T`, unwrap to T. if let Some(child) = type_anno.named_child(0) { return text_of(child, code); } @@ -643,10 +696,10 @@ fn has_ts_decorator_argument(param: Node<'_>, code: &[u8], wanted: &[&str]) -> b false } -/// Rust (Axum / Rocket / Actix) — read the parameter's type text and +/// Rust (Axum / Rocket / Actix), read the parameter's type text and /// look for `Path` / `Json` / `Form` / `Query` shapes. /// Per Hard Rule 3, bare primitives (`fn h(id: i64)` without an -/// extractor wrapper) are **not** treated as typed extractors — only +/// extractor wrapper) are **not** treated as typed extractors, only /// framework-wrapped types qualify. fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option { if param.kind() != "parameter" { @@ -654,9 +707,121 @@ fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option`, + // `new_shard_docids: &'a mut hashbrown::HashMap<...>` shapes from + // meilisearch/index-scheduler's bitmap bookkeeping where the + // verb-name dispatch (`is_mutation: insert/remove`) would otherwise + // classify these as DB writes. + if let Some(k) = rust_type_to_local_collection(&type_text) { + return Some(k); + } + rust_type_to_kind(&type_text) } +/// Strip Rust reference markers, lifetimes, and `mut` from the head of +/// a type-text fragment so the underlying type name is exposed for +/// matching. Handles `&T`, `&mut T`, `&'a T`, `&'a mut T`, and +/// repeated `&` prefixes (e.g. `&&mut T`). +fn strip_rust_ref_markers(t: &str) -> &str { + let mut s = t.trim(); + loop { + if let Some(rest) = s.strip_prefix('&') { + let rest = rest.trim_start(); + // Optional lifetime label: `'a`, `'static`, `'_`. + let rest = if let Some(after) = rest.strip_prefix('\'') { + let end = after + .find(|c: char| !c.is_alphanumeric() && c != '_') + .unwrap_or(after.len()); + after[end..].trim_start() + } else { + rest + }; + // Optional `mut` keyword. + let rest = rest.strip_prefix("mut ").unwrap_or(rest).trim_start(); + s = rest; + continue; + } + if let Some(rest) = s.strip_prefix("mut ") { + s = rest.trim_start(); + continue; + } + break; + } + s +} + +/// Map a Rust parameter / variable type-text to +/// [`TypeKind::LocalCollection`] when the head names a known +/// in-memory container. Strips reference / lifetime / `mut` markers, +/// drops module-path prefixes (`std::collections::`, `hashbrown::`, +/// `roaring::`), then matches the head against std and ecosystem +/// collection types. +/// +/// Recognises: +/// * Std: `Vec`, `HashMap`, `HashSet`, `BTreeMap`, `BTreeSet`, +/// `VecDeque`, `BinaryHeap`, `LinkedList`. +/// * Ecosystem: `IndexMap`, `IndexSet` (indexmap), `SmallVec` +/// (smallvec), `DashMap`, `DashSet` (dashmap), `FxHashMap`, +/// `FxHashSet` (rustc-hash / fxhash), `RoaringBitmap`, +/// `RoaringTreemap` (roaring). +/// * Array / slice shorthand: `[T; N]`, `[T]` (covered by the +/// leading-`[` check after ref-stripping). +/// +/// Returns `None` for `Database<...>` (heed/sled, persistent KV +/// store, NOT a local collection, keeping this `None` preserves +/// real IDOR detection on persistent-store calls), `Mutex<...>` / +/// `RwLock<...>` (synchronisation wrappers, not sink-shape claims), +/// and bare primitives. +pub(super) fn rust_type_to_local_collection(t: &str) -> Option { + let stripped = strip_rust_ref_markers(t); + + // Array / slice shorthand: `[T; N]` or `[T]` (the `&` was + // already stripped). + if stripped.starts_with('[') { + return Some(TypeKind::LocalCollection); + } + + // Drop module-path prefix: keep only the last segment before `<` + // or end (`std::collections::HashMap` → `HashMap`). + let head_with_generics = stripped.rsplit("::").next().unwrap_or(stripped); + let head = head_with_generics + .split('<') + .next() + .unwrap_or(head_with_generics) + .trim(); + + const TYPES: &[&str] = &[ + "Vec", + "VecDeque", + "BinaryHeap", + "LinkedList", + "HashMap", + "HashSet", + "BTreeMap", + "BTreeSet", + "IndexMap", + "IndexSet", + "SmallVec", + "DashMap", + "DashSet", + "FxHashMap", + "FxHashSet", + "RoaringBitmap", + "RoaringTreemap", + ]; + if TYPES.contains(&head) { + Some(TypeKind::LocalCollection) + } else { + None + } +} + fn rust_type_to_kind(t: &str) -> Option { let stripped = t.trim(); // Reject reference / mutability noise so `&Path` still matches @@ -666,7 +831,7 @@ fn rust_type_to_kind(t: &str) -> Option { .trim_start_matches('&') .trim_start_matches("mut ") .trim(); - // Only framework wrapper extractors qualify — bare primitives like + // Only framework wrapper extractors qualify, bare primitives like // `i64` could be regular function parameters with no framework // validation gate. for wrap in [ @@ -684,7 +849,7 @@ fn rust_type_to_kind(t: &str) -> Option { if let Some(rest) = stripped.strip_prefix(&prefix) { if let Some(inner) = rest.strip_suffix('>') { let inner = inner.trim(); - // Tuple extractor `Path<(i64, String)>` — first element wins. + // Tuple extractor `Path<(i64, String)>`, first element wins. if inner.starts_with('(') { let inside = inner.trim_start_matches('(').trim_end_matches(')'); let first = inside.split(',').next().unwrap_or("").trim(); @@ -696,16 +861,16 @@ fn rust_type_to_kind(t: &str) -> Option { if let Some(k) = rust_primitive_to_kind(inner) { return Some(k); } - // Phase 6.2: `Json` / `Form` / `Query` / - // `Path` with a same-file struct type — resolve via + // `Json` / `Form` / `Query` / + // `Path` with a same-file struct type, resolve via // the DTO map. Strip nested generics so `Json>` // matches on `Foo`. let head = inner.split('<').next().unwrap_or(inner).trim(); if let Some(k) = lookup_dto_class(head) { return Some(k); } - // Custom struct outside the same file — leave None - // (cross-file resolution is Phase 6.4). + // Custom struct outside the same file, leave None + // (cross-file resolution is a follow-up). return None; } } @@ -714,7 +879,7 @@ fn rust_type_to_kind(t: &str) -> Option { } /// Map a Rust primitive / `String` / `&str` to a [`TypeKind`]. Public -/// to the `cfg` module so the Phase 6 DTO collector can reuse it for +/// to the `cfg` module so the DTO DTO collector can reuse it for /// `struct` field types. pub(super) fn rust_primitive_to_kind(t: &str) -> Option { let t = t.trim(); @@ -728,10 +893,10 @@ pub(super) fn rust_primitive_to_kind(t: &str) -> Option { } } -/// Python (FastAPI) — recognise typed-extractor parameters via the +/// Python (FastAPI), recognise typed-extractor parameters via the /// `Annotated[X, Path()/Query()/Body()/Header()/Cookie()]` shape. Per /// Hard Rule 3, a bare `def h(id: int)` is **not** a framework -/// extractor — the function may be a plain Python function and the +/// extractor, the function may be a plain Python function and the /// type annotation provides no runtime gate. fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option { let type_node = param.child_by_field_name("type")?; @@ -741,7 +906,7 @@ fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option Option { let stripped = t.trim(); - // `Annotated[int, Path()]` — only matches when one of the generic + // `Annotated[int, Path()]`, only matches when one of the generic // args names a recognised FastAPI binding marker. Otherwise no // framework gate is implied. if let Some(inner) = stripped @@ -756,8 +921,8 @@ fn python_type_to_kind(t: &str) -> Option { if let Some(k) = python_primitive_to_kind(first) { return Some(k); } - // Phase 6.2: `Annotated[CreateUser, Body()]` with a same-file - // Pydantic model — resolve via the DTO map. Generic args are + // `Annotated[CreateUser, Body()]` with a same-file + // Pydantic model, resolve via the DTO map. Generic args are // dropped via the same head-split as `python_primitive_to_kind`. let head = first.split('[').next().unwrap_or(first).trim(); return lookup_dto_class(head); @@ -773,7 +938,7 @@ fn contains_fastapi_marker(s: &str) -> bool { } /// Map a Python type expression to a primitive [`TypeKind`]. Used by -/// both the per-parameter classifier and the Phase 6 Pydantic-model +/// both the per-parameter classifier and the DTO Pydantic-model /// field collector. pub(super) fn python_primitive_to_kind(t: &str) -> Option { let head = t.trim().split('[').next().unwrap_or(t).trim(); @@ -806,10 +971,70 @@ pub(super) fn is_configured_terminator( mod typed_extractor_tests { use super::{ contains_fastapi_marker, java_type_to_kind, python_primitive_to_kind, python_type_to_kind, - rust_primitive_to_kind, rust_type_to_kind, + rust_primitive_to_kind, rust_type_to_kind, rust_type_to_local_collection, + ts_type_to_local_collection, }; use crate::ssa::type_facts::TypeKind; + // ── TypeScript / JavaScript local-collection types ─────────────────── + + #[test] + fn ts_built_in_collections_map_to_local_collection() { + // The four keyed/unkeyed built-in container generics. + assert_eq!( + ts_type_to_local_collection("Map"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + ts_type_to_local_collection("Set"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + ts_type_to_local_collection("WeakMap"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + ts_type_to_local_collection("WeakSet"), + Some(TypeKind::LocalCollection) + ); + // Array forms. + assert_eq!( + ts_type_to_local_collection("Array"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + ts_type_to_local_collection("ReadonlyArray"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + ts_type_to_local_collection("string[]"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + ts_type_to_local_collection("readonly string[]"), + Some(TypeKind::LocalCollection) + ); + // Excalidraw-style keyed map with index-type generic args. + assert_eq!( + ts_type_to_local_collection("Map"), + Some(TypeKind::LocalCollection) + ); + } + + #[test] + fn ts_non_collection_types_return_none() { + // Plain primitives. + assert_eq!(ts_type_to_local_collection("string"), None); + assert_eq!(ts_type_to_local_collection("number"), None); + assert_eq!(ts_type_to_local_collection("boolean"), None); + // Promise / Iterator / etc. are not LocalCollections. + assert_eq!(ts_type_to_local_collection("Promise"), None); + assert_eq!(ts_type_to_local_collection("Iterator"), None); + // User types. + assert_eq!(ts_type_to_local_collection("CreateUserDto"), None); + assert_eq!(ts_type_to_local_collection("ElementsMap"), None); + } + // ── Java (Spring) ──────────────────────────────────────────────────── #[test] @@ -841,7 +1066,7 @@ mod typed_extractor_tests { #[test] fn java_request_body_dto_returns_none_until_phase_six() { - // @RequestBody CreateUserDto dto — no kind today; Phase 6 will + // @RequestBody CreateUserDto dto, no kind today; future passes will // return DtoObject(name) once cross-file class resolution lands. assert_eq!(java_type_to_kind("CreateUserDto"), None); assert_eq!(java_type_to_kind("List"), None); @@ -860,7 +1085,7 @@ mod typed_extractor_tests { #[test] fn rust_path_tuple_first_element_wins() { - // Path<(i64, String)> — first slot is the int extractor that + // Path<(i64, String)>, first slot is the int extractor that // matters for sink suppression. assert_eq!( rust_type_to_kind("Path<(i64, String)>"), @@ -876,15 +1101,15 @@ mod typed_extractor_tests { #[test] fn rust_json_dto_returns_none_until_phase_six() { - // Json / Form / Query with a custom struct type — no - // primitive resolution available; Phase 6 lifts to DTO. + // Json / Form / Query with a custom struct type, no + // primitive resolution available; future passes will lift to DTO. assert_eq!(rust_type_to_kind("Json"), None); assert_eq!(rust_type_to_kind("Form"), None); assert_eq!(rust_type_to_kind("Query"), None); } /// Per Hard Rule 3, bare primitives (`fn h(id: i64)`) are NOT - /// framework extractors — only wrapper types (`Path` etc.) + /// framework extractors, only wrapper types (`Path` etc.) /// imply a framework runtime gate. Bare i64 must return None. #[test] fn rust_bare_primitives_are_not_framework_extractors() { @@ -903,7 +1128,7 @@ mod typed_extractor_tests { #[test] fn python_bare_primitives_are_not_framework_extractors() { // Per Hard Rule 3: bare `def h(id: int)` is NOT a typed - // extractor — without an `Annotated[..., Path()/Query()/Body()]` + // extractor, without an `Annotated[..., Path()/Query()/Body()]` // wrapper, no FastAPI gate is implied. assert_eq!(python_type_to_kind("int"), None); assert_eq!(python_type_to_kind("float"), None); @@ -936,7 +1161,7 @@ mod typed_extractor_tests { #[test] fn python_annotated_without_marker_returns_none() { // Annotated without a FastAPI binding marker is a generic - // type-system tag — not a framework extractor. + // type-system tag, not a framework extractor. assert_eq!(python_type_to_kind("Annotated[int, str]"), None); assert_eq!(python_type_to_kind("Annotated[int, MyMeta]"), None); } @@ -954,4 +1179,128 @@ mod typed_extractor_tests { assert!(contains_fastapi_marker("bytes, File()")); assert!(!contains_fastapi_marker("int, str")); } + + // ── Rust local-collection types ────────────────────────────────────── + + #[test] + fn rust_std_collections_map_to_local_collection() { + for ty in [ + "Vec", + "HashMap", + "HashSet", + "BTreeMap", + "BTreeSet", + "VecDeque", + "BinaryHeap", + "LinkedList", + ] { + assert_eq!( + rust_type_to_local_collection(ty), + Some(TypeKind::LocalCollection), + "{ty} should map to LocalCollection" + ); + } + } + + #[test] + fn rust_ecosystem_collections_map_to_local_collection() { + for ty in [ + "IndexMap", + "IndexSet", + "SmallVec<[u32; 4]>", + "DashMap", + "DashSet", + "FxHashMap", + "FxHashSet", + "RoaringBitmap", + "RoaringTreemap", + ] { + assert_eq!( + rust_type_to_local_collection(ty), + Some(TypeKind::LocalCollection), + "{ty} should map to LocalCollection" + ); + } + } + + #[test] + fn rust_module_qualified_collections_map_to_local_collection() { + // Module-path prefixes: keep only the last segment for matching. + assert_eq!( + rust_type_to_local_collection("std::collections::HashMap"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + rust_type_to_local_collection("hashbrown::HashMap"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + rust_type_to_local_collection("roaring::RoaringBitmap"), + Some(TypeKind::LocalCollection) + ); + } + + #[test] + fn rust_reference_and_lifetime_markers_stripped() { + // `&T`, `&mut T`, `&'a T`, `&'a mut T`, `&'static T`, + // repeated `&` prefixes, all reach the underlying type head. + for ty in [ + "&RoaringBitmap", + "&mut RoaringBitmap", + "&'a RoaringBitmap", + "&'a mut RoaringBitmap", + "&'static RoaringBitmap", + "&&mut RoaringBitmap", + "&'a mut hashbrown::HashMap", + ] { + assert_eq!( + rust_type_to_local_collection(ty), + Some(TypeKind::LocalCollection), + "{ty} should map to LocalCollection after ref stripping" + ); + } + } + + #[test] + fn rust_array_and_slice_shorthand_map_to_local_collection() { + // `[T; N]` arrays and `[T]` slices are local containers. + assert_eq!( + rust_type_to_local_collection("[u32; 4]"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + rust_type_to_local_collection("[u8]"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + rust_type_to_local_collection("&[u32]"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + rust_type_to_local_collection("&mut [u32]"), + Some(TypeKind::LocalCollection) + ); + } + + #[test] + fn rust_persistent_db_and_sync_wrappers_return_none() { + // heed / sled / rocksdb persistent-store handles are NOT local + // collections, preserves IDOR detection on real DB calls. + assert_eq!( + rust_type_to_local_collection("Database>"), + None + ); + assert_eq!(rust_type_to_local_collection("heed::Database"), None); + assert_eq!(rust_type_to_local_collection("sled::Db"), None); + // Sync wrappers don't claim a sink shape. + assert_eq!(rust_type_to_local_collection("Mutex>"), None); + assert_eq!(rust_type_to_local_collection("RwLock>"), None); + // Bare primitives. + assert_eq!(rust_type_to_local_collection("u32"), None); + assert_eq!(rust_type_to_local_collection("&str"), None); + assert_eq!(rust_type_to_local_collection("String"), None); + // Unrelated user types. + assert_eq!(rust_type_to_local_collection("MyDao"), None); + assert_eq!(rust_type_to_local_collection("Connection"), None); + } } diff --git a/src/cfg_analysis/auth.rs b/src/cfg_analysis/auth.rs index 698c9b2f..d521d48f 100644 --- a/src/cfg_analysis/auth.rs +++ b/src/cfg_analysis/auth.rs @@ -107,11 +107,11 @@ fn has_web_handler_params(ctx: &AnalysisContext, func_name: &str) -> bool { /// Determine if a function qualifies as a web entrypoint (not just any entrypoint). /// /// A web entrypoint must: -/// 1. Match entrypoint naming rules (handle_*, route_*, api_*, etc.) — but NOT bare `main` +/// 1. Match entrypoint naming rules (handle_*, route_*, api_*, etc.), but NOT bare `main` /// unless it has web-like parameters /// 2. Have parameters resembling HTTP handler signatures fn is_web_entrypoint(ctx: &AnalysisContext, func_name: &str) -> bool { - // "main" without web params is a CLI entrypoint — skip + // "main" without web params is a CLI entrypoint, skip if func_name == "main" { return has_web_handler_params(ctx, func_name); } @@ -163,7 +163,7 @@ impl CfgAnalysis for AuthGap { fn run(&self, ctx: &AnalysisContext) -> Vec { // Decorator/annotation/attribute auth on the body declaration - // already gates every sink in the body — skip the + // already gates every sink in the body, skip the // structural-call dominance check entirely when the framework // enforces auth at the declaration level. Mirrors the // `classify_auth_decorators` lookup the state engine uses to diff --git a/src/cfg_analysis/error_handling.rs b/src/cfg_analysis/error_handling.rs index 58a06bec..09e2819a 100644 --- a/src/cfg_analysis/error_handling.rs +++ b/src/cfg_analysis/error_handling.rs @@ -14,7 +14,7 @@ use petgraph::visit::EdgeRef; /// Returns true if the identifier is exactly `err` / `error` or a /// snake-case error name (`err_x`, `error_x`, `x_err`, `x_error`). /// CamelCase names (`isErrorEnabled`, `getError`, `errorMsg`) are -/// rejected — the cost is occasional FNs on Java-style error fields, +/// rejected, the cost is occasional FNs on Java-style error fields, /// which is acceptable for a precision fix. fn is_error_var_ident(name: &str) -> bool { let lower = name.to_ascii_lowercase(); @@ -36,7 +36,7 @@ fn is_error_var_ident(name: &str) -> bool { /// Used by the error-fallthrough rule to skip happy-path checks /// like `if (!data.error && Array.isArray(results))` whose TRUE branch /// is the success path and is not expected to return. The original -/// rule fires on `if (err) { warn(); } sink_after()` — a positive +/// rule fires on `if (err) { warn(); } sink_after()`, a positive /// error check whose body forgets to early-return. fn contains_negated_err_identifier(text: &str) -> bool { let bytes = text.as_bytes(); @@ -46,7 +46,7 @@ fn contains_negated_err_identifier(text: &str) -> bool { i += 1; continue; } - // Skip the `!=` / `!==` operators — those are comparisons, not + // Skip the `!=` / `!==` operators, those are comparisons, not // logical-not. Only treat a `!` followed by whitespace or an // identifier-leading char as logical negation. if i + 1 < bytes.len() && bytes[i + 1] == b'=' { @@ -57,7 +57,7 @@ fn contains_negated_err_identifier(text: &str) -> bool { while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') { j += 1; } - // Allow a leading `(` for `!(expr)` shapes — peek past one open + // Allow a leading `(` for `!(expr)` shapes, peek past one open // paren and continue capturing the identifier chain. if j < bytes.len() && bytes[j] == b'(' { j += 1; @@ -118,7 +118,95 @@ fn branch_terminates(cfg: &crate::cfg::Cfg, if_node: NodeIndex) -> bool { false } -/// Check if all paths from `node` reach a Return/Break/Continue before exiting scope. +/// Recognise calls that never return on the success path. +/// +/// `cfg-error-fallthrough` looks for `if err != nil { … }` whose body +/// fails to terminate. A `return`/`break`/`continue`/`throw` is the +/// canonical terminator and already produces a `StmtKind::Return` / +/// `Throw` / `Break` / `Continue` node. But a large class of real +/// terminators arrives as a *call* whose callee is documented to abort +/// the goroutine, process, or test: +/// +/// * Go testing, `t.Fatal`, `t.Fatalf`, `t.Fatalln`, `b.Fatal*`, +/// `*Helper()` chains ending in `Fatal*`, also third-party +/// `require.NoError(t, …)` (asserts and aborts on err) which the +/// common `c.Fatalf("...")` pattern in minio's table tests reduces +/// to. All `Fatal*` methods on a `testing.T`/`B`/`F` call +/// `runtime.Goexit()` which is documented as never returning to the +/// caller. +/// * Go std-library, `os.Exit`, `syscall.Exit`, `runtime.Goexit`, +/// `log.Fatal`, `log.Fatalf`, `log.Fatalln`, `log.Panic*`. +/// * Go builtin, bare `panic(…)`. +/// * Rust, `panic!`, `unreachable!`, `unimplemented!`, `todo!`, +/// `process::exit`, `std::process::exit`, `process::abort`, +/// `std::process::abort` (the macros currently lower to +/// `StmtKind::Throw` via tree-sitter's macro arm; the function +/// forms need explicit recognition). +/// * Python, `sys.exit`, `os._exit`, `os.abort`. +/// +/// The recogniser looks at the bare method name (last segment after +/// `.` or `::`) and, where the receiver is a closed token, the +/// receiver's first segment. Bare `panic` / `exit` callees are +/// recognised only when the namespace context matches (callee equals +/// the literal string, no other receiver). This keeps the recogniser +/// from claiming arbitrary user-defined `Exit(...)` / `Panic(...)` as +/// terminators. +/// +/// Closes the minio test-file cluster (49 in `xl-storage_test.go` +/// alone, 176 across the repo) where every `if err != nil { c.Fatalf(...) }` +/// fired `cfg-error-fallthrough`: the `Fatalf` aborts the goroutine +/// and the post-if code never executes, but the rule classified it as +/// fall-through. Conservative: only adds new terminators; never +/// removes the existing `Return`/`Throw`/`Break`/`Continue` recognition. +fn call_never_returns(info: &crate::cfg::NodeInfo) -> bool { + if info.kind != StmtKind::Call { + return false; + } + let Some(callee) = info.call.callee.as_deref() else { + return false; + }; + let last = callee.rsplit(['.', ':']).next().unwrap_or(callee); + + // Method names that always terminate when called on any receiver + // that's a testing handle (`*testing.T`, `*testing.B`, `*testing.F`) + // or a logger. Receiver type is unknown to this rule; the names + // are sufficiently distinctive that arbitrary user-defined methods + // sharing the name are vanishingly rare. + if matches!( + last, + // Go testing + "Fatal" | "Fatalf" | "Fatalln" | "FailNow" | + // Go log/slog terminating handlers + "Panic" | "Panicf" | "Panicln" | + // Rust process / never-return std fns + "abort" | "unreachable_unchecked" + ) { + return true; + } + + // Bare callees (no receiver) that are language builtins or + // unambiguous std-library terminators. + match callee { + // Go builtin + "panic" => return true, + // Go std + "os.Exit" | "syscall.Exit" | "runtime.Goexit" | "log.Fatal" | "log.Fatalf" + | "log.Fatalln" | "log.Panic" | "log.Panicf" | "log.Panicln" | "slog.Fatal" + | "klog.Fatal" | "klog.Fatalf" | "klog.Exit" | "klog.Exitf" => return true, + // Rust std + "process::exit" | "process::abort" | "std::process::exit" | "std::process::abort" => { + return true; + } + // Python std + "sys.exit" | "os._exit" | "os.abort" => return true, + _ => {} + } + + false +} + +/// Check if all paths from `node` reach a Return/Break/Continue (or a +/// known never-returning call) before exiting scope. fn terminates_on_all_paths( cfg: &crate::cfg::Cfg, node: NodeIndex, @@ -142,10 +230,15 @@ fn terminates_on_all_paths( } _ => {} } + if call_never_returns(info) { + // Documented never-returning call (`t.Fatalf`, `os.Exit`, + // `panic`, `runtime.Goexit`, …), this path terminates. + continue; + } let successors: Vec<_> = cfg.neighbors(current).collect(); if successors.is_empty() { - // Reached a dead end without terminating — path does not terminate + // Reached a dead end without terminating, path does not terminate return false; } @@ -181,7 +274,7 @@ fn find_post_if_sinks(cfg: &crate::cfg::Cfg, if_node: NodeIndex) -> Vec = cfg .edges(if_node) @@ -225,9 +318,9 @@ impl CfgAnalysis for IncompleteErrorHandling { // Look for If nodes whose CONDITION involves "err" or "error". // `info.taint.uses` for an If node contains identifiers from the - // whole if statement (condition + body) — see + // whole if statement (condition + body), see // `cfg::literals::extract_defs_uses_extra_defs` Kind::If branch - // — so checking it would misfire on `if (!res.ok) { ... const + //, so checking it would misfire on `if (!res.ok) { ... const // err = await … ; return … }` shapes whose body happens to // mention `err` even though the condition doesn't. Use // `info.condition_vars`, which is populated strictly from the @@ -244,7 +337,7 @@ impl CfgAnalysis for IncompleteErrorHandling { // Polarity gate: only fire when the condition POSITIVELY // checks for an error. `if (!data.error && other)` is a - // happy-path check — the TRUE branch is the success branch + // happy-path check, the TRUE branch is the success branch // and is not expected to terminate. Detect by scanning the // condition text for any `!` (logical-not, distinct from // `!=`) preceding an identifier whose name contains "err". @@ -354,7 +447,7 @@ mod err_ident_tests { fn rejects_camelcase_method_names() { // Spring `logger.isErrorEnabled()` lifts `isErrorEnabled` into // `condition_vars`; under the old `lower.contains("err")` check - // this fired the rule. The new strict check rejects it — the + // this fired the rule. The new strict check rejects it, the // condition is asking "is logging enabled", not "is there an // error". assert!(!is_error_var_ident("isErrorEnabled")); @@ -371,3 +464,103 @@ mod err_ident_tests { assert!(!is_error_var_ident("perform")); } } + +#[cfg(test)] +mod terminator_call_tests { + use super::call_never_returns; + use crate::cfg::{CallMeta, NodeInfo, StmtKind}; + + fn call_node(callee: &str) -> NodeInfo { + NodeInfo { + kind: StmtKind::Call, + call: CallMeta { + callee: Some(callee.to_string()), + ..Default::default() + }, + ..Default::default() + } + } + + #[test] + fn recognises_go_testing_fatal_methods() { + // Bare method name on any receiver, the canonical minio test + // shape `c.Fatalf("bucket creat error: %v", err)`. + assert!(call_never_returns(&call_node("c.Fatalf"))); + assert!(call_never_returns(&call_node("t.Fatal"))); + assert!(call_never_returns(&call_node("t.Fatalf"))); + assert!(call_never_returns(&call_node("t.Fatalln"))); + assert!(call_never_returns(&call_node("b.Fatal"))); + assert!(call_never_returns(&call_node("t.FailNow"))); + // Logger panics (handler-style fatal). + assert!(call_never_returns(&call_node("logger.Panic"))); + assert!(call_never_returns(&call_node("logger.Panicf"))); + } + + #[test] + fn recognises_go_std_terminators() { + assert!(call_never_returns(&call_node("os.Exit"))); + assert!(call_never_returns(&call_node("syscall.Exit"))); + assert!(call_never_returns(&call_node("runtime.Goexit"))); + assert!(call_never_returns(&call_node("log.Fatal"))); + assert!(call_never_returns(&call_node("log.Fatalf"))); + assert!(call_never_returns(&call_node("log.Fatalln"))); + assert!(call_never_returns(&call_node("log.Panic"))); + assert!(call_never_returns(&call_node("klog.Exit"))); + // Bare builtin + assert!(call_never_returns(&call_node("panic"))); + } + + #[test] + fn recognises_rust_and_python_std_terminators() { + assert!(call_never_returns(&call_node("std::process::exit"))); + assert!(call_never_returns(&call_node("std::process::abort"))); + assert!(call_never_returns(&call_node("process::exit"))); + assert!(call_never_returns(&call_node("sys.exit"))); + assert!(call_never_returns(&call_node("os._exit"))); + } + + #[test] + fn does_not_claim_user_defined_lookalikes() { + // Bare `Exit` on a custom receiver is a normal method, not the + // process-level terminator. The bare callee path only matches + // exact std-library forms. + assert!(!call_never_returns(&call_node("server.Exit"))); + assert!(!call_never_returns(&call_node("Exit"))); + assert!(!call_never_returns(&call_node("session.exit"))); + // Bare `panic` is a Go builtin; method `panic` is not. + // The recogniser keys off the full callee path so + // `widget.panic` does not match. + assert!(!call_never_returns(&call_node("widget.panic"))); + // Common helpers that *don't* terminate. + assert!(!call_never_returns(&call_node("log.Print"))); + assert!(!call_never_returns(&call_node("log.Println"))); + assert!(!call_never_returns(&call_node("t.Errorf"))); + assert!(!call_never_returns(&call_node("t.Logf"))); + assert!(!call_never_returns(&call_node("c.Skip"))); + } + + #[test] + fn requires_call_kind() { + // Only StmtKind::Call nodes are inspected; an If or Seq node + // carrying the same callee text wouldn't ever come through + // this path. Defensive: confirm the kind gate. + let mut node = call_node("t.Fatal"); + node.kind = StmtKind::Seq; + assert!(!call_never_returns(&node)); + node.kind = StmtKind::If; + assert!(!call_never_returns(&node)); + } + + #[test] + fn missing_callee_does_not_panic() { + let node = NodeInfo { + kind: StmtKind::Call, + call: CallMeta { + callee: None, + ..Default::default() + }, + ..Default::default() + }; + assert!(!call_never_returns(&node)); + } +} diff --git a/src/cfg_analysis/guards.rs b/src/cfg_analysis/guards.rs index f10522d6..672a5a11 100644 --- a/src/cfg_analysis/guards.rs +++ b/src/cfg_analysis/guards.rs @@ -29,7 +29,7 @@ pub struct UnguardedSink; /// receiver recorded as a compound identifier rather than a named binding). fn is_all_args_constant(ctx: &AnalysisContext, sink: NodeIndex) -> bool { // Fast path: syntactic literal detection from CFG construction. - // Strictly weaker than the one-hop trace below — serves as an + // Strictly weaker than the one-hop trace below, serves as an // optimization for the common case of inline literal arguments. if ctx.cfg[sink].all_args_literal { return true; @@ -127,17 +127,17 @@ fn ssa_all_sink_operands_constant( /// SSA-backed reassign-aware safety probe: every operand of the sink /// resolves to a constant, callee fragment, OR a function parameter that /// is not itself a Source. Used at the cfg-unguarded-sink site under -/// `!has_taint` — the taint engine has already proved no source-tainted +/// `!has_taint`, the taint engine has already proved no source-tainted /// data reaches the sink, so a non-source Param at operand position is /// inert payload-wise (e.g. HTTP writer in `Fprintf(w, "

", "Guest")`). /// /// Gated on the function body actually exhibiting the reassign-to-constant -/// signature — at least one named SSA def whose RHS is a literal Const +/// signature, at least one named SSA def whose RHS is a literal Const /// (`name = "Guest"`). In a thin wrapper without a same-block named /// const assignment (`fn wrap(p) { sink(p) }`, or C `popen(buf, "r")` where /// `buf` is filled in-place by `sprintf` with no Const Assign on `buf`), /// the bare Param at operand position IS the payload and the suppression's -/// rationale does not apply — `cfg-unguarded-sink` must still fire. +/// rationale does not apply, `cfg-unguarded-sink` must still fire. fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex) -> bool { let Some(facts) = ctx.body_const_facts else { return false; @@ -165,13 +165,13 @@ fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex) } /// Return true if the SSA body contains a *named* variable whose definition -/// is a constant — the SSA signature of an explicit `name = "literal"` +/// is a constant, the SSA signature of an explicit `name = "literal"` /// reassignment. Used as the gate for the broader operand-Param suppression: /// the suppression's purpose is the reassign-to-constant idiom, which by /// definition has at least one named const assignment. In a thin wrapper /// (`fn wrap(p) { sink(p) }` or `popen(buf, "r")` where `buf` is filled by /// `sprintf`), no such named const assignment exists and the suppression's -/// rationale doesn't apply — so the bare-Param structural finding fires. +/// rationale doesn't apply, so the bare-Param structural finding fires. fn func_body_has_named_const_assign(facts: &BodyConstFacts) -> bool { for block in &facts.ssa.blocks { for inst in &block.body { @@ -228,7 +228,7 @@ fn ssa_operand_const_or_param( // CFG-node-level Source label: when an SSA `Call` corresponds to a // Source-labeled CFG node (e.g. `env::var(...)` whose callee // matches a `LabelRule` Source matcher), the call's result is - // tainted user input — refuse, regardless of how the SSA + // tainted user input, refuse, regardless of how the SSA // happened to lower. Catches the `SsaOp::Call` lowering of // labeled Source functions, which the `SsaOp::Source` arm only // sees for callee-less pure sources like PHP `$_GET`. @@ -266,7 +266,7 @@ fn ssa_operand_const_or_param( } SsaOp::Source => return false, SsaOp::Nop | SsaOp::Undef => {} - // FieldProj: walk the receiver — `obj.f` is constant iff `obj` + // FieldProj: walk the receiver, `obj.f` is constant iff `obj` // is constant under the same definition. The field name itself // is structural and adds no runtime value. SsaOp::FieldProj { receiver, .. } => stack.push(*receiver), @@ -321,7 +321,7 @@ fn ssa_operand_constant( } SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam | SsaOp::Source => { // Only acceptable when the param's `var_name` is a callee - // fragment — i.e. an identifier that only appears because + // fragment, i.e. an identifier that only appears because // the CFG recorded name components of the dotted/chained // callee as uses. Real parameters and sources are dynamic. let name = inst.var_name.as_deref().unwrap_or(""); @@ -333,7 +333,7 @@ fn ssa_operand_constant( } } SsaOp::Nop => {} - // Undef is a non-user, non-dynamic sentinel — treat like Const + // Undef is a non-user, non-dynamic sentinel, treat like Const // (no additional operands to trace). SsaOp::Undef => {} // FieldProj: structural field read; constness reduces to the @@ -440,7 +440,7 @@ fn sink_args_typed_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) !is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts) } // Constant string literals used as inline args (e.g. `"listener"`, - // `"-c"`) are not user-controlled — treat as non-real for the + // `"-c"`) are not user-controlled, treat as non-real for the // "all int-typed" test so they don't block suppression. SsaOp::Const(_) => false, _ => true, @@ -477,7 +477,7 @@ fn type_facts_suppress(values: &[SsaValue], sink_caps: Cap, type_facts: &TypeFac /// lookup idiom (e.g. `map.get(x).unwrap_or("safe")` over literal inserts) /// should clear a command-injection sink. /// -/// Only fires for `Cap::SHELL_ESCAPE` — SQL / path suppression from this +/// Only fires for `Cap::SHELL_ESCAPE`, SQL / path suppression from this /// domain would require stronger reasoning (literal keys can still carry /// SQL tokens if the inserts themselves contain them). fn sink_args_static_map_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { @@ -595,6 +595,71 @@ fn match_config_sanitizer(callee: &str, extra: &[RuntimeLabelRule]) -> Option Option { + if info.condition_vars.len() != 1 { + return None; + } + let var_name = info.condition_vars[0].as_str(); + let cond_func = info.ast.enclosing_func.as_deref(); + let cond_span_start = info.ast.span.0; + + // Walk the CFG for any node that DEFINES `var_name` via a Call + // expression. Same-function only, and only consider definitions + // textually before the condition: a reassignment after the `if` + // cannot be the def reaching it. Among the eligible defs, take + // the textually-last one (highest span start), a conservative + // latest-def proxy without paying for full dominator analysis. + let mut best: Option<(usize, &str)> = None; + for nidx in ctx.cfg.node_indices() { + let n = &ctx.cfg[nidx]; + if n.kind != crate::cfg::StmtKind::Call { + continue; + } + if n.taint.defines.as_deref() != Some(var_name) { + continue; + } + if n.ast.enclosing_func.as_deref() != cond_func { + continue; + } + let span_start = n.ast.span.0; + if span_start >= cond_span_start { + continue; + } + let Some(callee) = n.call.callee.as_deref() else { + continue; + }; + match best { + Some((s, _)) if s >= span_start => {} + _ => best = Some((span_start, callee)), + } + } + let (_, callee) = best?; + + crate::ssa::type_facts::classify_input_validator_callee(callee).map(|_| callee.to_string()) +} + /// Find all nodes in the CFG that are calls to guard functions. fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> { let guard_rules = rules::guard_rules(ctx.lang); @@ -620,6 +685,24 @@ fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> { | PredicateKind::ValidationCall ) { result.push((idx, Cap::all())); + } else if cond_indirect_validator_callee(info, ctx).is_some() { + // Indirect-validator pattern: + // const err = validate(x); if (err) throw …; + // const ok = isValid(x); if (!ok) throw …; + // The classifier returns Unknown / NullCheck / ErrorCheck + // because the if-condition is a bare result variable, not + // a direct call expression. `cond_indirect_validator_callee` + // handles that by scanning the CFG for nodes whose + // `TaintMeta.defines` matches the condition variable and + // checking whether any defining Call has an + // `is_input_validator_callee`-recognised callee. This keeps + // cfg-unguarded-sink suppression aligned with the same + // structural validator recognition the SSA branch-narrowing + // layer uses, without requiring the condition itself to be + // a direct call expression. + // + // Motivated by Novu CVE GHSA-4x48-cgf9-q33f. + result.push((idx, Cap::all())); } else if matches!( kind, PredicateKind::ShellMetaValidated | PredicateKind::BoundedLength @@ -733,7 +816,7 @@ fn sink_arg_is_parameter_only(ctx: &AnalysisContext, sink: NodeIndex) -> bool { let sink_uses = &sink_info.taint.uses; if sink_uses.is_empty() { - // No identifiable arguments — could be a constant call like Command::new("ls") + // No identifiable arguments, could be a constant call like Command::new("ls") return true; // treat as non-dangerous (constant arg) } @@ -787,7 +870,7 @@ pub(crate) fn has_redirect_path_prefix(source_bytes: &[u8], span: (usize, usize) false } -/// Check if this sink is an internal redirect — a `res.redirect` (SSRF sink) +/// Check if this sink is an internal redirect, a `res.redirect` (SSRF sink) /// whose argument is a template literal or string starting with `/`, indicating /// a server-relative path rather than an attacker-controlled URL. fn is_internal_redirect(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool { @@ -896,7 +979,7 @@ impl CfgAnalysis for UnguardedSink { let source_derived = sink_arg_is_source_derived(ctx, *sink); // If sink args are all constants (including one-hop constant bindings) - // and taint didn't confirm, this is a false positive — skip it. + // and taint didn't confirm, this is a false positive, skip it. if is_all_args_constant(ctx, *sink) && !has_taint { continue; } @@ -904,7 +987,7 @@ impl CfgAnalysis for UnguardedSink { // SSA latest-def suppression: when the taint engine has already // proved no source-tainted data reaches this sink (`!has_taint`) // and every SSA operand resolves to a constant, callee-fragment - // pseudo-name, OR a function parameter that is not a Source — + // pseudo-name, OR a function parameter that is not a Source , // the sink's actual arguments cannot carry an injection payload. // Catches the reassign-to-constant idiom (`name := req.x; name = // "Guest"; sink(name)`) where the latest SSA def is a literal @@ -919,7 +1002,7 @@ impl CfgAnalysis for UnguardedSink { // Type-aware suppression: when all SSA operand values of the sink // are proven to carry non-injectable types (e.g. integers parsed // from a raw source), the arguments cannot form a payload for - // SHELL/SQL/FILE sinks. Skip the structural finding — the taint + // SHELL/SQL/FILE sinks. Skip the structural finding, the taint // engine already covers the source→sink flow via type-aware // suppression. Unknown-typed or mixed operands fall through. if !has_taint && sink_args_typed_safe(ctx, *sink, sink_caps) { @@ -936,13 +1019,13 @@ impl CfgAnalysis for UnguardedSink { // Parameterized SQL queries: arg 0 is a string literal with // placeholders ($1, ?, %s, :name) and a params argument exists. - // These are safe by construction — the driver handles escaping. + // These are safe by construction, the driver handles escaping. if sink_info.parameterized_query { continue; } // Internal redirects: res.redirect(`/path/...`) with a path-prefix - // argument are server-relative — not attacker-controlled URLs. + // argument are server-relative, not attacker-controlled URLs. if is_internal_redirect(ctx, *sink, sink_caps) { continue; } @@ -953,10 +1036,10 @@ impl CfgAnalysis for UnguardedSink { let (severity, confidence) = if has_taint || source_derived { (Severity::High, Confidence::High) } else if param_only && !in_entrypoint { - // Wrapper function with param-only args — zero signal. Suppress. + // Wrapper function with param-only args, zero signal. Suppress. continue; } else if !ctx.taint_active { - // AST-only / cfg-only mode — preserve as LOW (unchanged) + // AST-only / cfg-only mode, preserve as LOW (unchanged) (Severity::Low, Confidence::Low) } else { // taint_active=true but found nothing. @@ -970,7 +1053,7 @@ impl CfgAnalysis for UnguardedSink { // If the function containing the sink has no Source-labeled // nodes AND no parameters (through which taint could flow // from callers), taint ran and found nothing because there - // is nothing to find. Suppress — the structural finding + // is nothing to find. Suppress, the structural finding // is noise. let sink_func = sink_info.ast.enclosing_func.as_deref(); let has_sources = ctx.cfg.node_indices().any(|n| { diff --git a/src/cfg_analysis/mod.rs b/src/cfg_analysis/mod.rs index 4cb539b6..7f68f6a3 100644 --- a/src/cfg_analysis/mod.rs +++ b/src/cfg_analysis/mod.rs @@ -1,3 +1,5 @@ +#![doc = include_str!(concat!(env!("OUT_DIR"), "/cfg_analysis.md"))] + pub mod auth; pub mod dominators; pub mod error_handling; @@ -30,17 +32,15 @@ pub struct BodyConstFacts { pub type_facts: TypeFactResult, /// Field-sensitive Steensgaard points-to facts. /// - /// Computed only when [`crate::pointer::is_enabled()`] (i.e. the - /// `NYX_POINTER_ANALYSIS=1` env var is set). Phase 2 of the - /// pointer-analysis rollout consumes this in `state::transfer.rs` - /// to suppress proxy-acquire mis-attribution on field-aliased - /// locals like `m := c.mu`. When `None`, every consumer must fall - /// back to its existing pointer-unaware behaviour. + /// Computed only when [`crate::pointer::is_enabled()`]. + /// `state::transfer.rs` consumes this to suppress proxy-acquire + /// mis-attribution on field-aliased locals like `m := c.mu`. When + /// `None`, consumers fall back to pointer-unaware behaviour. pub pointer_facts: Option, } /// Lower a body to SSA and run constant propagation. Returns `None` when -/// lowering fails (empty CFG, invalid entry) — callers treat absence as +/// lowering fails (empty CFG, invalid entry), callers treat absence as /// "no SSA facts available" and fall back to the syntactic path. pub fn build_body_const_facts(body: &crate::cfg::BodyCfg, lang: Lang) -> Option { let mut ssa = crate::ssa::lower_to_ssa_with_params( @@ -116,13 +116,13 @@ pub struct AnalysisContext<'a> { /// Structural analyses use it to suppress findings when a sink's argument /// SSA values are proven to carry non-injectable types (e.g. integers /// parsed from a raw source can't form SHELL/SQL/path payloads). Sourced - /// from `body_const_facts` when present — keep both pointers coherent. + /// from `body_const_facts` when present, keep both pointers coherent. pub type_facts: Option<&'a TypeFactResult>, /// Decorators / annotations / attributes attached to the body's /// declaration (e.g. Python `@login_required`, Java `@PreAuthorize`, /// Symfony `#[IsGranted(...)]`). Consumed by the AuthGap analysis to /// suppress `cfg-auth-gap` when the framework already enforces auth at - /// the function-declaration level — the gap only matters when the + /// the function-declaration level, the gap only matters when the /// auth call has to live inside the body. pub auth_decorators: &'a [String], } diff --git a/src/cfg_analysis/resources.rs b/src/cfg_analysis/resources.rs index 2f93ac19..3feffd0d 100644 --- a/src/cfg_analysis/resources.rs +++ b/src/cfg_analysis/resources.rs @@ -25,7 +25,7 @@ fn find_acquire_nodes( } if let Some(callee) = &info.call.callee { let callee_lower = callee.to_ascii_lowercase(); - // Check exclusions first — if the callee matches an exclude + // Check exclusions first, if the callee matches an exclude // pattern, it is NOT an acquire even if it also matches an // acquire pattern (e.g. `freopen` ends with `fopen`). let excluded = exclude_patterns.iter().any(|p| { @@ -74,7 +74,7 @@ fn find_release_nodes(ctx: &AnalysisContext, release_patterns: &[&str]) -> Vecnext = ...` (linked-list insertion) /// -/// If the variable is transferred, there is no leak — the receiving struct is +/// If the variable is transferred, there is no leak, the receiving struct is /// responsible for the lifetime. fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool { let acquired_var = match &ctx.cfg[acquire].taint.defines { @@ -258,7 +258,7 @@ fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool { false }; if !is_field_write { - continue; // genuine redefinition — stop this path + continue; // genuine redefinition, stop this path } } @@ -343,7 +343,7 @@ fn is_consumed_by_owner(ctx: &AnalysisContext, acquire: NodeIndex) -> bool { } } - // Also check the span text for consuming calls — handles cases where + // Also check the span text for consuming calls, handles cases where // the call is embedded in a return statement (e.g. `return FileResponse(f)`) if info.taint.uses.iter().any(|u| u == &acquired_var) { let (start, end) = info.ast.span; diff --git a/src/cfg_analysis/rules.rs b/src/cfg_analysis/rules.rs index adc446f0..3af3fd04 100644 --- a/src/cfg_analysis/rules.rs +++ b/src/cfg_analysis/rules.rs @@ -141,7 +141,7 @@ static JAVA_AUTH: &[AuthRule] = &[AuthRule { "hasPermission", "requireRole", // Spring Security / JAX-RS annotation names (used by decorator - // detection — see `extract_auth_decorators` in src/cfg.rs). + // detection, see `extract_auth_decorators` in src/cfg.rs). "PreAuthorize", "PostAuthorize", "Secured", @@ -174,7 +174,7 @@ static JS_AUTH: &[AuthRule] = &[AuthRule { "jwt.verify", // NestJS-style decorators and guard class names (seeded by decorator // arg extraction in `extract_auth_decorators`). `UseGuards` alone is - // too generic — we still match on guard *argument* identifiers here. + // too generic, we still match on guard *argument* identifiers here. "Authenticated", "AuthGuard", "JwtAuthGuard", @@ -268,7 +268,7 @@ static CPP_AUTH: &[AuthRule] = &[AuthRule { "check_auth", "verify_token", "validate_token", - // Custom C++ attributes — framework-defined, bare-name match. + // Custom C++ attributes, framework-defined, bare-name match. "authenticated", "require_auth", "admin_only", @@ -287,7 +287,7 @@ static RUST_AUTH: &[AuthRule] = &[AuthRule { "check_auth", "verify_token", "validate_token", - // Custom proc-macro attributes — framework-defined, bare-name match. + // Custom proc-macro attributes, framework-defined, bare-name match. "authenticated", "require_auth", "admin_only", diff --git a/src/cfg_analysis/tests.rs b/src/cfg_analysis/tests.rs index 9b20752d..852de09a 100644 --- a/src/cfg_analysis/tests.rs +++ b/src/cfg_analysis/tests.rs @@ -127,7 +127,7 @@ fn unreachable_code_detection_runs_without_panic() { #[test] fn all_branches_reachable_no_findings() { - // All branches reachable — no unreachable-code findings + // All branches reachable, no unreachable-code findings let src = br#" use std::process::Command; fn main() { @@ -282,7 +282,7 @@ fn ssa_const_prop_preserves_sink_on_dynamic_source_arg() { #[test] fn unguarded_sink_detected() { - // Sink with no validation — should be flagged + // Sink with no validation, should be flagged let src = br#" use std::process::Command; fn main() { @@ -335,6 +335,90 @@ fn guarded_sink_with_sanitizer_not_flagged() { ); } +/// Regression: `cond_indirect_validator_callee` used to pick the +/// textually-last call defining the condition variable across the +/// whole function, including reassignments that occur **after** the +/// `if`. When that later call wasn't a recognised validator, the +/// validator pattern was missed and the downstream sink was +/// (incorrectly) flagged as `cfg-unguarded-sink`. +/// +/// Pattern: +/// let err = validateInput(cmd); // real validator, before the if +/// if (err) throw …; // sink-guarding branch +/// eval(cmd); // sink dominated by the guard +/// err = recordMetric(); // later reassignment, NOT a validator +/// +/// The defining call reaching the `if` is `validateInput`; the +/// `recordMetric` reassignment is downstream of the use and must not +/// shadow it. +#[test] +fn indirect_validator_ignores_reassignment_after_if() { + let src = br#" +async function handler(req) { + const cmd = req.query.cmd; + let err = await validateInput(cmd); + if (err) { + throw new Error('blocked'); + } + eval(cmd); + err = recordMetric(); +} +"#; + + let findings = parse_and_analyse( + &guards::UnguardedSink, + src, + "javascript", + Language::from(tree_sitter_javascript::LANGUAGE), + ); + + let guard_findings: Vec<_> = findings + .iter() + .filter(|f| f.rule_id == "cfg-unguarded-sink") + .collect(); + assert!( + guard_findings.is_empty(), + "later non-validator reassignment must not shadow the real validator def reaching the if; got {:?}", + guard_findings + ); +} + +/// Companion sanity check for `indirect_validator_ignores_reassignment_after_if`: +/// without the trailing reassignment the same pattern is already +/// suppressed by `cond_indirect_validator_callee`. Pinned so a future +/// change to the indirect-validator recognition can't silently regress +/// this baseline alongside the regression case above. +#[test] +fn indirect_validator_baseline_suppresses_dominated_sink() { + let src = br#" +async function handler(req) { + const cmd = req.query.cmd; + const err = await validateInput(cmd); + if (err) { + throw new Error('blocked'); + } + eval(cmd); +} +"#; + + let findings = parse_and_analyse( + &guards::UnguardedSink, + src, + "javascript", + Language::from(tree_sitter_javascript::LANGUAGE), + ); + + let guard_findings: Vec<_> = findings + .iter() + .filter(|f| f.rule_id == "cfg-unguarded-sink") + .collect(); + assert!( + guard_findings.is_empty(), + "indirect-validator pattern (no reassignment) must suppress dominated sink; got {:?}", + guard_findings + ); +} + // ─── Auth gap tests ──────────────────────────────────────────────────── #[test] @@ -397,7 +481,7 @@ fn auth_check_before_sink_no_finding() { #[test] fn error_fallthrough_analysis_runs_on_go() { // Go pattern: err check without return, followed by dangerous call. - // This is a heuristic analysis — we verify it runs without panicking. + // This is a heuristic analysis, we verify it runs without panicking. let src = br#" package main import "os/exec" @@ -422,7 +506,7 @@ fn error_fallthrough_analysis_runs_on_go() { #[test] fn proper_error_return_no_finding_go() { - // Go pattern: err check with return — should not flag error fallthrough. + // Go pattern: err check with return, should not flag error fallthrough. let src = br#" package main import "os/exec" @@ -820,6 +904,7 @@ fn taint_and_unguarded_sink_deduped() { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }]; let findings = parse_and_run_all_with_taint( @@ -949,7 +1034,7 @@ function readFile() { #[test] fn js_throw_terminates_block() { - // throw should act as a terminator — code directly after throw in the same + // throw should act as a terminator, code directly after throw in the same // block should be unreachable. let src = br#" function fail() { @@ -1031,7 +1116,7 @@ fn configured_terminator_stops_flow() { "eval should be unreachable after process.exit terminator" ); } - // If eval_nodes is empty it means the node wasn't created (also acceptable — + // If eval_nodes is empty it means the node wasn't created (also acceptable , // it's after a terminator so the CFG may not even emit it) } @@ -1480,7 +1565,7 @@ void process() { let reachable = dominators::reachable_set(cfg, entry); - // All nodes should be reachable — the preproc recovery should prevent + // All nodes should be reachable, the preproc recovery should prevent // the dangling-else from orphaning downstream code. let unreachable_count = cfg.node_count() - reachable.len(); assert!( @@ -1515,7 +1600,7 @@ void process() { let reachable = dominators::reachable_set(cfg, entry); - // All nodes should be reachable — break exits the loop and post-loop + // All nodes should be reachable, break exits the loop and post-loop // code (free(x)) should be connected. let unreachable_count = cfg.node_count() - reachable.len(); assert!( @@ -1878,7 +1963,7 @@ def run(): #[test] fn python_one_hop_constant_still_suppressed() { - // cmd = "ls"; os.system(cmd) — `all_args_literal` is false (identifier arg), + // cmd = "ls"; os.system(cmd), `all_args_literal` is false (identifier arg), // but should still be suppressed via existing one-hop constant trace in cfg_analysis. let src = br#" import os @@ -1959,7 +2044,7 @@ def run(): #[test] fn python_constant_receiver_tainted_arg_produces_finding() { - // safe_obj.system(user_input) — constant receiver is irrelevant, tainted arg must report + // safe_obj.system(user_input), constant receiver is irrelevant, tainted arg must report let src = br#" import os import sys diff --git a/src/cfg_analysis/unreachable.rs b/src/cfg_analysis/unreachable.rs index 30e85215..7968f095 100644 --- a/src/cfg_analysis/unreachable.rs +++ b/src/cfg_analysis/unreachable.rs @@ -26,7 +26,7 @@ fn event_handler_callbacks(ctx: &AnalysisContext) -> HashSet { .iter() .any(|h| callee_lower.ends_with(&h.to_ascii_lowercase())); if is_handler { - // The callback function is typically used within the call — any function + // The callback function is typically used within the call, any function // that appears as `uses` of this call node is a potential callback. for u in &info.taint.uses { callbacks.insert(u.clone()); @@ -113,7 +113,7 @@ impl CfgAnalysis for UnreachableCode { Severity::Medium, ) } else { - // Plain unreachable code — low severity + // Plain unreachable code, low severity continue; } }; diff --git a/src/commands/config.rs b/src/commands/config.rs index 4f615375..b97ea940 100644 --- a/src/commands/config.rs +++ b/src/commands/config.rs @@ -57,7 +57,7 @@ fn print_toml_with_highlights(toml_str: &str) { continue; } // key = value lines (but not `[xxx]`). Split on the first `=` - // that isn't inside a quoted string — TOML keys don't contain + // that isn't inside a quoted string, TOML keys don't contain // `=` outside quotes, so a leading-segment split is safe enough // for the common case. Continuation lines from multi-line // arrays/strings won't have `=` and fall through to plain. @@ -149,7 +149,7 @@ fn prune_matching(effective: &toml::Value, defaults: &toml::Value) -> Option { - // Key absent in defaults — keep entirely. + // Key absent in defaults, keep entirely. out.insert(k.clone(), v.clone()); } } @@ -160,9 +160,9 @@ fn prune_matching(effective: &toml::Value, defaults: &toml::Value) -> Option None, - // Differing leaf or shape change — keep the effective value. + // Differing leaf or shape change, keep the effective value. _ => Some(effective.clone()), } } @@ -180,13 +180,13 @@ fn count_top_level_keys(toml_str: &str) -> usize { continue; } if trimmed.starts_with('[') { - // Section header — not an override on its own. Reset + // Section header, not an override on its own. Reset // any stuck multi-line state defensively. in_multiline = false; continue; } if in_multiline { - // Inside a multi-line array/inline table — closing bracket + // Inside a multi-line array/inline table, closing bracket // ends it, intermediate lines don't count. if trimmed.starts_with(']') || trimmed.starts_with('}') { in_multiline = false; diff --git a/src/commands/index.rs b/src/commands/index.rs index 72999b49..fa92e5d3 100644 --- a/src/commands/index.rs +++ b/src/commands/index.rs @@ -123,7 +123,7 @@ pub fn build_index_with_observer( logs: Option<&Arc>, ) -> NyxResult<()> { // Pass 1 of the indexed scan reads persisted summaries produced here, so - // framework context must be populated at index-build time — otherwise + // framework context must be populated at index-build time, otherwise // framework-conditional label rules never contribute to the summaries // and indexed scans diverge from non-indexed ones. Matches the // auto-fill in scan_filesystem_with_observer / @@ -152,7 +152,7 @@ pub fn build_index_with_observer( let walk_start = std::time::Instant::now(); let (rx, handle) = spawn_file_walker(project_path, config); - // Drain the channel BEFORE joining — the bounded channel will deadlock + // Drain the channel BEFORE joining, the bounded channel will deadlock // if we join first and the walker blocks on send. let paths: Vec = rx.into_iter().flatten().collect(); if let Err(err) = handle.join() { @@ -205,7 +205,7 @@ pub fn build_index_with_observer( .try_for_each(|path| -> NyxResult<()> { let mut idx = Indexer::from_pool(project_name, &pool)?; - // Read once, hash once — pass bytes to both rule execution and + // Read once, hash once, pass bytes to both rule execution and // summary extraction. Use pre-computed hash for upsert to avoid // a redundant file read inside upsert_file. let bytes = std::fs::read(&path)?; diff --git a/src/commands/mod.rs b/src/commands/mod.rs index 20810a63..cfc826e9 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -21,7 +21,7 @@ pub fn handle_command( // Resolve engine options once for the whole process. Scan overlays CLI // flags below; other subcommands use the config values verbatim. The // install is a no-op after the first call, so Scan's overlay must happen - // before we reach this point for its own call path — we delay the install + // before we reach this point for its own call path, we delay the install // to the Scan arm and gate non-scan commands behind a fallback install of // the bare config values. let install_from_config = |config: &Config| { @@ -378,7 +378,7 @@ fn print_engine_explanation(config: &Config, engine_profile: Option String { diff --git a/src/commands/scan.rs b/src/commands/scan.rs index 4006ce08..470f8f31 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -54,7 +54,7 @@ fn record_persist_error(errors: &Arc>>, message: String) { /// When `enabled` is true, a panic inside `f` is caught, logged, and /// converted into a `NyxError::Msg`; callers that already match on /// `Err(_)` will gracefully skip the file. When `enabled` is false, -/// the panic propagates unchanged — preserving the default behaviour +/// the panic propagates unchanged, preserving the default behaviour /// for users who want to catch engine bugs loudly. /// /// `AssertUnwindSafe` is load-bearing: closures over `&Config` / @@ -222,7 +222,7 @@ fn is_false(b: &bool) -> bool { /// Framework detection drives framework-conditional label rules (e.g. actix / /// axum / rocket handler-arg sources, Rails route helpers) and auth-analysis /// extractors. If any scan entry point forgets to populate it, the indexed -/// and non-indexed paths silently diverge — missing framework-specific +/// and non-indexed paths silently diverge, missing framework-specific /// findings in whichever path skipped detection. This helper exists so the /// auto-fill stays consistent across `scan_filesystem_with_observer`, /// `scan_with_index_parallel_observer`, and `build_index_with_observer`. @@ -239,7 +239,7 @@ pub(crate) fn ensure_framework_ctx(root: &Path, cfg: &Config) -> Option /// /// Drives the one-time `preview-tier scan` banner in `handle()`. Tracks /// the extensions `lang_for_path` in `ast.rs` maps to the `"c"` and `"cpp"` -/// slugs — keep this aligned with that mapping. +/// slugs, keep this aligned with that mapping. pub(crate) fn is_preview_tier_path(path: &Path) -> bool { matches!( path.extension() @@ -514,14 +514,14 @@ pub fn retain_converged_findings(diags: &mut Vec) { /// the same function; tiebreak by source line asc, source col asc). /// /// Rule IDs of the form `taint-unsanitised-flow (source L:C)` share a single -/// base `taint-unsanitised-flow`. The grouping key is column-agnostic — +/// base `taint-unsanitised-flow`. The grouping key is column-agnostic , /// multiple flows to the same sink line differing only in column or source /// are collapsed to one. The rule_id preserves the source location, so the /// kept representative still identifies which flow was reported. /// /// The grouping key **includes the resolved sink capability bits** so that /// two different sinks on the same line (e.g. `sink_sql(x); sink_shell(x);`) -/// are not collapsed into one finding — they represent materially different +/// are not collapsed into one finding, they represent materially different /// vulnerabilities and must surface independently. Findings with different /// base rule IDs (e.g. `js.code_exec.eval`) or different severities are /// left untouched per guardrails. @@ -560,7 +560,7 @@ pub(crate) fn deduplicate_taint_flows(diags: &mut Vec) { let src_col = src.map(|s| s.col).unwrap_or(u32::MAX); // Same-function check: first flow_step (Source) and the step at the // sink share an `enclosing_func`. If flow_steps are absent or the - // function markers are missing, treat as "unknown" — worse than a + // function markers are missing, treat as "unknown", worse than a // confirmed same-function match but better than a confirmed mismatch. let same_function_flag: u32 = ev .and_then(|e| { @@ -677,7 +677,7 @@ pub const SCC_UNCONVERGED_CROSS_FILE_NOTE_PREFIX: &str = "scc_unconverged:cross- /// [`GlobalSummaries::snapshot_caps`] results. /// /// Used by the Phase-B worklist to derive the next iteration's dirty -/// file set. Semantics match [`diff_cap_snapshots`] — a key that +/// file set. Semantics match [`diff_cap_snapshots`], a key that /// appears or disappears counts as changed. fn changed_cap_keys_of( before: &HashMap)>, @@ -728,7 +728,7 @@ fn changed_ssa_keys_of( /// /// Called once per unconverged batch (after the pass-2 rayon parallelism /// has collected `iteration_diags`) so the cost is O(n) over the batch's -/// findings — much cheaper than a per-finding `warn!`. +/// findings, much cheaper than a per-finding `warn!`. /// /// Confidence is **capped** at `Low` rather than unconditionally set: /// upstream analysis may have proven something particularly strong about @@ -795,7 +795,7 @@ fn tag_unconverged_findings( /// Safety cap on SCC fixed-point iterations. /// -/// The convergence predicate is *snapshot equality* — we break as soon as +/// The convergence predicate is *snapshot equality*, we break as soon as /// an iteration leaves both `snapshot_caps()` and `snapshot_ssa()` /// unchanged. The cap only triggers if something prevents monotone /// progress (e.g. a non-monotone SSA summary refinement or an SCC larger @@ -809,7 +809,7 @@ fn tag_unconverged_findings( /// SCC with `k` functions arranged in a chain, fresh taint introduced at /// one end of the chain needs up to `k` iterations to reach the other /// end. A hard cap of 3 was silently truncating propagation for any -/// SCC of 4+ cross-file functions — findings vanished with no warning. +/// SCC of 4+ cross-file functions, findings vanished with no warning. /// /// `FuncSummary` is a finite-height lattice (≤ 48 bits of caps + a /// bounded vector of parameter indices) and `insert()` is strictly @@ -865,7 +865,7 @@ fn effective_scc_cap() -> usize { /// persisted by non-recursive topo batches in the most recent /// [`run_topo_batches`] invocation. Intended for the regression tests /// that prove the topo-refinement pipeline is wired and producing -/// observable cross-batch state — see +/// observable cross-batch state, see /// `tests/topo_pass2_refinement_tests.rs`. Cheap relaxed load. static LAST_TOPO_NONRECURSIVE_REFINEMENTS: AtomicUsize = AtomicUsize::new(0); @@ -905,7 +905,7 @@ fn topo_refine_enabled() -> bool { /// /// When `call_graph` is missing an edge (e.g. a summary was inserted /// after graph construction), we conservatively fall back to -/// re-analysing the full batch — correctness is preserved at the cost +/// re-analysing the full batch, correctness is preserved at the cost /// of the worklist optimisation for that iteration. #[allow(clippy::too_many_arguments)] fn run_topo_batches( @@ -1104,7 +1104,7 @@ fn run_topo_batches( // A file becomes dirty for iteration N+1 iff it // contains at least one caller of a FuncKey that // changed in iteration N. If no key changed, the - // dirty set is empty — which implies convergence (and + // dirty set is empty, which implies convergence (and // matches `iter_converged` above). let changed_cap_keys = changed_cap_keys_of(&snap_before, &snap_after); let changed_ssa_keys = @@ -1124,7 +1124,7 @@ fn run_topo_batches( // changed key. Fall back to the full batch when the // call graph does not resolve any caller (e.g. all // changes happened in leaf functions that no one in - // this batch calls — rare but must not regress to + // this batch calls, rare but must not regress to // missed analysis). let namespaces_needing_reanalysis = crate::callgraph::namespaces_for_callers(call_graph, &all_changed_keys); @@ -1165,7 +1165,7 @@ fn run_topo_batches( } if iter_converged { // Snapshots equal but dirty_files non-empty is - // anomalous — log and treat as converged + // anomalous, log and treat as converged // (snapshot equality is the correctness-preserving // signal). tracing::debug!( @@ -1182,7 +1182,7 @@ fn run_topo_batches( // After the loop, flatten per-file diags into the // iteration_diags vector in batch order for deterministic // output. Files that were in the batch but never made - // dirty (shouldn't happen — iter 0 runs all of them) are + // dirty (shouldn't happen, iter 0 runs all of them) are // skipped silently. let mut iteration_diags: Vec = Vec::new(); for p in &batch.files { @@ -1268,7 +1268,7 @@ fn run_topo_batches( // parallel section completes, persist those refinements into // `global_summaries` sequentially. Subsequent batches in // topo order (caller-most batches) then resolve their call - // sites against the refined cross-file context — the final + // sites against the refined cross-file context, the final // step in the callee-first topo pipeline that pass-2 // sequencing was always meant to deliver. // @@ -1455,7 +1455,7 @@ fn run_topo_batches( } } - // Orphan files (no functions in call graph) — process last, single pass. + // Orphan files (no functions in call graph), process last, single pass. if !orphans.is_empty() { let orphan_diags: Vec = orphans .par_iter() @@ -2099,7 +2099,7 @@ pub fn scan_with_index_parallel_observer( if let Some(p) = &progress_ref { p.set_current_file(&path.to_string_lossy()); } - // Read once, hash once — use the hash for the change check + // Read once, hash once, use the hash for the change check // to avoid a second file read inside should_scan. if let Ok(bytes) = std::fs::read(path) { let hash = Indexer::digest_bytes(&bytes); @@ -2681,7 +2681,7 @@ pub fn scan_with_index_parallel_observer( // pipeline intends to produce (taint + cfg-* + state-* from state // analysis + auth.* when configured). A previous revision clipped this // to `taint*`/`cfg-*` only, silently dropping state-model findings and - // breaking parity with `scan_filesystem` — fixed. Mode-scoped + // breaking parity with `scan_filesystem`, fixed. Mode-scoped // filtering, if ever needed, belongs in the analysis layer, not here. let post_process_start = std::time::Instant::now(); @@ -3134,7 +3134,7 @@ mod dedup_taint_flow_tests { #[test] fn dedup_collapses_same_line_different_columns() { - // Two findings at line 10 but different columns — the widened key + // Two findings at line 10 but different columns, the widened key // (path, line, severity) collapses them; the tighter source wins. let mut diags = vec![ make_taint("a.rs", 10, 3, 4, 1), @@ -3151,7 +3151,7 @@ mod dedup_taint_flow_tests { #[test] fn dedup_does_not_drop_different_sink_caps_on_same_line() { - // Two findings at line 10, same column, same severity — but with + // Two findings at line 10, same column, same severity, but with // different resolved sink capability bits (SQL vs SHELL). They must // NOT collapse: different sink kinds are materially different // vulnerabilities. Regression guard. @@ -3175,7 +3175,7 @@ mod dedup_taint_flow_tests { #[test] fn dedup_collapses_same_sink_caps_on_same_line() { - // Same line, same severity, same sink caps — this is the canonical + // Same line, same severity, same sink caps, this is the canonical // dedup case (two flows to the same sink, differing only in source). let mut diags = vec![ make_taint("a.rs", 10, 5, 3, 1), diff --git a/src/commands/serve.rs b/src/commands/serve.rs index 2b404454..e5117a03 100644 --- a/src/commands/serve.rs +++ b/src/commands/serve.rs @@ -88,7 +88,7 @@ pub fn handle( // Invalidate the findings cache whenever a scan finishes so the next // request rebuilds against fresh diags. The next-request rebuild keeps - // this hot-path simple — we only clear the slot here, never recompute. + // this hot-path simple, we only clear the slot here, never recompute. let cache_for_invalidate = Arc::clone(&state.findings_cache); let mut event_rx = event_tx.subscribe(); tokio::spawn(async move { @@ -152,7 +152,7 @@ async fn shutdown_signal() { .expect("failed to listen for Ctrl+C"); eprintln!("\n Shutting down..."); // SSE connections block graceful shutdown indefinitely. - // Use a raw OS thread to force exit — tokio tasks may not + // Use a raw OS thread to force exit, tokio tasks may not // run reliably during shutdown. std::thread::spawn(|| { std::thread::sleep(std::time::Duration::from_millis(250)); diff --git a/src/constraint/domain.rs b/src/constraint/domain.rs index 363b0c5e..342c3897 100644 --- a/src/constraint/domain.rs +++ b/src/constraint/domain.rs @@ -106,7 +106,7 @@ impl ConstValue { if let Ok(i) = t.parse::() { return Some(ConstValue::Int(i)); } - // Negative with space: "- 5" — not supported, conservative + // Negative with space: "- 5", not supported, conservative None } } @@ -118,9 +118,9 @@ impl ConstValue { pub struct TypeSet(u16); impl TypeSet { - /// All 12 type bits set — no type constraint (Top). + /// All 12 type bits set, no type constraint (Top). pub const TOP: Self = Self(0x0FFF); - /// No type bits — unsatisfiable (Bottom). + /// No type bits, unsatisfiable (Bottom). pub const BOTTOM: Self = Self(0); pub fn singleton(kind: &TypeKind) -> Self { @@ -149,7 +149,7 @@ impl TypeSet { self == Self::TOP } - /// Complement — all types NOT in this set. + /// Complement, all types NOT in this set. pub fn complement(self) -> Self { Self(!self.0 & Self::TOP.0) } @@ -184,7 +184,7 @@ fn type_kind_index(kind: &TypeKind) -> u32 { TypeKind::Url => 10, TypeKind::HttpClient => 11, TypeKind::LocalCollection => 12, - // Phase 6 DTO types carry per-field structural info that the + // the analysis DTO types carry per-field structural info that the // bitset domain can't represent. Collapse to Unknown so callers // still see "any type possible" rather than crashing on an // unhandled variant. Same-file/cross-file Dto-aware paths read @@ -274,7 +274,7 @@ impl Nullability { /// Boolean state lattice. /// -/// Same shape as [`Nullability`]. No `negate()` — negation is structural +/// Same shape as [`Nullability`]. No `negate()`, negation is structural /// on [`ConditionExpr`](super::lower::ConditionExpr). #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum BoolState { @@ -313,7 +313,7 @@ impl BoolState { /// Abstract fact about a single SSA value. /// /// Combines interval, constant, type, null, and boolean constraints. -/// There is intentionally no generic `negate()` on ValueFact — negation +/// There is intentionally no generic `negate()` on ValueFact, negation /// is structural on [`ConditionExpr`](super::lower::ConditionExpr) and /// then applied as atomic refinements by the solver. #[derive(Clone, Debug, PartialEq, Eq)] @@ -857,14 +857,14 @@ impl PathEnv { // `assume_neq`, and a few internal sites. Large generated inputs // (thousands of short statements on one line) can drive millions // of calls and overflow a plain u16 `refine_count`. Saturate to - // stay within bounds — the refinement pipeline is already + // stay within bounds, the refinement pipeline is already // idempotent past the cap, so saturation is semantically a no-op. self.refine_count = self.refine_count.saturating_add(1); // Check size bound let pos = self.facts.binary_search_by_key(&v, |(k, _)| *k); if pos.is_err() && self.facts.len() >= MAX_PATH_ENV_ENTRIES { - return; // bounded — don't grow + return; // bounded, don't grow } // Get meet count for widening @@ -963,7 +963,7 @@ impl PathEnv { let ra = self.uf.find_immutable(a); let rb = self.uf.find_immutable(b); if ra == rb { - // Already known equal — contradiction + // Already known equal, contradiction self.unsat = true; return; } @@ -1040,7 +1040,7 @@ impl PathEnv { return; } - // Step 4: dedup check — if this exact constraint already exists, skip + // Step 4: dedup check, if this exact constraint already exists, skip let already_present = self .relational .iter() @@ -1052,7 +1052,7 @@ impl PathEnv { if self.relational.len() < MAX_RELATIONAL { self.relational.push((ra, op, rb)); } - // If at capacity, skip — conservative: losing a constraint only + // If at capacity, skip, conservative: losing a constraint only // loses pruning power, never introduces unsoundness. } @@ -1089,7 +1089,7 @@ impl PathEnv { if has_strict || op == RelOp::Lt { return true; } - // All Le: a <= b <= ... <= a means all equal — satisfiable + // All Le: a <= b <= ... <= a means all equal, satisfiable return false; } // Continue walking (take first outgoing edge) @@ -1181,11 +1181,11 @@ impl PathEnv { while i < self.facts.len() && j < other.facts.len() { match self.facts[i].0.cmp(&other.facts[j].0) { std::cmp::Ordering::Less => { - // Only in self — drop (absent on other side = Top) + // Only in self, drop (absent on other side = Top) i += 1; } std::cmp::Ordering::Greater => { - // Only in other — drop + // Only in other, drop j += 1; } std::cmp::Ordering::Equal => { diff --git a/src/constraint/lower.rs b/src/constraint/lower.rs index c5bd9bf2..2115deb6 100644 --- a/src/constraint/lower.rs +++ b/src/constraint/lower.rs @@ -8,10 +8,10 @@ //! 1. **Structural:** `condition_negated` (AST-level boolean) //! 2. **Structural:** `condition_vars` (AST-extracted identifiers) //! 3. **Structural:** compound decomposition (already handled by -//! `build_condition_chain` — each leaf is a separate Block/Branch) -//! 4. **Structural:** `value_defs` — resolve var names to [`SsaValue`]s -//! 5. **Structural:** `const_values` — augment with known constants -//! 6. **Text fallback:** `condition_text` — parse comparison operator and +//! `build_condition_chain`, each leaf is a separate Block/Branch) +//! 4. **Structural:** `value_defs`, resolve var names to [`SsaValue`]s +//! 5. **Structural:** `const_values`, augment with known constants +//! 6. **Text fallback:** `condition_text`, parse comparison operator and //! literal operand. Necessary because individual comparisons are NOT //! decomposed into separate SSA operations (condition nodes → `Nop`). @@ -82,7 +82,7 @@ impl CompOp { /// Structured condition expression with SSA-resolved operands. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum ConditionExpr { - /// `lhs op rhs` — e.g., `x > 5`, `x == y`. + /// `lhs op rhs`, e.g., `x > 5`, `x == y`. Comparison { lhs: Operand, op: CompOp, @@ -98,7 +98,7 @@ pub enum ConditionExpr { }, /// Boolean truthiness test: `if (x)`. BoolTest { var: SsaValue }, - /// Could not parse or resolve — conservatively no refinement. + /// Could not parse or resolve, conservatively no refinement. Unknown, } @@ -240,7 +240,7 @@ pub fn lower_condition_with_stacks( .map(|(name, val)| (name.as_str(), *val)) .collect(); - // No const_values at lowering time — empty lookup + // No const_values at lowering time, empty lookup let const_lookup: HashMap = HashMap::new(); let lower = text.to_ascii_lowercase(); diff --git a/src/constraint/solver.rs b/src/constraint/solver.rs index 5744dec3..c8573a19 100644 --- a/src/constraint/solver.rs +++ b/src/constraint/solver.rs @@ -1,6 +1,6 @@ //! Constraint solver: apply conditions to [`PathEnv`] and check satisfiability. //! -//! The solver operates on structured [`ConditionExpr`] values — never on raw +//! The solver operates on structured [`ConditionExpr`] values, never on raw //! text. Negation is always structural (via [`ConditionExpr::negate`] / //! [`CompOp::negate`]), not via a generic "negate ValueFact" operation. @@ -13,7 +13,7 @@ use super::lower::{CompOp, ConditionExpr, Operand}; /// for the branch where the condition has the given polarity. /// /// `polarity = true`: condition holds (true branch). -/// `polarity = false`: condition does NOT hold (false branch) — negate +/// `polarity = false`: condition does NOT hold (false branch), negate /// the condition structurally, then apply. pub fn refine_env(env: &PathEnv, cond: &ConditionExpr, polarity: bool) -> PathEnv { if env.is_unsat() { @@ -97,7 +97,7 @@ fn apply_condition(env: &mut PathEnv, cond: &ConditionExpr) { } ConditionExpr::Unknown => { - // No information — no refinement + // No information, no refinement } } } @@ -232,7 +232,7 @@ pub fn class_name_to_type_kind(name: &str) -> Option { "Boolean" => Some(TypeKind::Bool), "List" | "ArrayList" | "Collection" | "Set" | "HashSet" => Some(TypeKind::Array), "URL" | "URI" => Some(TypeKind::Url), - // Framework HTTP clients — also listed in JAVA_HIERARCHY (type_facts.rs) + // Framework HTTP clients, also listed in JAVA_HIERARCHY (type_facts.rs) // for subtype resolution. Both locations needed: this function is called // directly by the constraint solver, while the hierarchy provides // is_subtype_of() for instanceof checks. diff --git a/src/constraint/tests.rs b/src/constraint/tests.rs index 69ec9aa1..bd4381f1 100644 --- a/src/constraint/tests.rs +++ b/src/constraint/tests.rs @@ -156,7 +156,7 @@ fn valuefact_widen_stable_bound() { b.lo = Some(0); b.lo_strict = true; let w = a.widen(&b); - assert_eq!(w.lo, Some(0)); // stable — preserved + assert_eq!(w.lo, Some(0)); // stable, preserved assert!(w.lo_strict); } @@ -357,7 +357,7 @@ fn pathenv_max_refine_per_block() { let v = SsaValue(0); // Reset counter env.reset_refine_count(); - // Refine many times — should stop after MAX_REFINE_PER_BLOCK + // Refine many times, should stop after MAX_REFINE_PER_BLOCK for _ in 0..(MAX_REFINE_PER_BLOCK + 50) { let mut f = ValueFact::top(); f.null = Nullability::NonNull; diff --git a/src/convergence_telemetry.rs b/src/convergence_telemetry.rs index f9952c60..6a6d46a7 100644 --- a/src/convergence_telemetry.rs +++ b/src/convergence_telemetry.rs @@ -1,69 +1,20 @@ //! Convergence-loop telemetry: per-batch and per-file JSONL sidecar. //! -//! Records how many iterations each fix-point loop (cross-file SCC; -//! JS/TS in-file pass-2) actually used on real inputs, plus the -//! per-iteration change-set size trajectory, so we can tune caps on -//! evidence rather than by guess. -//! -//! # Why this module exists -//! -//! The SCC fix-point safety cap ([`crate::commands::scan::SCC_FIXPOINT_SAFETY_CAP`]) -//! and the JS/TS pass-2 cap ([`crate::taint::JS_TS_PASS2_SAFETY_CAP`]) -//! are both 64 iterations — chosen as "generous for every realistic -//! input we've seen". Neither value is backed by telemetry from a -//! production corpus (React, VSCode, Webpack, enterprise -//! monorepos). Without that data we cannot: -//! -//! * tell how often the cap actually fires under real workloads, -//! * distinguish tuneable-budget problems from non-monotonicity -//! regressions (Phase-D classifier addresses this on cap-hit, but -//! tells us nothing about the near-cap distribution), -//! * decide whether further Phase-B worklist optimisation is needed. -//! -//! The telemetry emitted here is consumed by offline analysis tools -//! (`tools/convergence_report.py`, not tracked here) that compute -//! P50/P95/P99 iteration counts per corpus. -//! -//! # Lifecycle -//! -//! Telemetry is **opt-in** via `NYX_CONVERGENCE_TELEMETRY=1` — production -//! scans are unaffected by default. When enabled: -//! -//! * [`is_enabled`] returns true. -//! * The SCC loop and JS/TS pass-2 loop each call [`record`] when -//! they terminate (early-convergence or cap-hit). -//! * On scan shutdown, the collected records are written to a JSONL -//! file alongside the SARIF output (or to the path specified by -//! `NYX_CONVERGENCE_TELEMETRY_PATH`). -//! -//! Records never touch the critical path — [`record`] is a cheap -//! push onto a `Mutex>` and the write happens once at scan end. -//! -//! # Schema stability -//! -//! Records serialize as JSONL (one JSON object per line, newline -//! separated). The `kind` tag is snake_case and stable; adding new -//! fields is backwards-compatible because unknown fields are ignored -//! by downstream tooling. Removing fields, or changing existing -//! fields' types, is a **breaking change** — bump the schema version -//! in [`SCHEMA_VERSION`] if you must. +//! Opt-in via `NYX_CONVERGENCE_TELEMETRY=1`. Records iteration counts +//! and change-set trajectories for the cross-file SCC and JS/TS +//! pass-2 fix-point loops so caps can be tuned from evidence. Output +//! goes to `NYX_CONVERGENCE_TELEMETRY_PATH` or a SARIF-adjacent file. use serde::{Deserialize, Serialize}; use smallvec::SmallVec; use std::sync::{Mutex, OnceLock}; -/// Stable schema version for the JSONL records emitted by this module. -/// -/// Bump when the record shape changes in a way that breaks downstream -/// consumers (field removed, type changed). Adding optional fields is -/// backwards-compatible and does not require a bump. +/// JSONL schema version. Bump on breaking shape changes; optional +/// fields don't require a bump. pub const SCHEMA_VERSION: u32 = 1; -/// One convergence event: either a cross-file SCC batch or a JS/TS -/// in-file pass-2 run. The `kind` discriminator selects between them. -/// -/// Serialized as JSON with `kind` as a snake_case tag so downstream -/// tooling can pattern-match without depending on Rust enum layout. +/// One convergence event, either a cross-file SCC batch or a JS/TS +/// in-file pass-2 run. #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case")] pub enum ConvergenceEvent { @@ -98,7 +49,7 @@ pub struct SccBatchRecord { /// True iff the batch reached the fixed point before the cap /// fired. pub converged: bool, - /// Per-iteration change-set size — the same trajectory the + /// Per-iteration change-set size, the same trajectory the /// [`crate::engine_notes::CapHitReason`] classifier consumes. Empty /// when the loop terminated on iteration 0 (pathological case). pub trajectory: SmallVec<[u32; 4]>, @@ -130,20 +81,10 @@ pub struct InFilePass2Record { pub trajectory: SmallVec<[u32; 4]>, } -/// Global collector for convergence events recorded during a scan. -/// -/// Stored behind a `OnceLock>>` so multiple rayon workers -/// can record events concurrently without a startup cost when -/// telemetry is disabled. The mutex contention is negligible because -/// each scan produces O(batches + JS/TS files) events, not per-task -/// events. static COLLECTOR: OnceLock>> = OnceLock::new(); -/// Returns true when telemetry collection is active for this process. -/// -/// Controlled by the `NYX_CONVERGENCE_TELEMETRY` env var: any value -/// except `"0"`, `"false"`, or empty enables it. Cached on first -/// read so the env lookup is paid once per process. +/// True when `NYX_CONVERGENCE_TELEMETRY` is set to anything other than +/// `"0"`, `"false"`, or empty. Cached. pub fn is_enabled() -> bool { static ENABLED: OnceLock = OnceLock::new(); *ENABLED.get_or_init(|| match std::env::var("NYX_CONVERGENCE_TELEMETRY") { @@ -152,11 +93,7 @@ pub fn is_enabled() -> bool { }) } -/// Record a convergence event. No-op when telemetry is disabled. -/// -/// Safe to call from parallel rayon contexts — the underlying mutex -/// is reentrant-safe and the push is O(1). Events are retained in -/// memory until [`drain`] is called at scan end. +/// Record a convergence event. No-op when telemetry is disabled. pub fn record(event: ConvergenceEvent) { if !is_enabled() { return; @@ -167,9 +104,7 @@ pub fn record(event: ConvergenceEvent) { } } -/// Drain and return all recorded events. Leaves the collector empty -/// so subsequent scans in the same process (e.g. integration tests) -/// do not see stale events. +/// Drain all recorded events. pub fn drain() -> Vec { let Some(lock) = COLLECTOR.get() else { return Vec::new(); @@ -207,7 +142,7 @@ pub fn write_jsonl(path: &std::path::Path) -> std::io::Result { /// Canonical sidecar path: uses `NYX_CONVERGENCE_TELEMETRY_PATH` if /// set, otherwise derives from the current working directory. /// -/// The `_derive_from_root` hint is the scan root — when no explicit +/// The `_derive_from_root` hint is the scan root, when no explicit /// path is configured we place the sidecar next to it as /// `nyx-convergence.jsonl` so the file lands alongside the SARIF /// output by default. @@ -230,7 +165,7 @@ mod tests { static COLLECTOR_TEST_GUARD: Mutex<()> = Mutex::new(()); /// Clear the global collector so each test starts with a known - /// state. Does **not** force `is_enabled()` true — the unit + /// state. Does **not** force `is_enabled()` true, the unit /// tests below bypass `record()` (which is a no-op unless /// env-enabled) by pushing directly into the collector. fn reset_and_enable_telemetry() { diff --git a/src/database.rs b/src/database.rs index e6c854c1..4addac12 100644 --- a/src/database.rs +++ b/src/database.rs @@ -202,16 +202,16 @@ pub mod index { /// /// Bumped independently of `ENGINE_VERSION` whenever the serialized /// layout or identity of a cached artefact changes in an incompatible - /// way — e.g. a `FuncKey` field semantic change that would cause old + /// way, e.g. a `FuncKey` field semantic change that would cause old /// summaries to misbehave when rehydrated. /// /// History: - /// * `"1"` — initial. - /// * `"2"` — 0.5.0: `FuncKey.disambig` changed from the function-node + /// * `"1"`, initial. + /// * `"2"`, 0.5.0: `FuncKey.disambig` changed from the function-node /// byte offset to a depth-first structural index. Pre-0.5.0 caches /// store byte-offset disambigs and would fail to match bodies built /// by the new engine, so they are silently rebuilt on open. - /// * `"3"` — `ssa_function_bodies.body` changed from JSON TEXT to + /// * `"3"`, `ssa_function_bodies.body` changed from JSON TEXT to /// bincode BLOB. Old JSON payloads cannot be deserialised by the /// new engine, so they are silently rebuilt on open. pub const SCHEMA_VERSION: &str = "3"; @@ -432,7 +432,7 @@ pub mod index { match stored { Some(ref v) if v == current => { - // Schema version matches — nothing to do. + // Schema version matches, nothing to do. } _ => { let old = stored.as_deref().unwrap_or(""); @@ -475,7 +475,7 @@ pub mod index { match stored { Some(ref v) if v == current => { - // Version matches — nothing to do. + // Version matches, nothing to do. } _ => { let old = stored.as_deref().unwrap_or(""); @@ -601,10 +601,10 @@ pub mod index { Ok(match row { Some((stored_hash, stored_mtime)) => { if stored_mtime != mtime { - // mtime changed — must re-scan + // mtime changed, must re-scan true } else { - // mtime matches — compare hash only if cheap + // mtime matches, compare hash only if cheap // (the caller already read the file and can use // should_scan_with_hash instead for full accuracy) let digest = Self::digest_file(path)?; @@ -811,7 +811,7 @@ pub mod index { /// Atomically replace all SSA function summaries for a single file. /// /// The input tuple is - /// `(name, arity, lang, namespace, container, disambig, kind, summary)` — + /// `(name, arity, lang, namespace, container, disambig, kind, summary)` , /// matching the fields required to reconstruct a full [`crate::symbol::FuncKey`] /// on load. pub fn replace_ssa_summaries_for_file( @@ -1040,7 +1040,7 @@ pub mod index { /// Load symbol metadata (name, arity, lang, namespace, container, kind) /// for a single file. /// - /// Lighter than `load_all_ssa_summaries` — skips JSON deserialization of + /// Lighter than `load_all_ssa_summaries`, skips JSON deserialization of /// the full summary body and filters by file_path in the query. `kind` /// is the [`crate::symbol::FuncKind`] slug (`"fn"`, `"method"`, /// `"closure"`, ...) so consumers can distinguish anonymous functions @@ -1074,7 +1074,7 @@ pub mod index { /// /// Persists cross-file callee bodies for interprocedural symex. /// Bodies are serialized as MessagePack (rmp-serde, named-field - /// encoding) BLOBs — JSON proved too costly at indexing time on + /// encoding) BLOBs, JSON proved too costly at indexing time on /// large SSA structures, and bincode's positional format trips /// over the `#[serde(skip_serializing_if = ...)]` attributes /// scattered through `OptimizeResult` and friends. @@ -1260,7 +1260,7 @@ pub mod index { /// /// Mirrors [`Self::replace_ssa_summaries_for_file`]. Each input tuple /// is `(name, arity, lang, namespace, container, disambig, kind, summary)` - /// — the full identity needed to reconstruct the callee's + ///, the full identity needed to reconstruct the callee's /// [`crate::symbol::FuncKey`] on load. pub fn replace_auth_summaries_for_file( &mut self, @@ -1326,7 +1326,7 @@ pub mod index { /// [`Self::replace_ssa_summaries_for_file`], /// [`Self::replace_ssa_bodies_for_file`] and /// [`Self::replace_auth_summaries_for_file`] in sequence, but - /// issues a single fsync at commit instead of four — the + /// issues a single fsync at commit instead of four, the /// dominant cost on large scans. /// /// Behaviour parity with the four-call sequence: @@ -1376,7 +1376,7 @@ pub mod index { let path_str = file_path.to_string_lossy(); let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64; - // function_summaries — always replace. + // function_summaries, always replace. tx.execute( "DELETE FROM function_summaries WHERE project = ?1 AND file_path = ?2", params![self.project, path_str], @@ -1408,7 +1408,7 @@ pub mod index { } } - // ssa_function_summaries — only touched when non-empty. + // ssa_function_summaries, only touched when non-empty. if !ssa_summaries.is_empty() { tx.execute( "DELETE FROM ssa_function_summaries @@ -1444,7 +1444,7 @@ pub mod index { } } - // ssa_function_bodies — only touched when non-empty. + // ssa_function_bodies, only touched when non-empty. if !ssa_bodies.is_empty() { tx.execute( "DELETE FROM ssa_function_bodies @@ -1478,7 +1478,7 @@ pub mod index { } } - // auth_check_summaries — always replace, even when empty, + // auth_check_summaries, always replace, even when empty, // so a helper that lost its ownership check no longer // leaks lifts into subsequent pass-2 runs. tx.execute( @@ -2203,7 +2203,7 @@ pub mod index { Ok(rows) } - /// Record the first time a finding fingerprint was observed. Idempotent — + /// Record the first time a finding fingerprint was observed. Idempotent , /// the earliest call wins via INSERT OR IGNORE. Used by the overview /// backlog-age computation; ts should be the originating scan's /// `started_at` (RFC-3339). @@ -2246,7 +2246,7 @@ pub mod index { if fingerprints.is_empty() { return Ok(std::collections::HashMap::new()); } - // SQLite IN-clause cap is high but parameter count is bounded — chunk + // SQLite IN-clause cap is high but parameter count is bounded, chunk // for safety with large fingerprint sets. let mut out = std::collections::HashMap::with_capacity(fingerprints.len()); let conn = self.c(); @@ -2590,7 +2590,7 @@ fn ssa_summaries_round_trip() { /// asserts that `return_path_facts` survive serialise → SQLite persist → /// load → deserialise. Regression guard for the per-return-path PathFact /// decomposition that closes the rs-safe-014 / tar-rs / rs-safe-016 FP -/// cluster — without this round-trip working, cross-file callers lose +/// cluster, without this round-trip working, cross-file callers lose /// the per-arm narrowing and inline-only callees regain the joined-fact /// dilution. #[test] @@ -2955,7 +2955,7 @@ fn ssa_bodies_replace_on_rescan() { assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 1); assert_eq!(idx.load_all_ssa_bodies().unwrap()[0].8.ssa.blocks.len(), 2); - // Store v2 with 5 blocks — should replace, not accumulate + // Store v2 with 5 blocks, should replace, not accumulate let hash2 = index::Indexer::digest_bytes(b"v2"); let bodies2 = vec![( "func".to_string(), @@ -3053,7 +3053,7 @@ fn ssa_bodies_removed_on_file_delete() { idx.replace_ssa_bodies_for_file(&f, &hash, &bodies).unwrap(); assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 1); - // Delete file — should also remove bodies + // Delete file, should also remove bodies idx.remove_file_and_related(&f).unwrap(); assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 0); } @@ -3215,7 +3215,7 @@ fn version_mismatch_triggers_reset() { 1 ); - // Reopen — version mismatch should trigger full wipe + // Reopen, version mismatch should trigger full wipe drop(pool); let pool2 = index::Indexer::init(&db).unwrap(); @@ -3286,7 +3286,7 @@ fn multiple_opens_no_repeated_resets() { populate_project(&pool, "proj", td.path()); drop(pool); - // Second open — should preserve data + // Second open, should preserve data let pool2 = index::Indexer::init(&db).unwrap(); assert_eq!( index::Indexer::count_rows(&pool2, "function_summaries", "proj").unwrap(), @@ -3297,7 +3297,7 @@ fn multiple_opens_no_repeated_resets() { populate_project(&pool2, "proj2", td.path()); drop(pool2); - // Third open — should still preserve both projects + // Third open, should still preserve both projects let pool3 = index::Indexer::init(&db).unwrap(); assert_eq!( index::Indexer::count_rows(&pool3, "function_summaries", "proj").unwrap(), @@ -3376,7 +3376,7 @@ fn missing_ssa_namespace_column_triggers_recreate() { .unwrap(); } - // Open via init — should detect missing namespace and recreate + // Open via init, should detect missing namespace and recreate let pool = index::Indexer::init(&db).unwrap(); // Verify the table now has the namespace column by inserting with it @@ -3405,12 +3405,12 @@ fn valid_schema_no_recreate() { let td = tempfile::tempdir().unwrap(); let db = td.path().join("nyx.sqlite"); - // First init — creates all tables + // First init, creates all tables let pool = index::Indexer::init(&db).unwrap(); populate_project(&pool, "proj", td.path()); drop(pool); - // Second init — schema is valid, should NOT drop/recreate + // Second init, schema is valid, should NOT drop/recreate let pool2 = index::Indexer::init(&db).unwrap(); // Data survives because schema was already correct assert_eq!( @@ -3735,7 +3735,7 @@ fn metadata_table_survives_clear() { assert_eq!(stored.as_deref(), Some(index::ENGINE_VERSION)); } -/// Pointer-Phase 5 / A3 audit: field_points_to round-trips through +/// field_points_to round-trips through /// the SsaFuncSummary SQLite blob. Pin that the new field_points_to /// records preserve param_field_reads, param_field_writes, the /// receiver sentinel (`u32::MAX`), the container-element marker @@ -3817,7 +3817,7 @@ fn ssa_summaries_round_trip_preserves_field_points_to() { } /// Pre-Phase-5 blob compatibility: a summary serialised without -/// `field_points_to` deserialises with the empty default — no +/// `field_points_to` deserialises with the empty default, no /// migration needed because the field is `#[serde(default)]`. #[test] fn ssa_summaries_pre_phase5_blob_decodes_with_empty_field_points_to() { diff --git a/src/engine_notes.rs b/src/engine_notes.rs index 9e01b6f0..ebf299ec 100644 --- a/src/engine_notes.rs +++ b/src/engine_notes.rs @@ -1,98 +1,43 @@ //! Provenance notes attached to findings when the engine has hit an //! internal budget, widening, or lowering cap. //! -//! The notes are surfaced through `Finding.engine_notes` (and -//! `Evidence.engine_notes` once the finding reaches the `Diag` layer) so -//! downstream consumers can tell "we found nothing" from "we stopped -//! looking". -//! -//! Each note carries a [`LossDirection`] classification that describes -//! *how* the engine deviated from a fully-converged analysis. The -//! direction drives two downstream behaviours: -//! -//! * [`crate::evidence::compute_confidence`] caps confidence at -//! `Medium` when any attached note has direction -//! [`LossDirection::OverReport`] or [`LossDirection::Bail`] (the -//! finding itself may be spurious). -//! * [`crate::rank`] applies a direction-aware `completeness` penalty -//! to the attack-surface score (see `rank.rs::completeness_penalty`). -//! -//! This replaces the earlier Phase-3 stance of "notes are purely -//! additive and never influence score". A release audit flagged that -//! users sorting thousands of findings by rank could not distinguish -//! converged analysis from capped analysis, which produced false -//! confidence in fragile findings. The direction-aware pipeline -//! preserves the observability goal while fixing the credibility gap. +//! Each note carries a [`LossDirection`] classification. +//! [`crate::evidence::compute_confidence`] caps confidence at `Medium` +//! for `OverReport`/`Bail` notes, and [`crate::rank`] applies a +//! direction-aware completeness penalty. use serde::{Deserialize, Serialize}; use smallvec::SmallVec; -/// Classification of *why* a fix-point loop hit its safety cap. -/// -/// The cap-hit alone is not actionable — "we ran 64 iterations and did -/// not detect convergence" can mean several very different things: -/// -/// * the lattice is still shrinking but slowly (e.g. a 72-function chain -/// SCC that legitimately needs >64 iterations), -/// * the lattice stopped shrinking but the convergence predicate still -/// detects change (the change set stabilised at a non-zero value — -/// monotonicity is fine but something in the convergence predicate is -/// spurious), or -/// * the lattice is oscillating (two iterations alternating with the -/// same change-set size; this is a *bug*, not a tuning issue). -/// -/// Recording the reason makes cap-hit telemetry actionable: operators -/// can tell when "raise the cap" would actually help vs. when they are -/// looking at a summary-non-monotonicity regression. -/// -/// Serialized as a nested snake_case tagged enum so SARIF/JSON consumers -/// can pattern-match without depending on Rust layout. +/// Why a fix-point loop hit its safety cap. Distinguishes "raise the +/// cap" cases from non-monotonicity bugs in cap-hit telemetry. +/// Serialized as a tagged snake_case enum for SARIF/JSON consumers. #[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case")] pub enum CapHitReason { - /// The change-set size was still decreasing when the cap fired. - /// `trajectory` is the last N iteration deltas (most recent last). - /// Operators can safely raise the cap; the underlying analysis is - /// healthy but the SCC is larger than the current budget. + /// Change-set still decreasing when the cap fired. Safe to raise + /// the cap; the SCC is just larger than budget. MonotoneShrinking { trajectory: SmallVec<[u32; 4]> }, - /// The change-set size stayed constant for the last ≥2 iterations - /// without reaching zero. This is unusual: every iteration is - /// updating the *same* keys, which suggests a summary that changes - /// the same fields back and forth even though the cap bits are - /// saturating. Raise the cap **and** investigate. + /// Change-set held steady at a non-zero value for ≥2 iterations. + /// Same keys updating back and forth, investigate. Plateau { delta: u32 }, - /// The change-set size oscillated with a detected period ≤ N/2. - /// Genuinely bad — the analysis is not monotone, convergence will - /// *never* be reached, and raising the cap will not help. File a - /// bug with the fixture attached. + /// Period-2 oscillation detected. Non-monotone; raising the cap + /// will not help. File a bug. SuspectedOscillation { period: u8, trajectory: SmallVec<[u32; 4]>, }, - /// Default when the engine did not record a trajectory (e.g. the - /// cap fired after only one iteration so there is nothing to - /// classify). Preserves backwards compatibility for old notes - /// deserialized from disk. + /// No trajectory recorded (e.g. cap fired after a single iteration). #[default] Unknown, } impl CapHitReason { - /// Classify a trajectory of per-iteration change-set sizes. - /// - /// `deltas` should carry the *changed-key counts* from the last N - /// iterations (most recent last). Classification rules: - /// - /// 1. Fewer than 2 samples → `Unknown` (nothing to diff against). - /// 2. A period-2 pattern (a,b,a,b) with a ≠ b → `SuspectedOscillation`. - /// 3. Last two samples equal and non-zero → `Plateau`. - /// 4. Strictly decreasing tail → `MonotoneShrinking`. - /// 5. Otherwise → `Unknown` (inconclusive; rare in practice). - /// - /// The function is pure — no allocation beyond the returned - /// [`SmallVec`] — so it is safe to call from within a hot loop when - /// a cap actually fires. Callers should accumulate deltas in a - /// fixed-size ring buffer to bound memory. + /// Classify a trajectory of per-iteration change-set sizes + /// (most recent last). Rules: <2 samples → `Unknown`; a,b,a,b with + /// a≠b → `SuspectedOscillation`; last two equal non-zero → + /// `Plateau`; strictly decreasing tail → `MonotoneShrinking`; + /// otherwise `Unknown`. pub fn classify(deltas: &[u32]) -> CapHitReason { if deltas.len() < 2 { return CapHitReason::Unknown; @@ -161,44 +106,26 @@ impl CapHitReason { } /// Direction of precision loss encoded by an [`EngineNote`]. -/// -/// Every new [`EngineNote`] variant must declare a direction via -/// [`EngineNote::direction`] — the match is exhaustive by design so the -/// classification cannot silently default. -/// -/// Ordering matters: variants are sorted by worsening impact on a -/// specific finding's credibility. [`combine`](Self::combine) uses the -/// `Ord` impl to merge directions when multiple notes are attached. +/// Variants are ordered by worsening credibility impact; +/// [`combine`](Self::combine) takes the max. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum LossDirection { - /// The note is informational only. Analysis was fully converged; - /// the note records a harmless event such as a cache reuse. + /// Analysis converged; the note records a harmless event. Informational, - /// The analysis may have *missed* additional findings (e.g. the - /// worklist was capped before fully propagating taint). Findings - /// that *were* reported are still sound — they correspond to real - /// flows — but the result set is a lower bound. + /// Analysis may have missed findings (worklist was capped). Reported + /// findings remain sound, the result set is a lower bound. UnderReport, - /// The analysis may have reported a *spurious* finding (e.g. - /// predicate state was widened to top, so a validation guard that - /// would have suppressed the finding was lost). The specific - /// finding is more likely to be a false positive than one produced - /// from converged state. + /// Analysis may have reported a spurious finding (e.g. predicate + /// state widened to top, dropping a guard). Likely FP. OverReport, - /// Analysis of this finding's body aborted before producing a - /// trustworthy result (e.g. SSA lowering bailed, parse timed out). - /// The finding is weakly supported; a human reviewer should treat - /// it as a starting point rather than a confirmed flow. + /// Analysis aborted before producing a trustworthy result. + /// Treat the finding as a starting point, not a confirmed flow. Bail, } impl LossDirection { - /// Merge two directions by taking the worse (later in `Ord`). - /// - /// A body with both `UnderReport` and `OverReport` notes is treated - /// as `OverReport` because over-reporting is the more credibility- - /// damaging failure mode for a specific emitted finding. + /// Merge by taking the worse (later in `Ord`). pub fn combine(self, other: LossDirection) -> LossDirection { self.max(other) } @@ -215,111 +142,46 @@ impl LossDirection { } /// A single provenance event recorded during analysis. -/// -/// `kind` is serialized as a snake_case tag so tooling can pattern-match -/// across JSON and SARIF output without depending on Rust enum layout. #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case")] pub enum EngineNote { - /// The taint worklist hit its iteration budget before converging. - /// Direction: [`LossDirection::UnderReport`] — the fixpoint was - /// aborted, so some flows may have been missed, but emitted flows - /// are still backed by propagated taint. + /// Taint worklist hit its iteration budget. UnderReport. WorklistCapped { iterations: u32 }, - /// Origin tracking was truncated when a value exceeded the configured - /// per-value origin cap (`analysis.engine.max_origins`, default 32). - /// Direction: [`LossDirection::UnderReport`] — each dropped origin - /// corresponds to a real source flow whose independent finding will - /// not be emitted. Other survivors still produce findings, so the - /// counter is a strict lower bound on under-reporting. Raise - /// `max_origins` if operators observe this note on realistic inputs. - /// Truncation is deterministic: origins are sorted by source - /// location and the largest-by-location are dropped first, so the - /// survivor set is stable across runs and merge orderings. + /// Per-value origin set truncated to `analysis.engine.max_origins` + /// (default 32). UnderReport, dropped origins correspond to real + /// source flows whose findings won't emit. OriginsTruncated { dropped: u32 }, - /// JS/TS pass-2 in-file global propagation hit its iteration cap. - /// Direction: [`LossDirection::UnderReport`] — global state may - /// not have reached fixpoint; cross-function flows could be missed. - /// - /// `reason` classifies *why* the cap fired (monotone-but-slow, - /// plateau, suspected oscillation) so operators can tell a - /// tunable-budget problem from a monotonicity regression. Older - /// serialized notes without this field default to - /// [`CapHitReason::Unknown`]. + /// JS/TS pass-2 in-file global propagation hit its cap. UnderReport. InFileFixpointCapped { iterations: u32, #[serde(default)] reason: CapHitReason, }, - /// Cross-file SCC fixpoint hit `SCC_FIXPOINT_SAFETY_CAP`. - /// Direction: [`LossDirection::UnderReport`] — the iterative - /// cross-file join aborted; summaries for members of this SCC may - /// be incomplete. - /// - /// `reason` classifies *why* the cap fired (monotone-but-slow, - /// plateau, suspected oscillation) so operators can tell a - /// tunable-budget problem from a monotonicity regression. Older - /// serialized notes without this field default to - /// [`CapHitReason::Unknown`]. + /// Cross-file SCC fixpoint hit `SCC_FIXPOINT_SAFETY_CAP`. UnderReport. CrossFileFixpointCapped { iterations: u32, #[serde(default)] reason: CapHitReason, }, - /// SSA lowering produced an empty body (parse failure or - /// unsupported shape). Direction: [`LossDirection::Bail`] — any - /// finding attributed to this body is weakly supported because the - /// IR itself is malformed. + /// SSA lowering produced an empty body. Bail. SsaLoweringBailed { reason: String }, - /// Tree-sitter parse exceeded the configured timeout. - /// Direction: [`LossDirection::Bail`] — parse aborted; findings - /// surfaced from the partial tree should be treated as a human- - /// review starting point. + /// Tree-sitter parse exceeded the timeout. Bail. ParseTimeout { timeout_ms: u32 }, - /// Predicate state was widened to top to maintain monotonicity. - /// Direction: [`LossDirection::OverReport`] — validation guards - /// that would have suppressed the finding may have been lost, so - /// the finding is more likely to be a false positive. + /// Predicate state widened to top to keep the lattice monotone. + /// OverReport, guards may have been lost. PredicateStateWidened, - /// Path-environment constraints exceeded internal cap; widened to - /// top. Direction: [`LossDirection::OverReport`] — same reasoning - /// as [`Self::PredicateStateWidened`]: dropped path constraints can - /// only turn infeasible paths into apparent-feasible ones. + /// Path-environment constraints widened to top. OverReport. PathEnvCapped, - /// Inline cache reused a cached body summary; origins were - /// re-attributed. Direction: [`LossDirection::Informational`] — - /// the cache hit does not affect precision, but surfacing the - /// re-attribution helps explain why origin locations move between - /// runs that share a body signature. + /// Inline cache reused a cached body. Informational. InlineCacheReused, - /// Points-to analysis dropped heap object members when an - /// intra-procedural points-to set exceeded - /// `analysis.engine.max_pointsto` (default 32). - /// Direction: [`LossDirection::UnderReport`] — stores and loads - /// that flow through the truncated set miss the dropped abstract - /// heap objects, so any taint into those objects via this alias - /// path will not reach downstream sinks. Other aliasing paths to - /// the same objects still propagate normally, so the counter is a - /// strict lower bound on under-reporting. Raise `max_pointsto` - /// if operators observe this note on factory-heavy codebases. + /// Points-to set truncated to `analysis.engine.max_pointsto` + /// (default 32). UnderReport. PointsToTruncated { dropped: u32 }, } impl EngineNote { - /// Classify this note by direction of precision loss. - /// - /// The match is exhaustive: every `EngineNote` variant must declare - /// a direction. When adding a new cap site, pick the direction - /// that most honestly describes the impact on an emitted finding: - /// - /// * `Informational` — analysis fully converged; note is a - /// provenance breadcrumb (e.g. cache reuse). - /// * `UnderReport` — analysis was cut short, but anything emitted - /// is still backed by real propagation. - /// * `OverReport` — precision was widened, so the emitted finding - /// is *more* likely to be a false positive than the baseline. - /// * `Bail` — analysis of this body aborted; the finding is weakly - /// supported. + /// Direction of precision loss for this note. New variants must + /// declare one explicitly. pub fn direction(&self) -> LossDirection { match self { EngineNote::WorklistCapped { .. } => LossDirection::UnderReport, @@ -335,23 +197,15 @@ impl EngineNote { } } - /// True if this note indicates the engine may have deviated from a - /// fully-converged analysis (any non-informational direction). - /// - /// This is a convenience over - /// `self.direction() != LossDirection::Informational` and drives - /// the `confidence_capped` SARIF property. + /// True for any non-informational direction. Drives the + /// `confidence_capped` SARIF property. pub fn lowers_confidence(&self) -> bool { self.direction() != LossDirection::Informational } } -/// Compute the worst direction across a slice of notes. -/// -/// Returns `None` when `notes` is empty or contains only -/// [`LossDirection::Informational`] notes. Returns `Some(dir)` with -/// the most impactful direction otherwise — this is what downstream -/// consumers (rank, confidence) use to decide how to degrade a finding. +/// Worst non-informational direction across a slice of notes, or +/// `None` if the slice is empty or only carries informational notes. pub fn worst_direction(notes: &[EngineNote]) -> Option { let mut worst: Option = None; for note in notes { @@ -367,9 +221,7 @@ pub fn worst_direction(notes: &[EngineNote]) -> Option { worst } -/// Deduplicating push: does not append if an identical note is already -/// present. Used to keep per-finding note lists small when a cap site -/// fires repeatedly inside the same body. +/// Push-if-not-present. pub fn push_unique(notes: &mut smallvec::SmallVec<[EngineNote; 2]>, note: EngineNote) { if !notes.iter().any(|n| n == ¬e) { notes.push(note); diff --git a/src/evidence.rs b/src/evidence.rs index 52e6d669..6f16f5c7 100644 --- a/src/evidence.rs +++ b/src/evidence.rs @@ -289,7 +289,7 @@ pub struct StateEvidence { /// (validation guards may have been lost, so the finding is more /// likely to be a false positive); `Bail` means analysis of the body /// aborted before producing a trustworthy result. `UnderReport` notes -/// (e.g. `WorklistCapped`) do *not* cap confidence — the reported flow +/// (e.g. `WorklistCapped`) do *not* cap confidence, the reported flow /// is still real, just surrounded by an incomplete result set. pub fn compute_confidence(diag: &Diag) -> Confidence { // Degraded analysis caps confidence @@ -343,7 +343,7 @@ fn apply_engine_notes_cap(diag: &Diag, base: Confidence) -> Confidence { | crate::engine_notes::LossDirection::Bail => base.min(Confidence::Medium), // UnderReport: result set is a lower bound, but the emitted // finding itself remains as credible as the analysis decided. - // Do not cap — the rank completeness penalty is the right lever + // Do not cap, the rank completeness penalty is the right lever // for that case (see rank.rs::completeness_penalty). crate::engine_notes::LossDirection::UnderReport => base, // Informational is filtered out upstream by `worst_direction`, @@ -600,7 +600,7 @@ pub fn generate_explanation(diag: &Diag) -> Option { /// Extract a vulnerability category label from the Diag (used in explanation text). fn extract_category_from_id(id: &str) -> String { - // Rule IDs like "taint-unsanitised-flow (source 3:1)" — category comes + // Rule IDs like "taint-unsanitised-flow (source 3:1)", category comes // from the finding category field, but we approximate from the ID here. if id.contains("sql") || id.contains("SQL") { "SQL injection".to_string() @@ -680,7 +680,7 @@ pub fn compute_confidence_limiters(diag: &Diag) -> Vec { "Backwards demand-driven analysis exceeded its budget (verdict not reached)".into(), ); } - // Confirmation is *not* a limiter — it is a positive signal. The + // Confirmation is *not* a limiter, it is a positive signal. The // taint-confidence scorer picks it up separately. let _ = NOTE_CONFIRMED; } @@ -976,7 +976,7 @@ mod tests { #[test] fn confidence_capped_at_medium_by_over_report() { // OverReport (PredicateStateWidened) means validation predicates - // were lost — the emitted finding is more likely to be spurious. + // were lost, the emitted finding is more likely to be spurious. let d = with_notes( taint_high_confidence_diag(), vec![crate::engine_notes::EngineNote::PredicateStateWidened], @@ -995,7 +995,7 @@ mod tests { #[test] fn confidence_cap_does_not_upgrade_low() { - // `base.min(Medium)` is what caps — it must not *raise* a Low + // `base.min(Medium)` is what caps, it must not *raise* a Low // baseline to Medium. Use a taint finding with weak evidence so // the points scorer gives us Low, then attach a Bail note. let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::Low); diff --git a/src/fmt.rs b/src/fmt.rs index 73fb5b94..829a2f71 100644 --- a/src/fmt.rs +++ b/src/fmt.rs @@ -31,7 +31,7 @@ pub fn render_console( } for (path, issues) in &grouped { - // File path header — dim blue, never brighter than severity. + // File path header, dim blue, never brighter than severity. out.push_str(&format!("{}\n", style(path).blue().dim().underlined())); for d in issues { out.push_str(&render_diag(d, width)); @@ -261,7 +261,7 @@ fn render_diag(d: &Diag, width: usize) -> String { // Engine provenance notes: show count + worst direction so a user // scanning the console can see "this finding is from capped analysis" // at a glance. Direction tags ("under-report", "over-report", "bail") - // are stable strings from `LossDirection::tag()` — kept in sync with + // are stable strings from `LossDirection::tag()`, kept in sync with // the SARIF `result.properties.engine_notes[].kind` serialization so // downstream tooling can cross-reference console and SARIF output. // Informational-only notes (e.g. InlineCacheReused) are not surfaced @@ -453,7 +453,7 @@ fn state_remediation_hint(rule_id: &str) -> Option<&'static str> { /// Colored severity tag with icon. The tag is the visual anchor of each finding. /// /// - HIGH: bold red -/// - MEDIUM: bold 208 (orange) — distinct from yellow +/// - MEDIUM: bold 208 (orange), distinct from yellow /// - LOW: dim 67 (muted blue-gray) fn severity_tag(sev: Severity) -> String { match sev { @@ -503,7 +503,7 @@ fn collapse_chain_spacing(s: &str) -> String { // Collapse: emit `.` directly after `)` continue; } else { - // Not a chain continuation — emit the whitespace we skipped + // Not a chain continuation, emit the whitespace we skipped for c in &chars[ws_start..i] { out.push(*c); } diff --git a/src/interop.rs b/src/interop.rs index 4deedbd6..ab2023ea 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -18,7 +18,7 @@ pub struct CallSiteKey { /// An explicit cross-language bridge edge. /// /// Connects a call site in one language to a function definition in another. -/// Without an `InteropEdge`, cross-language resolution is never attempted — +/// Without an `InteropEdge`, cross-language resolution is never attempted , /// this prevents false positives from name collisions across languages. #[derive(Clone, Debug)] pub struct InteropEdge { diff --git a/src/labels/c.rs b/src/labels/c.rs index f688e5e2..c38010aa 100644 --- a/src/labels/c.rs +++ b/src/labels/c.rs @@ -115,8 +115,8 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig { /// Benchmark-driven output-parameter source positions for known C APIs. /// Maps callee name → argument positions that receive Source taint. pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[ - ("fgets", &[0]), // fgets(buf, size, stream) — buf receives input - ("gets", &[0]), // gets(buf) — buf receives input + ("fgets", &[0]), // fgets(buf, size, stream), buf receives input + ("gets", &[0]), // gets(buf), buf receives input ("recv", &[1]), // recv(fd, buf, len, flags) ("recvfrom", &[1]), // recvfrom(fd, buf, len, flags, ...) ]; diff --git a/src/labels/cpp.rs b/src/labels/cpp.rs index fbf71f40..1587ad92 100644 --- a/src/labels/cpp.rs +++ b/src/labels/cpp.rs @@ -120,7 +120,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! { // and extract them as separate bodies. Without these, a // `class_specifier` / `struct_specifier` falls through to the // generic `_ =>` arm in `build_sub`, which records a leaf `Seq` - // node and never walks the body — so inline member-function + // node and never walks the body, so inline member-function // definitions (and methods of nested classes) are silently dropped. "declaration_list" => Kind::Block, "field_declaration_list" => Kind::Block, @@ -160,7 +160,7 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig { /// Benchmark-driven output-parameter source positions for known C++ APIs. pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[ - ("getline", &[1]), // std::getline(stream, str) — str receives input + ("getline", &[1]), // std::getline(stream, str), str receives input ("std::getline", &[1]), ("fgets", &[0]), ("gets", &[0]), diff --git a/src/labels/go.rs b/src/labels/go.rs index 9ee97c13..ae4f6dca 100644 --- a/src/labels/go.rs +++ b/src/labels/go.rs @@ -66,7 +66,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SQL_QUERY), case_sensitive: false, }, - // fmt.Printf/Sprintf write to stdout or build strings in memory — not + // fmt.Printf/Sprintf write to stdout or build strings in memory, not // security sinks. fmt.Fprintf writes to an io.Writer (often http.ResponseWriter) // so it IS a security sink for XSS. LabelRule { @@ -110,7 +110,7 @@ pub static RULES: &[LabelRule] = &[ // Idiomatic Go SSRF sinks (Owncast CVE-2023-3188) use the // `http.DefaultClient.Get(url)` form rather than the bare // `http.Get(url)` helper, so the suffix-matched callee text needs - // an explicit entry here — bare `Get/Post/Do/Head` would + // an explicit entry here, bare `Get/Post/Do/Head` would // over-match unrelated method names. "http.DefaultClient.Get", "http.DefaultClient.Post", diff --git a/src/labels/java.rs b/src/labels/java.rs index 59483fe4..8c04e9f6 100644 --- a/src/labels/java.rs +++ b/src/labels/java.rs @@ -53,13 +53,13 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::URL_ENCODE), case_sensitive: false, }, - // OWASP ESAPI input validator — validates and canonicalizes input + // OWASP ESAPI input validator, validates and canonicalizes input LabelRule { matchers: &["Validator.getValidInput"], label: DataLabel::Sanitizer(Cap::all()), case_sensitive: false, }, - // Type-check sanitizers — parsing to a primitive erases taint + // Type-check sanitizers, parsing to a primitive erases taint LabelRule { matchers: &[ "Integer.parseInt", @@ -99,7 +99,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::CODE_EXEC), case_sensitive: false, }, - // HTTP response sinks — println/print are broad (also match System.out) + // HTTP response sinks, println/print are broad (also match System.out) // but necessary to catch response.getWriter().println() via suffix matching. LabelRule { matchers: &["println", "print"], @@ -107,7 +107,7 @@ pub static RULES: &[LabelRule] = &[ case_sensitive: false, }, // openConnection() is the standard java.net.URL API for initiating a connection. - // It is the correct interception point — the URL is already set on the object. + // It is the correct interception point, the URL is already set on the object. LabelRule { matchers: &[ "openConnection", @@ -153,9 +153,9 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SQL_QUERY), case_sensitive: true, }, - // NOTE: Java logging (logger.info, log.warn, etc.) removed as sinks — + // NOTE: Java logging (logger.info, log.warn, etc.) removed as sinks , // logging format injection is not a real security vulnerability in Java. - // String.format also removed — it builds strings in memory (not a sink); + // String.format also removed, it builds strings in memory (not a sink); // the real sink is wherever the formatted string is used (SQL, HTTP, etc.). // ─── JNDI injection sinks ─── LabelRule { diff --git a/src/labels/javascript.rs b/src/labels/javascript.rs index 6937c89d..2ebace19 100644 --- a/src/labels/javascript.rs +++ b/src/labels/javascript.rs @@ -36,7 +36,7 @@ pub static RULES: &[LabelRule] = &[ case_sensitive: false, }, // `encodeURIComponent` percent-encodes every character outside the - // ASCII identifier alphabet, including `<`, `>`, `&`, `"`, `'` — so + // ASCII identifier alphabet, including `<`, `>`, `&`, `"`, `'`, so // the result is safe to embed in HTML text content and HTML // attribute values, not just URL components. Treating it as // covering both URL_ENCODE and HTML_ESCAPE caps avoids FPs when a @@ -92,7 +92,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE), case_sensitive: false, }, - // he library — HTML entity encoding + // he library, HTML entity encoding LabelRule { matchers: &["he.encode", "he.escape"], label: DataLabel::Sanitizer(Cap::HTML_ESCAPE), @@ -148,16 +148,16 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SHELL_ESCAPE), case_sensitive: true, }, - // ── Outbound HTTP clients — modeled as destination-aware gated sinks ── + // ── Outbound HTTP clients, modeled as destination-aware gated sinks ── // Flat-Sink modeling of fetch/axios/got/undici/http.request was producing // a dominant FP class where any tainted body/payload arg appeared as SSRF // (e.g. `fetch("/api/telemetry", { body: navigator.userAgent })`). SSRF // semantics require attacker control over the *destination*, not the - // payload. The gated entries in `GATED_SINKS` below narrow activation to - // URL / host / path / origin arguments or object fields. Taint flowing - // only to body / data / json / headers is no longer flagged as SSRF — - // cross-boundary data-exfiltration detection is a separate future - // capability (`Cap::DATA_EXFIL`, not yet introduced). + // payload. The gated entries in `GATED_SINKS` below narrow SSRF + // activation to URL / host / path / origin arguments or object fields. + // Taint flowing only to body / data / json / headers is captured by a + // *separate* gate class (`Cap::DATA_EXFIL`) so the two can coexist on + // the same callee without one over-flagging the other. // Express response sinks LabelRule { matchers: &["res.send", "res.json"], @@ -222,6 +222,21 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // ── Cross-boundary data exfiltration (DATA_EXFIL) ───────────────────── + // + // `XMLHttpRequest.prototype.send(body)`, when the receiver type is + // tracked back to `new XMLHttpRequest()`, the SSA engine's type-qualified + // resolver converts `xhr.send` to `HttpClient.send`; matching that form + // fires DATA_EXFIL on tainted body flow. The explicit + // `XMLHttpRequest.prototype.send.apply(...)` form is also covered. The + // `fetch` body / headers / json case is covered by the gated entry in + // `GATED_SINKS` (so SSRF on the URL and DATA_EXFIL on the payload can + // coexist on a single call site). + LabelRule { + matchers: &["HttpClient.send", "XMLHttpRequest.prototype.send"], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + }, // ─────────── SQL injection sinks ───────────── // Database drivers: mysql, mysql2, pg, better-sqlite3 LabelRule { @@ -314,7 +329,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ // only to body / data / json / headers / payload is silenced. See the // commentary at the top of RULES for the rationale. // - // `fetch(input, init)` — arg 0 can be a URL string OR a Request/config + // `fetch(input, init)`, arg 0 can be a URL string OR a Request/config // object with `url`. Per WHATWG Fetch, when `input` is a dictionary, the // URL field is canonically `url`. Init-object body/headers at arg 1 are // *not* destination-bearing. @@ -332,7 +347,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["url"], }, }, - // `axios(config)` / `axios.request(config)` — config object exposes + // `axios(config)` / `axios.request(config)`, config object exposes // `url` and `baseURL`. Body-ish fields (`data`, `params`, `headers`) // are excluded. SinkGate { @@ -363,7 +378,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["url", "baseURL"], }, }, - // `axios.get(url[, config])` — arg 0 is URL; arg 1 is config. + // `axios.get(url[, config])`, arg 0 is URL; arg 1 is config. SinkGate { callee_matcher: "axios.get", arg_index: 0, @@ -378,7 +393,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &[], }, }, - // `axios.post(url, data[, config])` — arg 0 is URL; `data` at arg 1 is + // `axios.post(url, data[, config])`, arg 0 is URL; `data` at arg 1 is // the request body and must NOT activate SSRF. SinkGate { callee_matcher: "axios.post", @@ -394,7 +409,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &[], }, }, - // `axios.put / axios.patch / axios.delete` follow the same shape — + // `axios.put / axios.patch / axios.delete` follow the same shape , // (url, data?, config?). Keep the model consistent across verbs. SinkGate { callee_matcher: "axios.put", @@ -438,7 +453,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &[], }, }, - // `got(url[, options])` / `got(options)` — options exposes `url` and + // `got(url[, options])` / `got(options)`, options exposes `url` and // `prefixUrl`. Body-ish fields (`body`, `json`, `form`, `searchParams`, // `headers`) are excluded. SinkGate { @@ -455,7 +470,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["url", "prefixUrl"], }, }, - // `undici.request(url | opts[, opts])` — opts exposes `origin` and + // `undici.request(url | opts[, opts])`, opts exposes `origin` and // `path`. Body-ish fields (`body`, `headers`) are excluded. SinkGate { callee_matcher: "undici.request", @@ -471,11 +486,11 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["origin", "path"], }, }, - // Node `http.request(options[, cb])` / `https.request(options[, cb])` — + // Node `http.request(options[, cb])` / `https.request(options[, cb])` , // options exposes `host`, `hostname`, `path`, `protocol`, `port`, // `origin`. Body is sent via `.write()`/`.end()` on the returned // ClientRequest, so it never appears as a positional arg here. - // Arg 0 may also be a URL string — the "whole arg is destination" + // Arg 0 may also be a URL string, the "whole arg is destination" // fallback (triggered when arg 0 is not an object literal) covers that. SinkGate { callee_matcher: "http.request", @@ -505,7 +520,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"], }, }, - // Node `http.get(options[, cb])` / `https.get(options[, cb])` — + // Node `http.get(options[, cb])` / `https.get(options[, cb])` , // convenience wrappers around `.request()` that auto-call `.end()`. // Same destination semantics as `.request`. Motivated by // CVE-2025-64430 (Parse Server SSRF via http.get(uri)). @@ -537,6 +552,31 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"], }, }, + // ── Cross-boundary data exfiltration ────────────────────────────────── + // + // Sensitive data flowing into the *payload* of an outbound request is a + // distinct vulnerability class from SSRF: the destination is fixed but + // attacker-influenced bytes leave the process via the request body / + // headers / json field. These gates fire on the body-bearing positions + // and emit `Cap::DATA_EXFIL`, which is intentionally separate from + // `Cap::SSRF` so a `fetch(taintedUrl, {body: tainted})` site reports + // both classes independently. + // + // `fetch(input, init)`, `init` at arg 1 carries body / headers / json. + SinkGate { + callee_matcher: "fetch", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["body", "headers", "json"], + }, + }, ]; pub static KINDS: Map<&'static str, Kind> = phf_map! { diff --git a/src/labels/mod.rs b/src/labels/mod.rs index d40844fa..248d0695 100644 --- a/src/labels/mod.rs +++ b/src/labels/mod.rs @@ -32,7 +32,7 @@ pub struct LabelRule { /// expands it to `(0..arity)` using the actual call arity. /// /// The value `usize::MAX` is used because `args.get(usize::MAX)` is a guaranteed -/// miss for any real argument list — an accidental direct-lookup would be a no-op +/// miss for any real argument list, an accidental direct-lookup would be a no-op /// rather than silently aliasing position 0. pub const ALL_ARGS_PAYLOAD: &[usize] = &[usize::MAX]; @@ -54,7 +54,7 @@ pub enum GateActivation { /// arg selects the MIME type). ValueMatch, /// Destination-bearing flow activation. The gate fires when taint reaches - /// a declared destination location at the call site — no literal + /// a declared destination location at the call site, no literal /// inspection, no prefix heuristic. /// /// For callees whose destination is a positional argument (e.g. `fetch`'s @@ -80,7 +80,7 @@ pub enum GateActivation { } /// Argument-sensitive sink activation. Whether a call becomes a sink is -/// determined by the gate's [`GateActivation`] mode — literal-value matching +/// determined by the gate's [`GateActivation`] mode, literal-value matching /// for traditional role-selector APIs, or destination-flow activation for /// outbound HTTP clients and other APIs where a specific location in the /// call carries the attacker-controlled destination. @@ -144,6 +144,13 @@ bitflags! { /// carrier cap for folding `auth_analysis` into the SSA/taint /// engine. const UNAUTHORIZED_ID = 0b0001_0000_0000_0000; // bit 12 + /// Cross-boundary data-exfiltration: tainted sensitive data flowing + /// into outbound request bodies, headers, or other payload-bearing + /// fields of network egress APIs. Distinct from `SSRF` (attacker + /// control over the destination URL), `DATA_EXFIL` fires when the + /// destination is fixed but attacker-influenced data leaves the + /// process via the request payload. + const DATA_EXFIL = 0b0010_0000_0000_0000; // bit 13 } } @@ -192,7 +199,7 @@ pub enum Kind { /// reachability does not depend on sibling-case execution order. Switch, Trivia, - /// Simple sequential expression (e.g. cast/type-assertion) — treated like + /// Simple sequential expression (e.g. cast/type-assertion), treated like /// any other sequential statement in the CFG but explicitly classified so /// code that inspects `Kind` can recognise it. Seq, @@ -472,9 +479,9 @@ pub enum SourceKind { FileSystem, /// Database query results Database, - /// Caught exception — may carry user-controlled data + /// Caught exception, may carry user-controlled data CaughtException, - /// Could not determine — treat conservatively + /// Could not determine, treat conservatively Unknown, } @@ -511,7 +518,7 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind { // File system patterns if cl.contains("read") || cl.contains("fopen") || cl.contains("open") { - // Distinguish from db reads — file reads typically have FILE_IO cap + // Distinguish from db reads, file reads typically have FILE_IO cap if caps.contains(Cap::FILE_IO) { return SourceKind::FileSystem; } @@ -570,6 +577,7 @@ pub fn parse_cap(s: &str) -> Option { "code_exec" => Some(Cap::CODE_EXEC), "crypto" => Some(Cap::CRYPTO), "unauthorized_id" => Some(Cap::UNAUTHORIZED_ID), + "data_exfil" | "data_exfiltration" => Some(Cap::DATA_EXFIL), "all" => Some(Cap::all()), _ => None, } @@ -621,7 +629,7 @@ pub fn build_lang_rules( Vec::new() }; - // Phase C: fold `auth_analysis` into the taint engine by injecting + // fold `auth_analysis` into the taint engine by injecting // `Cap::UNAUTHORIZED_ID` sink/sanitizer rules. Gated by config; default // OFF so the standalone `auth_analysis` subsystem remains authoritative. if config.scanner.enable_auth_as_taint { @@ -636,7 +644,7 @@ pub fn build_lang_rules( } } -/// Return Phase C auth-as-taint rules for a given language (currently Rust-only). +/// Return the auth-as-taint rules for a given language (Rust-only). fn phase_c_auth_rules_for_lang(lang_slug: &str) -> Vec { match lang_slug { "rust" | "rs" => rust::phase_c_auth_rules(), @@ -718,7 +726,7 @@ fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool { if exact_only { // `=foo` matchers fire only when `text` IS `foo` (no `Mod.foo`, // `Class::foo`, or any preceding namespace). Lets a label rule - // distinguish bare `Kernel#open` from `File.open` — the former + // distinguish bare `Kernel#open` from `File.open`, the former // shells out on `|cmd`, the latter never does (CVE-2020-8130). start == 0 } else { @@ -731,7 +739,7 @@ fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool { /// Strip an optional `=` "exact-match" sigil from the start of a matcher. /// Matchers prefixed with `=` (e.g. `"=open"`) only fire when the candidate -/// text equals the matcher exactly — the boundary-`.`-or-`:` allowance is +/// text equals the matcher exactly, the boundary-`.`-or-`:` allowance is /// suppressed. Used to distinguish bare-callee Ruby/Python builtins from /// methods of the same name on a typed receiver. #[inline] @@ -767,7 +775,7 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O let full_normalized = normalize_chained_call(text); let full_norm_bytes = full_normalized.as_bytes(); - // ── Check runtime (config) rules first — they take priority ────── + // ── Check runtime (config) rules first, they take priority ────── if let Some(extras) = extra { // Pass 1: exact / suffix for rule in extras { @@ -865,7 +873,7 @@ pub fn classify_all( } } - // ── Check runtime (config) rules first — they take priority ────── + // ── Check runtime (config) rules first, they take priority ────── if let Some(extras) = extra { // Pass 1: exact / suffix for rule in extras { @@ -941,7 +949,7 @@ pub fn classify_all( /// (or [`ALL_ARGS_PAYLOAD`] for dynamic-activation conservative fallback). /// `object_destination_fields`, when non-empty, restricts sink-taint checks /// to identifiers found under those field names within an object-literal -/// positional argument — used by destination-aware outbound-HTTP gates so +/// positional argument, used by destination-aware outbound-HTTP gates so /// `fetch({url, body})` fires only when taint reaches `url`, not `body`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct GateMatch { @@ -952,9 +960,13 @@ pub struct GateMatch { /// Classify a call against gated sink rules. /// -/// Returns `Some(GateMatch)` if the callee matches a gated rule AND the -/// activation conditions fire. Returns `None` if the callee doesn't match -/// any gated rule, or matches but the activation is provably safe. +/// Returns every gate whose callee matches AND whose activation conditions +/// fire. An empty result means the callee did not match any gated rule, or +/// every match was provably safe. Multiple matches are possible when the +/// same callee carries gates for different sink classes, e.g. `fetch` is +/// both an SSRF gate (URL flow) and a `DATA_EXFIL` gate (body / headers / +/// json flow); each gate carries its own [`GateMatch`] so downstream code +/// can attribute findings per-cap. /// /// `const_arg_at` extracts positional argument values. /// `const_keyword_arg` extracts keyword argument values (for languages like Python). @@ -964,11 +976,15 @@ pub fn classify_gated_sink( const_arg_at: impl Fn(usize) -> Option, const_keyword_arg: impl Fn(&str) -> Option, kwarg_present: impl Fn(&str) -> bool, -) -> Option { - let gates = GATED_REGISTRY.get(lang).or_else(|| { +) -> SmallVec<[GateMatch; 2]> { + let mut out: SmallVec<[GateMatch; 2]> = SmallVec::new(); + let gates = match GATED_REGISTRY.get(lang).or_else(|| { let key = lang.to_ascii_lowercase(); GATED_REGISTRY.get(key.as_str()) - })?; + }) { + Some(g) => g, + None => return out, + }; let callee_bytes = callee_text.as_bytes(); @@ -985,11 +1001,12 @@ pub fn classify_gated_sink( object_destination_fields, } = gate.activation { - return Some(GateMatch { + out.push(GateMatch { label: gate.label, payload_args: gate.payload_args, object_destination_fields, }); + continue; } // ── ValueMatch activation (legacy) ─────────────────────────────── @@ -1012,7 +1029,7 @@ pub fn classify_gated_sink( any_dangerous = true; break; } - // Present with a safe literal — continue checking other kwargs. + // Present with a safe literal, continue checking other kwargs. } None => { any_dynamic_present = true; @@ -1020,23 +1037,25 @@ pub fn classify_gated_sink( } } if any_dangerous { - return Some(GateMatch { + out.push(GateMatch { label: gate.label, payload_args: gate.payload_args, object_destination_fields: &[], }); + continue; } if any_dynamic_present { - // Dynamic kwarg value — we can't prove safe. Conservatively + // Dynamic kwarg value, we can't prove safe. Conservatively // flag every positional arg so the activation pathway isn't // silently narrowed to the gate's declared `payload_args`. - return Some(GateMatch { + out.push(GateMatch { label: gate.label, payload_args: ALL_ARGS_PAYLOAD, object_destination_fields: &[], }); + continue; } - return None; // all listed kwargs absent or safe-literal → suppress + continue; // all listed kwargs absent or safe-literal → suppress } // Single-kwarg / positional gate path (original semantics). @@ -1058,22 +1077,22 @@ pub fn classify_gated_sink( .iter() .any(|p| lower.starts_with(&p.to_ascii_lowercase())); if is_dangerous { - return Some(GateMatch { + out.push(GateMatch { label: gate.label, payload_args: gate.payload_args, object_destination_fields: &[], }); } - return None; // safe constant → suppress + // safe constant → suppress (no push) } // Unknown / dynamic activation arg: the gate fires conservatively, // but we can't prove that only the declared `payload_args` carry - // risk — a tainted activation arg (e.g. `setAttribute(userAttr, …)` + // risk, a tainted activation arg (e.g. `setAttribute(userAttr, …)` // where `userAttr` is user-controlled) is itself a vulnerability // path. Return ALL_ARGS_PAYLOAD so downstream sink scanning // considers every positional argument. None => { - return Some(GateMatch { + out.push(GateMatch { label: gate.label, payload_args: ALL_ARGS_PAYLOAD, object_destination_fields: &[], @@ -1081,7 +1100,7 @@ pub fn classify_gated_sink( } } } - None + out } /// Public wrapper for [`normalize_chained_call`] so callers outside the module @@ -1090,25 +1109,11 @@ pub fn normalize_chained_call_for_classify(text: &str) -> String { normalize_chained_call(text) } -/// Return the bare method-name segment of a callee text. -/// -/// Centralised replacement for the textual `callee.rsplit('.').next().unwrap_or(callee)` -/// pattern that used to be scattered across the codebase. -/// -/// Behaviour-preserving across the Phase 2 SSA chain decomposition rollout: -/// - When SSA lowering rewrites a chained-receiver call (`c.mu.Lock()` → -/// `Call("Lock", [v_mu])`), the call's `callee` is already the bare method -/// name, so this helper is a no-op pass-through. -/// - For 1-dot callees (`obj.method`) and for languages where Phase 2 lowering -/// doesn't run yet (PHP/Ruby) the helper still extracts the trailing method -/// from the textual form, exactly as the old per-callsite split did. -/// - For bare callees (no dot), it returns the input unchanged. -/// -/// Use this helper when you need the *terminal* method name from a callee -/// string regardless of whether the call had a chained receiver. When you -/// have an `SsaOp::Call` in hand, prefer reading `callee` directly and -/// walking `receiver` through `FieldProj` ops — that's the precise path. -/// This helper is the textual fallback for callsites that only see a `&str`. +/// Return the bare method-name segment of a callee text. Returns the +/// input unchanged for bare callees. When you have an `SsaOp::Call`, +/// prefer reading `callee` directly and walking `receiver` through +/// `FieldProj` ops, this helper is the textual fallback for callsites +/// that only see a `&str`. pub fn bare_method_name(callee: &str) -> &str { callee.rsplit('.').next().unwrap_or(callee) } @@ -1314,19 +1319,15 @@ mod tests { fn bare_method_name_strips_chain() { // No-dot input → returned as-is. assert_eq!(bare_method_name("foo"), "foo"); - // 1-dot → trailing segment (Phase 2 leaves these alone in SSA). + // 1-dot → trailing segment. assert_eq!(bare_method_name("obj.method"), "method"); - // Multi-dot → trailing segment (matches AST-only callees from - // PHP/Ruby and any pre-Phase-2 textual paths kept around in - // `callee_text` for display). + // Multi-dot → trailing segment. assert_eq!(bare_method_name("a.b.c.method"), "method"); - // Trailing dot → empty trailing segment, matching the legacy - // `rsplit('.').next()` behaviour bit-for-bit. + // Trailing dot → empty trailing segment. assert_eq!(bare_method_name("foo."), ""); // Empty input. assert_eq!(bare_method_name(""), ""); - // Phase 2 invariant: when SSA decomposed a chain, `callee` is - // the bare method already and the helper is a no-op. + // SSA-decomposed chains pass through untouched. assert_eq!(bare_method_name("Lock"), "Lock"); } @@ -1399,7 +1400,7 @@ mod tests { #[test] fn classify_bare_href_is_none() { - // Bare "href" should NOT be a sink — only "location.href" and variants + // Bare "href" should NOT be a sink, only "location.href" and variants let result = classify("javascript", "href", None); assert_eq!(result, None); } @@ -1497,7 +1498,7 @@ mod tests { #[test] fn classify_go_user_client_get_is_not_ssrf_sink() { // `client.Get` on a user-named *http.Client variable should NOT - // match — the Go SSRF set is restricted to the stdlib package + // match, the Go SSRF set is restricted to the stdlib package // helper `http.DefaultClient`. Type-aware resolution would be the // path to a broader rule, not a bare-name match. let result = classify("go", "client.Get", None); @@ -1530,7 +1531,7 @@ mod tests { #[test] fn classify_ruby_io_open_is_not_shell_escape_sink() { - // `IO.open` takes a file descriptor — never pipes. The bare- + // `IO.open` takes a file descriptor, never pipes. The bare- // open CMDI rule must leave it alone. let result = classify("ruby", "IO.open", None); assert_ne!(result, Some(DataLabel::Sink(Cap::SHELL_ESCAPE))); @@ -1572,7 +1573,7 @@ mod tests { #[test] fn classify_cpp_sto_family_is_sanitizer() { - // Phase 1: full `std::sto*` family (including 64-bit and `long + // full `std::sto*` family (including 64-bit and `long // double` variants) clears every taint cap that flows through it, // matching the existing `std::stoi`/`std::stol` rule. for callee in [ @@ -1621,6 +1622,16 @@ mod tests { false } + /// Find the first matching gate whose label sink-caps overlap `caps`. + /// Lets tests target a specific gate when a callee carries multiple + /// (e.g. `fetch` is both an SSRF and a `DATA_EXFIL` gate). + fn find_match_with_caps(matches: &[GateMatch], caps: Cap) -> Option { + matches + .iter() + .find(|m| matches!(m.label, DataLabel::Sink(c) if c.intersects(caps))) + .copied() + } + #[test] fn gated_sink_dangerous_exact() { let result = classify_gated_sink( @@ -1631,12 +1642,12 @@ mod tests { no_kw_present, ); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::HTML_ESCAPE), payload_args: [1usize].as_slice(), object_destination_fields: &[], - }) + }] ); } @@ -1650,12 +1661,12 @@ mod tests { no_kw_present, ); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::HTML_ESCAPE), payload_args: [1usize].as_slice(), object_destination_fields: &[], - }) + }] ); } @@ -1668,24 +1679,24 @@ mod tests { no_kw, no_kw_present, ); - assert_eq!(result, None); + assert!(result.is_empty()); } #[test] fn gated_sink_dynamic_conservative() { // Dynamic activation (e.g. `setAttribute(attrVar, val)`) returns the // ALL_ARGS_PAYLOAD sentinel so callers expand payload tracking to - // every positional arg — the activation arg itself is a vulnerability + // every positional arg, the activation arg itself is a vulnerability // path when attacker-controlled. let result = classify_gated_sink("javascript", "setAttribute", |_| None, no_kw, no_kw_present); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::HTML_ESCAPE), payload_args: ALL_ARGS_PAYLOAD, object_destination_fields: &[], - }) + }] ); } @@ -1698,7 +1709,7 @@ mod tests { no_kw, no_kw_present, ); - assert_eq!(result, None); + assert!(result.is_empty()); } #[test] @@ -1711,7 +1722,7 @@ mod tests { no_kw, no_kw_present, ); - assert_eq!(result.unwrap().payload_args, &[1]); + assert_eq!(result[0].payload_args, &[1]); // parseFromString: payload is arg 0 let result = classify_gated_sink( @@ -1727,7 +1738,7 @@ mod tests { no_kw, no_kw_present, ); - assert_eq!(result.unwrap().payload_args, &[0]); + assert_eq!(result[0].payload_args, &[0]); } #[test] @@ -1745,7 +1756,7 @@ mod tests { no_kw, no_kw_present, ); - assert_eq!(result, None); + assert!(result.is_empty()); } #[test] @@ -1764,12 +1775,12 @@ mod tests { |kw| kw == "shell", ); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::SHELL_ESCAPE), payload_args: [0usize].as_slice(), object_destination_fields: &[], - }) + }] ); } @@ -1788,7 +1799,7 @@ mod tests { }, |kw| kw == "shell", ); - assert_eq!(result, None); + assert!(result.is_empty()); } #[test] @@ -1797,12 +1808,12 @@ mod tests { // literal available → unknown activation → ALL_ARGS_PAYLOAD sentinel. let result = classify_gated_sink("python", "Popen", |_| None, |_| None, no_kw_present); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::SHELL_ESCAPE), payload_args: ALL_ARGS_PAYLOAD, object_destination_fields: &[], - }) + }] ); } @@ -1825,12 +1836,12 @@ mod tests { |kw| kw == "shell", ); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::SHELL_ESCAPE), payload_args: [0usize].as_slice(), object_destination_fields: &[], - }) + }] ); } @@ -1850,7 +1861,7 @@ mod tests { }, |kw| kw == "shell", ); - assert_eq!(result, None); + assert!(result.is_empty()); } /// `subprocess.run(cmd)` → no shell kwarg → presence-aware gate suppresses. @@ -1864,7 +1875,7 @@ mod tests { |_| None, no_kw_present, ); - assert_eq!(result, None); + assert!(result.is_empty()); } /// `subprocess.run(cmd, shell=flag)` → shell kwarg present but dynamic → @@ -1880,12 +1891,12 @@ mod tests { |kw| kw == "shell", ); assert_eq!( - result, - Some(GateMatch { + result.as_slice(), + &[GateMatch { label: DataLabel::Sink(Cap::SHELL_ESCAPE), payload_args: ALL_ARGS_PAYLOAD, object_destination_fields: &[], - }) + }] ); } @@ -1893,18 +1904,18 @@ mod tests { /// verbatim for the caller to apply object-literal field filtering. #[test] fn gated_sink_destination_positional_always_fires() { - // `fetch(url)` — arg 0 is the URL (positional destination) OR an + // `fetch(url)`, arg 0 is the URL (positional destination) OR an // object with a `url` field. The gate fires unconditionally, with // `url` declared as the object-literal destination-field for the // `fetch({url, body})` shape. let result = classify_gated_sink( "javascript", "fetch", - |_| None, // no literal — Destination mode doesn't inspect it + |_| None, // no literal, Destination mode doesn't inspect it no_kw, no_kw_present, ); - let m = result.expect("fetch gate should fire"); + let m = find_match_with_caps(&result, Cap::SSRF).expect("fetch SSRF gate should fire"); assert_eq!(m.label, DataLabel::Sink(Cap::SSRF)); assert_eq!(m.payload_args, &[0]); assert_eq!(m.object_destination_fields, &["url"]); @@ -1914,10 +1925,13 @@ mod tests { /// the CFG caller to drive object-literal field filtering. #[test] fn gated_sink_destination_object_fields_surfaced() { - // `http.request(opts, cb)` — opts is an object with destination fields. + // `http.request(opts, cb)`, opts is an object with destination fields. let result = classify_gated_sink("javascript", "http.request", |_| None, no_kw, no_kw_present); - let m = result.expect("http.request gate should fire"); + let m = result + .first() + .copied() + .expect("http.request gate should fire"); assert_eq!(m.label, DataLabel::Sink(Cap::SSRF)); assert_eq!(m.payload_args, &[0]); assert!( @@ -1929,6 +1943,27 @@ mod tests { ); } + /// `fetch` carries both SSRF (URL flow) and `DATA_EXFIL` (body / headers / + /// json flow) gates. Both must fire from a single classify call so the + /// downstream CFG can build per-cap filters. + #[test] + fn gated_sink_fetch_emits_ssrf_and_data_exfil() { + let result = classify_gated_sink("javascript", "fetch", |_| None, no_kw, no_kw_present); + let ssrf = find_match_with_caps(&result, Cap::SSRF).expect("SSRF gate fires"); + assert_eq!(ssrf.label, DataLabel::Sink(Cap::SSRF)); + assert_eq!(ssrf.payload_args, &[0]); + assert_eq!(ssrf.object_destination_fields, &["url"]); + + let exfil = find_match_with_caps(&result, Cap::DATA_EXFIL).expect("DATA_EXFIL gate fires"); + assert_eq!(exfil.label, DataLabel::Sink(Cap::DATA_EXFIL)); + assert_eq!(exfil.payload_args, &[1]); + assert!( + exfil.object_destination_fields.contains(&"body"), + "expected body in DATA_EXFIL destination fields, got {:?}", + exfil.object_destination_fields, + ); + } + #[test] fn classify_all_single_label() { let result = classify_all("javascript", "innerHTML", None); diff --git a/src/labels/python.rs b/src/labels/python.rs index 6870fe09..ff00110d 100644 --- a/src/labels/python.rs +++ b/src/labels/python.rs @@ -106,6 +106,19 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::URL_ENCODE), case_sensitive: false, }, + // SQLAlchemy bound-parameter sanitizer. Values passed as keyword + // arguments to `text("…:name…").bindparams(name=value)` are bound + // by the driver, so injection cannot break out of the literal + // context. The accompanying SQL-string check (py.sqli.text_format) + // already flags the `text(f"…")` shape at construction, so this + // sanitizer only clears flow when the SQL is a literal and the + // values reach the engine via bindparams. Recognises both the + // method form (`text(…).bindparams(...)`) and the bare call form. + LabelRule { + matchers: &["bindparams", ".bindparams"], + label: DataLabel::Sanitizer(Cap::SQL_QUERY), + case_sensitive: false, + }, // Path canonicalization LabelRule { matchers: &["os.path.abspath", "os.path.normpath"], @@ -119,7 +132,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::CODE_EXEC), case_sensitive: false, }, - // Jinja2 / string.Template — tainted template string enables SSTI + // Jinja2 / string.Template, tainted template string enables SSTI LabelRule { matchers: &["Template"], label: DataLabel::Sink(Cap::HTML_ESCAPE), @@ -141,7 +154,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::HTML_ESCAPE), case_sensitive: false, }, - // Flask Markup — bypasses auto-escaping + // Flask Markup, bypasses auto-escaping LabelRule { matchers: &["Markup"], label: DataLabel::Sink(Cap::HTML_ESCAPE), @@ -216,7 +229,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, - // aiohttp HTTP client — SSRF sinks + // aiohttp HTTP client, SSRF sinks LabelRule { matchers: &[ "aiohttp.get", @@ -228,6 +241,30 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // Type-qualified SSRF sinks: when the receiver is tracked as + // TypeKind::HttpClient (e.g. `client = requests.Session()`, + // `client = httpx.Client()`, or `s = aiohttp.ClientSession()`), + // resolve_type_qualified_labels() constructs `"HttpClient."` + // call texts so the receiver-name is no longer load-bearing. Matches + // the existing Rust HttpClient. sink set so both languages + // stay in step on the type-aware SSRF model. Motivated by the + // upstream LMDeploy CVE-2026-33626 shape: + // client = requests.Session() + // response = client.get(url, ...) + LabelRule { + matchers: &[ + "HttpClient.get", + "HttpClient.post", + "HttpClient.put", + "HttpClient.delete", + "HttpClient.patch", + "HttpClient.head", + "HttpClient.request", + "HttpClient.send", + ], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + }, LabelRule { matchers: &[ "pickle.loads", @@ -256,7 +293,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ dangerous_kwargs: &[], activation: GateActivation::ValueMatch, }, - // subprocess.run(cmd, shell=True) — multi-kwarg gate using the new + // subprocess.run(cmd, shell=True), multi-kwarg gate using the new // presence-aware mechanism. Payload is arg 1 (after receiver offset // applied by the CFG layer when the call is modelled method-style). SinkGate { @@ -361,7 +398,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec { let mut rules = Vec::new(); if ctx.has(DetectedFramework::Django) { - // QuerySet.extra() — raw SQL injection risk. + // QuerySet.extra(), raw SQL injection risk. // Framework-conditional because `extra` is too generic as a static matcher. rules.push(RuntimeLabelRule { matchers: vec!["extra".into()], diff --git a/src/labels/ruby.rs b/src/labels/ruby.rs index 1a73b42b..cceecead 100644 --- a/src/labels/ruby.rs +++ b/src/labels/ruby.rs @@ -14,7 +14,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Source(Cap::all()), case_sensitive: false, }, - // Rails request object — user-controlled HTTP request data. + // Rails request object, user-controlled HTTP request data. // Dotted matchers work via push_node receiver.method text construction // (confirmed by existing Net::HTTP.get matcher in ssrf_net_http fixture). LabelRule { @@ -75,7 +75,7 @@ pub static RULES: &[LabelRule] = &[ }, // Bare `Kernel#open(path)` interprets a path beginning with `|` as a // shell command (`open("|cmd")` runs `cmd`). `=open` exact-matcher - // syntax limits this rule to the bare call — `File.open`, `IO.open`, + // syntax limits this rule to the bare call, `File.open`, `IO.open`, // `URI.open` etc. each have their own non-pipe semantics and are // covered by their own labels (or intentionally not labeled as CMDI). // CVE-2020-8130 (rake `Rake::FileList#egrep`) was the canonical @@ -99,7 +99,7 @@ pub static RULES: &[LabelRule] = &[ // File I/O sinks: user-controlled paths flowing into File.open/File.new // are a path-traversal / arbitrary-read vector. File.open also participates // in the resource-lifecycle acquire/release pair (cfg_analysis::RUBY_RESOURCES), - // so this entry is additive — it does not disturb resource-leak detection. + // so this entry is additive, it does not disturb resource-leak detection. LabelRule { matchers: &["File.open", "File.new", "File.read", "IO.read"], label: DataLabel::Sink(Cap::FILE_IO), @@ -115,7 +115,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::HTML_ESCAPE), case_sensitive: false, }, - // URI.open is the network-capable Kernel#open wrapper — more specific than + // URI.open is the network-capable Kernel#open wrapper, more specific than // plain `open` (excluded to avoid file I/O false positives). LabelRule { matchers: &[ @@ -140,7 +140,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::DESERIALIZE), case_sensitive: false, }, - // Reflection / dynamic class resolution — arbitrary class instantiation from + // Reflection / dynamic class resolution, arbitrary class instantiation from // user-controlled names enables gadget chains (similar risk profile to // deserialization). Rails adds `constantize`/`safe_constantize` to String. LabelRule { @@ -157,7 +157,7 @@ pub static RULES: &[LabelRule] = &[ // SQL injection: ActiveRecord query methods that accept raw SQL strings. // `where` and `order` are the most common Rails SQLi vectors when called // with string interpolation (e.g., User.where("name = '#{params[:name]}'")). - // Broad matchers — verified against fixture fallout. + // Broad matchers, verified against fixture fallout. LabelRule { matchers: &["where", "order", "group", "having", "joins", "pluck"], label: DataLabel::Sink(Cap::SQL_QUERY), @@ -240,7 +240,7 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig { /// ActiveRecord query methods that the static [`RULES`] table classifies as /// `Sink(Cap::SQL_QUERY)`. These are SQL injection vectors only when arg 0 -/// is a string with interpolation (`#{x}`) or a non-literal identifier — the +/// is a string with interpolation (`#{x}`) or a non-literal identifier, the /// hash form (`where(id: x)`) and the parameterised form (`where("a = ?", x)`) /// are intrinsically safe because Rails escapes the values. const AR_QUERY_METHOD_NAMES: &[&str] = &["where", "order", "group", "having", "joins", "pluck"]; @@ -249,7 +249,7 @@ const AR_QUERY_METHOD_NAMES: &[&str] = &["where", "order", "group", "having", "j /// shape-safe. Hash literals (`pair`, `hash`), symbol literals /// (`simple_symbol`, `hash_key_symbol`), array literals (`array`), and pure /// string literals without `#{...}` interpolation are all safe. Strings WITH -/// interpolation and identifiers / method calls are *not* in this list — +/// interpolation and identifiers / method calls are *not* in this list , /// callers must check `has_interpolation` and the kind separately. const AR_QUERY_SAFE_ARG0_KINDS: &[&str] = &[ "pair", @@ -270,15 +270,15 @@ const AR_QUERY_SAFE_ARG0_KINDS: &[&str] = &[ /// `cfg-unguarded-sink` (sanitiser dominates the sink reflexively). /// /// Real-world FP shapes this closes (redmine, mastodon, diaspora): -/// * `Issue.where(:id => params[:id])` — hash form -/// * `Model.where(id: x, name: y)` — keyword-shorthand pairs -/// * `Project.order(:created_at)` — symbol literal -/// * `Issue.pluck(:id, :name)` — symbol literals -/// * `Model.where("active = ?", x)` — parameterised string +/// * `Issue.where(:id => params[:id])`, hash form +/// * `Model.where(id: x, name: y)`, keyword-shorthand pairs +/// * `Project.order(:created_at)`, symbol literal +/// * `Issue.pluck(:id, :name)`, symbol literals +/// * `Model.where("active = ?", x)`, parameterised string /// /// Real-world TPs preserved: -/// * `User.where("name = '#{name}'")` — string with interpolation -/// * `Model.where(some_string_var)` — dynamic identifier (conservative) +/// * `User.where("name = '#{name}'")`, string with interpolation +/// * `Model.where(some_string_var)`, dynamic identifier (conservative) pub fn ar_query_safe_shape(callee_text: &str, arg0_kind: &str, has_interpolation: bool) -> bool { // Match the callee's last segment ("Model.where" → "where", "where" → "where"). let leaf = callee_text.rsplit(['.', ':']).next().unwrap_or(callee_text); @@ -297,7 +297,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec { let mut rules = Vec::new(); if ctx.has(DetectedFramework::Rails) { - // Strong parameters — permit/require sanitize user input + // Strong parameters, permit/require sanitize user input rules.push(RuntimeLabelRule { matchers: vec!["permit".into(), "require".into()], label: DataLabel::Sanitizer(Cap::all()), @@ -306,7 +306,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec { } if ctx.has(DetectedFramework::Sinatra) { - // Sinatra template rendering — user content flows to rendered output + // Sinatra template rendering, user content flows to rendered output rules.push(RuntimeLabelRule { matchers: vec!["erb".into(), "haml".into()], label: DataLabel::Sink(Cap::HTML_ESCAPE), @@ -323,7 +323,7 @@ mod ar_query_tests { #[test] fn hash_form_is_safe() { - // Model.where(:id => x) — pair node directly in argument_list + // Model.where(:id => x) , pair node directly in argument_list assert!(ar_query_safe_shape("Model.where", "pair", false)); // Model.where(id: x) assert!(ar_query_safe_shape("where", "pair", false)); @@ -338,32 +338,32 @@ mod ar_query_tests { #[test] fn parameterised_string_is_safe() { - // Model.where("a = ?", x) — first arg is a string literal w/o interpolation + // Model.where("a = ?", x) , first arg is a string literal w/o interpolation assert!(ar_query_safe_shape("where", "string", false)); assert!(ar_query_safe_shape("where", "string_literal", false)); } #[test] fn interpolated_string_is_dangerous() { - // Model.where("a = #{x}") — string node WITH interpolation child + // Model.where("a = #{x}") , string node WITH interpolation child assert!(!ar_query_safe_shape("where", "string", true)); } #[test] fn dynamic_identifier_is_dangerous() { - // Model.where(some_var) — kind is identifier, not in safe list + // Model.where(some_var), kind is identifier, not in safe list assert!(!ar_query_safe_shape("where", "identifier", false)); } #[test] fn array_form_is_safe() { - // Model.pluck([:id, :name]) — uncommon but valid + // Model.pluck([:id, :name]), uncommon but valid assert!(ar_query_safe_shape("pluck", "array", false)); } #[test] fn non_ar_method_is_never_suppressed() { - // find_by_sql is a real raw-SQL sink — never suppress. + // find_by_sql is a real raw-SQL sink, never suppress. assert!(!ar_query_safe_shape("find_by_sql", "string", false)); assert!(!ar_query_safe_shape("connection.execute", "pair", false)); } diff --git a/src/labels/rust.rs b/src/labels/rust.rs index a59ead03..68826c11 100644 --- a/src/labels/rust.rs +++ b/src/labels/rust.rs @@ -168,7 +168,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! { "expression_statement" => Kind::CallWrapper, "assignment_expression" => Kind::Assignment, - // struct expressions — recurse so env::var() calls inside field + // struct expressions, recurse so env::var() calls inside field // initialisers produce Source-labelled CFG nodes (needed for summaries). "struct_expression" => Kind::Block, "field_initializer_list" => Kind::Block, @@ -287,7 +287,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec { rules } -/// Phase C: auth-as-taint label rules for Rust. Gated by +/// auth-as-taint label rules for Rust. Gated by /// `config.scanner.enable_auth_as_taint`; appended to the runtime rule set /// when the flag is enabled. These declare **sinks** (state-changing or /// outbound operations that should not be reached by an un-checked @@ -343,10 +343,8 @@ pub fn phase_c_auth_rules() -> Vec { case_sensitive: false, }, // ── Sanitizers clearing Cap::UNAUTHORIZED_ID ── - // Ownership and membership guards from the auth_analysis default - // `authorization_check_names` list. Phase C consumes these via - // call-site argument sanitization (see - // `is_auth_as_taint_arg_sanitizer` in ssa_transfer). + // Ownership and membership guards consumed via call-site + // argument sanitization (see `is_auth_as_taint_arg_sanitizer`). RuntimeLabelRule { matchers: vec![ "check_ownership".into(), diff --git a/src/labels/typescript.rs b/src/labels/typescript.rs index 97130676..fdc37e89 100644 --- a/src/labels/typescript.rs +++ b/src/labels/typescript.rs @@ -86,7 +86,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE), case_sensitive: false, }, - // he library — HTML entity encoding + // he library, HTML entity encoding LabelRule { matchers: &["he.encode", "he.escape"], label: DataLabel::Sanitizer(Cap::HTML_ESCAPE), @@ -131,7 +131,7 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SHELL_ESCAPE), case_sensitive: true, }, - // ── Outbound HTTP clients — modeled as destination-aware gated sinks ── + // ── Outbound HTTP clients, modeled as destination-aware gated sinks ── // See GATED_SINKS below; rationale mirrors javascript.rs. LabelRule { matchers: &[ @@ -206,6 +206,14 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // ── Cross-boundary data exfiltration (DATA_EXFIL) ───────────────────── + // See javascript.rs for rationale. `xhr.send(body)` resolves to + // `HttpClient.send` via type-qualified resolution. + LabelRule { + matchers: &["HttpClient.send", "XMLHttpRequest.prototype.send"], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + }, // ─────────── SQL injection sinks ───────────── // Database drivers: mysql, mysql2, pg, better-sqlite3 LabelRule { @@ -283,7 +291,7 @@ pub static GATED_SINKS: &[SinkGate] = &[ dangerous_kwargs: &[], activation: GateActivation::ValueMatch, }, - // ── Outbound HTTP clients (SSRF) — see javascript.rs for rationale ──── + // ── Outbound HTTP clients (SSRF), see javascript.rs for rationale ──── SinkGate { callee_matcher: "fetch", arg_index: 0, @@ -452,6 +460,24 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"], }, }, + // ── Cross-boundary data exfiltration ────────────────────────────────── + // `fetch(input, init)`, payload-bearing fields of `init` (arg 1) flow + // into the request body / headers / json, distinct from SSRF on the URL + // (arg 0). See javascript.rs for full rationale. + SinkGate { + callee_matcher: "fetch", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["body", "headers", "json"], + }, + }, ]; pub static KINDS: Map<&'static str, Kind> = phf_map! { diff --git a/src/lib.rs b/src/lib.rs index 762d1da7..39c486a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,43 +1,14 @@ -//! # Nyx Scanner +//! Multi-language static vulnerability scanner. Tree-sitter parsing, petgraph +//! CFGs, SSA-based dataflow, and cross-file taint analysis with a +//! capability-based sanitizer system. Supports Rust, C, C++, Java, Go, PHP, +//! Python, Ruby, TypeScript, and JavaScript. //! -//! A multi-language static vulnerability scanner. Nyx parses source files with -//! [tree-sitter](https://tree-sitter.github.io/), builds intra-procedural -//! control-flow graphs ([petgraph](https://docs.rs/petgraph)), and runs -//! cross-file taint analysis with a capability-based sanitizer system. -//! -//! ## Architecture -//! -//! Nyx uses a **two-pass architecture**: -//! -//! 1. **Pass 1 — Summary extraction**: Parse each file, build a CFG per function, -//! and export a [`summary::FuncSummary`] capturing source/sanitizer/sink capabilities, -//! taint propagation behavior, and callee lists. Summaries are persisted to SQLite. -//! -//! 2. **Pass 2 — Analysis**: Load all summaries into a [`summary::GlobalSummaries`] map, -//! re-parse files, and run taint analysis with cross-file callee resolution. CFG -//! structural analysis checks for auth gaps, unguarded sinks, and resource leaks. -//! -//! ## Four Detector Families -//! -//! - **Taint** ([`taint`]) — Monotone forward dataflow tracking source-to-sink flows -//! - **CFG Structural** ([`cfg_analysis`]) — Dominator-based guard and auth-gap detection -//! - **State Model** ([`state`]) — Resource lifecycle and authentication state lattices -//! - **AST Patterns** ([`patterns`]) — Tree-sitter structural queries per language -//! -//! ## Supported Languages -//! -//! Rust, C, C++, Java, Go, PHP, Python, Ruby, TypeScript, JavaScript. -//! -//! ## Entry Points -//! -//! - [`scan_no_index`] — Run a two-pass scan without indexing (for tests) -//! - [`commands::scan::scan_filesystem`] — Filesystem scan with optional indexing -//! - [`commands::scan::scan_with_index_parallel`] — Index-backed parallel scan -//! -//! ## Documentation -//! -//! See the [`docs/`](https://github.com/elicpeter/nyx/tree/master/docs) directory -//! for user and contributor documentation. +//! The handbook below is embedded verbatim from +//! [`docs/how-it-works.md`](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md). +//! Per-detector documentation lives on the [`taint`], [`cfg_analysis`], +//! [`state`], [`patterns`], and [`auth_analysis`] modules. The primary +//! library entry point for tests and embedders is [`scan_no_index`]. +#![doc = include_str!(concat!(env!("OUT_DIR"), "/lib_intro.md"))] pub mod abstract_interp; pub mod ast; diff --git a/src/main.rs b/src/main.rs index 9245470f..100830b0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,7 +69,7 @@ fn main() -> NyxResult<()> { let quiet = config.output.quiet || cli.command.is_structured_output(&config); // Print config note before scanning (human-readable mode only). Pure - // informational commands suppress it too — their output is usually + // informational commands suppress it too, their output is usually // piped or grepped and the preamble is noise. if let Some(note) = config_note.filter(|_| !quiet && !is_info) { eprint!("{note}"); diff --git a/src/output.rs b/src/output.rs index b54a48c2..86e257c9 100644 --- a/src/output.rs +++ b/src/output.rs @@ -47,14 +47,28 @@ fn cfg_rule_description(id: &str) -> Option<&'static str> { } } -/// Look up a human-readable description for any rule ID. -fn rule_description(id: &str) -> &str { - // Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base) - let base_id = if id.starts_with("taint-") { +/// Normalise a finding's id to the base SARIF rule id. +/// +/// Findings carry source-location-suffixed ids like +/// `"taint-unsanitised-flow (source 12:3)"` so identical (source, sink) +/// pairs can be deduped, but SARIF wants a single rule per category. +/// Cap-specific taint rule classes (e.g. `taint-data-exfiltration`) are +/// preserved as distinct bases so consumers can filter on them rather than +/// folding everything into `taint-unsanitised-flow`. +fn sarif_base_id(id: &str) -> &str { + if id.starts_with("taint-data-exfiltration") { + "taint-data-exfiltration" + } else if id.starts_with("taint-") { "taint-unsanitised-flow" } else { id - }; + } +} + +/// Look up a human-readable description for any rule ID. +fn rule_description(id: &str) -> &str { + // Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base) + let base_id = sarif_base_id(id); if let Some(desc) = PATTERN_DESCRIPTIONS.get(base_id) { return desc; @@ -62,10 +76,13 @@ fn rule_description(id: &str) -> &str { if let Some(desc) = cfg_rule_description(base_id) { return desc; } - if base_id == "taint-unsanitised-flow" { - return "Unsanitised data flows from source to sink"; + match base_id { + "taint-unsanitised-flow" => "Unsanitised data flows from source to sink", + "taint-data-exfiltration" => { + "Sensitive data flows into the payload of an outbound network request" + } + _ => id, } - id } fn severity_to_level(sev: Severity) -> &'static str { @@ -83,11 +100,7 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value { let mut rule_index_map: HashMap = HashMap::new(); for d in diags { - let base = if d.id.starts_with("taint-") { - "taint-unsanitised-flow".to_string() - } else { - d.id.clone() - }; + let base = sarif_base_id(&d.id).to_string(); if !rule_index_map.contains_key(&base) { let idx = rule_ids.len(); rule_index_map.insert(base.clone(), idx); @@ -108,15 +121,11 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value { let results: Vec = diags .iter() .map(|d| { - let base = if d.id.starts_with("taint-") { - "taint-unsanitised-flow" - } else { - &d.id - }; + let base = sarif_base_id(&d.id); let rule_index = rule_index_map[base]; // Make path relative to scan root. Fall back to a deterministic - // sentinel instead of the absolute path — SARIF must not leak + // sentinel instead of the absolute path, SARIF must not leak // home-directory or host-specific prefixes. let uri = match Path::new(&d.path).strip_prefix(scan_root) { Ok(p) => p.to_string_lossy().to_string(), @@ -213,17 +222,17 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value { props.insert("relatedFindings".into(), json!(d.alternative_finding_ids)); } - // Engine provenance notes — surface any cap-hit / lowering + // Engine provenance notes, surface any cap-hit / lowering // bail / timeout signals recorded by the analysis engine so // downstream consumers can tell "nothing found" from "engine // stopped looking". // // Three properties are emitted together: - // * `engine_notes` — raw list of {kind, ...} entries - // * `confidence_capped` — true iff any non-informational + // * `engine_notes` , raw list of {kind, ...} entries + // * `confidence_capped` , true iff any non-informational // note is present (back-compat // boolean; drives legacy dashboards) - // * `loss_direction` — worst `LossDirection` across + // * `loss_direction` , worst `LossDirection` across // the list ("under-report", // "over-report", "bail"). Absent // when only informational notes @@ -590,7 +599,7 @@ mod tests { #[test] fn build_sarif_path_outside_scan_root_is_redacted() { - // Absolute host paths leak home-directory information — SARIF must + // Absolute host paths leak home-directory information, SARIF must // substitute a deterministic token when a finding falls outside the // scan root. let mut diag = make_diag("rule-x", Severity::High); diff --git a/src/patterns/ejs.rs b/src/patterns/ejs.rs index 8c7401b6..ff3adf86 100644 --- a/src/patterns/ejs.rs +++ b/src/patterns/ejs.rs @@ -43,7 +43,7 @@ pub fn scan_ejs_file(path: &Path, bytes: &[u8]) -> Vec { // Advance past this match for the next iteration. search_from = abs_end + 2; // skip "%>" - // Skip <%- include(...) %> — EJS partial inclusion, not user-controlled. + // Skip <%- include(...) %>, EJS partial inclusion, not user-controlled. if is_include_call(expr) { continue; } diff --git a/src/patterns/java.rs b/src/patterns/java.rs index d1c7037b..0f6218a6 100644 --- a/src/patterns/java.rs +++ b/src/patterns/java.rs @@ -12,7 +12,7 @@ pub const PATTERNS: &[Pattern] = &[ Pattern { id: "java.deser.readobject", description: "ObjectInputStream.readObject() performs unsafe deserialization", - // Match any .readObject() call — the method name is specific enough. + // Match any .readObject() call, the method name is specific enough. query: r#"(method_invocation name: (identifier) @id (#eq? @id "readObject")) @vuln"#, @@ -21,6 +21,46 @@ pub const PATTERNS: &[Pattern] = &[ category: PatternCategory::Deserialization, confidence: Confidence::High, }, + // ── Tier A: SnakeYAML deserialization (CVE-2022-1471) ────────────── + // `new Yaml()` constructed without a `SafeConstructor` argument + // accepts arbitrary YAML tags (`!!javax.script.ScriptEngineManager`, + // `!!java.net.URLClassLoader`, …) and instantiates any class via + // reflection. SnakeYAML 2.0 swapped the default to SafeConstructor + // but pre-2.0 deployments stay vulnerable until call sites are + // patched. We match the empty-arg form `new Yaml()` only, so the + // explicit-SafeConstructor remediation form + // `new Yaml(new SafeConstructor(new LoaderOptions()))` is silent. + Pattern { + id: "java.deser.snakeyaml_unsafe_constructor", + description: "new Yaml() without SafeConstructor accepts arbitrary class tags (CVE-2022-1471)", + query: r#"(object_creation_expression + type: (type_identifier) @t (#eq? @t "Yaml") + arguments: (argument_list) @args (#eq? @args "()")) + @vuln"#, + severity: Severity::High, + tier: PatternTier::A, + category: PatternCategory::Deserialization, + confidence: Confidence::High, + }, + // ── Tier A: Apache Commons Text Text4Shell (CVE-2022-42889) ──────── + // `StringSubstitutor.createInterpolator()` enables `script:`, + // `dns:`, and `url:` lookups by default, `${script:js:…}` + // evaluates JavaScript via the JSR-223 ScriptEngineManager. The + // factory call is itself the structural bug; the recommended app- + // side mitigation builds a `StringSubstitutor` directly with a + // restricted lookup map. + Pattern { + id: "java.code_exec.text4shell_interpolator", + description: "StringSubstitutor.createInterpolator() enables script:/dns:/url: evaluation (CVE-2022-42889)", + query: r#"(method_invocation + object: (identifier) @c (#eq? @c "StringSubstitutor") + name: (identifier) @id (#eq? @id "createInterpolator")) + @vuln"#, + severity: Severity::High, + tier: PatternTier::A, + category: PatternCategory::CodeExec, + confidence: Confidence::High, + }, // ── Tier A: Command execution ────────────────────────────────────── Pattern { id: "java.cmdi.runtime_exec", diff --git a/src/patterns/mod.rs b/src/patterns/mod.rs index a9b8e44c..dc30f839 100644 --- a/src/patterns/mod.rs +++ b/src/patterns/mod.rs @@ -1,42 +1,4 @@ -//! # AST Pattern Conventions -//! -//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs. -//! -//! ## ID format -//! -//! `..` — e.g. `java.deser.readobject`, `py.cmdi.os_system`. -//! -//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`. -//! -//! ## Tiers -//! -//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`). -//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated -//! arg, format-string with variable first arg). -//! -//! ## Severity -//! -//! * **High** — command exec, deserialization, banned C functions. -//! * **Medium** — SQL concat, reflection, XSS sinks, casts. -//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`). -//! -//! Note: the default `min_severity` filter skips Low patterns; they only appear when -//! the user explicitly lowers the threshold. -//! -//! ## No-duplicate rule -//! -//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a -//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but -//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`). -//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location. -//! -//! ## Adding a new pattern -//! -//! 1. Pick the language file under `src/patterns/.rs`. -//! 2. Choose tier, category, severity per the rules above. -//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`. -//! 4. Add a snippet to `tests/fixtures/patterns//positive.`. -//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`. +#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))] pub mod c; pub mod cpp; @@ -68,7 +30,7 @@ pub enum Severity { impl Severity { /// Bracketed, colored, fixed-width tag for aligned console output. /// - /// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` — always 8 visible characters + /// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"`, always 8 visible characters /// so the column after the tag lines up regardless of severity. #[allow(dead_code)] // public API for lib consumers pub fn colored_tag(self) -> String { @@ -123,9 +85,9 @@ impl FromStr for Severity { /// A parsed severity filter expression. /// /// Supports three forms: -/// - Single level: `"HIGH"` — matches only that level -/// - Comma list: `"HIGH,MEDIUM"` — matches any listed level -/// - Threshold: `">=MEDIUM"` — matches that level and above +/// - Single level: `"HIGH"`, matches only that level +/// - Comma list: `"HIGH,MEDIUM"`, matches any listed level +/// - Threshold: `">=MEDIUM"`, matches that level and above /// /// Parsing is case-insensitive and tolerates whitespace around tokens. #[derive(Debug, Clone, PartialEq, Eq)] @@ -242,7 +204,7 @@ impl PatternCategory { /// One AST pattern with a tree-sitter query and meta-data. #[derive(Debug, Clone, Serialize, PartialEq)] pub struct Pattern { - /// Unique identifier — `..` preferred. + /// Unique identifier, `..` preferred. pub id: &'static str, /// Human-readable explanation. pub description: &'static str, diff --git a/src/patterns/python.rs b/src/patterns/python.rs index 74c0c135..8364b58b 100644 --- a/src/patterns/python.rs +++ b/src/patterns/python.rs @@ -5,7 +5,7 @@ use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity}; /// /// Taint rules cover `eval`/`exec`, `os.system`/`os.popen`/`subprocess.*`, /// and `cursor.execute`. AST patterns here add coverage for **deserialization**, -/// **subprocess shell=True** (Tier B — taint doesn't check keyword args), and +/// **subprocess shell=True** (Tier B, taint doesn't check keyword args), and /// **code execution** sinks that taint cannot structurally verify. pub const PATTERNS: &[Pattern] = &[ // ── Tier A: Code execution ───────────────────────────────────────── @@ -121,14 +121,45 @@ pub const PATTERNS: &[Pattern] = &[ confidence: Confidence::High, }, // ── Tier B: SQL injection (format/concat heuristic) ──────────────── + // Catches both `cursor.execute(query + user)` (binary_operator concat) + // and `cursor.execute(f"... {user} ...")` (f-string with interpolation). + // f-strings appear as a `string` node with `interpolation` children in + // tree-sitter-python; the alternation lets the same pattern cover both + // the historical % / + concat shapes and the modern f-string SQLi shape + // that surfaces in CVE-2025-24793 (snowflake-connector-python), + // CVE-2025-69662 (geopandas), and dozens of similar cursor.execute + // call sites across the corpus. Pattern { id: "py.sqli.execute_format", - description: "cursor.execute with string concatenation risks SQL injection", + description: "cursor.execute with string concatenation or f-string risks SQL injection", query: r#"(call function: (attribute attribute: (identifier) @fn (#eq? @fn "execute")) arguments: (argument_list - (binary_operator) @arg)) + [(binary_operator) + (string (interpolation))] @arg)) + @vuln"#, + severity: Severity::Medium, + tier: PatternTier::B, + category: PatternCategory::SqlInjection, + confidence: Confidence::Medium, + }, + // SQLAlchemy `text()`, same Tier B heuristic + // applied to the SQLAlchemy raw-SQL constructor. Catches the + // CVE-2025-69662 (geopandas) shape: + // connection.execute(text(f"SELECT … '{geom_name}' …")) + // where the f-string interpolation is the injection point and the + // surrounding `connection.execute` would otherwise hide the unsafe + // construction from the simple execute_format pattern. + Pattern { + id: "py.sqli.text_format", + description: "sqlalchemy text() with f-string or string concat risks SQL injection", + query: r#"(call + function: [(identifier) @fn (attribute attribute: (identifier) @fn)] + (#eq? @fn "text") + arguments: (argument_list + [(binary_operator) + (string (interpolation))] @arg)) @vuln"#, severity: Severity::Medium, tier: PatternTier::B, diff --git a/src/pointer/analysis.rs b/src/pointer/analysis.rs index 265b9b0d..b7364441 100644 --- a/src/pointer/analysis.rs +++ b/src/pointer/analysis.rs @@ -1,33 +1,8 @@ //! Field-sensitive Steensgaard points-to analysis driver. //! -//! Walks the SSA body once per fixpoint pass, emitting equality -//! constraints for each instruction. The constraints are resolved -//! via standard union-find with path compression and union-by-rank; -//! propagation through `FieldProj` requires a worklist because the -//! representative of a receiver may change after the field projection -//! is first visited. -//! -//! The analysis is flow-insensitive (Steensgaard) — every assignment -//! that joins two values unifies their points-to sets across the -//! entire body. Field sensitivity is recovered by representing each -//! `obj.f` access as a structural [`AbsLoc::Field`] location with a -//! distinct identity per `(parent_loc, field)` pair. -//! -//! ## Phase 1 scope -//! -//! - Field READS via [`SsaOp::FieldProj`] — sufficient for Phase 2's -//! resource-lifecycle attribution fix (the gin/`context.go` proxy -//! acquire FP). -//! - Param/SelfParam → fresh caller-relative locations. -//! - Phi/Assign → Steensgaard unification. -//! - Call results → fresh allocation-site locations (one per call -//! instruction, keyed by SSA value). -//! - Source/Const/Nop/Undef → empty (scalars don't reach the heap). -//! -//! Field WRITES land in Phase 3 alongside the cross-method field-flow -//! consumer; they require careful handling of the synthetic -//! base-update `Assign` instructions emitted by SSA lowering and are -//! not load-bearing for Phase 1's "no behaviour change" gate. +//! Flow-insensitive union-find over SSA values; field sensitivity comes +//! from representing each `obj.f` access as a structural +//! [`AbsLoc::Field`] keyed by `(parent_loc, field)`. use std::collections::HashMap; @@ -41,13 +16,9 @@ use super::domain::{AbsLoc, LOC_TOP, LocId, LocInterner, PointsToSet, PtrProxyHi /// in a small number of passes for any well-formed body. const MAX_FIXPOINT_ITERS: usize = 8; -/// Pointer-Phase 4: container-read callees that pull a single element -/// out of a collection without indexing through a key. Recognised -/// across the languages nyx supports as a cross-cut surface — exact -/// per-language specialisation is intentionally skipped for the -/// minimum-viable rollout. The receiver-side projection through -/// [`FieldId::ELEM`] is conservative: a callee not in this list still -/// gets the existing fresh-alloc behaviour and does not lose precision. +/// Container-read callees that pull a single element out of a +/// collection without a key. Cross-language; non-listed callees still +/// get fresh-alloc behaviour, so the list is conservative. fn is_container_read_callee(callee: &str) -> bool { let bare = match callee.rsplit_once('.') { Some((_, m)) => m, @@ -67,19 +38,12 @@ fn is_container_read_callee(callee: &str) -> bool { | "dequeue" | "remove" | "popleft" - // Pointer-Phase 6 / W5: synthetic callee emitted by CFG - // lowering for subscript / index-expression reads - // (`arr[i]`, `map[k]`, `cmds[0]`). + // synthetic callee for subscript reads (`arr[i]`, `map[k]`) | "__index_get__" ) } -/// Pointer-Phase 4: container-write callees that store an element into -/// a collection. Mirror of [`is_container_read_callee`]. The pointer -/// analysis itself doesn't track stored values (the Steensgaard -/// receiver/result aliasing already covers the common cases), but the -/// helper is exposed so the taint engine's ELEM-cell write hook can -/// share a single classifier with the points-to pass. +/// Container-write callees, mirror of [`is_container_read_callee`]. pub fn is_container_write_callee(callee: &str) -> bool { let bare = match callee.rsplit_once('.') { Some((_, m)) => m, @@ -97,37 +61,34 @@ pub fn is_container_write_callee(callee: &str) -> bool { | "insert" | "enqueue" | "unshift" - // Pointer-Phase 6 / W5: synthetic callee emitted by CFG - // lowering for subscript / index-expression writes - // (`arr[i] = v`, `map[k] = v`). + // synthetic callee for subscript writes (`arr[i] = v`, `map[k] = v`) | "__index_set__" ) } -/// Pointer-Phase 4: callee-name aware container-read recognition. -/// Public for unit tests + reuse from the taint engine. +/// Public re-export of [`is_container_read_callee`] for the taint engine. pub fn is_container_read_callee_pub(callee: &str) -> bool { is_container_read_callee(callee) } -/// Pointer-Phase 5: derive a [`crate::summary::points_to::FieldPointsToSummary`] -/// from per-body points-to facts. +/// Derive a [`crate::summary::points_to::FieldPointsToSummary`] from +/// per-body points-to facts. /// /// Records two channels: /// -/// 1. **Reads** — walks every [`SsaOp::FieldProj`] in the body; for +/// 1. **Reads**, walks every [`SsaOp::FieldProj`] in the body; for /// each `loc ∈ pt(receiver)` that resolves to a parameter /// location ([`AbsLoc::Param`] / [`AbsLoc::SelfParam`]), records /// the projected field name into the summary's /// `param_field_reads`. -/// 2. **Writes** — walks the body's [`SsaBody::field_writes`] side- +/// 2. **Writes**, walks the body's [`SsaBody::field_writes`] side- /// table (populated by SSA lowering's W1 synth-Assign hook) and /// records each `(receiver, FieldId)` pair against the receiver's /// pt set the same way reads are recorded. /// /// Field name resolution goes through the body's /// [`SsaBody::field_interner`] because [`crate::ssa::ir::FieldId`] -/// is body-local — names are the only stable cross-file identity. +/// is body-local, names are the only stable cross-file identity. /// /// Receiver (`SelfParam`) reads/writes are recorded under the /// [`u32::MAX`] sentinel parameter index, mirroring the convention in @@ -226,7 +187,7 @@ pub fn extract_field_points_to( /// Per-body points-to result. /// /// Owns the body-local [`LocInterner`] and a flat `SsaValue → PointsToSet` -/// table. The table is dense — one slot per SSA value — so lookups +/// table. The table is dense, one slot per SSA value, so lookups /// are O(1). #[derive(Clone, Debug)] pub struct PointsToFacts { @@ -242,7 +203,7 @@ pub struct PointsToFacts { } impl PointsToFacts { - /// Empty result — every value points to nothing. Used by callers + /// Empty result, every value points to nothing. Used by callers /// that need a "no facts" placeholder when the analysis is /// disabled or the body could not be analysed. pub fn empty(body: BodyId) -> Self { @@ -288,11 +249,6 @@ impl PointsToFacts { /// [`PtrProxyHint::FieldOnly`] iff every member is an /// [`AbsLoc::Field`]. /// - /// Phase 2 consumer: the resource-lifecycle proxy attribution in - /// `state::transfer.rs` uses `FieldOnly` to recognise locals like - /// `m` in `m := c.mu` and route the proxy entry through - /// `chain_proxies` instead of marking the local as a leakable - /// SymbolId-keyed resource. pub fn proxy_hint(&self, v: SsaValue) -> PtrProxyHint { let set = self.pt(v); if set.is_empty() || set.is_top() { @@ -310,7 +266,7 @@ impl PointsToFacts { /// Build a `var_name → PtrProxyHint` map by scanning the body's /// value defs for the latest definition of each named variable. /// Names that resolve to no variable, or whose latest definition is - /// `Other`, are omitted — only `FieldOnly` entries appear. + /// `Other`, are omitted, only `FieldOnly` entries appear. /// /// Iterates over [`SsaBody::value_defs`] in *reverse* order so the /// last (post-renaming) SSA definition for each name wins. Used by @@ -340,13 +296,13 @@ impl PointsToFacts { /// Analyse a single body and return its [`PointsToFacts`]. /// /// `body_id` is used as the disambiguator inside the abstract -/// locations — supplying a stable id (e.g. the file's +/// locations, supplying a stable id (e.g. the file's /// `BodyMeta.id`) lets callers compare facts emitted by different /// bodies in the same file. pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts { let mut state = AnalysisState::new(body_id, body.num_values()); - // Pass 1 — emit constraints from ops that don't depend on + // Pass 1, emit constraints from ops that don't depend on // representative resolution (Param, SelfParam, Call result, // etc.). These produce the "leaf" points-to sets. for block in &body.blocks { @@ -355,7 +311,7 @@ pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts { } } - // Pass 2+ — propagate through field projections, phis, and + // Pass 2+, propagate through field projections, phis, and // assignments until a fixpoint. Field projections need iteration // because a `FieldProj` whose receiver's representative changes // (via a later unification) must re-emit its constraint with the @@ -377,7 +333,7 @@ pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts { // ── Constraint solver internals ──────────────────────────────────── -/// Mutable analysis state — the interner, points-to table, and +/// Mutable analysis state, the interner, points-to table, and /// union-find arrays. Lives inside `analyse_body` only. struct AnalysisState { /// Body-id forwarded to [`PointsToFacts::body`] when the analysis @@ -457,7 +413,7 @@ impl AnalysisState { /// `pt(rep_a) ∪= pt(rep_b)`. Caller is responsible for passing /// already-resolved representatives if it wants Steensgaard - /// unification — see `union` for that. + /// unification, see `union` for that. fn copy_pt(&mut self, dst: u32, src: u32) -> bool { let dr = self.find(dst); let sr = self.find(src); @@ -486,7 +442,7 @@ impl AnalysisState { self.add_loc(v, loc); } SsaOp::CatchParam => { - // Exception bindings come from the runtime — model as + // Exception bindings come from the runtime, model as // an opaque allocation-site keyed by the SSA value. let loc = self.interner.intern_alloc(body_id, v); self.add_loc(v, loc); @@ -494,14 +450,14 @@ impl AnalysisState { SsaOp::Call { callee, receiver, .. } => { - // Pointer-Phase 4: container element retrieval ops + // container element retrieval ops // (`shift`, `pop`, `peek`, `front`, …) project through // the abstract `Field(pt(receiver), ELEM)` cell so // per-element taint flows independently of the SSA // value referencing the container. The receiver's // points-to set may not be fully resolved on this // pass, so we *also* add a fresh allocation site as a - // fallback — the fixpoint pass below absorbs the + // fallback, the fixpoint pass below absorbs the // proper Field projection once the receiver's set // converges. let loc = self.interner.intern_alloc(body_id, v); @@ -538,7 +494,7 @@ impl AnalysisState { } } SsaOp::FieldProj { .. } => { - // Resolved during the fixpoint pass — see + // Resolved during the fixpoint pass, see // `propagate_inst`. } SsaOp::Source | SsaOp::Const(_) | SsaOp::Nop | SsaOp::Undef => { @@ -548,7 +504,7 @@ impl AnalysisState { } /// Fixpoint-pass transfer. Re-runs constraints whose result - /// depends on the current set of representatives — i.e. field + /// depends on the current set of representatives, i.e. field /// projections, phis, and assignments may need to absorb new /// members emitted after the first pass. Returns `true` when /// any points-to set changed. @@ -608,7 +564,7 @@ impl AnalysisState { } /// Materialise the dense `SsaValue → PointsToSet` table. Each - /// value's set is the set of its representative — values in the + /// value's set is the set of its representative, values in the /// same Steensgaard class share the same set. fn into_facts(mut self) -> PointsToFacts { let mut by_value = Vec::with_capacity(self.pt.len()); @@ -714,7 +670,7 @@ mod tests { } } - /// `let c = self; let m = c.mu;` — pt(m) must be `{Field(SelfParam, mu)}`, + /// `let c = self; let m = c.mu;` , pt(m) must be `{Field(SelfParam, mu)}`, /// distinct from pt(c) = `{SelfParam}`. #[test] fn field_subobject_distinct_from_receiver() { @@ -762,7 +718,7 @@ mod tests { } } - /// `let y = x;` — y and x share the same points-to set. + /// `let y = x;` , y and x share the same points-to set. #[test] fn copy_propagation_unifies() { let mut b = BodyBuilder::new(); @@ -783,7 +739,7 @@ mod tests { assert!(!facts.pt(y).is_empty()); } - /// `if (cond) z = a; else z = b;` — phi at the merge unifies + /// `if (cond) z = a; else z = b;` , phi at the merge unifies /// `pt(z)` with both `pt(a)` and `pt(b)`. #[test] fn phi_unifies_branches() { @@ -793,7 +749,7 @@ mod tests { let b_v = b.fresh(Some("b")); b.emit(b_v, SsaOp::Param { index: 1 }, Some("b")); - // Phi(0: a, 0: b) — predecessor block ids are placeholders. + // Phi(0: a, 0: b), predecessor block ids are placeholders. let z = b.fresh(Some("z")); b.emit( z, @@ -812,7 +768,7 @@ mod tests { assert_eq!(pt_z.len(), 2); } - /// `node = node.next;` — the `FieldProj` self-cycle must + /// `node = node.next;`, the `FieldProj` self-cycle must /// terminate via the union-find / depth bound, not loop. #[test] fn self_referential_field_chain_terminates() { @@ -847,7 +803,7 @@ mod tests { let facts = analyse_body(&body, body_id()); let pt_node = facts.pt(node); // Either we converge to a non-empty set including a Field chain, - // or we saturate to Top — either is a valid termination outcome. + // or we saturate to Top, either is a valid termination outcome. assert!(!pt_node.is_empty()); } @@ -864,7 +820,7 @@ mod tests { assert!(facts.pt(s).is_empty()); } - /// `Call` produces a fresh allocation-site location for its result — + /// `Call` produces a fresh allocation-site location for its result , /// distinct from its arguments. #[test] fn call_result_is_fresh_alloc() { @@ -901,7 +857,7 @@ mod tests { /// Driver smoke-test: the analysis runs on an SsaBody produced by /// the real lowering pipeline without panicking. This pins the - /// "no behaviour change" gate — analysis runs to completion on + /// "no behaviour change" gate, analysis runs to completion on /// representative input. #[test] fn smoke_runs_on_lowered_body() { @@ -929,12 +885,10 @@ mod tests { assert!(facts.is_trivial()); assert_eq!(facts.len(), 0); - // Suppress unused-import warning for `Cfg` — it's exposed for - // future Phase 1.b tests that need a real CFG. let _ = std::marker::PhantomData::; } - /// Pointer-Phase 2 contract pin: a value defined by a `FieldProj` + /// Contract pin: a value defined by a `FieldProj` /// classifies as [`PtrProxyHint::FieldOnly`]. Consumed by the /// resource-lifecycle pass to recognise field-aliased locals. #[test] @@ -965,7 +919,7 @@ mod tests { assert_eq!(facts.proxy_hint(c), crate::pointer::PtrProxyHint::Other); } - /// Pointer-Phase 4: container-read callee classifier covers a + /// container-read callee classifier covers a /// representative sample across nyx's languages. Pinned because /// the taint engine relies on the same classifier. #[test] @@ -992,7 +946,7 @@ mod tests { } } - /// Pointer-Phase 4: container-write classifier (mirror). + /// container-write classifier (mirror). #[test] fn container_write_callee_classifier() { for c in [ @@ -1014,7 +968,7 @@ mod tests { } } - /// Pointer-Phase 4: a `Call("shift", receiver=container)` projects + /// a `Call("shift", receiver=container)` projects /// `Field(pt(container), ELEM)` into the result, alongside the /// fresh allocation site that fall-back paths still emit. #[test] @@ -1023,7 +977,7 @@ mod tests { // `arr` is the parameter container. let arr = b.fresh(Some("arr")); b.emit(arr, SsaOp::Param { index: 0 }, Some("arr")); - // `e := arr.shift()` — container read. + // `e := arr.shift()`, container read. let e = b.fresh(Some("e")); b.emit( e, @@ -1055,7 +1009,7 @@ mod tests { ); } - /// Pointer-Phase 5: `extract_field_points_to` records a field + /// `extract_field_points_to` records a field /// READ on the parameter index when a `FieldProj` traces back to /// an `AbsLoc::Param`. #[test] @@ -1064,7 +1018,7 @@ mod tests { // `obj` is parameter 0. let obj = b.fresh(Some("obj")); b.emit(obj, SsaOp::Param { index: 0 }, Some("obj")); - // `let n = obj.name;` — field projection from a param. + // `let n = obj.name;`, field projection from a param. let name_field = b.intern_field("name"); let n = b.fresh(Some("n")); b.emit( @@ -1088,7 +1042,7 @@ mod tests { assert!(entry.1.iter().any(|s| s == "name")); } - /// Pointer-Phase 5 / W3: `extract_field_points_to` records field + /// `extract_field_points_to` records field /// WRITES from the body's `field_writes` side-table populated by /// SSA lowering. A synth Assign whose receiver traces back to /// `AbsLoc::Param` produces a `param_field_writes` entry. @@ -1124,7 +1078,7 @@ mod tests { ); } - /// Pointer-Phase 5 / W3: writes through the receiver (`this.f = + /// writes through the receiver (`this.f = /// rhs`) are recorded under the same `u32::MAX` sentinel as /// reads. #[test] @@ -1151,7 +1105,7 @@ mod tests { assert!(entry.1.iter().any(|s| s == "cache")); } - /// Pointer-Phase 5 / W3: container-element writes (`` + /// container-element writes (`` /// marker) flow through the same channel as named-field writes /// when the synth Assign carries `FieldId::ELEM`. #[test] @@ -1180,7 +1134,7 @@ mod tests { ); } - /// Pointer-Phase 5: receiver projections are recorded under the + /// receiver projections are recorded under the /// `u32::MAX` sentinel parameter index (mirror of /// `SsaFuncSummary::receiver_to_*`). #[test] @@ -1233,7 +1187,7 @@ mod tests { assert!(is_container_write_callee("arr.__index_set__")); } - /// W5: regression guard — neither synth name should match the + /// W5: regression guard, neither synth name should match the /// opposite predicate, otherwise the W2 read/write hooks would /// double-fire on the same call. #[test] @@ -1245,10 +1199,10 @@ mod tests { #[test] fn name_proxy_hints_collects_field_only_locals() { let mut b = BodyBuilder::new(); - // `c` is the receiver — root location, hint=Other. + // `c` is the receiver, root location, hint=Other. let c = b.fresh(Some("c")); b.emit(c, SsaOp::SelfParam, Some("c")); - // `m := c.mu` — field projection, hint=FieldOnly. + // `m := c.mu`, field projection, hint=FieldOnly. let mu = b.intern_field("mu"); let m = b.fresh(Some("m")); b.emit( diff --git a/src/pointer/domain.rs b/src/pointer/domain.rs index 4630ef57..ab1371d6 100644 --- a/src/pointer/domain.rs +++ b/src/pointer/domain.rs @@ -2,7 +2,7 @@ //! //! Locations are interned to compact `LocId(u32)` handles so the //! union-find resolver can operate on dense integer keys. Field -//! locations are keyed structurally by `(parent_loc_id, field_id)` — +//! locations are keyed structurally by `(parent_loc_id, field_id)` , //! interning a `Field(parent, f)` always returns the same `LocId` no //! matter how many times the same `(parent, f)` pair is requested. @@ -29,14 +29,14 @@ pub const MAX_POINTSTO_MEMBERS: usize = 16; /// Compact handle for an interned [`AbsLoc`]. /// /// All abstract locations referenced by a single body share one -/// [`LocInterner`] — `LocId`s are only meaningful relative to that +/// [`LocInterner`], `LocId`s are only meaningful relative to that /// interner. IDs are assigned densely from 0 and are stable for the /// lifetime of the interner so the union-find can index parent / rank /// arrays directly. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct LocId(pub u32); -/// Sentinel "anywhere" location. Always `LocId(0)` — the interner +/// Sentinel "anywhere" location. Always `LocId(0)`, the interner /// reserves the first slot at construction so callers can compare /// against it cheaply. pub const LOC_TOP: LocId = LocId(0); @@ -48,7 +48,7 @@ pub const LOC_TOP: LocId = LocId(0); /// is exceeded the chain folds to [`AbsLoc::Top`]. #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum AbsLoc { - /// "Anywhere" — the over-approximation used when precision is + /// "Anywhere", the over-approximation used when precision is /// unrecoverable (e.g. a value sourced from outside the analysed /// body, or a points-to set that exceeded the cap). Top, @@ -60,7 +60,7 @@ pub enum AbsLoc { /// file. The interned `u32` is the `SsaValue.0` of the call / /// constructor instruction. Alloc(BodyId, u32), - /// Function parameter — the abstract identity of the value + /// Function parameter, the abstract identity of the value /// supplied by the caller for parameter `index`. The receiver /// (`self` / `this`) uses [`AbsLoc::SelfParam`] instead. Param(BodyId, usize), @@ -69,7 +69,7 @@ pub enum AbsLoc { /// receiver" sentinel index. SelfParam(BodyId), /// Heap field of a parent location: `parent.f`. `parent` is - /// itself a [`LocId`] — chains of field accesses produce nested + /// itself a [`LocId`], chains of field accesses produce nested /// `Field` locations. Depth is bounded by [`MAX_FIELD_DEPTH`]. Field { parent: LocId, field: FieldId }, } @@ -130,7 +130,7 @@ impl LocInterner { } /// Resolve a [`LocId`] back to its [`AbsLoc`]. Panics on out-of- - /// range ids — only ids the interner produced are valid. + /// range ids, only ids the interner produced are valid. #[inline] pub fn resolve(&self, id: LocId) -> &AbsLoc { &self.locs[id.0 as usize] @@ -202,7 +202,7 @@ impl LocInterner { } /// Coarse classification of a value's points-to set, used by consumers -/// (Phase 2: resource lifecycle) that don't need full set membership but +/// (Hierarchy: resource lifecycle) that don't need full set membership but /// do need to know "is this value's heap identity a *field* of some /// other value, or does it stand on its own?". /// @@ -213,7 +213,7 @@ impl LocInterner { #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum PtrProxyHint { /// Every member of the points-to set is an [`AbsLoc::Field`]. The - /// value is a sub-object alias — e.g. `m` in `m := c.mu`. + /// value is a sub-object alias, e.g. `m` in `m := c.mu`. FieldOnly, /// Anything else: the set is empty, contains a root location /// ([`AbsLoc::SelfParam`] / [`AbsLoc::Param`] / [`AbsLoc::Alloc`]), @@ -242,7 +242,7 @@ impl Default for PointsToSet { } impl PointsToSet { - /// Empty set — the value points to nothing tracked by the + /// Empty set, the value points to nothing tracked by the /// analysis (e.g. a scalar constant). pub fn empty() -> Self { Self { @@ -257,7 +257,7 @@ impl PointsToSet { Self { ids } } - /// `{Top}` — the universal over-approximation. + /// `{Top}`, the universal over-approximation. pub fn top() -> Self { Self::singleton(LOC_TOP) } @@ -313,7 +313,7 @@ impl PointsToSet { } } - /// Set-union, in place. Returns `true` when `self` changed — + /// Set-union, in place. Returns `true` when `self` changed , /// the constraint solver uses the bit to decide whether the /// containing equivalence class needs another pass. pub fn union_in_place(&mut self, other: &PointsToSet) -> bool { diff --git a/src/pointer/mod.rs b/src/pointer/mod.rs index 93fd3236..78a83e73 100644 --- a/src/pointer/mod.rs +++ b/src/pointer/mod.rs @@ -1,24 +1,12 @@ //! Field-sensitive Steensgaard alias / points-to analysis. //! -//! Sibling pass to [`crate::ssa::heap`]. Where `heap.rs` tracks per-value -//! container identity for taint propagation through container element -//! abstractions, this module tracks **field-sensitive** points-to so the -//! engine can distinguish a receiver from one of its sub-fields: -//! -//! - `c.mu.Lock()` — the lock is acquired on `Field(c, mu)`, not on `c` -//! itself. Without this distinction the resource-lifecycle pass -//! mis-attributes the acquire to the receiver and emits a spurious -//! "leakable resource" finding (the gin / `context.go` FP class). -//! - Cross-method field flow — method A writes `this.cache`, method B -//! reads `this.cache`; both observe a shared abstract location -//! `Field(SelfParam, cache)` only when fields have a stable identity -//! independent of the parent value. -//! -//! Phase 1 of the rollout (this commit) ships the analysis but no -//! consumer. Behaviour is unchanged whether `NYX_POINTER_ANALYSIS=1` is -//! set or not — the analysis is opt-in and only computed when callers -//! ask for it. Phase 2 (resource lifecycle) and Phase 3 (taint engine) -//! will start consuming the resulting facts. +//! Sibling to [`crate::ssa::heap`]: where heap tracks per-value +//! container identity for element abstractions, this module tracks +//! field-sensitive points-to so the engine can distinguish a receiver +//! from a sub-field. `c.mu.Lock()` acquires on `Field(c, mu)`, not `c`, +//! so the resource-lifecycle pass doesn't mis-attribute the acquire. +//! Cross-method field flow (method A writes `this.cache`, method B +//! reads it) observes the shared `Field(SelfParam, cache)` location. pub mod analysis; pub mod domain; @@ -29,12 +17,8 @@ pub use analysis::{ }; pub use domain::{AbsLoc, LocId, LocInterner, PointsToSet, PtrProxyHint}; -/// Returns whether the field-sensitive pointer analysis is enabled at runtime. -/// -/// Default: enabled (post-Phase-6 flip on 2026-04-26). Set -/// `NYX_POINTER_ANALYSIS=0` (or `false`) to disable for one release -/// cycle so customer scans can compare baselines. The env-var -/// override is removed entirely in the next release. +/// Returns whether the field-sensitive pointer analysis is enabled. +/// Set `NYX_POINTER_ANALYSIS=0` (or `false`) to disable. #[inline] pub fn is_enabled() -> bool { !matches!( diff --git a/src/rank.rs b/src/rank.rs index 8c49fae6..217d4fc4 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -97,14 +97,14 @@ pub fn compute_attack_rank(diag: &Diag) -> AttackRank { // direction of precision loss is classified by // `EngineNote::direction()` and drives a bounded penalty: // - // * `Bail` — analysis aborted on this body → -8.0 - // * `OverReport` — widening may have produced a false positive → -8.0 - // * `UnderReport` — fixpoint was cut short but this finding is + // * `Bail` , analysis aborted on this body → -8.0 + // * `OverReport` , widening may have produced a false positive → -8.0 + // * `UnderReport`, fixpoint was cut short but this finding is // still a real flow → -3.0 - // * `Informational` — no penalty (cache reuse etc.) + // * `Informational`, no penalty (cache reuse etc.) // - // The penalty is the *worst* direction across all attached notes — - // not additive — so a body with ten `OriginsTruncated` notes is not + // The penalty is the *worst* direction across all attached notes , + // not additive, so a body with ten `OriginsTruncated` notes is not // ranked below a body with one `ParseTimeout`. Magnitudes are // chosen so that `High + capped` (60 − 8 = 52) still exceeds // `Medium + taint + UserInput` (30 + 10 + 6 = 46), preserving the @@ -125,7 +125,7 @@ pub fn compute_attack_rank(diag: &Diag) -> AttackRank { /// /// `None` when the finding has no evidence struct, no engine notes, or /// only informational notes. Uses `worst_direction` so the penalty is -/// the single most credibility-damaging direction present — adding more +/// the single most credibility-damaging direction present, adding more /// notes of the same direction does not compound the penalty. struct CompletenessPenalty { value: f64, @@ -289,16 +289,16 @@ fn source_kind_priority(source_value: &str) -> f64 { // Strong user-input signals 6.0 } else if lower.contains("env") || lower.contains("var(") || lower.contains("getenv") { - // Environment / config — still attacker-controllable in many deployments + // Environment / config, still attacker-controllable in many deployments 5.0 } else if lower.contains("read") || lower.contains("file") || lower.contains("open") { - // File system — needs indirect vector + // File system, needs indirect vector 3.0 } else if lower.contains("query") || lower.contains("fetch") || lower.contains("select") { - // Database — needs prior injection + // Database, needs prior injection 2.0 } else { - // Unknown / unrecognised — treat as moderately exploitable + // Unknown / unrecognised, treat as moderately exploitable 4.0 } } @@ -931,7 +931,7 @@ mod tests { #[test] fn completeness_penalty_is_not_additive_across_notes() { - // Ten OriginsTruncated notes must produce the same penalty as one — + // Ten OriginsTruncated notes must produce the same penalty as one , // the penalty reflects the worst direction, not a count. let mut d_many = clean_diag_with_evidence(); let many = (0..10) diff --git a/src/rust_resolve.rs b/src/rust_resolve.rs index 6c48f7b7..55a914fb 100644 --- a/src/rust_resolve.rs +++ b/src/rust_resolve.rs @@ -3,11 +3,11 @@ //! This module is entirely Rust-flavored helpers for the cross-file call graph. //! Other languages do not need it. The two pieces are: //! -//! * [`derive_module_path`] — given a Rust source file path and an optional +//! * [`derive_module_path`], given a Rust source file path and an optional //! crate root, produce its canonical crate-relative module path //! (`src/foo/bar.rs` → `"foo::bar"`, `src/lib.rs` → `""`). //! -//! * [`parse_rust_use_map`] — walk the top-level `use_declaration` nodes of a +//! * [`parse_rust_use_map`], walk the top-level `use_declaration` nodes of a //! parsed tree and produce a [`RustUseMap`] mapping local aliases to fully //! qualified paths plus a list of wildcard imports. //! @@ -27,7 +27,7 @@ //! * Macro-expanded `use` statements //! * `pub use` re-exports across modules //! * `extern crate alias_name;` -//! * Self-prefixed imports (`use self::sub::foo;`) — treated as `self::sub::foo` +//! * Self-prefixed imports (`use self::sub::foo;`), treated as `self::sub::foo` //! //! These are flagged in the final pass-1 telemetry but do not block resolution. @@ -102,7 +102,7 @@ pub fn derive_module_path(file_path: &Path, scan_root: Option<&Path>) -> Option< let mut segments: Vec<&str> = rel.iter().filter_map(|s| s.to_str()).collect(); // Strip a leading `src` directory if present. Files outside `src/` (e.g. - // tests, examples, build.rs) get a `None` here — we do not have a stable + // tests, examples, build.rs) get a `None` here, we do not have a stable // module path for them and resolution should fall back to file-based. match segments.first().copied() { Some("src") => { @@ -145,7 +145,7 @@ pub fn derive_module_path(file_path: &Path, scan_root: Option<&Path>) -> Option< /// [`RustUseMap`]. /// /// The walk only inspects direct children of the source root. Nested `use`s -/// inside functions or impls are deliberately skipped — their scope is local +/// inside functions or impls are deliberately skipped, their scope is local /// and does not influence the cross-file call graph at the module level. pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap { let mut map = RustUseMap::default(); @@ -160,7 +160,7 @@ pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap { Some(n) => n, None => { // tree-sitter-rust 0.24 sometimes exposes the body as a named - // child instead of a field — fall back to the first named child. + // child instead of a field, fall back to the first named child. match child.named_child(0) { Some(n) => n, None => continue, @@ -179,7 +179,7 @@ pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap { /// `b::c` inside `a::{b::c}` is flattened to `a::b::c`). fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut RustUseMap) { match node.kind() { - // `crate::auth::token::validate` — terminal scoped path, leaf is the alias. + // `crate::auth::token::validate`, terminal scoped path, leaf is the alias. "scoped_identifier" => { let segments = scoped_segments(node, src); if segments.is_empty() { @@ -191,7 +191,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru map.aliases.insert(leaf, full); } } - // `validate` — bare identifier (e.g. `use foo::validate`). + // `validate`, bare identifier (e.g. `use foo::validate`). "identifier" => { let name = node_text(node, src).to_string(); if name.is_empty() { @@ -201,7 +201,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru segs.push(name.clone()); map.aliases.insert(name, segs.join("::")); } - // `crate::auth::token::{validate, verify}` — left side is the prefix, + // `crate::auth::token::{validate, verify}`, left side is the prefix, // right side is a use_list of further use clauses. "scoped_use_list" => { // path field carries the prefix; the list field carries the body. @@ -239,7 +239,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru collect_use_paths(c, src, prefix, map); } } - // `crate::auth::token::validate as ok` — alias the leaf identifier. + // `crate::auth::token::validate as ok`, alias the leaf identifier. "use_as_clause" => { let path_node = node .child_by_field_name("path") @@ -256,7 +256,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru map.aliases.insert(alias_name, full); } } - // `crate::auth::token::*` — record the prefix as a wildcard import. + // `crate::auth::token::*`, record the prefix as a wildcard import. "use_wildcard" => { // The wildcard's child is the path being wildcarded. let path_node = node.named_child(0); @@ -270,7 +270,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru } _ => { // Unknown/unsupported form (e.g. macro_invocation in use position, - // attribute-prefixed clauses) — flag in pass-1 telemetry, skip + // attribute-prefixed clauses), flag in pass-1 telemetry, skip // here to keep the walk total. } } @@ -452,7 +452,7 @@ mod tests { #[test] fn module_path_no_cargo_toml_with_scan_root() { - // No Cargo.toml anywhere — fall back to scan root. + // No Cargo.toml anywhere, fall back to scan root. let dir = PathBuf::from("/tmp/nyx_mp_test_no_cargo"); std::fs::create_dir_all(dir.join("src")).ok(); // Make sure no Cargo.toml exists. @@ -535,7 +535,7 @@ mod tests { #[test] fn use_map_malformed_does_not_panic() { - // Truncated input — must not panic. + // Truncated input, must not panic. let src = b"use crate::auth::"; let tree = parse(std::str::from_utf8(src).unwrap()); let _ = parse_rust_use_map(src, &tree); diff --git a/src/server/app.rs b/src/server/app.rs index 315a7424..ea290d31 100644 --- a/src/server/app.rs +++ b/src/server/app.rs @@ -72,7 +72,7 @@ pub struct AppState { pub findings_cache: Arc>>, } -/// 50 MiB cap on request bodies — generous for config uploads, tight +/// 50 MiB cap on request bodies, generous for config uploads, tight /// enough to prevent OOM from a rogue client. const MAX_BODY_BYTES: usize = 50 * 1024 * 1024; @@ -286,7 +286,7 @@ mod tests { } /// Panic inside a thread that holds a write guard on the shared config lock. - /// With `parking_lot::RwLock`, the lock must remain usable afterwards — + /// With `parking_lot::RwLock`, the lock must remain usable afterwards , /// this is the poison-recovery contract we rely on in every route handler. #[tokio::test] async fn config_lock_survives_panic_in_write_guard() { diff --git a/src/server/debug.rs b/src/server/debug.rs index 818ffffc..581264f6 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -782,7 +782,7 @@ pub struct FuncSummaryView { /// Enclosing container path (class / impl / module / outer function). /// Empty for free top-level functions. pub container: String, - /// Structural [`crate::symbol::FuncKind`] slug — `"fn"`, `"method"`, + /// Structural [`crate::symbol::FuncKind`] slug, `"fn"`, `"method"`, /// `"closure"`, etc. Lets the UI distinguish anonymous closures from /// named functions for filtering. pub func_kind: String, @@ -934,10 +934,10 @@ pub struct PointerView { pub locations: Vec, pub values: Vec, /// Field reads attributed to params/receiver via the field-points-to - /// extractor (Phase 5). + /// extractor. pub field_reads: Vec, /// Field writes attributed to params/receiver via the field-points-to - /// extractor (Phase 5). + /// extractor. pub field_writes: Vec, /// Number of distinct interned locations beyond the reserved Top sentinel. pub location_count: usize, @@ -998,7 +998,7 @@ impl PointerView { }); } - // Per-value pt sets — emit only values with non-empty sets to keep + // Per-value pt sets, emit only values with non-empty sets to keep // the payload focused on interesting facts. let mut values: Vec = Vec::new(); for v in 0..ssa.num_values() as u32 { @@ -1064,12 +1064,12 @@ pub struct TypeFactDetailView { pub ssa_value: u32, pub var_name: Option, pub line: usize, - /// Type kind tag — matches the [`TypeKind`] discriminant + /// Type kind tag, matches the [`TypeKind`] discriminant /// (`String`, `Int`, `HttpClient`, `Dto`, …). pub kind: String, /// True when the value is allowed to be null/None. pub nullable: bool, - /// Container/class name — set for `HttpClient`, `DatabaseConnection`, + /// Container/class name, set for `HttpClient`, `DatabaseConnection`, /// `Dto`, etc. Mirrors [`TypeKind::container_name`]. #[serde(skip_serializing_if = "Option::is_none")] pub container: Option, @@ -1437,7 +1437,7 @@ pub fn function_list(analysis: &FileAnalysis) -> Vec { /// Lower a single function to SSA and optimize it. /// /// Returns the per-function body graph alongside the SSA. SSA is lowered -/// against `body.graph`, whose `NodeIndex` space is body-local — the file's +/// against `body.graph`, whose `NodeIndex` space is body-local, the file's /// top-level CFG (`analysis.cfg()`) has a different index space, so any /// downstream analysis that indexes by `inst.cfg_node` must use the returned /// `&Cfg`, not `analysis.cfg()`. @@ -1638,7 +1638,7 @@ pub fn analyse_file_summaries( /// Run the file-level authorization extraction pipeline for the debug UI. /// /// Returns the structured `AuthorizationModel` (routes, units, sensitive -/// operations, auth checks) plus the file bytes and an `enabled` flag — +/// operations, auth checks) plus the file bytes and an `enabled` flag , /// the bytes drive line-number resolution in the view, and `enabled` /// surfaces "auth analysis is off for this language" without conflating /// it with an empty result. @@ -1651,7 +1651,7 @@ pub fn analyse_file_auth( .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)? .ok_or(StatusCode::BAD_REQUEST)?; // Determine whether the auth rules were actually enabled for this - // file's language — `extract_auth_model_for_debug` returns an empty + // file's language, `extract_auth_model_for_debug` returns an empty // model both when the rules are disabled and when the file just // happens to have no routes. The view distinguishes the two so the // UI can show "analysis disabled" instead of "no routes found". @@ -2122,7 +2122,7 @@ fn main() { // Belt-and-suspenders: assert that calling with the wrong (top-level) // CFG would have panicked. We can't catch the panic across rayon // worker threads here, but we can confirm at least one `inst.cfg_node` - // index lies outside `analysis.cfg()`'s range — that's what triggers + // index lies outside `analysis.cfg()`'s range, that's what triggers // the OOB indexing inside `transfer_inst`. let toplevel_count = analysis.cfg().node_count(); let max_inst_idx = ssa diff --git a/src/server/health.rs b/src/server/health.rs index 5e73144e..cc3bac71 100644 --- a/src/server/health.rs +++ b/src/server/health.rs @@ -1,4 +1,4 @@ -//! Health-score scoring engine — v3.5. +//! Health-score scoring engine, v3.5. //! //! Pure-function scoring over a `HealthInputs` struct. Documented in //! `docs/health-score-audit.md` (calibration, rationale) and @@ -15,7 +15,7 @@ //! //! 2. **HIGH-count guardrails.** The *qualitative* axis: HIGH counts //! cap the maximum grade and floor "no HIGH" to at least C. These -//! are non-negotiable promises — even a perfect-everywhere-else +//! are non-negotiable promises, even a perfect-everywhere-else //! repo with 6 confirmed HIGHs grades F. //! //! Modifiers (triage, trend, stale, regression, suppression hygiene) @@ -27,17 +27,17 @@ //! * Verdict-weighted credibility (`Confirmed > NotAttempted > //! Inconclusive > Infeasible`). This is the structural protection //! against false-positive-driven F grades while the scanner is -//! still maturing — it auto-tightens as symex coverage grows. +//! still maturing, it auto-tightens as symex coverage grows. //! * Cross-file vs intra-file vs AST-only weighting via //! `context_factor`. -//! * Test-path downweight (0.3×) — a HIGH in a test fixture is +//! * Test-path downweight (0.3×), a HIGH in a test fixture is //! genuinely less concerning than one in a request handler. -//! * Effective HIGH count for ceilings — the HIGH-count caps key on +//! * Effective HIGH count for ceilings, the HIGH-count caps key on //! credibility-adjusted HIGHs, not raw HIGHs. A repo with 5 //! low-confidence HIGHs that got `NotAttempted` from symex doesn't //! pay the same ceiling cost as a repo with 5 `Confirmed` HIGHs. //! * Tighter modifier ranges so they can't flip a band. -//! * No `parse_success_rate` (it's actually a cache-miss metric — +//! * No `parse_success_rate` (it's actually a cache-miss metric , //! see `project_parse_success_rate_misnomer.md`). use crate::commands::scan::Diag; @@ -48,11 +48,11 @@ use crate::server::models::{BacklogStats, FindingSummary, HealthComponent, Healt // ── Tunables ───────────────────────────────────────────────────────────────── // // Calibrated for v0.5.0 scanner FP rate. As Nyx symex coverage and -// rule precision improve, the HIGH ceilings should tighten — see +// rule precision improve, the HIGH ceilings should tighten, see // `docs/health-score-audit.md` "Calibration trajectory" for the // roadmap. -/// Below this file count, we floor the size divisor at 1.0 — tiny +/// Below this file count, we floor the size divisor at 1.0, tiny /// repos can't claim infinite per-LOC dilution from one finding. const FILES_FLOOR: f64 = 100.0; @@ -66,7 +66,7 @@ const QUALITY_DRAG_PER_FINDING: f64 = 0.05; const QUALITY_DRAG_CAP: f64 = 15.0; /// Below this finding count, the Triage component contributes -/// weight 0 — we don't punish fresh users for not having triaged +/// weight 0, we don't punish fresh users for not having triaged /// what didn't need triaging. const TRIAGE_FLOOR: usize = 20; @@ -77,7 +77,7 @@ const STALE_PENALTY_CAP: f64 = 10.0; // ── Public API ─────────────────────────────────────────────────────────────── /// Pure inputs to the health-score calculation. No app state, no DB -/// handles — those upstream concerns are flattened into primitives the +/// handles, those upstream concerns are flattened into primitives the /// scorer actually consumes. #[derive(Debug, Clone, Copy)] pub struct HealthInputs<'a> { @@ -120,7 +120,7 @@ pub fn compute(inp: &HealthInputs<'_>) -> HealthScore { let quality_drag = quality_drag(weighted.quality_count); let base_after_drag = (base_score - quality_drag).clamp(0.0, 100.0); - // Step 5: HIGH-count guardrails — keyed on *effective* HIGH count + // Step 5: HIGH-count guardrails, keyed on *effective* HIGH count // (credibility-weighted), not raw count. This is what protects // users from FP-driven F grades while the scanner is maturing. let ceiling = high_total_ceiling(weighted.effective_high); @@ -161,9 +161,9 @@ struct WeightedAggregate { /// context_factor` across security findings. Quality lints are /// handled separately via `quality_drag`. raw_weight: f64, - /// Number of `*.quality.*` findings — drives `quality_drag`. + /// Number of `*.quality.*` findings, drives `quality_drag`. quality_count: usize, - /// Credibility-adjusted HIGH count (rounded) — drives the HIGH + /// Credibility-adjusted HIGH count (rounded), drives the HIGH /// ceiling and floor. A low-confidence + Inconclusive HIGH might /// contribute 0.2; five of them would round to 1. effective_high: usize, @@ -171,10 +171,10 @@ struct WeightedAggregate { raw_high: usize, raw_medium: usize, raw_low_security: usize, - /// Confidence rate (high+medium*0.5)/total — drives the + /// Confidence rate (high+medium*0.5)/total, drives the /// confidence component. 100 if no findings. confidence_rate: f64, - /// Symex coverage — % of taint findings with any non-NotAttempted + /// Symex coverage, % of taint findings with any non-NotAttempted /// verdict. Surfaced in component detail; not currently in score. symex_coverage: f64, } @@ -218,7 +218,7 @@ fn aggregate_findings(findings: &[Diag]) -> WeightedAggregate { _ => 0.0, }; - // Symex coverage tracking — only meaningful for findings with + // Symex coverage tracking, only meaningful for findings with // taint-flow evidence (the ones symex even attempts). if let Some(ev) = f.evidence.as_ref() && ev.symbolic.is_some() @@ -294,7 +294,7 @@ fn context_factor(f: &Diag) -> f64 { return 0.3; } let Some(ev) = f.evidence.as_ref() else { - return 0.75; // No evidence at all — pattern match + return 0.75; // No evidence at all, pattern match }; if ev.flow_steps.is_empty() { return 0.75; @@ -351,7 +351,7 @@ fn quality_drag(quality_count: usize) -> f64 { (quality_count as f64 * QUALITY_DRAG_PER_FINDING).min(QUALITY_DRAG_CAP) } -// ── HIGH guardrails — calibrated for v0.5.0 FP rate ────────────────────────── +// ── HIGH guardrails, calibrated for v0.5.0 FP rate ────────────────────────── /// Final-score ceiling keyed on *effective* HIGH count (credibility- /// weighted, not raw). See module docstring for the rationale. @@ -398,7 +398,7 @@ fn build_components( let sev_score = base_after_drag.round().clamp(0.0, 100.0) as u8; let sev_detail = severity_detail(weighted, size_divisor, inp.repo_files, inp.backlog); - // Confidence component — high-conf rate scaled into 0..=100. + // Confidence component, high-conf rate scaled into 0..=100. let conf_score = weighted.confidence_rate.round().clamp(0.0, 100.0) as u8; let conf_detail = format!( "High-confidence rate {:.0}% across {} security finding{}", @@ -407,7 +407,7 @@ fn build_components( plural_s(total - weighted.quality_count) ); - // Trend component — only contributes weight when has_history. + // Trend component, only contributes weight when has_history. let net = inp.fixed_since_last as i64 - inp.new_since_last as i64; let trend_score = (50 + net * 5).clamp(0, 100) as u8; let trend_weight = if inp.has_history { 0.20 } else { 0.0 }; @@ -420,7 +420,7 @@ fn build_components( "Not applicable: no prior scan to compare against (re-scan to populate)".into() }; - // Triage — drops out when total < TRIAGE_FLOOR. + // Triage, drops out when total < TRIAGE_FLOOR. let triage_active = total >= TRIAGE_FLOOR; let triage_score = (inp.triage_coverage * 100.0).round().clamp(0.0, 100.0) as u8; let triage_weight = if triage_active { 0.20 } else { 0.0 }; @@ -470,7 +470,7 @@ fn build_components( HealthComponent { label: "Severity pressure".into(), score: sev_score, - weight: 1.0, // Severity is the *base*, not a modifier — full weight in the blend. + weight: 1.0, // Severity is the *base*, not a modifier, full weight in the blend. detail: sev_detail, }, HealthComponent { @@ -770,7 +770,7 @@ mod tests { .collect(); let s = summary_of(&findings); let h = compute(&first_scan(&s, &findings, 0.0, 100)); - // The score reflects credibility — should NOT crater to F. + // The score reflects credibility, should NOT crater to F. assert!( h.score >= 60, "low-credibility HIGHs shouldn't crater to F, got {}", diff --git a/src/server/jobs.rs b/src/server/jobs.rs index 9c5b173d..9efd107a 100644 --- a/src/server/jobs.rs +++ b/src/server/jobs.rs @@ -65,7 +65,7 @@ pub struct JobManager { job_order: Mutex>, active_job_id: Mutex>, max_jobs: usize, - /// Dedicated rayon pool for scans — keeps the global pool (and tokio + /// Dedicated rayon pool for scans, keeps the global pool (and tokio /// worker threads) free so the web UI stays responsive during a scan. scan_pool: rayon::ThreadPool, } diff --git a/src/server/models.rs b/src/server/models.rs index 9c219a2a..0acc5f70 100644 --- a/src/server/models.rs +++ b/src/server/models.rs @@ -632,7 +632,7 @@ pub struct HealthScore { pub struct HealthComponent { /// Human label (e.g. "Severity pressure", "Trend", "Triage"). pub label: String, - /// 0–100 — already inverted so higher = healthier. + /// 0–100, already inverted so higher = healthier. pub score: u8, /// Weight applied when blending into the final score (0.0–1.0). pub weight: f64, @@ -662,7 +662,7 @@ pub struct BacklogStats { pub median_age_days: Option, /// Findings older than 30 days that remain open. pub stale_count: usize, - /// Histogram buckets (label, count) — fixed 5 buckets. + /// Histogram buckets (label, count), fixed 5 buckets. pub age_buckets: Vec, } @@ -691,12 +691,12 @@ pub struct ConfidenceDistribution { pub struct ScannerQuality { pub files_scanned: u64, pub files_skipped: u64, - /// 0.0–1.0 — files_scanned / (files_scanned + files_skipped). + /// 0.0–1.0, files_scanned / (files_scanned + files_skipped). pub parse_success_rate: f64, pub functions_analyzed: u64, pub call_edges: u64, pub unresolved_calls: u64, - /// 0.0–1.0 — call_edges / (call_edges + unresolved_calls). + /// 0.0–1.0, call_edges / (call_edges + unresolved_calls). pub call_resolution_rate: f64, /// % of taint findings that received a symbolic verdict (Confirmed|Infeasible|Inconclusive). pub symex_verified_rate: f64, @@ -712,7 +712,7 @@ pub struct IssueCategoryBucket { pub count: usize, } -/// "Hot sink" — a single callee that absorbs many findings. +/// "Hot sink", a single callee that absorbs many findings. #[derive(Debug, Clone, Serialize)] pub struct HotSink { /// Callee name (best-effort; from flow_steps last Sink). @@ -723,7 +723,7 @@ pub struct HotSink { /// One OWASP Top-10 (2021) bucket. #[derive(Debug, Clone, Serialize)] pub struct OwaspBucket { - /// "A01:2021 — Broken Access Control" etc. + /// "A01:2021, Broken Access Control" etc. pub code: String, pub label: String, pub count: usize, diff --git a/src/server/observability.rs b/src/server/observability.rs index 381ebc10..9ab89f18 100644 --- a/src/server/observability.rs +++ b/src/server/observability.rs @@ -41,7 +41,7 @@ pub async fn observe(mut request: Request, next: Next) -> Response { response.headers_mut().insert(REQUEST_ID_HEADER, value); } - // Skip noisy SSE channel — long-lived stream pollutes logs. + // Skip noisy SSE channel, long-lived stream pollutes logs. if path != "/api/events" { if status.is_server_error() { tracing::error!( diff --git a/src/server/owasp.rs b/src/server/owasp.rs index 1d34c240..0ca9c16a 100644 --- a/src/server/owasp.rs +++ b/src/server/owasp.rs @@ -1,15 +1,15 @@ //! Static rule-id → OWASP Top-10 (2021) mapping for the dashboard. //! //! Rule IDs follow the convention `{lang}.{family}.{name}` (e.g. `js.xss.outer_html`). -//! The family segment is what determines the bucket. Conservative — when in doubt, +//! The family segment is what determines the bucket. Conservative, when in doubt, //! map to the closest fit; rules with no obvious bucket are left unbucketed. use crate::server::models::OwaspBucket; use std::collections::HashMap; /// Extract the family token from a rule ID. Handles two ID shapes: -/// 1. `lang.family.name` — typical (e.g. `js.xss.outer_html`) -/// 2. `family-subname` or single-segment — engine-emitted (e.g. +/// 1. `lang.family.name`, typical (e.g. `js.xss.outer_html`) +/// 2. `family-subname` or single-segment, engine-emitted (e.g. /// `state-resource-leak`, `taint-unsanitised-flow`, `cfg-error-fallthrough`) fn extract_family(rule_id: &str) -> &str { if let Some(idx) = rule_id.find('.') { @@ -33,23 +33,23 @@ pub fn owasp_bucket_for(rule_id: &str) -> Option<(&'static str, &'static str)> { } Some(match family { - // A01 — Broken Access Control + // A01, Broken Access Control "auth" | "csrf" | "mass_assign" | "path" | "redirect" => ("A01", "Broken Access Control"), - // A02 — Cryptographic Failures + // A02, Cryptographic Failures "crypto" | "secrets" => ("A02", "Cryptographic Failures"), - // A03 — Injection (covers SQLi, XSS, command, code-eval, template, NoSQL, LDAP, reflection, + // A03, Injection (covers SQLi, XSS, command, code-eval, template, NoSQL, LDAP, reflection, // and engine-level taint findings without a more specific family tag). "sqli" | "xss" | "cmdi" | "code_exec" | "template" | "nosql" | "ldap" | "reflection" | "taint" => ("A03", "Injection"), - // A05 — Security Misconfiguration (TLS verify off, cookie flags, prototype pollution) + // A05, Security Misconfiguration (TLS verify off, cookie flags, prototype pollution) "config" | "transport" | "prototype" => ("A05", "Security Misconfiguration"), - // A08 — Software and Data Integrity Failures + // A08, Software and Data Integrity Failures "deser" => ("A08", "Software and Data Integrity Failures"), - // A09 — Logging & Monitoring Failures + // A09, Logging & Monitoring Failures "log" => ("A09", "Logging and Monitoring Failures"), - // A10 — SSRF + // A10, SSRF "ssrf" => ("A10", "Server-Side Request Forgery"), - // Memory-safety + state-machine resource lifecycle bugs — closest OWASP fit is + // Memory-safety + state-machine resource lifecycle bugs, closest OWASP fit is // A04 Insecure Design (defensive depth). "memory" | "state" => ("A04", "Insecure Design"), // Quality findings (e.g. rs.quality.unwrap) and CFG structural issues @@ -162,7 +162,7 @@ mod tests { fn malformed_rule_returns_none() { // single-segment "not" → family "not" → unmapped → None assert_eq!(owasp_bucket_for("not-a-rule"), None); - // "js.onlytwo" — family is "onlytwo" which is unmapped + // "js.onlytwo", family is "onlytwo" which is unmapped assert_eq!(owasp_bucket_for("js.onlytwo"), None); } diff --git a/src/server/routes/config.rs b/src/server/routes/config.rs index d987cb80..73ef35ed 100644 --- a/src/server/routes/config.rs +++ b/src/server/routes/config.rs @@ -282,7 +282,7 @@ async fn remove_terminator( // ── Sources / Sinks / Sanitizers (by kind) ─────────────────────────────────── fn list_by_kind(state: &AppState, target_kind: &str) -> Vec { - // Built-in rules live on /api/rules — keep this endpoint focused on the + // Built-in rules live on /api/rules, keep this endpoint focused on the // user's own additions in nyx.local. let target_rule_kind = match target_kind { "source" => RuleKind::Source, diff --git a/src/server/routes/debug.rs b/src/server/routes/debug.rs index d0890e98..8df3de01 100644 --- a/src/server/routes/debug.rs +++ b/src/server/routes/debug.rs @@ -306,8 +306,8 @@ async fn get_type_facts( } /// GET /api/debug/auth?file= -/// Return the file-scoped authorization model — routes, units, -/// sensitive operations, and auth checks — for the debug UI. +/// Return the file-scoped authorization model, routes, units, +/// sensitive operations, and auth checks, for the debug UI. async fn get_auth( State(state): State, Query(q): Query, diff --git a/src/server/routes/explorer.rs b/src/server/routes/explorer.rs index 6bfd2921..cb0ca332 100644 --- a/src/server/routes/explorer.rs +++ b/src/server/routes/explorer.rs @@ -55,7 +55,7 @@ struct TreeEntry { struct SymbolEntry { name: String, /// Legacy display kind (`"function"` / `"method"`) used by existing CSS - /// classes in the frontend. Kept for backward-compat — new consumers + /// classes in the frontend. Kept for backward-compat, new consumers /// should prefer `func_kind`. kind: String, /// Structural [`crate::symbol::FuncKind`] slug (`"fn"`, `"method"`, @@ -291,7 +291,7 @@ async fn get_symbols( let entries: Vec = symbols .into_iter() .map(|(name, arity, _lang, namespace, container, func_kind)| { - // Legacy `kind` field — still used by existing CSS classes + // Legacy `kind` field, still used by existing CSS classes // (`symbol-kind-method`, `symbol-kind-function`). Map any // method-like FuncKind onto `"method"` and everything else // onto `"function"` so the rendered icon stays sensible. diff --git a/src/server/routes/findings.rs b/src/server/routes/findings.rs index 3372cc77..30822d4e 100644 --- a/src/server/routes/findings.rs +++ b/src/server/routes/findings.rs @@ -73,7 +73,7 @@ fn load_latest_findings_internal(state: &AppState) -> LoadedFindings { /// Build (or fetch from cache) the per-scan derived views. /// /// Returns clones of `Arc`s so callers can drop the lock immediately and work -/// without contention. Triage state is *not* baked into the cached views — it +/// without contention. Triage state is *not* baked into the cached views, it /// changes on a different cadence and is overlaid per request. fn cached_for_latest(state: &AppState) -> CachedFindings { let loaded = load_latest_findings_internal(state); @@ -85,7 +85,7 @@ fn cached_for_latest(state: &AppState) -> CachedFindings { } } - // Slow path: rebuild. Guard against concurrent rebuilds of the same key — + // Slow path: rebuild. Guard against concurrent rebuilds of the same key , // a second writer that finds the cache already populated for our key // simply returns it. let mut guard = state.findings_cache.write(); diff --git a/src/server/routes/overview.rs b/src/server/routes/overview.rs index b596e078..622670ea 100644 --- a/src/server/routes/overview.rs +++ b/src/server/routes/overview.rs @@ -29,7 +29,7 @@ pub fn routes() -> Router { .route("/overview/baseline/{scan_id}", post(set_baseline_path)) } -/// GET /api/overview — aggregated dashboard data. +/// GET /api/overview, aggregated dashboard data. async fn overview(State(state): State) -> Json { // 1. Load latest findings (in-memory → DB fallback) let findings = crate::server::routes::findings::load_latest_findings(&state); @@ -121,7 +121,7 @@ async fn overview(State(state): State) -> Json { new_since_last, fixed_since_last, reintroduced: reintroduced_count, - // Files-scanned proxy for repo size — used for size-aware + // Files-scanned proxy for repo size, used for size-aware // severity dampening in `health::compute`. See // `docs/health-score-audit.md` for calibration data. repo_files: scanner_quality @@ -129,10 +129,10 @@ async fn overview(State(state): State) -> Json { .map(|q| q.files_scanned) .filter(|&f| f > 0), backlog: backlog.as_ref(), - // Trend is meaningless without ≥2 completed scans — + // Trend is meaningless without ≥2 completed scans , // matches the first-scan check `compare_to_current` uses. has_history: history.scans.len() >= 2, - // Suppression-hygiene modifier — populated when the + // Suppression-hygiene modifier, populated when the // suppression panel was computable for this scan. blanket_suppression_rate: suppression_hygiene.as_ref().map(|s| s.blanket_rate), }, @@ -173,7 +173,7 @@ async fn overview(State(state): State) -> Json { }) } -/// GET /api/overview/trends — scan-over-scan finding counts. +/// GET /api/overview/trends, scan-over-scan finding counts. async fn overview_trends(State(state): State) -> Json> { let mut points = Vec::new(); @@ -218,7 +218,7 @@ struct BaselineBody { scan_id: String, } -/// POST /api/overview/baseline { scan_id } — pin a scan as the baseline for drift comparison. +/// POST /api/overview/baseline { scan_id }, pin a scan as the baseline for drift comparison. async fn set_baseline( State(state): State, Json(body): Json, @@ -226,7 +226,7 @@ async fn set_baseline( set_baseline_inner(&state, &body.scan_id) } -/// POST /api/overview/baseline/:scan_id — convenience path-form for clients without a JSON body. +/// POST /api/overview/baseline/:scan_id, convenience path-form for clients without a JSON body. async fn set_baseline_path( State(state): State, AxPath(scan_id): AxPath, @@ -248,7 +248,7 @@ fn set_baseline_inner(state: &AppState, scan_id: &str) -> Result) -> Result { let pool = state .db_pool @@ -381,7 +381,7 @@ impl ScanHistory { (new_count, fixed_count, reintroduced) } - /// Trend slope across the last N totals — 1.0 means strictly improving, + /// Trend slope across the last N totals, 1.0 means strictly improving, /// -1.0 strictly regressing, 0.0 unchanged. Returns None with <3 points. fn trend_slope(&self) -> Option { if self.scans.len() < 3 { @@ -712,7 +712,7 @@ fn compute_cross_file_ratio(findings: &[Diag]) -> f64 { cross as f64 / findings.len() as f64 } -/// Hot sinks are *only* meaningful for taint findings — counting AST rule IDs +/// Hot sinks are *only* meaningful for taint findings, counting AST rule IDs /// (e.g. `rs.quality.unwrap`) here just duplicates the Top Rules table. So we /// deliberately require a real Sink-step callee (or a parsable sink snippet) /// and skip everything else. Empty result → frontend hides the card. @@ -751,7 +751,7 @@ fn compute_hot_sinks(findings: &[Diag], limit: usize) -> Vec { rows } -/// Pull the leading identifier from a sink snippet — a best-effort heuristic +/// Pull the leading identifier from a sink snippet, a best-effort heuristic /// for the dashboard's "hot sinks" list. fn extract_callee_from_snippet(s: &str) -> String { let trimmed = s.trim(); @@ -932,7 +932,7 @@ fn compute_suppression_hygiene(state: &AppState, findings: &[Diag]) -> Suppressi } fn compute_backlog(state: &AppState, findings: &[Diag], history: &ScanHistory) -> BacklogStats { - // No useful aging data on the first scan — every fingerprint was first-seen + // No useful aging data on the first scan, every fingerprint was first-seen // today by definition. Avoid the misleading "0d / 0d / 0" display. if history.scans.len() <= 1 { return BacklogStats { @@ -1046,7 +1046,7 @@ fn build_posture( current_total: usize, ) -> PostureSummary { // First-scan case: no prior data to diff against. Saying "stable / no change" - // is misleading — we genuinely don't know yet. + // is misleading, we genuinely don't know yet. if history.scans.len() <= 1 { return PostureSummary { trend: "unknown".into(), diff --git a/src/server/routes/rules.rs b/src/server/routes/rules.rs index 7f6647d4..25205fe5 100644 --- a/src/server/routes/rules.rs +++ b/src/server/routes/rules.rs @@ -61,7 +61,7 @@ fn build_rule_list(state: &AppState) -> Vec { rules } -/// GET /api/rules — list all rules with finding counts. +/// GET /api/rules, list all rules with finding counts. async fn list_rules(State(state): State) -> Json> { let rules = build_rule_list(&state); @@ -99,7 +99,7 @@ async fn list_rules(State(state): State) -> Json> { Json(items) } -/// GET /api/rules/:id — full detail for one rule. +/// GET /api/rules/:id, full detail for one rule. async fn get_rule( State(state): State, Path(id): Path, @@ -140,7 +140,7 @@ async fn get_rule( })) } -/// POST /api/rules/:id/toggle — enable/disable a rule. +/// POST /api/rules/:id/toggle, enable/disable a rule. async fn toggle_rule( State(state): State, Path(id): Path, @@ -162,7 +162,7 @@ async fn toggle_rule( Ok(Json(serde_json::json!({ "status": "ok", "rule_id": id }))) } -/// POST /api/rules/clone — clone a built-in rule to custom. +/// POST /api/rules/clone, clone a built-in rule to custom. async fn clone_rule( State(state): State, Json(body): Json, diff --git a/src/server/routes/scans.rs b/src/server/routes/scans.rs index d8225258..17ffb50c 100644 --- a/src/server/routes/scans.rs +++ b/src/server/routes/scans.rs @@ -213,7 +213,7 @@ async fn delete_scan( Json(serde_json::json!({ "error": msg })), )); } - // "Scan not found" in memory is fine — may be DB-only + // "Scan not found" in memory is fine, may be DB-only } // Delete from DB (CASCADE handles metrics + logs) diff --git a/src/server/triage_sync.rs b/src/server/triage_sync.rs index 7b66d102..72903618 100644 --- a/src/server/triage_sync.rs +++ b/src/server/triage_sync.rs @@ -3,8 +3,8 @@ //! This file is designed to be committed to version control so that triage //! decisions travel with the code and are shared across team members. //! -//! The file uses **portable fingerprints** — computed with paths relative to the -//! project root — so they match across machines regardless of where the repo is +//! The file uses **portable fingerprints**, computed with paths relative to the +//! project root, so they match across machines regardless of where the repo is //! checked out. use crate::commands::scan::Diag; diff --git a/src/ssa/alias.rs b/src/ssa/alias.rs index 251eb854..a3c01371 100644 --- a/src/ssa/alias.rs +++ b/src/ssa/alias.rs @@ -59,7 +59,7 @@ impl BaseAliasResult { /// /// For each entry `(dst_val, src_val)` where copy prop replaced `dst` with /// `src`, looks up the original variable names. If both are plain identifiers -/// (no dots — i.e. not field paths), they are registered as base aliases. +/// (no dots, i.e. not field paths), they are registered as base aliases. /// Transitive closure is computed so `b = a; c = b` yields group `{a, b, c}`. pub fn compute_base_aliases( copy_map: &HashMap, @@ -103,7 +103,7 @@ pub fn compute_base_aliases( let ra = find(parent, a); let rb = find(parent, b); if ra != rb { - // Arbitrary root choice — alphabetically smaller becomes root + // Arbitrary root choice, alphabetically smaller becomes root // for determinism. if ra < rb { parent.insert(rb, ra); @@ -130,7 +130,7 @@ pub fn compute_base_aliases( None => continue, }; - // Only alias plain idents — dotted paths (field accesses) are tracked + // Only alias plain idents, dotted paths (field accesses) are tracked // independently in SSA and handled by field-aware suppression. if dst_name.contains('.') || src_name.contains('.') { continue; diff --git a/src/ssa/const_prop.rs b/src/ssa/const_prop.rs index b00ad3d4..9dfcc470 100644 --- a/src/ssa/const_prop.rs +++ b/src/ssa/const_prop.rs @@ -17,7 +17,7 @@ pub enum ConstLattice { Bool(bool), /// Null / nil / None. Null, - /// Multiple possible values — not constant. + /// Multiple possible values, not constant. Varying, } @@ -70,7 +70,7 @@ impl ConstLattice { return ConstLattice::Str(inner.to_string()); } - // Bare string (no quotes) — treat as string constant + // Bare string (no quotes), treat as string constant ConstLattice::Str(trimmed.to_string()) } @@ -283,7 +283,7 @@ fn eval_inst(inst: &SsaInst, values: &HashMap) -> ConstL | SsaOp::SelfParam | SsaOp::CatchParam => ConstLattice::Varying, // FieldProj: projecting a field is dynamic with respect to the - // const-propagation lattice — there is no general way to fold + // const-propagation lattice, there is no general way to fold // `obj.field` to a known scalar at this phase. Returning Varying // matches Call: callers needing field-level constness will go // through the points-to / heap analysis. @@ -452,7 +452,7 @@ fn mark_edge_executable( if executable_blocks.insert(to) { cfg_worklist.push_back(to); } else { - // Block already executable but new edge — re-evaluate phis + // Block already executable but new edge, re-evaluate phis cfg_worklist.push_back(to); } } @@ -863,7 +863,7 @@ mod tests { /// Const parsing must round-trip integer signs. i64::MIN/MAX must /// parse without overflow; arbitrary text falls back to a bare-string - /// const (current contract — tested here so a future change is + /// const (current contract, tested here so a future change is /// caught explicitly). #[test] fn const_parse_extremes_and_fallback() { diff --git a/src/ssa/copy_prop.rs b/src/ssa/copy_prop.rs index 68248327..fb18c2ec 100644 --- a/src/ssa/copy_prop.rs +++ b/src/ssa/copy_prop.rs @@ -25,7 +25,7 @@ pub fn copy_propagate(body: &mut SsaBody, cfg: &Cfg) -> (usize, HashMap usize { /// condition variable. Without counting these, a value used solely by a /// terminator (the canonical case for short helpers like /// `def f(s): return s`) is judged dead, and DCE strips every instruction -/// in the body — leaving empty blocks whose terminators reference +/// in the body, leaving empty blocks whose terminators reference /// nonexistent SsaValues, breaking downstream analyses (per-return-path /// PathFact narrowing, inline-summary extraction, etc.). fn build_use_counts(body: &SsaBody) -> HashMap { @@ -170,8 +170,8 @@ mod tests { #[test] fn dead_const_removed() { - // v0 = const("42") — unused, should be removed - // v1 = source() — must survive even if unused + // v0 = const("42"), unused, should be removed + // v1 = source(), must survive even if unused let mut cfg: Cfg = Graph::new(); let n0 = cfg.add_node(make_cfg_node(StmtKind::Seq)); let n1 = cfg.add_node(make_cfg_node(StmtKind::Seq)); @@ -228,7 +228,7 @@ mod tests { #[test] fn dead_sanitizer_label_preserved() { - // v0 has a Sanitizer label on its CFG node — must survive even if unused + // v0 has a Sanitizer label on its CFG node, must survive even if unused use crate::labels::{Cap, DataLabel}; let mut cfg: Cfg = Graph::new(); @@ -277,7 +277,7 @@ mod tests { #[test] fn dead_source_label_preserved() { - // v0 has a Source label on its CFG node — must survive even if unused + // v0 has a Source label on its CFG node, must survive even if unused use crate::labels::{Cap, DataLabel}; let mut cfg: Cfg = Graph::new(); @@ -541,7 +541,7 @@ mod tests { #[test] fn used_def_preserved() { - // v0 = const("42"), v1 = assign(v0) — v0 is used, both survive + // v0 = const("42"), v1 = assign(v0), v0 is used, both survive let mut cfg: Cfg = Graph::new(); let n0 = cfg.add_node(make_cfg_node(StmtKind::Seq)); let n1 = cfg.add_node(make_cfg_node(StmtKind::Seq)); @@ -597,7 +597,7 @@ mod tests { } /// DCE must NEVER remove a Call instruction even when its result has - /// zero uses — calls have side effects (I/O, throws, mutations) that + /// zero uses, calls have side effects (I/O, throws, mutations) that /// cannot be modeled as SSA-value uses. This is the conservative /// invariant `is_dead()` enforces; regressing it would silently drop /// real-world code from analysis (sinks, sanitizers expressed as diff --git a/src/ssa/heap.rs b/src/ssa/heap.rs index 9ede505c..51e5bc53 100644 --- a/src/ssa/heap.rs +++ b/src/ssa/heap.rs @@ -8,7 +8,7 @@ //! Key design: //! - HeapObjectId is keyed by allocation-site SsaValue (deterministic, zero-cost) //! - PointsToSet is bounded to `analysis.engine.max_pointsto` entries -//! (default 32, widening on overflow — see [`effective_max_pointsto`]). +//! (default 32, widening on overflow, see [`effective_max_pointsto`]). //! Overflow drops emit an [`crate::engine_notes::EngineNote::PointsToTruncated`] //! note and increment [`POINTSTO_TRUNCATION_COUNT`] so operators can //! tell when the cap is firing on their corpus. @@ -16,7 +16,7 @@ //! - HeapSlot::Index(u64) for constant-index container access (proven by const propagation) //! - HeapSlot::Elements for coarse element access (push/pop, dynamic index, overflow) //! - Intraprocedural: constant-index sensitivity is guaranteed when const propagation proves it -//! - Interprocedural: best-effort — relies on correct const_values threading (already handled) +//! - Interprocedural: best-effort, relies on correct const_values threading (already handled) //! - Unknown/unproven indices fall back to Elements (conservative) //! - Analysis runs as a pre-pass in optimize_ssa(), like type_facts @@ -32,7 +32,7 @@ use serde::{Deserialize, Serialize}; use smallvec::SmallVec; use std::collections::HashMap; -// Heap origin cap used to be `const MAX_HEAP_ORIGINS: usize = 4` — now +// Heap origin cap used to be `const MAX_HEAP_ORIGINS: usize = 4`, now // governed by the shared `analysis.engine.max_origins` knob through // `crate::taint::ssa_transfer::push_origin_bounded`. Unifying the two // lattices behind a single tunable means operators raise *one* value to @@ -47,7 +47,7 @@ static MAX_POINTSTO_OVERRIDE: std::sync::atomic::AtomicUsize = /// Total heap-object members dropped by [`PointsToSet`] truncation since /// the last reset. Captured from `insert`/`union` so tests (and /// operators inspecting scan output) can detect truncation events that -/// don't propagate to a finding — e.g. when the cap is tight enough +/// don't propagate to a finding, e.g. when the cap is tight enough /// that no taint flow survives to emit a sink event. pub(crate) static POINTSTO_TRUNCATION_COUNT: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); @@ -114,7 +114,7 @@ pub const MAX_TRACKED_INDICES: usize = 8; /// Distinguishes constant-index container access from coarse element access. /// -/// `Elements` is the conservative default — all container elements merge into +/// `Elements` is the conservative default, all container elements merge into /// a single taint. `Index(n)` provides per-index precision when the index is /// provably a non-negative integer constant (via the function's own const /// propagation pass). @@ -302,10 +302,10 @@ impl HeapTaint { /// union of per-slot taint), matching the `SsaTaintState` pattern. /// /// Load semantics: -/// - `load(id, Index(n))`: union of `(id, Index(n))` and `(id, Elements)` — +/// - `load(id, Index(n))`: union of `(id, Index(n))` and `(id, Elements)` , /// indexed reads also see taint from dynamic/push operations. /// - `load(id, Elements)`: union of `(id, Elements)` and ALL `(id, Index(*))` -/// entries — dynamic reads conservatively see all indexed taint. +/// entries, dynamic reads conservatively see all indexed taint. #[derive(Clone, Debug, PartialEq, Eq)] pub struct HeapState { entries: SmallVec<[((HeapObjectId, HeapSlot), HeapTaint); 4]>, @@ -927,7 +927,7 @@ mod tests { set_max_pointsto_override(4); reset_points_to_observability(); - // a = {0,1,2,3}, b = {4,5,6} — union wants 7 members; cap is 4 + // a = {0,1,2,3}, b = {4,5,6}, union wants 7 members; cap is 4 // so 3 members are dropped. Deterministic order: smallest // ids survive. let mut a = PointsToSet::empty(); @@ -1215,7 +1215,7 @@ mod tests { #[test] fn heap_elements_load_unions_all_indices() { - // Store to Index(0) and Index(2) — Elements load should see both + // Store to Index(0) and Index(2), Elements load should see both let mut h = HeapState::empty(); let id = HeapObjectId(SsaValue(0)); h.store(id, HeapSlot::Index(0), Cap::HTML_ESCAPE, &[origin(0)]); diff --git a/src/ssa/invariants.rs b/src/ssa/invariants.rs index de35dd67..5705aba8 100644 --- a/src/ssa/invariants.rs +++ b/src/ssa/invariants.rs @@ -20,33 +20,33 @@ //! //! Invariants are split into two groups: //! -//! **Group A — SSA integrity (must hold unconditionally):** +//! **Group A, SSA integrity (must hold unconditionally):** //! -//! 1. `BlockId` indexing — `blocks[i].id == BlockId(i)` +//! 1. `BlockId` indexing, `blocks[i].id == BlockId(i)` //! 2. Entry block has no predecessors -//! 3. Pred/succ symmetry — `B.succs.contains(S)` ⇔ `S.preds.contains(B)` -//! 4. Phi placement — every phi appears in `block.phis` (never in body) -//! 5. Phi operand arity — ≤ `block.preds.len()` -//! 6. Phi operand sources — every `(pred_bid, _)` operand has +//! 3. Pred/succ symmetry, `B.succs.contains(S)` ⇔ `S.preds.contains(B)` +//! 4. Phi placement, every phi appears in `block.phis` (never in body) +//! 5. Phi operand arity, ≤ `block.preds.len()` +//! 6. Phi operand sources, every `(pred_bid, _)` operand has //! `block.preds.contains(pred_bid)` -//! 7. Unique SSA definitions — every `SsaValue` is defined at most once +//! 7. Unique SSA definitions, every `SsaValue` is defined at most once //! across all phi + body instructions -//! 8. `value_defs` coverage — every defined `SsaValue.0` is a valid index +//! 8. `value_defs` coverage, every defined `SsaValue.0` is a valid index //! into `value_defs`, and `value_defs[v.0].block` matches the block //! containing the defining instruction -//! 9. `cfg_node_map` consistency — every `(node, SsaValue)` pair points +//! 9. `cfg_node_map` consistency, every `(node, SsaValue)` pair points //! to an instruction whose `cfg_node == node` //! -//! **Group B — terminator and reachability (loose, reflecting lowering):** +//! **Group B, terminator and reachability (loose, reflecting lowering):** //! //! 10. Terminator/succs agreement *subset* form: -//! * `Goto(t)` → `succs.contains(t)` — extras tolerated +//! * `Goto(t)` → `succs.contains(t)`, extras tolerated //! (3-successor collapse fallback) //! * `Branch{t, f, …}` → `succs` contains both `t` and `f` //! * `Return`/`Unreachable` → no constraint on `succs` (CFG may carry //! finally/cleanup continuation edges that downstream analysis //! propagates through) -//! 11. Reachability from entry — tolerated exceptions: +//! 11. Reachability from entry, tolerated exceptions: //! * blocks that appear as the `catch` side of an exception edge //! //! Group B is deliberately permissive: the SSA body's `succs` field is the @@ -61,8 +61,8 @@ use super::ir::*; /// Errors returned by targeted invariant checks. /// -/// Wraps a list of human-readable violation messages — one per offending -/// block — so callers can include every failure in a single panic / +/// Wraps a list of human-readable violation messages, one per offending +/// block, so callers can include every failure in a single panic / /// warning. #[derive(Debug, Clone, PartialEq, Eq)] pub struct InvariantError { @@ -106,12 +106,12 @@ pub fn check_structural_invariants(body: &SsaBody) -> Vec { errors } -/// Every block carrying an [`SsaOp::CatchParam`] — an exception-handler -/// entry — must be reachable from either the function entry (via normal +/// Every block carrying an [`SsaOp::CatchParam`], an exception-handler +/// entry, must be reachable from either the function entry (via normal /// flow) or from at least one entry in [`SsaBody::exception_edges`]. /// /// When this fails, the CFG builder has produced an orphan catch block -/// that should have been wired up as an exception successor but was not — +/// that should have been wired up as an exception successor but was not , /// a real construction bug that otherwise manifests as silent false /// negatives in resource-cleanup / exception-flow findings. pub fn check_catch_block_reachability(body: &SsaBody) -> Result<(), InvariantError> { @@ -252,7 +252,7 @@ fn check_pred_succ_symmetry(body: &SsaBody, errors: &mut Vec) { } fn check_terminator_succ_agreement(body: &SsaBody, errors: &mut Vec) { - // Group B — loose agreement. See module docs for rationale. + // Group B, loose agreement. See module docs for rationale. for block in &body.blocks { match &block.terminator { Terminator::Goto(target) => { @@ -301,7 +301,7 @@ fn check_terminator_succ_agreement(body: &SsaBody, errors: &mut Vec) { } } Terminator::Return(_) | Terminator::Unreachable => { - // Loose by design — cleanup/finally continuation edges in + // Loose by design, cleanup/finally continuation edges in // `succs` are expected. Downstream consumers (taint // `compute_succ_states`, SCCP `process_terminator`) treat // `succs` as authoritative and propagate across these edges, @@ -443,7 +443,7 @@ fn check_reachability(body: &SsaBody, errors: &mut Vec) { // Multi-root BFS: start from the entry *and* from every catch target // recorded in `exception_edges`. Exception-handler blocks are reached // via stripped exception edges, so from the SSA body's perspective they - // look like roots — as does anything transitively reachable from them + // look like roots, as does anything transitively reachable from them // (e.g. a `finally` block chained after a `catch`). let mut visited = vec![false; n]; let mut stack: Vec = Vec::new(); @@ -487,7 +487,7 @@ fn check_reachability(body: &SsaBody, errors: &mut Vec) { /// fingerprint have the same block structure, terminator shape, per-block /// phi/body instruction counts and op-kind sequences. SsaValue numbers are /// not part of the fingerprint, so renumbering between runs does not cause -/// spurious diffs — only shape changes do. +/// spurious diffs, only shape changes do. /// /// Phis are emitted in their natural (insertion) order. Lowering now drives /// phi placement through a `BTreeSet`, so that order is deterministic diff --git a/src/ssa/ir.rs b/src/ssa/ir.rs index 8238c7e7..94b9c882 100644 --- a/src/ssa/ir.rs +++ b/src/ssa/ir.rs @@ -24,21 +24,14 @@ pub struct BlockId(pub u32); pub struct FieldId(pub u32); impl FieldId { - /// Pointer-Phase 4 sentinel for the abstract "any element of a - /// container" field. Steensgaard-grade precision: every numeric - /// or dynamic index access (`arr[i]`, `arr.shift()`, `map[k]`) - /// projects through the same `Field(pt(container), ELEM)` cell so - /// per-element taint propagation is independent of the SSA value - /// referencing the container. - /// - /// `u32::MAX` is reserved by convention; the per-body - /// [`FieldInterner`] never assigns it because interning is - /// monotone-ascending from `0` and bodies don't approach 4 billion - /// fields. Consumers should compare with `==` rather than reach - /// into the wrapped `u32`. + /// Sentinel for the abstract "any element of a container" field. + /// Every numeric or dynamic index access (`arr[i]`, `arr.shift()`, + /// `map[k]`) projects through the same `Field(pt(container), ELEM)` + /// cell. `u32::MAX` is reserved; the per-body interner never + /// assigns it. pub const ELEM: FieldId = FieldId(u32::MAX); - /// "Tainted at every field" wildcard sentinel — distinct from + /// "Tainted at every field" wildcard sentinel, distinct from /// [`Self::ELEM`] (which is container-element semantics: every /// numeric/dynamic index access projects through it). /// `ANY_FIELD` represents the case where a writeback-shaped sink @@ -91,17 +84,14 @@ impl FieldInterner { /// Read-only lookup: returns the [`FieldId`] for `name` if it has /// already been interned, or `None` otherwise. /// - /// Used by cross-call resolvers (Pointer-Phase 5 / W3) to avoid - /// growing the caller's interner with field names introduced - /// solely by the callee summary — such IDs would never be referenced - /// by any other instruction in the caller's body, so the cells - /// would be write-only and consume space without contributing - /// to taint flow. + /// Used by cross-call resolvers to avoid growing the caller's + /// interner with field names introduced solely by callee summaries + ///, such cells would be write-only. pub fn lookup(&self, name: &str) -> Option { // Walk `names` directly so we don't require the post-deserialise // `ensure_lookup()` rebuild before this method is callable. - // Callers usually own `&SsaBody` — interning was either done at - // lowering time or via `ensure_lookup` post-deserialise — so the + // Callers usually own `&SsaBody`, interning was either done at + // lowering time or via `ensure_lookup` post-deserialise, so the // hot path goes through the `lookup` table; the linear walk is // a fallback for the (small) deserialised-but-not-rebuilt case. if let Some(&id) = self.lookup.get(name) { @@ -168,7 +158,7 @@ pub enum SsaOp { Call { callee: String, /// Original textual full path when SSA decomposed a chained receiver. - /// `None` when the callee was not rewritten — `callee` already holds + /// `None` when the callee was not rewritten, `callee` already holds /// the source-level textual form. /// /// **Debug / display only.** Analysis code must walk the SSA receiver @@ -188,7 +178,7 @@ pub enum SsaOp { /// Models member-access expressions (`obj.field`) as a first-class SSA /// op. Lowering walks the receiver tree so chained accesses like /// `c.writer.header` produce a chain of `FieldProj` ops with explicit - /// per-step receivers — eliminating the textual-prefix parsing that + /// per-step receivers, eliminating the textual-prefix parsing that /// previously misclassified deep receivers (the gin/context.go FP). /// /// `field` is interned in the owning [`SsaBody`]'s [`FieldInterner`]. @@ -223,7 +213,7 @@ pub enum SsaOp { /// /// Emitted by SSA lowering as a synthesized instruction in the entry /// block and referenced from phi operands whose incoming edge does - /// not carry a definition of the phi's variable — e.g. a try/catch + /// not carry a definition of the phi's variable, e.g. a try/catch /// rejoin where a variable is only defined on the normal path, or /// an early-return branch on a later-defined variable. /// @@ -269,7 +259,7 @@ pub enum Terminator { /// `targets` lists the per-case successor blocks (order matches the /// source-order of cases in the switch); `default` is the fallback /// branch taken when no case matches. Block `succs` remain the - /// authoritative flow set — the terminator is a structured summary. + /// authoritative flow set, the terminator is a structured summary. /// /// Emitted only for switch-like dispatch whose semantics are /// guaranteed-exclusive across cases (e.g. Go `switch`, Java @@ -285,11 +275,11 @@ pub enum Terminator { /// /// `Some(c)` records the constant value the scrutinee must equal for /// the corresponding target to be taken. `None` means the literal is - /// unknown — emitted for synthetic ≥3-way CFG fanouts or for case + /// unknown, emitted for synthetic ≥3-way CFG fanouts or for case /// patterns that aren't plain literals (OR-patterns, ranges, guards). /// /// When omitted/empty (length zero), all targets behave as "unknown - /// literal" — preserves backward compatibility with consumers that + /// literal", preserves backward compatibility with consumers that /// only inspect `targets`/`default`. #[serde(default)] case_values: SmallVec<[Option; 4]>, @@ -342,19 +332,17 @@ pub struct SsaBody { pub exception_edges: Vec<(BlockId, BlockId)>, /// Per-body interner for [`SsaOp::FieldProj`] field names. /// - /// Empty until the lowering phase emits FieldProj ops (Phase 2 of the - /// field-projections rollout). Cross-body callers (cross-file - /// summaries, debug serialization) MUST resolve interned ids through - /// this interner before transporting field references to other bodies. + /// Empty until lowering emits FieldProj ops. Cross-body callers + /// (cross-file summaries, debug serialization) MUST resolve interned + /// ids through this interner before transporting field references + /// to other bodies. #[serde(default)] pub field_interner: FieldInterner, - /// Pointer-Phase 3 / W1: side-table mapping a synthetic base-update - /// [`SsaOp::Assign`]'s defined value back to the `(receiver, field)` - /// pair it represents. Populated by SSA lowering at the - /// `obj.f = rhs` synthesis point so the taint engine can recognise - /// the synthetic assign as a structural field WRITE — the assigned - /// value is the new "obj" value, the use is the rhs, and the side- - /// table records `(prior_obj_value, FieldId("f"))`. + /// Side-table mapping a synthetic base-update [`SsaOp::Assign`]'s + /// defined value back to the `(receiver, field)` pair it + /// represents. Populated by lowering at the `obj.f = rhs` synthesis + /// point so the taint engine can treat the synthetic assign as a + /// structural field WRITE. /// /// Empty by default; only synthetic assigns whose enclosing source /// statement was a dotted-path assignment (`a.b.c = …`) appear here. @@ -505,10 +493,10 @@ mod tests { assert_eq!(uses, vec![SsaValue(1)]); } - /// Pointer-Phase 4 / A6 audit: the [`FieldId::ELEM`] sentinel is + /// the [`FieldId::ELEM`] sentinel is /// reserved for "any element of a container". The interner assigns /// IDs monotonically from `0`, so the sentinel `u32::MAX` can only - /// collide if the body declares ~4 billion fields — a corner case + /// collide if the body declares ~4 billion fields, a corner case /// no realistic codebase reaches. Pin the contract with a stress /// loop so future implementation drift can't silently shift IDs to /// the sentinel value. @@ -526,7 +514,7 @@ mod tests { // Lookup of the sentinel name (used by W3 to round-trip // container-element flow through summary) must NOT match a // real interned name even when the same name is interned. - // The wire-format keeps `` as a *string marker* — it + // The wire-format keeps `` as a *string marker*, it // never goes through `intern`. Instead, callers compare // explicitly against `FieldId::ELEM`. assert_ne!(interner.intern(""), FieldId::ELEM); diff --git a/src/ssa/lower.rs b/src/ssa/lower.rs index 720a59f7..9ab2ca86 100644 --- a/src/ssa/lower.rs +++ b/src/ssa/lower.rs @@ -29,16 +29,16 @@ use super::ir::*; /// - Construct the `Call` op with `callee = bare_method_name`, /// `callee_text = Some(original_callee.to_string())`, /// `receiver = Some(final_receiver_value)`. -/// - Use the returned receiver as the implicit method receiver — do NOT +/// - Use the returned receiver as the implicit method receiver, do NOT /// add the chain root or any intermediate field name to `args`. /// -/// **Decomposition rules** (Phase 2 of the field-projections rollout): +/// **Decomposition rules**: /// - Skip when the callee contains zero `.` characters (no member access) /// or only one `.` (single-dot case is handled by the existing /// `info.call.receiver` channel without needing a `FieldProj` op). -/// - Bail when any "complex" token appears in the callee — `(`, `)`, +/// - Bail when any "complex" token appears in the callee, `(`, `)`, /// `[`, `]`, `::`, `->`, `?`, `<`, `>`, `*`, `&`, `:` (other than `::` -/// already filtered), or whitespace — signaling the callee text isn't +/// already filtered), or whitespace, signaling the callee text isn't /// a clean `....` chain we can safely split on `.`. /// - The first segment must be a known SSA variable in `var_stacks`; /// otherwise the chain root is unresolvable and we bail. @@ -221,7 +221,7 @@ fn lower_to_ssa_inner( // 4b. For per-function scope: identify external variables (used but not defined) // and inject synthetic Param defs at entry block so rename can find them. // When formal_params is supplied, reorder so formal params come first in - // declaration order — this makes Param indices correspond to call-site positions. + // declaration order, this makes Param indices correspond to call-site positions. // let external_vars = if scope.is_some() && !scope_all && !scope_nop { let raw = identify_external_uses(cfg, &blocks_nodes, &var_defs); @@ -277,7 +277,7 @@ fn lower_to_ssa_inner( } // 7b. Debug assertions: verify structural invariants. - // The helper body is `debug_assert!` only, so it's a no-op in release — + // The helper body is `debug_assert!` only, so it's a no-op in release , // call unconditionally to avoid a dead_code warning when the lib is // built without `--tests`. debug_assert_bfs_ordering(&block_preds); @@ -451,10 +451,10 @@ fn collect_reachable( /// Form basic blocks from filtered CFG nodes. /// /// Returns: -/// - blocks_nodes: Vec> — nodes per block (in order) -/// - block_of_node: HashMap — node → block index -/// - block_succs: Vec> — successors per block -/// - block_preds: Vec> — predecessors per block +/// - blocks_nodes: Vec>, nodes per block (in order) +/// - block_of_node: HashMap, node → block index +/// - block_succs: Vec>, successors per block +/// - block_preds: Vec>, predecessors per block fn form_blocks( cfg: &Cfg, entry: NodeIndex, @@ -537,7 +537,7 @@ fn form_blocks( // Discover leaders in BFS order over `cfg`, but skip edges whose // source is a terminating (Return / Throw) node. Walking the raw // `cfg` directly here would re-introduce the bookkeeping - // Return/Throw → fn_exit edges we just stripped — fn_exit (or any + // Return/Throw → fn_exit edges we just stripped, fn_exit (or any // post-return join) would be discovered through them and assigned a // block ID before its true block-level predecessors, breaking the // BFS-forward-pred invariant (`debug_assert_bfs_ordering`). @@ -546,7 +546,7 @@ fn form_blocks( // exception edges entirely (collect_reachable strips them and records // them separately in `exception_edges`). Catch-block nodes are still // in `reachable` and must be discoverable as leaders via the - // try-body → catch path — only the terminating-source bookkeeping + // try-body → catch path, only the terminating-source bookkeeping // edges are bogus. { let mut bfs_queue: VecDeque = VecDeque::new(); @@ -572,7 +572,7 @@ fn form_blocks( // Belt-and-braces: any leader still unvisited gets appended in // CFG-node-index order so block-ID assignment remains // deterministic. We do NOT include the synthetic function-exit - // node when it is unreachable through filtered edges — that + // node when it is unreachable through filtered edges, that // happens whenever every path in the body terminates explicitly // (e.g. a function whose only return is `return buf.toString()` // at the tail). Including it would emit an orphan SSA block @@ -760,19 +760,19 @@ pub(crate) fn is_receiver_name(name: &str) -> bool { /// on to emit one [`SsaOp::SelfParam`] (for the leading receiver slot, when /// present) followed by a contiguous run of [`SsaOp::Param { index }`] values /// whose indices 0..N correspond exactly to positional call-site argument -/// positions — no receiver offset required anywhere downstream. +/// positions, no receiver offset required anywhere downstream. /// /// W1.b: every formal parameter gets a Param op even when the body never /// references it directly. Without this, the *first* `obj.f = rhs` on a /// formal `obj` whose body never reads `obj` produces no W1 -/// `field_writes` entry — `var_stacks["obj"]` is empty when the synth +/// `field_writes` entry, `var_stacks["obj"]` is empty when the synth /// Assign runs because no external-use path interned `obj`. Subsequent /// writes work because the synth Assign itself defines `obj`, so the /// gap is exactly the FIRST write. Always emitting a formal Param at /// block 0 closes that gap. fn reorder_external_vars(external: Vec, formal_params: &[String]) -> Vec { if formal_params.is_empty() { - return external; // no reordering — preserve existing alphabetical sort + return external; // no reordering, preserve existing alphabetical sort } let ext_set: HashSet<&str> = external.iter().map(|s| s.as_str()).collect(); let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect(); @@ -789,7 +789,7 @@ fn reorder_external_vars(external: Vec, formal_params: &[String]) -> Vec } // Formal positional params next (declaration order), skipping any // receiver that was already emitted above. W1.b: include EVERY - // formal regardless of whether the body uses it externally — an + // formal regardless of whether the body uses it externally, an // unused formal that gets field-written via `obj.cache = rhs` still // needs a Param op so the synth Assign loop sees its prior reaching // def in `var_stacks`. @@ -865,7 +865,7 @@ fn collect_var_defs( /// Returns a `BTreeSet` per block so downstream consumers that iterate /// the set (notably `rename_variables`) observe a deterministic, alphabetical /// order regardless of the underlying hasher state. The Cytron algorithm -/// itself is order-independent — only its observers are. +/// itself is order-independent, only its observers are. fn insert_phis( var_defs: &BTreeMap>, dom_frontiers: &[HashSet], @@ -882,7 +882,7 @@ fn insert_phis( for &f in &dom_frontiers[b] { if has_phi.insert(f) { phi_placements[f].insert(var.clone()); - // Phi is a new definition — add to worklist + // Phi is a new definition, add to worklist if !def_blocks.contains(&f) { worklist.push_back(f); } @@ -945,7 +945,7 @@ fn rename_variables( // empty otherwise so existing per-statement Call lowering is // bit-for-bit unchanged. let mut field_interner = crate::ssa::ir::FieldInterner::new(); - // Pointer-Phase 3 / W1: side-table mapping each synthetic base-update + //side-table mapping each synthetic base-update // [`SsaOp::Assign`]'s defined value to its `(receiver, field)` pair. // Populated below at the synthetic-Assign emission site. Read by // the taint engine to lift the assign into a structural field WRITE. @@ -968,7 +968,7 @@ fn rename_variables( // `BTreeMap` guarantees a deterministic (alphabetical) iteration order when // pushing phi values onto `var_stacks` and when filling operands on - // successor phis — both sites are observable in SSA numbering if they + // successor phis, both sites are observable in SSA numbering if they // reordered between runs. let mut phi_values: Vec> = vec![BTreeMap::new(); num_blocks]; @@ -1118,14 +1118,14 @@ fn rename_variables( .any(|l| matches!(l, crate::labels::DataLabel::Source(_))) && info.call.callee.is_none() { - // Pure source (e.g. $_GET, env var) — no callee, so no args to track. + // Pure source (e.g. $_GET, env var), no callee, so no args to track. // Source-labeled calls (e.g. file_get_contents) fall through to Call // so argument taint and sink detection still work. SsaOp::Source } else if info.call.callee.is_some() { let callee = info.call.callee.as_deref().unwrap_or("").to_string(); let (mut args, mut receiver) = build_call_args(info, var_stacks); - // Phase 2: try decomposing chained-receiver method calls + // try decomposing chained-receiver method calls // (`a.b.c()`) into a FieldProj chain plus a bare-method Call // so downstream consumers can read the receiver structure // without re-parsing the callee text. Bails to None on any @@ -1145,7 +1145,7 @@ fn rename_variables( Some((recv_v, bare_method)) => { receiver = Some(recv_v); // Strip any positional arg group that exactly matches the - // chain root identifier — it has been replaced by the + // chain root identifier, it has been replaced by the // FieldProj chain receiver, and re-listing it as an // argument would inflate arity / double-taint. if let Some(base_ident) = callee.split('.').next() { @@ -1175,7 +1175,7 @@ fn rename_variables( // Reassignment kill: a node that defines a variable but has no // uses (operands) and is not a source is a constant/literal // assignment. SSA rename allocates a fresh SsaValue, so - // downstream references see this new (untainted) value — the + // downstream references see this new (untainted) value, the // prior tainted definition is implicitly dead. SsaOp::Const(info.taint.const_text.clone()) } else if info.taint.defines.is_some() { @@ -1217,12 +1217,12 @@ fn rename_variables( // `Assign(uses)` so the SSA carries an explicit pass-through // for the returned/thrown value. Without this, the Return // node was lowered as a `Nop` and the terminator-setup - // "last non-Nop body inst" search returned None — producing + // "last non-Nop body inst" search returned None, producing // `Terminator::Return(None)` for a function that visibly // returns an identifier. That broke per-return-path // PathFact narrowing for non-Rust languages where the // returned identifier wasn't computed in the same block - // (e.g. Python `def f(s): return s` — `s` is a Param in + // (e.g. Python `def f(s): return s`, `s` is a Param in // block 0, the Return block itself has no body insts). let uses: SmallVec<[SsaValue; 4]> = info .taint @@ -1250,8 +1250,8 @@ fn rename_variables( } else if info.call.callee.is_some() { let callee = info.call.callee.as_deref().unwrap_or("").to_string(); let (mut args, mut receiver) = build_call_args(info, var_stacks); - // Phase 2: same FieldProj-chain decomposition as the primary - // Call branch above — kept in sync because this fallback + // same FieldProj-chain decomposition as the primary + // Call branch above, kept in sync because this fallback // path also constructs SSA Call ops (used for control-flow // wrapper calls that landed past the earlier match arms). let (final_callee, callee_text) = match try_lower_field_proj_chain( @@ -1342,9 +1342,9 @@ fn rename_variables( // overwrites properly kill taint: if obj.data is re-assigned to a // constant, the base `obj` no longer carries that field's taint. // - // Pointer-Phase 3 / W1: each synthetic Assign also records its - // structural identity into `field_writes` — `(receiver_old_value, - // FieldId(field_name))` — so the taint engine can recognise the + //each synthetic Assign also records its + // structural identity into `field_writes`, `(receiver_old_value, + // FieldId(field_name))`, so the taint engine can recognise the // synthetic assign as a field WRITE and mirror the rhs taint // into the matching `(loc, field)` cell on `SsaTaintState`. // The "old" parent value is the reaching def of `parent` BEFORE @@ -1427,9 +1427,9 @@ fn rename_variables( ssa_blocks[block_idx].terminator = if succs.is_empty() { // A block with no successors at the block level is one of: - // (1) a block containing a Throw — terminates with an + // (1) a block containing a Throw, terminates with an // exception; no normal fall-through. - // (2) a block containing a Return — terminates with a value + // (2) a block containing a Return, terminates with a value // (or void). After form_blocks strips the bookkeeping // Seq edge from Return → fn_exit, every explicit-return // block lands here, including `if cond { return X; }` @@ -1458,7 +1458,7 @@ fn rename_variables( let return_info = &cfg[rn]; // Return-value resolution. Mirror the legacy // `has_const_return` path so callers see exactly the same - // SSA shape they did before the merged-return fix — only + // SSA shape they did before the merged-return fix, only // the *terminator* changes (Goto(exit) → Return(_)), not // the value selection. // @@ -1468,7 +1468,7 @@ fn rename_variables( // Emit a synthetic Const inst so taint never leaks // from an unrelated inst earlier in the same block // (regression guard: C-1 inline-return precision). - // (b) Computed / passthrough return — last non-Nop body + // (b) Computed / passthrough return, last non-Nop body // inst. Covers `return foo()` (Call sits before the // Return Nop), `return x + y` (Assign), and the // implicit tail expression collapsed into a single @@ -1476,9 +1476,9 @@ fn rename_variables( // Return carries identifier uses (`return req`, // `return { req.session, ... }`), the SSA defs for // those identifiers are already on the body as - // Param / Assign / Source insts — picking the last + // Param / Assign / Source insts, picking the last // one matches pre-fix behaviour exactly. - // (c) Void / unresolved — `Return(None)`. + // (c) Void / unresolved, `Return(None)`. if return_info.taint.uses.is_empty() { let const_text = return_info.taint.const_text.clone(); let const_v = SsaValue(*next_value); @@ -1507,7 +1507,7 @@ fn rename_variables( Terminator::Return(from_body) } } else { - // (3) fn_exit / true fall-off — no Return CFG node in this + // (3) fn_exit / true fall-off, no Return CFG node in this // block. Use the last non-Nop body instruction as the // implicit return value (e.g. the function's tail-position // expression in Rust). @@ -1575,7 +1575,7 @@ fn rename_variables( condition, } } else { - // More than 2 successors — model as a multi-way Switch. + // More than 2 successors, model as a multi-way Switch. // // This replaces the previous `Goto(first)` collapse: the // structured terminator now enumerates every target instead @@ -1594,7 +1594,7 @@ fn rename_variables( // // Scrutinee: use the primary SSA value defined at the last // node in this block when one exists; fall back to - // `SsaValue(0)` (a valid index — SSA numbering is 1-based + // `SsaValue(0)` (a valid index, SSA numbering is 1-based // only conceptually, and value 0 is always present in a // non-empty body) when no value is defined. Downstream // consumers that care about the scrutinee (abstract interp, @@ -1604,7 +1604,7 @@ fn rename_variables( let targets: SmallVec<[BlockId; 4]> = succs.iter().skip(1).map(|&s| BlockId(s as u32)).collect(); let default = BlockId(succs[0] as u32); - // Synthetic ≥3-way fanouts have no per-case literal metadata — + // Synthetic ≥3-way fanouts have no per-case literal metadata , // every entry is None (unknown), so the executor falls back to // first-reachable behavior on this terminator. let case_values: SmallVec<[Option; 4]> = @@ -1815,7 +1815,7 @@ fn debug_assert_bfs_ordering(block_preds: &[Vec]) { /// predecessor of the block. /// /// Runs in release builds because phi-operand mismatches are -/// load-bearing for soundness — downstream taint, const, and abstract +/// load-bearing for soundness, downstream taint, const, and abstract /// analyses iterate phi operands by `(pred_blk, value)` pairs, and /// either a missing operand (silent "no contribution" on that edge) /// or a phantom operand (garbage into the join) corrupts analysis @@ -1824,7 +1824,7 @@ fn debug_assert_bfs_ordering(block_preds: &[Vec]) { /// The invariant is strict equality. Predecessors that carry no /// reaching definition for the phi's variable are filled with the /// [`SsaOp::Undef`] sentinel in `fill_undef_phi_operands`, rather than -/// being dropped — so consumers that look up by `(pred_blk, value)` +/// being dropped, so consumers that look up by `(pred_blk, value)` /// see a real operand for every control-flow edge. fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec]) { use std::collections::HashSet; @@ -1887,7 +1887,7 @@ fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec] /// single shared sentinel instruction ([`SsaOp::Undef`]) synthesized /// at the end of block 0's body. Consumers iterate phi operands by /// `(pred_blk, value)` and therefore see a real operand on every -/// control-flow edge — no implicit "missing = empty" semantics. +/// control-flow edge, no implicit "missing = empty" semantics. /// /// The Undef instruction is created lazily (only when at least one phi /// has a gap) so functions with fully-dominating definitions pay zero @@ -1931,7 +1931,7 @@ fn fill_undef_phi_operands( block: BlockId(0), }); // Place the Undef instruction at the end of block 0's body so it - // appears after any synthetic Param / SelfParam emissions — its + // appears after any synthetic Param / SelfParam emissions, its // only role is to anchor the SsaValue; ordering relative to other // body instructions is cosmetic (no consumer depends on its // position, only on the value lookup). @@ -2181,7 +2181,7 @@ mod tests { #[test] fn bfs_ordering_holds_for_linear_cfg() { - // Entry → A → B → Exit — all blocks should satisfy BFS ordering + // Entry → A → B → Exit, all blocks should satisfy BFS ordering let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_node(StmtKind::Entry)); let a = cfg.add_node(NodeInfo { @@ -2409,7 +2409,7 @@ mod tests { /// predecessor and a normal control-flow predecessor must lower to a /// consistent phi. For variables defined before the try (live on /// *both* edges), the phi at the catch block has exactly two operands - /// — one per predecessor — and the release assertion accepts it. + ///, one per predecessor, and the release assertion accepts it. #[test] fn catch_block_join_phi_has_operand_per_live_predecessor() { // Entry → defines `x` → Try → (Seq) → Join ← (Exception via body) Catch @@ -2456,7 +2456,7 @@ mod tests { cfg.add_edge(catch, join, EdgeKind::Seq); cfg.add_edge(join, exit, EdgeKind::Seq); - // Lowering must succeed — the assertion is active in release. + // Lowering must succeed, the assertion is active in release. let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Locate the block containing a phi for `x`; it must be the join @@ -2498,7 +2498,7 @@ mod tests { /// Regression guard for the Undef fill pass. When a variable is /// only defined on one branch of a join (e.g. a catch-only binding /// rejoining the normal path), the lowering must still emit one - /// phi operand per predecessor — the missing edge becoming a + /// phi operand per predecessor, the missing edge becoming a /// reference to the synthesized `SsaOp::Undef` sentinel rather /// than being dropped. #[test] @@ -2633,7 +2633,7 @@ mod tests { #[should_panic(expected = "SSA phi operand count does not match predecessor count")] fn phi_assertion_helper_rejects_more_operands_than_preds() { // A phi with MORE operands than preds references a nonexistent - // predecessor — unsound because downstream consumers either + // predecessor, unsound because downstream consumers either // panic on the lookup or silently feed garbage taint into the // join. Strict-equality invariant catches this. let dummy_node = NodeIndex::new(0); @@ -2859,7 +2859,7 @@ mod tests { /// to a synthetic exit block. Previously, the bookkeeping /// `Return → fn_exit` `Seq` edge made early-return blocks fall into /// the single-successor `Goto` arm, and the fall-through tail - /// expression's body got merged into the shared exit block — every + /// expression's body got merged into the shared exit block, every /// early-return path therefore appeared to also execute the tail. /// Mirrors the `if cond { return X; } Y` shape that motivated the fix. #[test] @@ -2876,7 +2876,7 @@ mod tests { }); // True branch: return constant. uses=[] + const_text=Some triggers // the literal-return path, ensuring the block emits a synthetic - // Const + Return(Some(_)) — the same shape `return None` / + // Const + Return(Some(_)), the same shape `return None` / // `return String::new()` produces in real Rust code. let early_ret = cfg.add_node(NodeInfo { taint: TaintMeta { @@ -2901,7 +2901,7 @@ mod tests { cfg.add_edge(if_node, early_ret, EdgeKind::True); cfg.add_edge(if_node, tail, EdgeKind::False); // Bookkeeping wire-up the real CFG construction performs in - // `build_cfg` — Return / Throw → fn_exit via Seq — so the SSA + // `build_cfg`, Return / Throw → fn_exit via Seq, so the SSA // lowering has to handle it. cfg.add_edge(early_ret, exit, EdgeKind::Seq); cfg.add_edge(tail, exit, EdgeKind::Seq); @@ -2909,7 +2909,7 @@ mod tests { let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap(); // Locate the block containing the early-return CFG node and - // assert it terminates with Return — not Goto(_) into the + // assert it terminates with Return, not Goto(_) into the // shared exit block. let early_block = ssa .blocks @@ -2936,7 +2936,7 @@ mod tests { // The fall-through (tail) block must NOT have the early-return // block as a predecessor. Pre-fix, both the early-return path // and the tail path merged into the shared fn_exit block, so the - // tail's body was reachable from the early-return path — that's + // tail's body was reachable from the early-return path, that's // the merged-return defect. let tail_block = ssa .blocks @@ -2963,7 +2963,7 @@ mod tests { /// `if a || b || c { return X; } Y` must have its rejection body emit a /// `Terminator::Return(_)` and have `succs.is_empty()`. Pre-fix the /// rejection body's String::new() Call shared a block whose only - /// successor was the merged tail — losing the early-return semantics + /// successor was the merged tail, losing the early-return semantics /// entirely and diluting per-return-path PathFact narrowing. #[test] fn or_chain_rejection_block_terminates_with_return() { @@ -3093,7 +3093,7 @@ mod tests { } // ───────────────────────────────────────────────────────────────── - // Phase 2: FieldProj chain lowering tests + // FieldProj chain lowering tests // ───────────────────────────────────────────────────────────────── // // These tests pin the contract that `try_lower_field_proj_chain` @@ -3426,7 +3426,7 @@ mod tests { assert!(blocks[0].body.is_empty()); } - // ── End-to-end Phase 2 tests via real tree-sitter parsing ────────── + // ── End-to-end SSA decomposition tests via real tree-sitter parsing ────────── // // These exercise the integration between CFG construction (which sets // `info.call.callee = "c.mu.Lock"`) and SSA lowering. We assert that @@ -3451,7 +3451,7 @@ mod tests { }; // Mirror the production lowering path: function bodies use // lower_to_ssa_with_params so formal parameters get synthetic - // Param/SelfParam injections at block 0 — without them, the + // Param/SelfParam injections at block 0, without them, the // FieldProj chain helper has no SSA root to anchor to. if body.meta.name.is_some() { let func_name = body.meta.name.clone().unwrap_or_default(); @@ -3506,7 +3506,7 @@ mod tests { #[test] fn phase2_e2e_go_chained_receiver_emits_field_proj() { - // Go: `c.writer.header.set(k, v)` — 3-segment receiver, 2 FieldProjs. + // Go: `c.writer.header.set(k, v)`, 3-segment receiver, 2 FieldProjs. // Chain root `c` is a function parameter so it is resolvable. let src = b"package p\nfunc f(c *T, k string, v string) { c.writer.header.set(k, v) }\n"; let body = parse_to_first_body( @@ -3549,7 +3549,7 @@ mod tests { #[test] fn phase2_e2e_python_chained_receiver_emits_field_proj() { - // Python: `obj.client.session.send(p)` — 3-segment receiver. + // Python: `obj.client.session.send(p)`, 3-segment receiver. let src = b"def f(obj, p):\n obj.client.session.send(p)\n"; let body = parse_to_first_body( src, @@ -3574,7 +3574,7 @@ mod tests { #[test] fn phase2_e2e_javascript_chained_receiver_emits_field_proj() { - // JS: `obj.foo.bar.baz()` — 3-segment receiver. + // JS: `obj.foo.bar.baz()`, 3-segment receiver. let src = b"function f(obj) { obj.foo.bar.baz(); }"; let body = parse_to_first_body( src, @@ -3592,10 +3592,10 @@ mod tests { #[test] fn phase2_e2e_java_chained_receiver_emits_field_proj() { - // Java: `obj.config.handler.run()` — 3-segment receiver chain through + // Java: `obj.config.handler.run()`, 3-segment receiver chain through // a parameter `obj`. We avoid `this.…` because `this` is a Java // keyword (not an identifier_node) so it isn't extracted as an - // external use — outside Phase 2's scope. + // external use, outside SSA decomposition.s scope. let src = b"class C { void f(Object obj) { obj.config.handler.run(); } }"; let body = parse_to_first_body( src, @@ -3620,7 +3620,7 @@ mod tests { #[test] fn phase2_e2e_simple_receiver_no_field_proj() { - // REGRESSION: `obj.foo()` — single-dot receiver. Phase 2 must NOT + // REGRESSION: `obj.foo()`, single-dot receiver. SSA lowering must NOT // decompose this into a FieldProj chain (existing receiver channel // already covers it). Verify the body has zero FieldProj ops and // the Call's callee_text stays None. @@ -3664,7 +3664,7 @@ mod tests { fn phase2_e2e_global_root_chain_still_emits_field_proj() { // REGRESSION-NEGATIVE: when the chain root is a global identifier // (`Math.foo.bar()`), the lowerer's external-var synthesis makes - // `Math` available as a synthetic Param — the chain still + // `Math` available as a synthetic Param, the chain still // decomposes, treating `Math` as the SSA receiver. This is the // semantically correct outcome even for global-rooted chains: the // FieldProj op precisely captures the field-access structure. @@ -3685,7 +3685,7 @@ mod tests { #[test] fn phase2_e2e_rust_method_call_through_field_emits_field_proj() { - // Rust: `c.mu.lock()` — `c` is a function parameter, `mu` is a field, + // Rust: `c.mu.lock()`, `c` is a function parameter, `mu` is a field, // `lock` is the method. Verifies we generate FieldProj for `mu`. // (Rust paths like `std::env::var` use `::` and are excluded by // the helper's complex-token check.) @@ -3782,16 +3782,11 @@ mod tests { ); } - /// Pointer-Phase 3 / W1 end-to-end: lowering an `obj.f = rhs` - /// statement populates `SsaBody.field_writes` with the synthetic - /// base-update Assign's `(receiver, FieldId)` mapping. - /// - /// W1.b: a SINGLE-write shape — `function f(obj) { obj.cache = 42 }` - /// — also populates `field_writes` because every formal gets a - /// Param op at block 0 regardless of whether it's read by the - /// body. Pre-W1.b this required two writes (the second's prior - /// reaching def came from the first synth Assign); now the first - /// write already finds the formal's Param in `var_stacks`. + /// End-to-end: lowering an `obj.f = rhs` statement populates + /// `SsaBody.field_writes` with the synthetic base-update Assign's + /// `(receiver, FieldId)` mapping. A single-write shape suffices , + /// every formal gets a Param op at block 0 so the first write + /// finds the formal in `var_stacks`. #[test] fn w1_end_to_end_field_write_records_side_table_when_parent_has_prior_def() { // Single write to `obj.cache`: the formal `obj` provides the @@ -3816,7 +3811,7 @@ mod tests { } } - /// W1.b: Python — single `obj.cache = 42` on a formal also + /// W1.b: Python, single `obj.cache = 42` on a formal also /// populates `field_writes` thanks to the formal Param op. #[test] fn w1b_single_write_records_field_write_python() { @@ -3835,7 +3830,7 @@ mod tests { ); } - /// W1.b: Rust — single `obj.cache = 42` on a method-style formal + /// W1.b: Rust, single `obj.cache = 42` on a method-style formal /// (`fn f(obj: &mut O)`) also populates `field_writes`. #[test] fn w1b_single_write_records_field_write_rust() { @@ -3880,11 +3875,11 @@ mod tests { // ───────────────────────────────────────────────────────────────── /// Loop induction variable: `x = x + 1` inside a loop is the - /// canonical SSA challenge — the body uses `x` then redefines it, + /// canonical SSA challenge, the body uses `x` then redefines it, /// and the join with the entry definition must produce a phi that /// distinguishes the entry value from the body's redefinition. - /// Phase 5.2 (induction var pruning) depends on this shape being - /// lowered correctly. + /// Induction-var pruning depends on this shape being lowered + /// correctly. #[test] fn loop_self_assignment_induction_phi_is_distinct() { // Entry → x=0 → Loop header → [Body: use x; x = x_new] → Loop @@ -4101,7 +4096,7 @@ mod tests { /// Variable defined ONLY in one branch of a conditional must be /// undef on the other path. The phi at the join should include an - /// undef sentinel for the missing arm — guards against the + /// undef sentinel for the missing arm, guards against the /// renamer silently dropping the missing operand. #[test] fn conditional_define_only_one_arm_phi_has_undef_operand() { @@ -4137,7 +4132,7 @@ mod tests { // Find a phi for x and verify it has 2 operands. The "undef" // operand can manifest as a Nop-defined SsaValue or a sentinel - // — both are acceptable; the invariant is that arity == preds. + //, both are acceptable; the invariant is that arity == preds. let x_phi_ops = ssa .blocks .iter() diff --git a/src/ssa/mod.rs b/src/ssa/mod.rs index 5a489cce..9e2f693e 100644 --- a/src/ssa/mod.rs +++ b/src/ssa/mod.rs @@ -1,4 +1,4 @@ -#[allow(dead_code)] // IR types — fields used by Display impl, tests, and downstream analyses +#[allow(dead_code)] // IR types, fields used by Display impl, tests, and downstream analyses pub mod alias; pub mod const_prop; pub mod copy_prop; diff --git a/src/ssa/param_points_to.rs b/src/ssa/param_points_to.rs index f6a5c2f8..b20c18da 100644 --- a/src/ssa/param_points_to.rs +++ b/src/ssa/param_points_to.rs @@ -6,13 +6,13 @@ //! 1. **Param → Param field writes.** An `obj.field = val` where `obj` //! traces back to parameter `b` and `val` traces back to parameter `a` //! emits a `Param(a) → Param(b)` `MayAlias` edge. This captures the -//! `mutating_helper` pattern — the callee mutates a shared heap cell +//! `mutating_helper` pattern, the callee mutates a shared heap cell //! through one parameter and the caller observes the mutation through //! its argument for that parameter. //! //! 2. **Param → Return aliases.** `Terminator::Return(v)` where `v` //! traces back to a parameter emits a `Param(i) → Return` edge. This -//! captures the `returned_alias` pattern — the callee returns its +//! captures the `returned_alias` pattern, the callee returns its //! argument unchanged and the caller treats the result as aliasing the //! input. //! @@ -25,7 +25,7 @@ //! //! The analysis is **flow-insensitive** and **bounded**: it does not //! reason about path feasibility, and it stops adding edges once the -//! summary's [`MAX_ALIAS_EDGES`] cap is reached — the overflow flag is +//! summary's [`MAX_ALIAS_EDGES`] cap is reached, the overflow flag is //! the conservative fallback that callers honour. use std::collections::{HashMap, HashSet}; @@ -39,7 +39,7 @@ use super::ir::{SsaBody, SsaOp, SsaValue, Terminator}; /// Map an SSA value back to its defining instruction's op. /// -/// Local to this module — the taint engine has its own `build_inst_map` +/// Local to this module, the taint engine has its own `build_inst_map` /// that also carries receiver info we do not need, and duplicating it /// keeps this analysis independent of that private helper's shape. fn build_op_map(ssa: &SsaBody) -> HashMap { @@ -73,7 +73,7 @@ struct ParamHit { /// The `SsaOp::Param` index as lowered. ssa_index: usize, /// The parameter's variable name (from [`SsaInst::var_name`]). Used - /// to map back to the formal-declaration position — the caller's + /// to map back to the formal-declaration position, the caller's /// `args[i]` slot is keyed by declaration position, not by SSA /// index, and the two can disagree when a formal parameter is /// skipped from SSA lowering (e.g., pure-output params). @@ -83,7 +83,7 @@ struct ParamHit { /// Walk Assign/Phi chains to find a backing `Param { index }` SSA op. /// /// Returns the `SsaOp::Param`'s index *and* its var_name so callers can -/// resolve the formal-positional index via the name lookup table — the +/// resolve the formal-positional index via the name lookup table, the /// two indices can disagree when SSA lowering skips a formal parameter /// (never used as a read), shifting subsequent param indices down. fn trace_to_param_hit( @@ -144,7 +144,7 @@ fn param_hit_to_formal_index(hit: &ParamHit, params_by_name: &HashMap &str { let dot = name.find('.'); @@ -170,7 +170,7 @@ fn is_receiver_name_local(name: &str) -> bool { /// Returns `true` the first time a qualifying allocation is found. /// Parameter-terminated paths, `Call` ops that are not container /// constructors, and constants that are not container literals all -/// return `false` — soundly under-approximating, since the caller will +/// return `false`, soundly under-approximating, since the caller will /// simply fall back to the existing `Param(i) → Return` / store-into- /// heap channels when the flag is absent. fn trace_to_fresh_alloc( @@ -225,7 +225,7 @@ fn returns_fresh_allocation( /// /// `param_info` carries one `(param_index, param_name, param_ssa_value)` /// tuple per formal parameter that was emitted as [`SsaOp::Param`] in the -/// lowered body. The receiver is intentionally excluded — this table +/// lowered body. The receiver is intentionally excluded, this table /// captures positional parameters only. /// /// `formal_param_names`, when supplied, is the authoritative list of @@ -261,7 +261,7 @@ pub fn analyse_param_points_to( // container constructor for `lang` (`ArrayList`, `dict`, …). // // When at least one return path matches, the callee produces a - // caller-visible fresh heap identity on that path — callers + // caller-visible fresh heap identity on that path, callers // synthesise a `HeapObjectId` keyed on the call result so later // container operations have a stable heap cell. Traces that reach a // parameter are handled by the edge-based `Param(i) → Return` channel @@ -278,7 +278,7 @@ pub fn analyse_param_points_to( return summary; } // Build the name→positional-index map. Summary param indices are - // *positional* — they match the call-site `args[i]` position, which + // *positional*, they match the call-site `args[i]` position, which // excludes the receiver (`self`/`this`). When `formal_param_names` // contains a leading receiver, skip it so the remaining names align // with the SSA `SsaOp::Param { index }` convention. @@ -344,7 +344,7 @@ pub fn analyse_param_points_to( continue; } if src_idx == target_idx { - // Self-alias is uninformative — the caller's + // Self-alias is uninformative, the caller's // arg-to-itself propagation is already covered by // `param_to_return`/`param_to_sink`. continue; @@ -532,7 +532,7 @@ mod tests { (5usize, "capture".to_string(), SsaValue(0)), (1usize, "b".to_string(), SsaValue(1)), ]; - // formal_param_count = 2 — index 5 is out of range. + // formal_param_count = 2, index 5 is out of range. let s = analyse_param_points_to(&body, &pinfo, 2, None, None); assert!( s.is_empty(), @@ -570,7 +570,7 @@ mod tests { .map(|i| (i, format!("p{i}"), SsaValue(i as u32))) .collect(); // Only the first traced param is emitted (trace_to_param short- - // circuits on first match), so overflow is not expected — we + // circuits on first match), so overflow is not expected, we // instead verify the bounded behaviour: a single edge. let s = analyse_param_points_to(&body, &pinfo, n as usize, None, None); assert!(!s.overflow); diff --git a/src/ssa/pointsto.rs b/src/ssa/pointsto.rs index 669ba149..950c8b94 100644 --- a/src/ssa/pointsto.rs +++ b/src/ssa/pointsto.rs @@ -14,7 +14,7 @@ use smallvec::SmallVec; #[derive(Clone, Debug, PartialEq, Eq)] pub enum ContainerOp { /// Taint flows from the listed argument positions into the receiver - /// container (e.g. `arr.push(val)` — val taint merges into arr). + /// container (e.g. `arr.push(val)`, val taint merges into arr). /// /// `index_arg`: when `Some(pos)`, the argument at that logical position /// is the container index/key. If constant-propagation proves it a @@ -27,11 +27,11 @@ pub enum ContainerOp { /// Taint flows from the receiver container to the call's return value /// (e.g. `arr.pop()`, `items.join('')`). /// - /// `index_arg`: same semantics as `Store::index_arg` — when present and + /// `index_arg`: same semantics as `Store::index_arg`, when present and /// provably constant, loads from `HeapSlot::Index(n)`. Load { index_arg: Option }, /// Taint flows from the receiver container into the argument at - /// `dest_arg` — i.e. the "writeback" pattern where a method writes its + /// `dest_arg`, i.e. the "writeback" pattern where a method writes its /// decoded/loaded value into a caller-supplied destination rather than /// returning it. Used for the Go `*.Decode(&dest)` family /// (`json.Decoder.Decode`, `xml.Decoder.Decode`, `gob.Decoder.Decode`), @@ -121,16 +121,16 @@ fn classify_js(method: &str) -> Option { match method { // Array store "push" | "unshift" => store(0), - // Map/Set store: map.set(key, value) — key at 0, value at 1 + // Map/Set store: map.set(key, value), key at 0, value at 1 "set" => store_indexed(1, 0), "add" => store(0), // set.add(value) // Array/Map load "pop" | "shift" => load(), "join" | "flat" | "concat" | "slice" | "toString" => load(), - // map.get(key) — key at 0 + // map.get(key), key at 0 "get" => load_indexed(0), "values" | "keys" | "entries" => load(), - // Pointer-Phase 6 / W5: synthetic callees emitted by CFG + //synthetic callees emitted by CFG // lowering for subscript reads/writes (`arr[i]`, `arr[i] = v`). "__index_get__" => load_indexed(0), "__index_set__" => store_indexed(1, 0), @@ -142,7 +142,7 @@ fn classify_python(method: &str) -> Option { match method { // List store "append" | "extend" => store(0), - "insert" => store_indexed(1, 0), // list.insert(index, value) — index at 0, value at 1 + "insert" => store_indexed(1, 0), // list.insert(index, value), index at 0, value at 1 // Set store "add" => store(0), // Dict store @@ -150,10 +150,10 @@ fn classify_python(method: &str) -> Option { "setdefault" => store2(0, 1), // dict.setdefault(key, default) // List/Dict load "pop" => load(), - "get" => load_indexed(0), // dict.get(key) / list index — key/index at 0 + "get" => load_indexed(0), // dict.get(key) / list index, key/index at 0 "items" | "values" | "keys" => load(), "join" => load(), - // Pointer-Phase 6 / W5: synthetic callees emitted by CFG + //synthetic callees emitted by CFG // lowering for subscript reads/writes (`arr[i]`, `arr[i] = v`). "__index_get__" => load_indexed(0), "__index_set__" => store_indexed(1, 0), @@ -165,11 +165,11 @@ fn classify_java(method: &str) -> Option { match method { // Collection store "add" | "addAll" | "putAll" | "offer" | "push" => store(0), - // ArrayList.set(index, value) — index at 0, value at 1 + // ArrayList.set(index, value), index at 0, value at 1 "set" => store_indexed(1, 0), - // Map.put(key, value) — key at 0, value at 1 + // Map.put(key, value), key at 0, value at 1 "put" => store_indexed(1, 0), - // Collection load: ArrayList.get(index) — index at 0 + // Collection load: ArrayList.get(index), index at 0 "get" => load_indexed(0), "poll" | "peek" | "remove" | "pop" => load(), "stream" | "toArray" | "iterator" => load(), @@ -203,7 +203,7 @@ fn classify_go(method: &str, callee: &str) -> Option { // method-call form has the bytes carried via the receiver, not arg 0, // so it lines up with the writeback contract just like `Decode`. "Decode" | "Unmarshal" => Some(ContainerOp::Writeback { dest_arg: 0 }), - // Pointer-Phase 6 / W5: synthetic callees emitted by CFG + //synthetic callees emitted by CFG // lowering for Go index_expression reads/writes (`arr[i]`, // `m[k] = v`). "__index_get__" => load_indexed(0), @@ -222,7 +222,7 @@ fn classify_ruby(method: &str) -> Option { fn classify_php(method: &str) -> Option { match method { - "array_push" => store(1), // array_push(&$arr, $val) — arr is arg 0, val is arg 1 + "array_push" => store(1), // array_push(&$arr, $val), arr is arg 0, val is arg 1 "array_pop" | "array_shift" | "current" | "next" | "reset" => load(), _ => None, } @@ -232,11 +232,11 @@ fn classify_cpp(method: &str) -> Option { match method { // Mutating container operations. // `assign` overwrites the container's contents with the argument - // sequence — modeled as Store so the receiver inherits the argument + // sequence, modeled as Store so the receiver inherits the argument // taint, matching the runtime "the values now live inside this // container" semantics shared with `push_back`/`emplace_back`. "push_back" | "emplace_back" | "insert" | "emplace" | "push" | "assign" => store(0), - // Map/unordered_map insertion: `m.insert_or_assign(k, v)` — value at 1. + // Map/unordered_map insertion: `m.insert_or_assign(k, v)`, value at 1. "insert_or_assign" => store_indexed(1, 0), // Read-only container observers. `find`/`count` return iterators or // counts that carry the container's value taint when queried with a @@ -255,7 +255,7 @@ fn classify_rust(method: &str) -> Option { match method { "push" | "insert" | "extend" => store(0), "pop" | "first" | "last" | "iter" | "remove" => load(), - // vec.get(index) — index at 0 + // vec.get(index), index at 0 "get" => load_indexed(0), _ => None, } @@ -304,7 +304,7 @@ mod tests { } // CVE Hunt Session 2 (Owncast CVE-2023-3188 / CVE-2024-31450 family): - // Go `*.Decode(&dest)` is the canonical streaming-decoder writeback — + // Go `*.Decode(&dest)` is the canonical streaming-decoder writeback , // `json.NewDecoder(r.Body).Decode(&dest)`, `xml.NewDecoder(r).Decode(&out)`, // `gob.NewDecoder(buf).Decode(&v)`. The decoder receiver carries the // source taint and the destination is arg 0; the writeback rule is the @@ -394,7 +394,7 @@ mod tests { } } - // ── C++ Phase 1 additions ────────────────────────────────────── + // ── C++ extras ────────────────────────────────────── #[test] fn cpp_push_back_is_store() { @@ -413,7 +413,7 @@ mod tests { #[test] fn cpp_assign_is_store() { - // vector::assign(args) overwrites the container's contents — the + // vector::assign(args) overwrites the container's contents, the // receiver inherits argument taint just like push_back. let op = classify_container_op("v.assign", Lang::Cpp); assert!(matches!(op, Some(ContainerOp::Store { .. }))); @@ -421,7 +421,7 @@ mod tests { #[test] fn cpp_insert_or_assign_indexes_value() { - // map::insert_or_assign(key, value) — value is at arg 1, key at arg 0. + // map::insert_or_assign(key, value), value is at arg 1, key at arg 0. match classify_container_op("m.insert_or_assign", Lang::Cpp) { Some(ContainerOp::Store { value_args, @@ -456,7 +456,7 @@ mod tests { } /// W5: synthetic `__index_get__` is recognised as an indexed load - /// in JS/TS, Python, and Go — driving the index_arg=0 path so a + /// in JS/TS, Python, and Go, driving the index_arg=0 path so a /// constant-key subscript read flows through `HeapSlot::Index(n)`. #[test] fn synth_index_get_classified_as_indexed_load_js_py_go() { @@ -471,7 +471,7 @@ mod tests { } /// W5: synthetic `__index_set__` is recognised as an indexed store - /// in JS/TS, Python, and Go — value at arg 1, index at arg 0. + /// in JS/TS, Python, and Go, value at arg 1, index at arg 0. #[test] fn synth_index_set_classified_as_indexed_store_js_py_go() { for lang in [Lang::JavaScript, Lang::TypeScript, Lang::Python, Lang::Go] { diff --git a/src/ssa/static_map.rs b/src/ssa/static_map.rs index a4e26b68..6a558e9d 100644 --- a/src/ssa/static_map.rs +++ b/src/ssa/static_map.rs @@ -12,7 +12,7 @@ //! where every insert's *value* slot is a syntactic string literal and the //! final lookup is dereffed via a literal fallback (`.unwrap_or(LIT)`). The //! result `cmd` is then provably bounded to the finite set -//! `{V1, V2, …, "safe"}`, regardless of what `k` carries — taint-flavour or +//! `{V1, V2, …, "safe"}`, regardless of what `k` carries, taint-flavour or //! otherwise. Downstream sink suppression consumes this finite set to //! clear SHELL/FILE/SQL injection findings whose payload is proved to be //! metacharacter-free. @@ -24,7 +24,7 @@ //! (e.g. `"table.get(key).copied().unwrap_or"` for `table.get(key).copied() //! .unwrap_or("safe")`) and whose `receiver` is the root identifier's SSA //! value. We therefore do not need to walk SSA `.copied()` / `.unwrap_or` -//! instructions as separate hops — pattern-matching on the callee text is +//! instructions as separate hops, pattern-matching on the callee text is //! the source of truth. String-literal arguments that the callee text //! elides (e.g. the fallback `"safe"`) are read from the CFG node's //! `arg_string_literals`, populated during CFG construction. @@ -33,7 +33,7 @@ //! literal-valued inserts, no escape beyond recognised mutate/read methods. //! Any deviation (dynamic insert, callee not in the allow-list, map used as //! a plain argument, map returned, map joined across a phi) invalidates the -//! candidate. Missed detection is safe — it just falls through to existing +//! candidate. Missed detection is safe, it just falls through to existing //! behaviour. use std::collections::{HashMap, HashSet}; @@ -73,15 +73,15 @@ fn is_rust_map_constructor(callee: &str) -> bool { /// Classification of a Call whose receiver is a candidate map. #[derive(Clone, Debug, PartialEq, Eq)] enum MapUse { - /// `{var}.insert(K, V)` — value contributes to the finite domain. + /// `{var}.insert(K, V)`, value contributes to the finite domain. Insert, /// `{var}.get(K)[.copied()|.cloned()|.as_deref()|.as_ref()]*.unwrap_or` - /// — lookup result is bounded by the inserted values plus the fallback + ///, lookup result is bounded by the inserted values plus the fallback /// literal on the CFG node. StaticLookup, /// Whitelisted read-only method (no reference leak). ReadOnly, - /// Anything else — invalidates the map candidate. + /// Anything else, invalidates the map candidate. Escape, } @@ -138,7 +138,7 @@ fn scan_past_balanced_parens(s: &str) -> Option<&str> { /// Return `true` when `s` is a sequence of zero or more identity chain /// methods (`.copied()`, `.cloned()`, `.as_deref()`, `.as_ref()`) followed /// by `.unwrap_or` (and nothing else). The trailing arg list of -/// `.unwrap_or` is elided in the callee text — it appears in the CFG node's +/// `.unwrap_or` is elided in the callee text, it appears in the CFG node's /// `arg_string_literals` instead. fn is_identity_chain_ending_in_unwrap_or(mut s: &str) -> bool { const IDENTS: &[&str] = &[".copied()", ".cloned()", ".as_deref()", ".as_ref()"]; @@ -171,7 +171,7 @@ fn resolve_alias(v: SsaValue, aliases: &HashMap) -> SsaValue cur } -/// Run the analysis. Bails out immediately for non-Rust bodies — the current +/// Run the analysis. Bails out immediately for non-Rust bodies, the current /// pattern set only models Rust `std::collections::HashMap`. pub fn analyze( body: &SsaBody, @@ -382,7 +382,7 @@ mod tests { #[test] fn classify_static_lookup_without_identity_chain() { - // `.unwrap_or` directly after `.get(...)` also qualifies — Rust + // `.unwrap_or` directly after `.get(...)` also qualifies, Rust // `HashMap::get` returns `Option<&V>`, so `.unwrap_or(&"safe")` is // syntactically valid and equally bounded. assert_eq!( @@ -401,7 +401,7 @@ mod tests { #[test] fn classify_rejects_unknown_terminator() { - // `.unwrap_or_else(|| …)` is not modelled — closure can return anything. + // `.unwrap_or_else(|| …)` is not modelled, closure can return anything. assert_eq!( classify_map_use("t.get(k).copied().unwrap_or_else", "t"), MapUse::Escape @@ -414,7 +414,7 @@ mod tests { #[test] fn classify_rejects_other_receiver() { - // `other.insert` does not belong to `table` — receiver mismatch. + // `other.insert` does not belong to `table`, receiver mismatch. assert_eq!(classify_map_use("other.insert", "table"), MapUse::Escape); } diff --git a/src/ssa/type_facts.rs b/src/ssa/type_facts.rs index ed761ecd..a5f1aa90 100644 --- a/src/ssa/type_facts.rs +++ b/src/ssa/type_facts.rs @@ -25,23 +25,21 @@ pub enum TypeKind { FileHandle, Url, HttpClient, - /// A local, in-memory collection (HashMap, HashSet, Vec, - /// BTreeMap, …). Consumed by the auth analysis sink gate so method - /// calls on variables of this type (`map.insert(...)`) are treated - /// as in-memory bookkeeping rather than cross-tenant sinks. Has no - /// `label_prefix` — it never participates in label-based callee + /// A local, in-memory collection (HashMap, HashSet, Vec, etc.). + /// The auth sink gate uses this so calls like `map.insert(...)` + /// are treated as bookkeeping rather than cross-tenant sinks. No + /// `label_prefix`, never participates in label-based callee /// resolution. LocalCollection, - /// Phase 6: a framework-injected DTO body whose field types are - /// known. Populated only when a parameter is recognised as a typed - /// extractor by a Phase 1-2 matcher AND the DTO class / struct / - /// Pydantic model is resolvable in the current scan scope. - /// Strictly additive — when no DTO definition is found, callers - /// fall through to today's pre-Phase-6 behaviour. + /// A framework-injected DTO body whose field types are known. + /// Populated when a parameter is recognised as a typed extractor and + /// the DTO class / struct / Pydantic model is resolvable in scope. + /// Strictly additive, without a DTO definition, callers fall back + /// to name-only resolution. Dto(DtoFields), } -/// Phase 6: structural carrier for a recognised DTO type. Maps +/// structural carrier for a recognised DTO type. Maps /// declared field names to their inferred [`TypeKind`]. Nested DTOs /// use [`TypeKind::Dto`] recursively. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] @@ -82,19 +80,11 @@ impl TypeKind { } } - /// Container name used by the typed call-graph devirtualisation - /// (`docs/typed-call-graph-prompt.md`, Phase 2). - /// - /// Returns the class / impl / module string under which an SSA - /// receiver value of this type would be looked up in - /// [`crate::callgraph::ClassMethodIndex`]. Mirrors - /// [`Self::label_prefix`] for the security-relevant abstract - /// types (HttpClient → `"HttpClient"`, DatabaseConnection → - /// `"DatabaseConnection"`, etc.) and additionally returns the DTO - /// class name for [`TypeKind::Dto`] receivers. - /// - /// Scalar / unknown types return `None` — they have no defining - /// container and would not narrow a method-call edge meaningfully. + /// Container name used by typed call-graph devirtualisation , + /// the class / impl / module under which a receiver of this type + /// would be looked up. Returns the DTO class name for `Dto` + /// receivers, label prefixes for known abstract types, `None` for + /// scalars. pub fn container_name(&self) -> Option { if let Some(prefix) = self.label_prefix() { return Some(prefix.to_string()); @@ -105,7 +95,7 @@ impl TypeKind { None } - /// Phase 6: convenience accessor for the inner `DtoFields` if this + /// convenience accessor for the inner `DtoFields` if this /// type is a recognised DTO. pub fn as_dto(&self) -> Option<&DtoFields> { match self { @@ -146,7 +136,7 @@ impl TypeFact { TypeFact { kind, nullable } } - /// Phase 6: factory used by the field-access propagation rule. + /// factory used by the field-access propagation rule. pub(crate) fn from_dto_field(receiver: &TypeKind, field: &str) -> Option { let dto = receiver.as_dto()?; let kind = dto.get(field)?.clone(); @@ -190,10 +180,10 @@ impl TypeFactResult { /// /// Suppression policy: /// * [`TypeKind::Int`] (and float, treated as numeric): suppresses -/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` — +/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` , /// numeric values cannot carry the metacharacters required to drive /// any of these injection classes. -/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit — +/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit , /// `true`/`false` cannot carry a payload of any kind. pub fn is_type_safe_for_sink( values: &[SsaValue], @@ -245,6 +235,18 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option { Lang::JavaScript | Lang::TypeScript => match suffix { "URL" => Some(TypeKind::Url), "Request" | "XMLHttpRequest" => Some(TypeKind::HttpClient), + // JS built-in collection constructors. `new Map()` / `new Set()` + // / `new WeakMap()` / `new WeakSet()` / `new Array()` produce + // in-memory collections; downstream `m.get(k)` / `m.set(k, v)` + // / `s.add(x)` / `s.has(x)` / `arr.find(p)` are container ops, + // not data-layer reads. Without this mapping the bare verb + // dispatch in `auth_analysis::config::classify_sink_class` + // matches the `get` / `find` / `add` read/mutation indicators + // and over-fires `js.auth.missing_ownership_check` on every + // Map lookup in pure data-manipulation code (excalidraw's + // `elementsMap.get(id)`, `origIdToDuplicateId.get(...)`, + // `groupIdMapForOperation.set(...)` shapes). + "Map" | "Set" | "WeakMap" | "WeakSet" | "Array" => Some(TypeKind::LocalCollection), _ => None, }, Lang::Python => { @@ -334,10 +336,9 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option { Some(TypeKind::DatabaseConnection) } else if is_rust_local_collection_constructor(base) { // Rust std/indexmap/smallvec/dashmap collection - // constructors map to a generic "local collection" type so - // the auth analysis sink gate can recognise - // `let x = factory_fn(); x.insert(..)` even when the RHS - // isn't a syntactic constructor call. + // constructors map to a generic "local collection" type + // so the auth sink gate recognises + // `let x = factory_fn(); x.insert(..)`. Some(TypeKind::LocalCollection) } else { None @@ -421,6 +422,15 @@ fn is_rust_local_collection_constructor(base: &str) -> bool { "FxHashSet", "DashMap", "DashSet", + // `roaring` crate, RoaringBitmap / RoaringTreemap are + // in-memory bitset / bitmap containers (set-of-u32 / + // set-of-u64). Used heavily by indexing systems + // (meilisearch's index-scheduler) for `task_ids`, + // `docids`, and similar local-collection bookkeeping. + // Mutations (`insert` / `remove` / `clear`) are container + // ops, not data-layer writes. + "RoaringBitmap", + "RoaringTreemap", ]; const VERBS: &[&str] = &[ "new", @@ -460,11 +470,73 @@ pub fn is_int_producing_callee(callee: &str) -> bool { | "Atoi" | "ParseInt" | "ParseFloat" // Go | "intval" | "floatval" // PHP | "to_i" | "to_f" // Ruby - | "parse" // Rust: `.parse::()` / `.parse().unwrap()` — conservative + | "parse" // Rust: `.parse::()` / `.parse().unwrap()`, conservative // (most Rust .parse() calls target numeric types) ) } +/// Polarity hint for a generic input-validator callee. +/// +/// Most validation idioms route attacker-controlled input through a +/// helper whose result the caller branches on: +/// +/// ```text +/// const err = validateUrlSsrf(child.webhookUrl); // ErrorReturning +/// if (err) throw new Error(err); // false branch → success +/// +/// if (isValid(input)) { use(input); } // BooleanTrueIsValid +/// // true branch → success +/// ``` +/// +/// Without modeling this pattern, a one-statement rewrite of a +/// `validate(x); if(x) ...` guard hides the semantic equivalence to +/// `if (validate(x)) ...` (already classified as ValidationCall). The +/// classifier discriminates only on the textual head of the bare call +///, strict-additive: callees that don't match any pattern return +/// `None` and the engine falls through to its existing behaviour. +/// +/// Motivated by Novu CVE GHSA-4x48-cgf9-q33f +/// (`const ssrfError = await validateUrlSsrf(child.webhookUrl); if (ssrfError) throw`). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum InputValidatorPolarity { + /// Returns boolean, truthy means "valid". + BooleanTrueIsValid, + /// Returns null/undefined on success, error/message on failure , + /// truthy means "rejected". + ErrorReturning, +} + +pub fn classify_input_validator_callee(callee: &str) -> Option { + let base = peel_identity_suffix(callee); + let suffix = base.rsplit(['.', ':']).next().unwrap_or(&base); + let lower = suffix.to_ascii_lowercase(); + + // Boolean returners, name typically reads as a predicate + // (`isValid…`, `is_valid_…`, `is_safe…`, `has_valid…`). Truthy + // result → input is valid → TRUE branch carries the validation. + if lower.starts_with("isvalid") + || lower.starts_with("is_valid") + || lower.starts_with("issafe") + || lower.starts_with("is_safe") + || lower.starts_with("hasvalid") + || lower.starts_with("has_valid") + { + return Some(InputValidatorPolarity::BooleanTrueIsValid); + } + + // Error-returning validators, name reads as a verb whose return + // value carries the error description. `validateXxx`, `verifyXxx` + // are the dominant idioms; we deliberately do NOT match `check…` + // here because a name like `checkPermissions` overlaps with auth + // checks (different semantic) and the suppression payoff isn't + // worth the precision risk. + if lower.starts_with("validate") || lower.starts_with("verify") { + return Some(InputValidatorPolarity::ErrorReturning); + } + + None +} + /// Analyze types for all SSA values. /// /// Uses constant propagation results to seed types from known constants, @@ -571,7 +643,7 @@ pub fn analyze_types_with_param_types( | BinOp::Gt | BinOp::GtEq, ) => TypeFact::from_kind(TypeKind::Int), - // Add could be string concatenation — defer to operand types + // Add could be string concatenation, defer to operand types _ => TypeFact::unknown(), } } @@ -587,7 +659,7 @@ pub fn analyze_types_with_param_types( Some(tk) => TypeFact::from_kind(tk.clone()), None => TypeFact::unknown(), }, - // Undef contributes no type information — phi joins + // Undef contributes no type information, phi joins // pick up the type from the other (defined) operand. SsaOp::Undef => TypeFact::unknown(), }; @@ -603,7 +675,7 @@ pub fn analyze_types_with_param_types( for block in &body.blocks { // Identity-preserving method calls: pass through receiver's type. - // E.g. `Connection::open(p).unwrap()` — the `.unwrap()` call's type + // E.g. `Connection::open(p).unwrap()`, the `.unwrap()` call's type // fact should mirror the receiver (Result). Only applies // when the current fact is still Unknown so explicit constructor // mappings win. @@ -618,7 +690,7 @@ pub fn analyze_types_with_param_types( continue; } // A numeric-length accessor pinned by the first pass is - // load-bearing for sink suppression — do not let identity- + // load-bearing for sink suppression, do not let identity- // method receiver propagation overwrite the Int fact. if cfg .node_weight(inst.cfg_node) @@ -644,7 +716,7 @@ pub fn analyze_types_with_param_types( } } - // Phase 6.3: FieldProj receiver-driven type narrowing. When + // FieldProj receiver-driven type narrowing. When // SSA lowering decomposed `a.b.c()` into a FieldProj chain, // intermediate FieldProj insts default to `projected_type = // None`. If the receiver value carries a Dto fact and the @@ -701,7 +773,7 @@ pub fn analyze_types_with_param_types( // Copy assignments and binary arithmetic for inst in &block.body { // Preserve the Int fact pinned by the numeric-length-access - // detector in the first pass — copy propagation would replace + // detector in the first pass, copy propagation would replace // it with the receiver's (usually Unknown) type and defeat the // whole point of the accessor rule. if cfg @@ -712,11 +784,11 @@ pub fn analyze_types_with_param_types( } if let SsaOp::Assign(uses) = &inst.op { if uses.len() == 1 { - // Phase 6.3: when the RHS is a single member-access + // when the RHS is a single member-access // expression and the receiver value carries a // `TypeKind::Dto(fields)` fact, route the assignment's // type to the field's declared `TypeKind`. Strictly - // additive — falls through to copy-prop when the + // additive, falls through to copy-prop when the // receiver isn't a DTO or the field isn't recorded. let dto_field_fact = cfg .node_weight(inst.cfg_node) @@ -777,7 +849,7 @@ pub fn analyze_types_with_param_types( /// Used for `instanceof` resolution and type-qualified method dispatch. pub struct TypeHierarchy; -/// (subtype, &[supertypes]) — sink-relevant framework types only. +/// (subtype, &[supertypes]), sink-relevant framework types only. static JAVA_HIERARCHY: &[(&str, &[&str])] = &[ ("HttpServletResponse", &["ServletResponse"]), ("HttpServletRequest", &["ServletRequest"]), @@ -853,7 +925,7 @@ impl TypeHierarchy { /// /// Conservative: unknown interfaces → `true` (could satisfy). /// Only [`definitely_not`](GoInterfaceTable::definitely_not) is used for -/// suppression — it returns `true` only when the type provably cannot +/// suppression, it returns `true` only when the type provably cannot /// implement the interface. pub struct GoInterfaceTable; @@ -1147,8 +1219,8 @@ mod tests { assert_eq!(result.get_type(SsaValue(99)), None); } - /// Phase 4: Int-typed values must suppress every type-suppressible - /// cap — including the freshly-added `SSRF` bit. Numeric IDs + /// Int-typed values must suppress every type-suppressible + /// cap, including the freshly-added `SSRF` bit. Numeric IDs /// cannot rewrite a URL host, cannot form path traversal sequences, /// cannot carry SQL/HTML/shell metacharacters. #[test] @@ -1183,7 +1255,7 @@ mod tests { )); } - /// Phase 4: Bool-typed values are even safer than ints — `true` / + /// Bool-typed values are even safer than ints, `true` / /// `false` cannot carry any payload and must suppress every /// type-suppressible cap. #[test] @@ -1207,7 +1279,7 @@ mod tests { } } - /// String-typed values must NOT trigger suppression — they are the + /// String-typed values must NOT trigger suppression, they are the /// canonical injection carrier. Regression guard so a future /// change to `is_type_safe_for_sink` does not silently silence /// real String-payload findings. @@ -1349,8 +1421,8 @@ mod tests { } } - /// Audit A3 (companion): mixed-type operand list — only one Int - /// among operands of unknown type — must NOT suppress. The + /// Audit A3 (companion): mixed-type operand list, only one Int + /// among operands of unknown type, must NOT suppress. The /// suppression rule requires every operand to be payload-incompatible. #[test] fn mixed_type_operands_do_not_suppress() { @@ -1366,7 +1438,7 @@ mod tests { )); } - /// Phase 3: Param values seeded from `param_types` must surface + /// Param values seeded from `param_types` must surface /// the right TypeKind for downstream sink suppression. An out-of- /// range index falls back to Unknown (the pre-Phase-3 default). #[test] @@ -1590,6 +1662,47 @@ mod tests { assert_eq!(constructor_type(Lang::Cpp, "printf"), None); } + #[test] + fn constructor_type_javascript_typescript_local_collections() { + // `new Map()` / `new Set()` / `new WeakMap()` / `new WeakSet()` / + // `new Array()` produce in-memory collections. Excalidraw's + // `elementsMap.get(id)` shape (which dominates the + // `js.auth.missing_ownership_check` cluster on JS data-manipulation + // libraries) is suppressed once the receiver type is known. + for lang in [Lang::JavaScript, Lang::TypeScript] { + assert_eq!( + constructor_type(lang, "Map"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + constructor_type(lang, "Set"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + constructor_type(lang, "WeakMap"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + constructor_type(lang, "WeakSet"), + Some(TypeKind::LocalCollection) + ); + assert_eq!( + constructor_type(lang, "Array"), + Some(TypeKind::LocalCollection) + ); + // Existing pre-fix mappings still resolve. + assert_eq!(constructor_type(lang, "URL"), Some(TypeKind::Url)); + assert_eq!( + constructor_type(lang, "XMLHttpRequest"), + Some(TypeKind::HttpClient) + ); + // Negative: unrelated identifiers stay None. + assert_eq!(constructor_type(lang, "Object"), None); + assert_eq!(constructor_type(lang, "Promise"), None); + assert_eq!(constructor_type(lang, "Foo"), None); + } + } + #[test] fn constructor_type_ruby() { // HttpClient @@ -1680,7 +1793,7 @@ mod tests { constructor_type(Lang::Rust, "diesel::SqliteConnection::establish"), Some(TypeKind::DatabaseConnection) ); - // Bare `Connection::open` is accepted — Rust idiom + // Bare `Connection::open` is accepted, Rust idiom // `use rusqlite::Connection; Connection::open(…)` is common, and the // scanner sees the unqualified callee text after import resolution. // Accepting this matches the benchmark fixture `rs-sqli-001`. @@ -1938,9 +2051,9 @@ mod tests { ); } - // ── Phase 6 DTO field-level taint ───────────────────────────────────── + // ── DTO field-level taint ───────────────────────────────────────────── - /// Phase 6: `TypeFact::from_dto_field` returns `Some(field_kind)` + /// `TypeFact::from_dto_field` returns `Some(field_kind)` /// for a DTO receiver whose `fields` map contains the requested /// field, and `None` otherwise. #[test] @@ -1956,7 +2069,7 @@ mod tests { assert!(TypeFact::from_dto_field(&recv, "missing").is_none()); } - /// Phase 6: a non-DTO receiver kind never produces a field fact — + /// a non-DTO receiver kind never produces a field fact , /// `from_dto_field` falls through to the legacy copy-prop path. #[test] fn dto_field_lookup_on_non_dto_returns_none() { @@ -1974,10 +2087,9 @@ mod tests { } } - /// Phase 6: nested DTO — the parent DTO's field type is - /// `TypeKind::Dto`, and `from_dto_field` returns that nested DTO - /// fact directly. Phase 6.3 callers can recurse into the inner - /// fields by following the returned receiver's `as_dto()` chain. + /// Nested DTO, the parent DTO's field type is `TypeKind::Dto`, + /// and `from_dto_field` returns that nested DTO fact directly. + /// Callers can recurse via `as_dto()`. #[test] fn dto_field_lookup_supports_nested_dto() { let mut inner = DtoFields::new("Address"); @@ -1990,7 +2102,7 @@ mod tests { assert_eq!(addr.kind, TypeKind::Dto(inner)); } - /// Phase 6: an empty DTO (class declared but with no inferred + /// an empty DTO (class declared but with no inferred /// fields) never resolves field reads. Documents the safe-fallback /// invariant so the legacy path runs when class fields couldn't be /// classified. @@ -2000,9 +2112,8 @@ mod tests { assert!(TypeFact::from_dto_field(&recv, "anything").is_none()); } - /// Phase 6: an `Int`-typed field in a DTO survives the - /// type-suppression matrix exactly the same way a freestanding - /// `Int` does — sanity-check the bridge between Phase 6 and Phase 4. + /// An `Int`-typed DTO field survives the type-suppression matrix + /// the same way a freestanding `Int` does. #[test] fn dto_int_field_suppresses_sql_query_via_matrix() { use crate::labels::Cap; diff --git a/src/state/domain.rs b/src/state/domain.rs index 572ab639..da59c87a 100644 --- a/src/state/domain.rs +++ b/src/state/domain.rs @@ -168,8 +168,8 @@ impl Lattice for AuthDomainState { /// (e.g. `"c.mu"`, `"c.writer.header"`) so distinct field projections /// of the same chain root are tracked independently. /// -/// Chain-keyed proxy state is the Phase 3 replacement for the single-dot -/// band-aid that conservatively dropped chain receivers entirely — chain +/// Chain-keyed proxy state is the DTO replacement for the single-dot +/// band-aid that conservatively dropped chain receivers entirely, chain /// receivers are now first-class, semantically distinct from their root. #[derive(Clone, Debug, PartialEq, Eq)] pub struct ChainProxyState { @@ -192,18 +192,12 @@ pub struct ProductState { /// operation (e.g., fs.openSync at line 7) rather than the proxy call. pub proxy_acquire_spans: HashMap, /// Per-chain-receiver proxy tracking, keyed by joined chain text - /// (`"c.mu"`, `"c.writer.header"`). Each chain receiver has its own - /// lifecycle, class group, and acquire span — independent of both the - /// chain root and any other chain. Phase 3 of the field-projections - /// rollout introduces this map; consumers that previously used - /// [`receiver_class_group`] for chain receivers (via the deleted - /// single-dot band-aid) now route through here for 2+ dot callees. + /// (`"c.mu"`, `"c.writer.header"`). Each chain receiver has its own + /// lifecycle, class group, and acquire span, independent of both + /// the chain root and any other chain. /// - /// Phase 3 ships chain_proxies in tracking-only mode: chain receivers - /// that remain OPEN at exit are NOT promoted to leak findings (so the - /// addition is strictly behaviour-preserving against the existing - /// benchmark). Phase 4 / a follow-up adds chain-rooted leak findings - /// once the receiver-class detection is broad enough to avoid new FPs. + /// Tracking-only: chain receivers that remain OPEN at exit are NOT + /// promoted to leak findings. pub chain_proxies: HashMap, } @@ -386,7 +380,7 @@ mod tests { // the laws also need to hold on the *actual* impls used by the // engine. A change to ResourceLifecycle's bitset semantics or to // AuthLevel's ordering could quietly break commutativity / - // associativity / idempotence — these tests pin those properties. + // associativity / idempotence, these tests pin those properties. #[test] fn resource_lifecycle_join_laws() { @@ -424,7 +418,7 @@ mod tests { /// `AuthLevel` satisfies idempotence, commutativity, and associativity /// of `join` (which is `min` of the privilege ordering). It does NOT - /// satisfy the `Lattice` trait's bot-identity law — see the explicit + /// satisfy the `Lattice` trait's bot-identity law, see the explicit /// `auth_level_bot_is_absorbing_not_identity` test below for a /// rationale and a regression guard. #[test] @@ -459,14 +453,14 @@ mod tests { /// * therefore `Admin.join(Unauthed) == Unauthed`, not `Admin` /// /// In other words, `Unauthed` is the *absorbing* element of the join, - /// not the identity — the algebraic dual of what the trait expects. + /// not the identity, the algebraic dual of what the trait expects. /// /// This is intentional for security: if any incoming path is unauthed, /// the merged state must be unauthed (the conservative baseline). The /// trait contract violation matters only if the dataflow engine ever /// joins `bot()` with a non-bot reachable state from a different path /// (e.g. for an unreachable predecessor); in the current engine such - /// nodes are skipped, so the violation is observably benign — but + /// nodes are skipped, so the violation is observably benign, but /// documenting it here prevents an accidental "fix" that flips /// `bot()` to `Admin` and silently elevates auth across all merges. #[test] @@ -506,7 +500,7 @@ mod tests { /// `AuthDomainState::join` keeps a variable as `validated` only if /// it was validated on *every* incoming path. A variable validated - /// on one branch but not the other must be dropped — otherwise an + /// on one branch but not the other must be dropped, otherwise an /// auth bypass on one path silently authorises sinks on the merge /// path. #[test] diff --git a/src/state/engine.rs b/src/state/engine.rs index f6826805..83b84940 100644 --- a/src/state/engine.rs +++ b/src/state/engine.rs @@ -89,7 +89,7 @@ pub fn run_forward>( converged = false; break; } - // Budget exceeded but transfer requested continuation — mark non-converged + // Budget exceeded but transfer requested continuation, mark non-converged converged = false; } @@ -100,7 +100,7 @@ pub fn run_forward>( let edges: Vec<_> = cfg.edges(node).map(|e| (*e.weight(), e.target())).collect(); - // No outgoing edges — nothing to propagate (exit/dead end). + // No outgoing edges, nothing to propagate (exit/dead end). if edges.is_empty() { continue; } @@ -159,7 +159,7 @@ pub fn run_forward>( let edges: Vec<_> = cfg.edges(node).map(|e| (*e.weight(), e.target())).collect(); if edges.is_empty() { - // Exit / dead end — apply transfer for event collection. + // Exit / dead end, apply transfer for event collection. let info = &cfg[node]; let (_out_state, new_events) = transfer.apply(node, info, None, node_state); events.extend(new_events); @@ -487,7 +487,7 @@ mod tests { assert!(in_wl.insert(n1)); wl.push_back(n1); - // Duplicate n0 — should not insert + // Duplicate n0, should not insert assert!(!in_wl.insert(n0)); // wl still has only 2 entries assert_eq!(wl.len(), 2); @@ -597,7 +597,7 @@ mod tests { } /// Self-loop on a single node: `entry → A → A → … → exit`. The - /// worklist must not livelock — once A's state is stable, the + /// worklist must not livelock, once A's state is stable, the /// back-edge stops re-enqueueing it. #[test] fn self_loop_terminates() { diff --git a/src/state/facts.rs b/src/state/facts.rs index 817b9f8b..e9c91320 100644 --- a/src/state/facts.rs +++ b/src/state/facts.rs @@ -15,7 +15,7 @@ fn sanitize_desc(s: &str) -> String { crate::fmt::normalize_snippet(s) } -/// Returns true if `idx` is the terminal exit of a function body — the +/// Returns true if `idx` is the terminal exit of a function body, the /// convergence node where all execution paths join before leaving the function. /// /// **Invariant:** Only terminal exits carry the complete merged lifecycle state @@ -143,7 +143,7 @@ pub fn extract_findings( for (idx, info) in cfg.node_references() { // File-level Exit (program termination, no enclosing function). let is_file_exit = info.kind == StmtKind::Exit && info.ast.enclosing_func.is_none(); - // Terminal function exit — the convergence node where all paths join. + // Terminal function exit, the convergence node where all paths join. // Return nodes are intermediate and carry only path-specific state; // only the terminal exit carries the complete merged lifecycle. let is_func_terminal = is_terminal_function_exit(idx, info, cfg); @@ -167,7 +167,7 @@ pub fn extract_findings( let acquire_node = find_acquire_node(cfg, sym, interner, scope); // At the file-level Exit, skip variables whose acquire site is - // inside a function — those are already handled by the per- + // inside a function, those are already handled by the per- // function exit checks above. Without this, the file-level Exit // would duplicate leak findings with a misleading acquire span // (the first global match instead of the correct function-local one). @@ -296,7 +296,7 @@ pub fn extract_findings( // **Language gate**: this heuristic is JS/TS-specific. Other // languages (Go, Java, C, C++, Python, Rust, Ruby, PHP) use // explicit error returns / try-catch with deterministic control - // flow — an intervening call does NOT silently bypass a release. + // flow, an intervening call does NOT silently bypass a release. // Firing this on Go gave the gin/context.go FP where any method // calling another method (`c.Set`, `c.Get`) was flagged as a // possible leak on the receiver. Skip the section but continue @@ -374,7 +374,7 @@ pub fn extract_findings( // (PathFact `dotdot=No && absolute=No`). A web handler // reading a sanitised user-controlled path is not the // same shape as a handler reading any user-controlled - // path — the auth concern reduces once the data cannot + // path, the auth concern reduces once the data cannot // escape into a privileged location. Note this is per // CFG-node span, so co-located unrelated sinks are // unaffected. @@ -455,7 +455,7 @@ fn is_web_entrypoint_simple( ) -> bool { let name_lower = func_name.to_ascii_lowercase(); - // Skip bare "main" — it's typically a CLI entry + // Skip bare "main", it's typically a CLI entry if name_lower == "main" { return false; } @@ -695,7 +695,7 @@ mod tests { fn per_body_factory_returned_resource_no_finding() { // Per-body graph: Entry → fopen(f) → return f → Exit // All nodes have enclosing_func=Some("factory"). - // The resource is returned — no leak finding expected. + // The resource is returned, no leak finding expected. let func = "factory"; let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_func_node(StmtKind::Entry, func)); @@ -764,7 +764,7 @@ mod tests { fn per_body_non_returned_resource_leaks() { // Per-body graph: Entry → fopen(f) → return (no uses) → Exit // All nodes have enclosing_func=Some("leaker"). - // Resource is NOT returned — exactly one state-resource-leak expected. + // Resource is NOT returned, exactly one state-resource-leak expected. let func = "leaker"; let mut cfg: Cfg = Graph::new(); let entry = cfg.add_node(make_func_node(StmtKind::Entry, func)); diff --git a/src/state/mod.rs b/src/state/mod.rs index 8d646e8e..20dc32cc 100644 --- a/src/state/mod.rs +++ b/src/state/mod.rs @@ -1,3 +1,5 @@ +#![doc = include_str!(concat!(env!("OUT_DIR"), "/state.md"))] + pub mod domain; pub mod engine; pub mod facts; @@ -27,7 +29,7 @@ pub fn classify_auth_decorators(lang: Lang, decorators: &[String]) -> AuthLevel let mut level = AuthLevel::Unauthed; for dec in decorators { let d = dec.to_ascii_lowercase(); - // Admin patterns — match the same static list used by the call-site + // Admin patterns, match the same static list used by the call-site // transfer so decorators and runtime checks agree on privilege. if d.contains("admin") || d.contains("hasrole") || d.contains("superuser") { return AuthLevel::Admin; @@ -73,7 +75,7 @@ pub fn run_state_analysis( // PointsToFacts. When present, the proxy-acquire transfer suppresses // SymbolId attribution on field-aliased receivers (`m := c.mu; // m.Lock()`) and routes them through `chain_proxies` instead. Pass - // `None` to disable — strict-additive. + // `None` to disable, strict-additive. ptr_proxy_hints: Option<&std::collections::HashMap>, ) -> Vec { let _span = tracing::debug_span!("run_state_analysis").entered(); @@ -119,7 +121,7 @@ pub fn run_state_analysis( /// Build resource method summaries by pre-scanning all method bodies for known /// resource acquire/release operations. Only creates summaries for methods whose -/// bodies actually contain matching operations — never infers from names alone. +/// bodies actually contain matching operations, never infers from names alone. pub fn build_resource_method_summaries( bodies: &[crate::cfg::BodyCfg], lang: Lang, @@ -140,7 +142,7 @@ pub fn build_resource_method_summaries( }; for (_, info) in body.graph.node_references() { - // Check both Call and Seq (Assignment) nodes — resource operations + // Check both Call and Seq (Assignment) nodes, resource operations // can appear as RHS of assignments (e.g., `this.fd = fs.openSync(...)`). if !matches!( info.kind, diff --git a/src/state/symbol.rs b/src/state/symbol.rs index fd19c022..b666c40d 100644 --- a/src/state/symbol.rs +++ b/src/state/symbol.rs @@ -8,7 +8,7 @@ pub struct SymbolId(pub(crate) u32); /// Function-scope discriminator for symbol interning. /// -/// This provides **function-level isolation only** — not full lexical/block +/// This provides **function-level isolation only**, not full lexical/block /// scope modeling. Variables in different functions with the same name get /// distinct [`SymbolId`]s. Top-level / module-scope code uses `scope: None`. #[derive(Clone, Debug, PartialEq, Eq, Hash)] @@ -21,8 +21,8 @@ struct ScopedKey { /// /// Built once from CFG node `defines`/`uses`, reused throughout analysis. /// Two construction modes: -/// - [`from_cfg`](Self::from_cfg): flat (unscoped) interning — used by taint/SSA pipeline -/// - [`from_cfg_scoped`](Self::from_cfg_scoped): function-scoped interning — used by state analysis +/// - [`from_cfg`](Self::from_cfg): flat (unscoped) interning, used by taint/SSA pipeline +/// - [`from_cfg_scoped`](Self::from_cfg_scoped): function-scoped interning, used by state analysis #[derive(Default)] pub struct SymbolInterner { to_id: HashMap, @@ -43,7 +43,7 @@ impl SymbolInterner { /// scoped key. pub fn intern_scoped(&mut self, scope: Option<&str>, name: &str) -> SymbolId { // Member expressions (e.g. `this.fd`, `self.conn`) are shared class/ - // instance state — keep them in the global (None) scope so that + // instance state, keep them in the global (None) scope so that // `open()` and `close()` methods can track the same resource symbol. // Only plain local variables get function-scoped isolation. let effective_scope = if name.contains('.') { None } else { scope }; @@ -70,7 +70,7 @@ impl SymbolInterner { self.to_id.get(&key).copied() } - /// Intern a name (unscoped — equivalent to `intern_scoped(None, name)`). + /// Intern a name (unscoped, equivalent to `intern_scoped(None, name)`). /// /// Used by the taint/SSA pipeline and unit tests that don't need /// function-scope isolation. @@ -78,7 +78,7 @@ impl SymbolInterner { self.intern_scoped(None, name) } - /// Look up a name without interning it (unscoped — equivalent to + /// Look up a name without interning it (unscoped, equivalent to /// `get_scoped(None, name)`). pub fn get(&self, name: &str) -> Option { self.get_scoped(None, name) diff --git a/src/state/transfer.rs b/src/state/transfer.rs index 11665ba9..0b55c4cb 100644 --- a/src/state/transfer.rs +++ b/src/state/transfer.rs @@ -13,19 +13,16 @@ use petgraph::graph::NodeIndex; /// callee isn't a clean dotted member chain (parens, brackets, `::`, /// arrow operators, whitespace, or other complex tokens disqualify it). /// -/// Phase 3 of the field-projections rollout: this is the textual mirror -/// of `try_lower_field_proj_chain` in `src/ssa/lower.rs`. The state -/// engine doesn't yet read SSA bodies (would require threading SSA -/// through the lattice run), so the same parse rules are duplicated -/// here. Both helpers share the contract: a success here implies a -/// FieldProj chain at SSA level (or a direct receiver for the 1-dot -/// case). +/// Textual mirror of `try_lower_field_proj_chain` in +/// `src/ssa/lower.rs`. The state engine doesn't read SSA bodies, so +/// the parse rules are duplicated. A success here implies a FieldProj +/// chain at SSA level (or a direct receiver for the 1-dot case). /// -/// **Returns** `Some(("c", "Close"))` for `"c.Close"` (1 dot — the +/// **Returns** `Some(("c", "Close"))` for `"c.Close"` (1 dot, the /// receiver is a bare ident); `Some(("c.mu", "Lock"))` for -/// `"c.mu.Lock"` (2 dots — receiver is a 1-element chain); +/// `"c.mu.Lock"` (2 dots, receiver is a 1-element chain); /// `Some(("c.writer.header", "set"))` for `"c.writer.header.set"` -/// (3 dots — receiver is a 2-element chain). Returns `None` for any +/// (3 dots, receiver is a 2-element chain). Returns `None` for any /// callee shape we can't safely decompose textually. fn try_chain_decompose(callee: &str) -> Option<(&str, &str)> { for ch in callee.chars() { @@ -42,7 +39,7 @@ fn try_chain_decompose(callee: &str) -> Option<(&str, &str)> { return None; } // Reject if any segment in the receiver is empty (leading dot, - // double dots) — same discipline as the SSA-side helper. + // double dots), same discipline as the SSA-side helper. if receiver_text.split('.').any(str::is_empty) { return None; } @@ -50,7 +47,7 @@ fn try_chain_decompose(callee: &str) -> Option<(&str, &str)> { } /// Events emitted during transfer for illegal state transitions. -/// These are NOT lattice values — they become findings in `facts.rs`. +/// These are NOT lattice values, they become findings in `facts.rs`. #[derive(Debug, Clone)] pub struct TransferEvent { pub kind: TransferEventKind, @@ -159,7 +156,7 @@ pub struct ResourceMethodSummary { pub method_name: String, /// Whether this method acquires or releases a resource. pub effect: ResourceEffect, - /// `parent_body_id` of the declaring method — groups methods by class. + /// `parent_body_id` of the declaring method, groups methods by class. pub class_group: crate::cfg::BodyId, /// Span of the actual resource operation (e.g., fs.openSync at line 7). pub original_span: (usize, usize), @@ -171,7 +168,7 @@ pub struct DefaultTransfer<'a> { pub interner: &'a SymbolInterner, /// Resource method summaries for cross-body proxy resolution. pub resource_method_summaries: &'a [ResourceMethodSummary], - /// Optional per-body field-only points-to hints — names that resolve + /// Optional per-body field-only points-to hints, names that resolve /// to a value whose entire abstract heap identity is one or more /// [`crate::pointer::AbsLoc::Field`] locations (e.g. `m := c.mu`). /// @@ -225,21 +222,12 @@ impl DefaultTransfer<'_> { .get_scoped(info.ast.enclosing_func.as_deref(), name) } - /// Pointer-Phase 2 hook. Returns `true` when the call has been - /// fully handled as a field-aliased receiver proxy and the rest of - /// `apply_call` should bail. - /// - /// Activates only on single-dot calls (`.`) whose - /// receiver name is recorded with [`crate::pointer::PtrProxyHint::FieldOnly`] - /// in the per-body hint map AND for which a matching - /// [`ResourceMethodSummary`] exists. The acquire/release effect - /// is recorded against `state.chain_proxies` keyed by the receiver - /// name — chain_proxies is a tracking-only lattice today, so leak - /// detection (which only inspects `state.resource`) is suppressed - /// for the alias. Strict-additive: when no hint map is supplied, - /// when the receiver isn't `FieldOnly`, or when no method summary - /// matches, the function returns `false` and the legacy branches - /// run unchanged. + /// Returns `true` when the call was fully handled as a + /// field-aliased receiver proxy and the rest of `apply_call` + /// should bail. Activates on single-dot calls whose receiver is + /// `FieldOnly` in the hint map and that match a + /// [`ResourceMethodSummary`]. The acquire/release effect is + /// recorded against `state.chain_proxies` keyed by receiver name. fn try_apply_field_alias_proxy( &self, info: &NodeInfo, @@ -308,13 +296,13 @@ impl DefaultTransfer<'_> { None => return, }; - // ── Pointer-Phase 2: field-aliased receiver fast-path ─────────── + // ── field-aliased receiver fast-path ─────────── // When the receiver name resolves through points-to to a value // whose abstract heap identity is purely `Field(_, _)` (e.g. // `m := c.mu` followed by `m.Lock()`), the receiver is a // sub-object alias rather than a standalone resource handle. - // Routing the entire call into `chain_proxies` here — *before* - // the SymbolId-based direct-acquire/release/proxy branches — + // Routing the entire call into `chain_proxies` here, *before* + // the SymbolId-based direct-acquire/release/proxy branches , // suppresses the FP class where the local `m` would otherwise // be flagged as a leakable resource at function exit. // @@ -385,16 +373,16 @@ impl DefaultTransfer<'_> { // When no direct resource pair matched, check if the callee is a // method wrapper for a known resource operation. // - // Phase 3 (field-projections rollout, 2026-04-25): the previous + // the previous // single-dot band-aid (`callee.matches('.').count() == 1 && // !callee.contains('(')`) silently dropped chained receivers // because the original textual extractor took the chain root as - // receiver — collapsing `c.writer.header().set` to `c` and + // receiver, collapsing `c.writer.header().set` to `c` and // marking `c` as proxy-acquired (the gin/context.go FP class). // // The band-aid is now deleted. Chained-receiver method calls // are routed to a *separate* state map (`chain_proxies`) keyed by - // the joined receiver chain text — so `c.mu.Lock()` acquires + // the joined receiver chain text, so `c.mu.Lock()` acquires // `c.mu` (a chain-receiver entity), not `c`. The chain receiver // is independent of the chain root: leaks/double-closes are // tracked per chain, never propagated up to the root. @@ -443,7 +431,7 @@ impl DefaultTransfer<'_> { } else if !direct_acquire && !direct_release { // Single-dot receiver (`.`): existing // SymbolId-based path. Gated on direct_acquire/release - // because it shares state with the direct paths above — + // because it shares state with the direct paths above , // running both would double-transition. Honour the // explicit `info.call.receiver` when it's the same bare // ident, otherwise fall back to the parsed receiver text. @@ -544,7 +532,7 @@ impl DefaultTransfer<'_> { } fn apply_if(&self, info: &NodeInfo, edge: Option, state: &mut ProductState) { - // Determine the "positive edge" — the edge where the underlying + // Determine the "positive edge", the edge where the underlying // (de-negated) condition evaluates to true. // // For `if (is_authenticated(req))`: positive = True edge @@ -558,8 +546,8 @@ impl DefaultTransfer<'_> { // Resource null-check: `if (f)` or `if (!f)` where f is a tracked // resource currently in OPEN state. The "var is falsy" edge means - // the acquisition returned null/zero — no resource was actually - // produced — so subsequent close requirements do not apply on that + // the acquisition returned null/zero, no resource was actually + // produced, so subsequent close requirements do not apply on that // path. Clearing OPEN suppresses the spurious may-leak finding for // the canonical NULL-safe close idiom in C / C++ / similar: // @@ -572,7 +560,7 @@ impl DefaultTransfer<'_> { // // Heuristic conditions: // * condition is a single-variable truth check (no comparisons, - // no calls — `condition_vars.len() == 1` and the trimmed text + // no calls, `condition_vars.len() == 1` and the trimmed text // equals that variable name). // * the var has OPEN in its lifecycle bitset. // * the edge represents "var is falsy" (= !is_positive_edge). @@ -595,7 +583,7 @@ impl DefaultTransfer<'_> { if let Some(ref cond) = info.condition_text { let cond_lower = cond.to_ascii_lowercase(); - // Strip leading negation operator for pattern matching — + // Strip leading negation operator for pattern matching , // the edge selection above already encodes the semantics. let cond_inner = if info.condition_negated { cond_lower.trim_start_matches('!').trim_start() @@ -691,7 +679,7 @@ fn is_guard_like(callee: &str) -> bool { } /// True iff the condition is a single-variable truth check (no comparison, -/// no method call, no boolean composition) — the bare `if (f)` or `if (!f)` +/// no method call, no boolean composition), the bare `if (f)` or `if (!f)` /// shape used as a NULL-safe gate around resource access. /// /// Conservative: requires `condition_vars` to have exactly one entry, and @@ -1093,7 +1081,7 @@ mod tests { let mut state = ProductState::initial(); state.resource.set(sym_f, ResourceLifecycle::OPEN); - // `if (!f)` — condition_negated=true, true-edge means f is null + // `if (!f)`, condition_negated=true, true-edge means f is null let info = NodeInfo { kind: StmtKind::If, condition_text: Some("!f".into()), @@ -1232,7 +1220,7 @@ mod tests { #[test] fn auth_token_underscore_camel_boundary_cases() { - // Underscore-joined identifiers are single tokens — must not match interior. + // Underscore-joined identifiers are single tokens, must not match interior. assert!(!condition_contains_auth_token( "req.user_is_authenticated_flag", "is_authenticated" @@ -1259,12 +1247,12 @@ mod tests { "xmiddleware.auth()", "middleware.auth" )); - // Right boundary violation — "middleware.authz" extends past "middleware.auth". + // Right boundary violation, "middleware.authz" extends past "middleware.auth". assert!(!condition_contains_auth_token( "middleware.authz()", "middleware.auth" )); - // "middleware.auth.check" — matcher ends at '.', which is non-ident → matches. + // "middleware.auth.check", matcher ends at '.', which is non-ident → matches. assert!(condition_contains_auth_token( "middleware.auth.check()", "middleware.auth" @@ -1332,7 +1320,7 @@ mod tests { #[test] fn auth_token_boolean_composition() { - // Compound conditions — each token should be individually matchable. + // Compound conditions, each token should be individually matchable. assert!(condition_contains_auth_token( "is_authenticated && is_admin", "is_authenticated" @@ -1352,7 +1340,7 @@ mod tests { } // ───────────────────────────────────────────────────────────────── - // Phase 3: chain-receiver decomposition + chain_proxies tracking + // chain-receiver decomposition + chain_proxies tracking // ───────────────────────────────────────────────────────────────── // // These tests pin the contract that: @@ -1360,12 +1348,12 @@ mod tests { // method, bailing on complex tokens. // 2. The proxy-method routing in `apply_call` records chained // receivers in `state.chain_proxies` (keyed by joined chain - // text) — independent from the chain root's `SymbolId`-based + // text), independent from the chain root's `SymbolId`-based // `state.receiver_class_group` entries. // 3. Single-dot callees still flow through the existing SymbolId // path (regression guard). // 4. The deleted single-dot band-aid no longer suppresses chain - // cases — `c.mu.Lock()` now fires the chain-proxies path + // cases, `c.mu.Lock()` now fires the chain-proxies path // instead of being silently dropped. #[test] @@ -1407,7 +1395,7 @@ mod tests { // the simple `....` shape; helper must bail to // preserve the conservative behaviour the band-aid established. for s in [ - "Foo::bar::baz", // Rust path — `::` rules it out + "Foo::bar::baz", // Rust path, `::` rules it out "ptr->field.f", // C arrow operator "obj.f().g", // intermediate call "vec[0].field", // index expression @@ -1431,7 +1419,7 @@ mod tests { #[test] fn chain_proxy_acquire_records_chain_text_not_root() { - // Phase 3 key behaviour: a chained-receiver acquire (`c.mu.Lock()`) + // Key behaviour: a chained-receiver acquire (`c.mu.Lock()`) // records `c.mu` in `state.chain_proxies` and DOES NOT touch the // SymbolId-keyed `receiver_class_group` for the chain root `c`. let mut interner = SymbolInterner::new(); @@ -1481,7 +1469,7 @@ mod tests { assert_eq!(entry.class_group, crate::cfg::BodyId(7)); assert_eq!(entry.acquire_span, (10, 20)); - // Root `c` is NOT marked in receiver_class_group — the gin/context FP + // Root `c` is NOT marked in receiver_class_group, the gin/context FP // the band-aid was guarding against can no longer reappear. assert!( state.receiver_class_group.is_empty(), @@ -1564,7 +1552,7 @@ mod tests { #[test] fn chain_proxy_distinct_chains_dont_collide() { // `c.mu.Lock()` and `c.other.Lock()` are independent chain - // receivers — each gets its own entry in chain_proxies. + // receivers, each gets its own entry in chain_proxies. let interner = SymbolInterner::new(); let class_group = crate::cfg::BodyId(3); @@ -1610,7 +1598,7 @@ mod tests { #[test] fn single_dot_proxy_acquire_uses_symbol_id_path() { // REGRESSION: single-dot callees keep the existing SymbolId-based - // path — `f.acquireMine()` records against + // path, `f.acquireMine()` records against // `receiver_class_group[sym_f]`, NOT `chain_proxies["f"]`. This // preserves all existing 1-dot proxy semantics (leak detection, // finding attribution). @@ -1716,7 +1704,7 @@ mod tests { fn chain_proxy_lattice_join_unions_keys() { // Sanity check: the lattice join unions chain_proxies keys. // Branch A: `c.mu` OPEN. Branch B: `c.other` OPEN. Join must - // contain both — this is the dataflow-correctness invariant + // contain both, this is the dataflow-correctness invariant // for chain tracking across branches. use crate::state::lattice::Lattice; let mut a = ProductState::initial(); @@ -1745,7 +1733,7 @@ mod tests { #[test] fn chain_proxy_lattice_join_merges_lifecycle() { - // Same chain key on two branches — the lifecycle is OR-joined + // Same chain key on two branches, the lifecycle is OR-joined // (OPEN ∪ CLOSED). Mirrors the `ResourceLifecycle::join` // bitflag-or semantics already used for SymbolId-based tracking. use crate::state::lattice::Lattice; @@ -1775,7 +1763,7 @@ mod tests { } // ───────────────────────────────────────────────────────────────── - // Pointer-analysis Phase 2: PtrProxyHint::FieldOnly routes + // Pointer-analysis: PtrProxyHint::FieldOnly routes // single-dot proxy-acquire to chain_proxies, suppressing the // SymbolId path that would otherwise mark the field-aliased local // as a leakable resource. @@ -1783,7 +1771,7 @@ mod tests { #[test] fn field_only_hint_routes_single_dot_acquire_to_chain_proxies() { - // Models `m := c.mu; m.Lock()` — `m`'s pt set is `{Field(SelfParam, mu)}`, + // Models `m := c.mu; m.Lock()`, `m`'s pt set is `{Field(SelfParam, mu)}`, // so PtrProxyHint::FieldOnly applies. The acquire must record // `m` in chain_proxies, NOT in receiver_class_group, so the // leak detector does not later flag `m` as an OPEN-at-exit @@ -1845,7 +1833,7 @@ mod tests { #[test] fn field_only_hint_release_transitions_chain_entry_to_closed() { // Acquire + Release pair on the field-aliased local both route - // through chain_proxies — the entry transitions OPEN → CLOSED + // through chain_proxies, the entry transitions OPEN → CLOSED // exactly as the existing chain-receiver path does. let mut interner = SymbolInterner::new(); let _sym_m = interner.intern_scoped(None, "m"); @@ -1909,7 +1897,7 @@ mod tests { #[test] fn no_hint_falls_through_to_existing_symbol_id_path() { // REGRESSION: when `ptr_proxy_hints` is `None`, the single-dot - // proxy-acquire branch behaves exactly as today — the SymbolId + // proxy-acquire branch behaves exactly as today, the SymbolId // path fires, `chain_proxies` stays empty. Strict-additive // contract: pointer analysis disabled ⇒ no behavioural change. let mut interner = SymbolInterner::new(); @@ -1951,7 +1939,7 @@ mod tests { fn empty_hint_map_does_not_redirect() { // REGRESSION: an empty hint map means "every name resolves to // PtrProxyHint::Other". The single-dot branch must fall - // through to the SymbolId path — not silently route to + // through to the SymbolId path, not silently route to // chain_proxies because the map happened to be empty. let mut interner = SymbolInterner::new(); let sym_f = interner.intern_scoped(None, "f"); diff --git a/src/summary/mod.rs b/src/summary/mod.rs index b647d7bf..9fb123e8 100644 --- a/src/summary/mod.rs +++ b/src/summary/mod.rs @@ -11,31 +11,10 @@ use std::hash::{Hash, Hasher}; // ── Sink site (primary sink-location attribution) ─────────────────────── -/// A single dangerous-instruction site recorded inside a function's body. -/// -/// `SinkSite` pairs a [`Cap`] (the bits this particular site consumes) with -/// the file-relative source location of the instruction that consumes them. -/// Carrying this alongside a summary's `param_to_sink` map lets cross-file -/// findings attribute the finding line to the actual dangerous call inside -/// the callee, rather than to the caller's call-site (which is all a -/// bare `(param_idx, Cap)` pair could support). -/// -/// Primary sink-location attribution stores this data in the summary so -/// `build_taint_diag()` can consume it and overwrite the caller-site -/// `Finding.line` when the sink was resolved via summary. -/// -/// Fields -/// ────── -/// * `file_rel` — the callee file's path relative to the workspace root -/// being scanned. Matches the `FuncKey::namespace` convention so the -/// site's origin is addressable without additional workspace context. -/// * `line` / `col` — 1-based source coordinates of the sink instruction. -/// `0` indicates the extractor could not resolve coordinates (e.g. a -/// pass-2 transient summary without tree access). -/// * `snippet` — the trimmed source line, capped at 120 characters, empty -/// when coordinates could not be resolved. -/// * `cap` — the [`Cap`] bits this specific site consumes. A parameter's -/// total sink caps is the union across every site associated with it. +/// A single dangerous-instruction site inside a function's body. +/// Pairs a [`Cap`] with the source location of the consuming +/// instruction so cross-file findings can attribute to the callee +/// rather than the caller call-site. #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] pub struct SinkSite { #[serde(default, skip_serializing_if = "String::is_empty")] @@ -50,19 +29,14 @@ pub struct SinkSite { } impl SinkSite { - /// Dedup key comparing the full identity of a site. Two sites with the - /// same `(file_rel, line, col, cap)` describe the same consumption of - /// the same bits at the same source location and should collapse when - /// summaries are merged. + /// Dedup key: two sites with the same `(file_rel, line, col, cap)` + /// describe the same consumption and collapse on merge. pub(crate) fn dedup_key(&self) -> (&str, u32, u32, u16) { (self.file_rel.as_str(), self.line, self.col, self.cap.bits()) } - /// Build a site that only carries a [`Cap`] — no resolved source - /// coordinates. Used by extraction paths that have no tree/bytes - /// context (e.g. pass-2 transient summaries), so downstream consumers - /// unioning caps across sites still see the correct bits even when - /// primary-location attribution is not available. + /// Build a cap-only site for extraction paths with no tree/bytes + /// context (pass-2 transient summaries). pub fn cap_only(cap: Cap) -> Self { Self { file_rel: String::new(), @@ -75,13 +49,8 @@ impl SinkSite { } /// Tree/bytes context for resolving a CFG span to a [`SinkSite`]. -/// -/// Summary extraction runs deep inside the taint engine, far from the -/// `ParsedFile` that owns the tree; `SinkSiteLocator` is the narrow -/// reference bundle the extractor needs to populate `SinkSite.line`, -/// `col`, and `snippet`. The struct is intentionally plain references -/// so construction is free and threading it as `Option<&Locator>` is -/// cheap. +/// Threaded as `Option<&Locator>` so extraction paths without tree +/// access can pass `None` cheaply. pub struct SinkSiteLocator<'a> { pub tree: &'a tree_sitter::Tree, pub bytes: &'a [u8], @@ -89,10 +58,8 @@ pub struct SinkSiteLocator<'a> { } impl<'a> SinkSiteLocator<'a> { - /// Resolve a `(start_byte, end_byte)` span to a [`SinkSite`] with the - /// given `cap`. Coordinates fall back to `(0, 0)` and the snippet to - /// empty when the byte offset is out of range (should not happen for - /// spans that came from the same tree). + /// Resolve a span to a [`SinkSite`]. Coordinates fall back to + /// `(0, 0)` and the snippet to empty when out of range. pub fn site_for_span(&self, span: (usize, usize), cap: Cap) -> SinkSite { let byte = span.0; let point = self @@ -148,7 +115,7 @@ pub(crate) fn union_param_sink_sites( /// Real disambigs come from `tree_sitter::Node::start_byte` (see /// `cfg.rs:fn_disambig`), which is a byte offset into the source file. /// Source files in practice are far below 2 GiB, so bit 31 of a real -/// disambig is always zero — setting it marks a value as synthetic and +/// disambig is always zero, setting it marks a value as synthetic and /// keeps it in a disjoint namespace from byte-offset disambigs. const SYNTHETIC_DISAMBIG_BIT: u32 = 0x8000_0000; @@ -160,17 +127,17 @@ const SYNTHETIC_DISAMBIG_BIT: u32 = 0x8000_0000; /// to disambiguate same-name overloads and method calls at resolution time /// without having to re-parse the raw callee string. /// -/// * `name` — the raw callee text as it appeared in source +/// * `name`, the raw callee text as it appeared in source /// (`"obj.method"`, `"env::var"`, `"helper"`). Preserved for diagnostics. -/// * `arity` — number of positional arguments at the call site. `None` +/// * `arity`, number of positional arguments at the call site. `None` /// when splats / keyword-args / rest-params make the count unreliable. -/// * `receiver` — structured receiver identifier for method calls +/// * `receiver`, structured receiver identifier for method calls /// (e.g. `"obj"` in `obj.method()`). Carries the root receiver for /// chained calls; `None` for non-method or complex receivers. -/// * `qualifier` — the segment immediately before the leaf for non-method +/// * `qualifier`, the segment immediately before the leaf for non-method /// qualified calls (e.g. `"env"` in `env::var`). Extracted once at CFG /// time rather than re-parsed downstream. -/// * `ordinal` — the per-function call ordinal matching +/// * `ordinal`, the per-function call ordinal matching /// `CallMeta.call_ordinal`, allowing cross-file consumers to address a /// specific call site rather than just a callee name. #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] @@ -293,15 +260,15 @@ pub struct FuncSummary { // ── Taint behaviour ────────────────────────────────────────────────── // Stored as raw `u16` so serde doesn't need to know about `bitflags`. - /// Caps this function **introduces** — i.e. the return value carries + /// Caps this function **introduces**, i.e. the return value carries /// freshly‑tainted data even if no argument was tainted. pub source_caps: u16, - /// Caps this function **cleans** — passing tainted data through this + /// Caps this function **cleans**, passing tainted data through this /// function strips the corresponding bits. pub sanitizer_caps: u16, - /// Caps this function **consumes unsafely** — calling it with tainted + /// Caps this function **consumes unsafely**, calling it with tainted /// arguments that still carry these bits is a finding. pub sink_caps: u16, @@ -309,7 +276,7 @@ pub struct FuncSummary { #[serde(default)] pub propagating_params: Vec, - /// Legacy field — kept only for deserialising old JSON from SQLite. + /// Legacy field, kept only for deserialising old JSON from SQLite. /// New code should use `propagating_params` instead. #[serde(default, skip_serializing)] pub propagates_taint: bool, @@ -317,7 +284,7 @@ pub struct FuncSummary { /// Indices of parameters that flow to internal sinks (0‑based). pub tainted_sink_params: Vec, - /// Per-parameter [`SinkSite`] records — mirrors + /// Per-parameter [`SinkSite`] records, mirrors /// [`SsaFuncSummary::param_to_sink`] so the coarse legacy summary also /// carries primary sink-location attribution through the two-pass /// architecture. Empty when the extractor lacked tree access. @@ -394,7 +361,7 @@ pub struct FuncSummary { /// /// Empty for files with no declared inheritance / impl /// relationships and for Go (which uses implicit interface - /// satisfaction — Phase 6 does not try to compute it). + /// satisfaction, not computed). /// /// **Per-file duplication.** Every `FuncSummary` produced from a /// given file carries the **same** `hierarchy_edges` vector so the @@ -457,7 +424,7 @@ pub enum CalleeResolution { Resolved(FuncKey), /// No candidates found at all. NotFound, - /// Multiple candidates — ambiguous, cannot pick one. + /// Multiple candidates, ambiguous, cannot pick one. Ambiguous(Vec), } @@ -470,19 +437,19 @@ pub enum CalleeResolution { /// /// Hint categories, ordered from strongest to weakest: /// -/// * `receiver_type` — authoritative class/impl/module name (e.g. from +/// * `receiver_type`, authoritative class/impl/module name (e.g. from /// type inference or a `use ...` resolution). When set, the resolver /// *requires* the callee's container to equal this name and refuses to /// fall back to a leaf-name collision if the qualified lookup misses. -/// * `namespace_qualifier` — syntactic qualifier parsed from the callee +/// * `namespace_qualifier`, syntactic qualifier parsed from the callee /// (e.g. `"env"` in `env::var`, `"http"` in `http.Get`). Treated as a /// container hint but not authoritative: a miss falls through. -/// * `receiver_var` — syntactic receiver variable name (e.g. `"obj"` in +/// * `receiver_var`, syntactic receiver variable name (e.g. `"obj"` in /// `obj.method()`). Soft hint, used only to tie-break ambiguity. -/// * `caller_container` — caller's own enclosing container, used to +/// * `caller_container`, caller's own enclosing container, used to /// resolve bare self-calls inside a class/impl body. /// -/// `arity` is a hard filter — when `Some`, every candidate whose arity +/// `arity` is a hard filter, when `Some`, every candidate whose arity /// differs is excluded from consideration. #[derive(Debug, Clone)] pub struct CalleeQuery<'a> { @@ -502,7 +469,7 @@ pub struct CalleeQuery<'a> { /// `std::env::var` in Rust the caller passes `"env"`; for `http.Get` /// in Go, `"http"`. Left `None` for purely bare calls. pub namespace_qualifier: Option<&'a str>, - /// Syntactic receiver variable name. Used only as a tie-breaker — a + /// Syntactic receiver variable name. Used only as a tie-breaker, a /// variable name is a weak proxy for a class name. pub receiver_var: Option<&'a str>, /// Positional-argument count at the call site. Hard filter when set. @@ -527,14 +494,14 @@ impl<'a> CalleeQuery<'a> { /// /// Functions are partitioned by language + namespace + name + arity. Two /// functions with the same bare name but different languages or namespaces -/// are stored separately — no implicit cross-language merging occurs. +/// are stored separately, no implicit cross-language merging occurs. /// /// A secondary index `(Lang, name)` supports fast lookup by language + name /// for same-language resolution in the taint engine. #[derive(Default)] pub struct GlobalSummaries { by_key: HashMap, - /// Bare leaf-name index — kept for compatibility with callers that only + /// Bare leaf-name index, kept for compatibility with callers that only /// see an unqualified call string. A single name may map to many keys /// across containers / files / arities. by_lang_name: HashMap<(Lang, String), Vec>, @@ -548,7 +515,7 @@ pub struct GlobalSummaries { /// `module_path` set. Used by use-map driven resolution to look up /// candidates by their crate-relative module rather than their /// filesystem path. Same name / module / arity overloads land on the - /// same vector — arity narrowing happens at resolution time. + /// same vector, arity narrowing happens at resolution time. by_rust_module: HashMap<(String, String), Vec>, /// Precise SSA-derived per-parameter summaries, keyed by `FuncKey`. /// These take precedence over `FuncSummary` during callee resolution. @@ -562,14 +529,14 @@ pub struct GlobalSummaries { /// pass 1 and consumed by /// [`crate::auth_analysis::run_auth_analysis`] during pass 2. auth_by_key: HashMap, - /// Phase 6 type hierarchy index for runtime virtual-dispatch fan-out. + /// Type hierarchy index for runtime virtual-dispatch fan-out. /// /// Installed by [`Self::install_hierarchy`] after pass 1 from the /// merged `FuncSummary::hierarchy_edges` vectors. Consumed by /// [`Self::resolve_callee_widened`] during pass 2 so the taint /// engine sees every concrete implementer of a method when the /// receiver is statically typed as a super-class / trait / - /// interface — recovering the dispatch precision that today's + /// interface, recovering the dispatch precision that today's /// single-result [`Self::resolve_callee`] discards. /// /// `None` until installed: every consumer treats `None` as @@ -590,7 +557,7 @@ impl GlobalSummaries { /// Identity collisions are extraordinarily rare in practice (they /// require two structurally distinct functions to land on the same /// non-synthetic key, e.g. both with `disambig: None`). The loop - /// bound is defensive — if synthetic probing still collides after + /// bound is defensive, if synthetic probing still collides after /// 1024 attempts we fall through and let the caller merge, which /// degrades gracefully to the old behaviour rather than looping /// forever. @@ -619,12 +586,12 @@ impl GlobalSummaries { /// SSA-summary variant of [`Self::reconcile_func_summary_key`]. /// /// Distinctness signals for SSA summaries are weaker than for - /// coarse `FuncSummary`s — the summary itself carries no explicit + /// coarse `FuncSummary`s, the summary itself carries no explicit /// `param_count`, only references to parameter indices. We combine: /// - /// * **Key arity fit** — any parameter index referenced by the new + /// * **Key arity fit**, any parameter index referenced by the new /// summary that exceeds `key.arity` is a structural mismatch. - /// * **Existing-entry compare** — if an entry already lives at + /// * **Existing-entry compare**, if an entry already lives at /// this key and it disagrees on the set of referenced parameter /// indices, the two cannot both describe the same function. fn reconcile_ssa_summary_key(&self, mut key: FuncKey, summary: &SsaFuncSummary) -> FuncKey { @@ -856,7 +823,7 @@ impl GlobalSummaries { pub fn merge(&mut self, other: GlobalSummaries) { // `insert` rebuilds every secondary index (by_lang_name, by_lang_qualified, // by_rust_module) from the summary itself, so we do not need to copy - // `other.by_rust_module` explicitly — draining `other.by_key` is enough. + // `other.by_rust_module` explicitly, draining `other.by_key` is enough. for (key, summary) in other.by_key { self.insert(key, summary); } @@ -874,7 +841,7 @@ impl GlobalSummaries { } // Hierarchy index: invalidate after a merge so the next consumer // sees a freshly-built view that includes `other`'s edges. The - // alternative — point-merging two indexes — is racy when the + // alternative, point-merging two indexes, is racy when the // same `(lang, super)` key carries different sub-orderings in // each input; rebuild is O(n) over `by_key.iter()` and is the // single source of truth. @@ -889,9 +856,9 @@ impl GlobalSummaries { /// caller genuinely wants the new one to replace the old. /// /// When the existing entry is **incompatible** with the incoming - /// one — the key's `arity` disagrees with the new summary's referenced + /// one, the key's `arity` disagrees with the new summary's referenced /// parameter indices, or the two summaries would describe different - /// functions — we synthesize a disambig so both are kept. Silent + /// functions, we synthesize a disambig so both are kept. Silent /// replacement in that case would drop one function's cross-file /// taint signal entirely, which the caller cannot recover. /// @@ -906,23 +873,21 @@ impl GlobalSummaries { /// `ssa_summary_fits_arity` would reject the summary and /// `reconcile_ssa_summary_key` would synthesise a disambig that /// uncouples the SSA FuncKey from the matching FuncSummary FuncKey - /// (audit gap A.2.1.G1 — + /// (audit gap A.2.1.G1 , /// `project_typed_callgraph_audit_gap_ssa_disambig.md`). pub fn insert_ssa(&mut self, key: FuncKey, summary: SsaFuncSummary) { // The summary may reference a parameter index ≥ `key.arity` when // scoped SSA lowering synthesised `Param` ops for **external // captures** (free identifiers like `this`, module imports, - // unresolved method names) — see audit gap A.2.1.G1 + // unresolved method names), see audit gap A.2.1.G1 // (`project_typed_callgraph_audit_gap_ssa_disambig.md`). These // synthetic refs are useful inside the file they were extracted - // in (the caller's implicit-uses argument group at the same - // index aligns with the synthetic Param) and stay useful when - // resolved cross-file by name from this map (the same - // implicit-uses alignment applies). But they would trip - // [`ssa_summary_fits_arity`] inside [`reconcile_ssa_summary_key`], - // forcing a synthetic disambig that uncouples the SSA FuncKey - // from the matching FuncSummary FuncKey — and Phase 3's - // `summaries.get_ssa(caller_key)` lookup (consuming + // in (caller implicit-uses align with the synthetic Param) and + // stay useful when resolved cross-file by name. But they trip + // [`ssa_summary_fits_arity`] inside + // [`reconcile_ssa_summary_key`], forcing a synthetic disambig + // that uncouples the SSA FuncKey from the FuncSummary FuncKey + //, `summaries.get_ssa(caller_key)` (consuming // `typed_call_receivers` at the FuncSummary-aligned key) would // miss. // @@ -930,23 +895,22 @@ impl GlobalSummaries { // arity): // // * **No existing entry, or existing entry also has out-of-range - // refs** — keep the (untrimmed) summary at the original key, - // bypassing the disambig synthesis. Phase 3 finds the entry - // under the FuncSummary's own disambig; cross-file resolvers - // find the same entry with its full per-param signal - // (closures, lambdas, captured-var sinks). The "existing also + // refs**, keep the untrimmed summary at the original key, + // bypassing disambig synthesis. Resolution finds the entry + // under the FuncSummary's own disambig with its full + // per-param signal (closures, lambdas, captured-var sinks). The "existing also // has out-of-range refs" branch covers the iterative-rescan // case where round 2's incoming summary lands on top of round // 1's already-installed copy of the same function. // - // * **Existing entry fits arity (legit) but new doesn't** — fall + // * **Existing entry fits arity (legit) but new doesn't**, fall // back to the disambig synthesis. This preserves the // `insert_ssa_arity_overflow_rekeys` invariant: a structurally // incompatible incoming summary (different function sharing // name + container + arity, with param refs at indices that // don't even exist in the legitimate function) cannot // dethrone the existing entry by silent overwrite. Both - // summaries survive — the existing one at the original key, + // summaries survive, the existing one at the original key, // the new one at the synthesised disambig. let key = if key.arity.is_some() && !ssa_summary_fits_arity(&summary, key.arity) { let existing_also_overflows = self @@ -1044,7 +1008,7 @@ impl GlobalSummaries { } /// Count of cross-file bodies currently loaded. Exposed for - /// `tracing::debug!` observability — lets callers distinguish "no + /// `tracing::debug!` observability, lets callers distinguish "no /// bodies available" from "bodies available but inline didn't fire". pub fn bodies_len(&self) -> usize { self.bodies_by_key.len() @@ -1081,7 +1045,7 @@ impl GlobalSummaries { /// /// Returns `(source_caps, sanitizer_caps, sink_caps, propagating_params)` /// per key. Used by the SCC fixed-point loop to detect when an iteration - /// has not changed any summary — i.e. convergence. + /// has not changed any summary, i.e. convergence. pub fn snapshot_caps(&self) -> HashMap)> { self.by_key .iter() @@ -1127,7 +1091,7 @@ impl GlobalSummaries { /// `(wildcard_prefix, name)` in the module index. If across all /// wildcards exactly one arity-filtered candidate appears → resolved. /// 3. Otherwise fall through to [`resolve_callee_key_with_container`] - /// with no `container_hint` — meaning only the existing namespace / + /// with no `container_hint`, meaning only the existing namespace / /// arity disambiguation applies. /// /// A `None` use_map (non-Rust file or no `use` declarations) makes this @@ -1229,7 +1193,7 @@ impl GlobalSummaries { /// Resolve a callee name with an optional container hint. /// - /// Legacy entry point — kept so tests and older callers compile + /// Legacy entry point, kept so tests and older callers compile /// unchanged. `container_hint` is interpreted as a syntactic /// container qualifier (not an authoritative receiver type), so a /// miss is allowed to fall through to leaf-name lookup. New @@ -1261,35 +1225,35 @@ impl GlobalSummaries { /// **New resolution order** (qualified identity primary, leaf name /// fallback): /// - /// 1. **Receiver-type qualified** — if `receiver_type` is set, + /// 1. **Receiver-type qualified**, if `receiver_type` is set, /// consult `by_lang_qualified[{receiver_type}::{name}]` with the /// arity filter. Exactly-one → resolved; same-namespace /// tie-breaker if multiple. *Receiver types are authoritative*: /// a miss does not fall back to bare leaf lookup (that would be /// a silent reinterpretation). - /// 2. **Namespace-qualifier qualified** — if `namespace_qualifier` + /// 2. **Namespace-qualifier qualified**, if `namespace_qualifier` /// is set, try the qualified index with that container. /// Non-authoritative: a miss falls through. - /// 3. **Caller-self-container** — when the caller lives inside a + /// 3. **Caller-self-container**, when the caller lives inside a /// container (method body), try the qualified index against the /// caller's own container. Resolves bare `foo()` self-calls /// inside a class without collapsing into an unrelated same-leaf /// definition in another file. - /// 4. **Same-namespace unique leaf** — intra-file bare-leaf call: + /// 4. **Same-namespace unique leaf**, intra-file bare-leaf call: /// if the caller's namespace contains exactly one arity-matched /// candidate with this leaf, resolve to it. - /// 5. **Receiver-variable tie-break** — if the same-namespace + /// 5. **Receiver-variable tie-break**, if the same-namespace /// lookup misses but the raw call came with a receiver variable, /// try `{receiver_var}::{name}` as a last qualified attempt. /// - /// 5.5. **Bare-call free-function preference** — for a truly bare + /// 5.5. **Bare-call free-function preference**, for a truly bare /// call (no receiver type, no namespace qualifier, no receiver /// variable), if exactly one same-namespace arity-matched /// candidate has an empty container, resolve to it. A class /// method cannot be invoked with bare-call syntax from outside /// its class, so this disambiguation is safe even when same-name /// methods exist elsewhere in the file. - /// 6. **Leaf-name fallback** — arity-filtered same-language lookup. + /// 6. **Leaf-name fallback**, arity-filtered same-language lookup. /// Unique → resolved. Multiple + we had any qualified hint → /// Ambiguous (refuse to guess when a qualifier exists but /// missed). Multiple + no qualified hint → narrow by namespace, @@ -1411,7 +1375,7 @@ impl GlobalSummaries { // outside its own class (intra-class self-calls were already // resolved by step 3). When the same-namespace candidate set // contains exactly one empty-container entry, it is the - // unambiguous target — returning Ambiguous here would be a + // unambiguous target, returning Ambiguous here would be a // silent false negative whenever a top-level helper happens to // share a name with some method elsewhere in the file. let syntactic_bare = q.receiver_type.is_none() @@ -1434,7 +1398,7 @@ impl GlobalSummaries { } // Multiple arity-matched candidates remain. When a qualified - // hint was supplied but missed, refuse to guess — a silent + // hint was supplied but missed, refuse to guess, a silent // leaf-name pick would defeat the point of qualified-first // resolution. (`receiver_type` is handled in Step 1 and never // reaches here; `namespace_qualifier` / `caller_container` @@ -1443,7 +1407,7 @@ impl GlobalSummaries { return CalleeResolution::Ambiguous(arity_filtered.into_iter().cloned().collect()); } - // No qualified hints whatsoever — tolerate namespace narrowing. + // No qualified hints whatsoever, tolerate namespace narrowing. match same_ns.len() { 1 => CalleeResolution::Resolved(same_ns[0].clone()), 0 => CalleeResolution::Ambiguous(arity_filtered.into_iter().cloned().collect()), @@ -1452,11 +1416,11 @@ impl GlobalSummaries { } /// Install / refresh the type-hierarchy index from the currently - /// loaded summaries. Idempotent — calling twice rebuilds. + /// loaded summaries. Idempotent, calling twice rebuilds. /// /// Call this once after pass-1 merge (and again whenever /// summary state changes in a way that could affect virtual - /// dispatch — typically: after the call-graph is rebuilt mid-fixed-point). + /// dispatch, typically: after the call-graph is rebuilt mid-fixed-point). /// `merge()` automatically invalidates so a forgotten reinstall /// degrades to today's behaviour rather than a stale lookup. pub fn install_hierarchy(&mut self) { @@ -1469,7 +1433,7 @@ impl GlobalSummaries { self.hierarchy.as_ref() } - /// Hard cap on hierarchy fan-out from a single call site — see + /// Hard cap on hierarchy fan-out from a single call site, see /// [`Self::resolve_callee_widened`] for rationale. Public for tests /// that need to assert cap behaviour without hard-coding the value. pub const MAX_HIERARCHY_FANOUT: usize = 8; @@ -1494,14 +1458,14 @@ impl GlobalSummaries { /// /// Hard cap: at most [`Self::MAX_HIERARCHY_FANOUT`] keys are /// returned. When the cap fires, the cap-hit is logged at `debug` - /// and the tail impls are silently dropped — over-fanning is a + /// and the tail impls are silently dropped, over-fanning is a /// precision-tax knob, not a soundness one. /// /// Empty result + non-empty `subs` triggers a /// secondary fall-through to [`Self::resolve_callee`] so a /// type-fact misclassification (receiver typed as a super-class /// that has no method by this name on any sub) does not silently - /// regress to "no resolution at all" — the leaf-name path can still + /// regress to "no resolution at all", the leaf-name path can still /// pick up a match. This preserves the /// "subset of today's targets, never a superset" rule under /// hierarchy-aware resolution failure. @@ -1584,7 +1548,7 @@ impl GlobalSummaries { // Hierarchy widening produced nothing (e.g., none of the // recorded sub-types declare this method). Fall back to // today's qualified-first resolver so the misclassified- - // type case still finds a leaf match — the same + // type case still finds a leaf match, the same // "preserve today's behaviour on miss" rule the call-graph // builder applies. return single_fallback(); @@ -1615,15 +1579,15 @@ impl std::fmt::Debug for GlobalSummaries { /// /// Comparison rules /// ──────────────── -/// * **`param_count` / `kind` / `container`** — unconditional agreement. +/// * **`param_count` / `kind` / `container`**, unconditional agreement. /// Any mismatch is a hard collision between distinct functions. -/// * **`file_path`** — agree when both sides are populated. A blank path +/// * **`file_path`**, agree when both sides are populated. A blank path /// can come from synthetic summaries constructed in tests / interop /// configs and should not force a split. -/// * **`param_names`** — agree when both sides are populated. Legacy +/// * **`param_names`**, agree when both sides are populated. Legacy /// summaries may persist with empty names; treating empty as "unknown" /// avoids gratuitous splits while still catching real divergence. -/// * **`module_path`** — Rust-only. Agreed when both sides are `Some`. +/// * **`module_path`**, Rust-only. Agreed when both sides are `Some`. /// A missing module path on one side is legacy-compatible; two *distinct* /// `Some` values mean the two summaries belong to different crates' /// module trees. @@ -1653,7 +1617,7 @@ pub(crate) fn summaries_compatible(a: &FuncSummary, b: &FuncSummary) -> bool { /// Derive a deterministic synthetic disambiguator from the /// identity-relevant fields of a `FuncSummary`. /// -/// The top bit is **not** set here — the caller composes the final value +/// The top bit is **not** set here, the caller composes the final value /// via `SYNTHETIC_DISAMBIG_BIT | (hash & !SYNTHETIC_DISAMBIG_BIT)` so that /// (a) the caller can safely bump the low bits to probe for a free slot, /// and (b) the synthetic namespace stays disjoint from byte-offset @@ -1678,7 +1642,7 @@ pub(crate) fn synthesize_disambig(summary: &FuncSummary) -> u32 { /// `SsaFuncSummary` carries no explicit `param_count`; we approximate /// it via the maximum parameter index referenced by either summary. /// Two summaries are compatible when neither references a parameter -/// index the other cannot — an upward compatibility check, so a refined +/// index the other cannot, an upward compatibility check, so a refined /// summary that merely adds flows for previously-silent parameters is /// still considered compatible. fn ssa_summaries_compatible( diff --git a/src/summary/points_to.rs b/src/summary/points_to.rs index c466a80e..4bf24223 100644 --- a/src/summary/points_to.rs +++ b/src/summary/points_to.rs @@ -17,15 +17,15 @@ //! //! Edges are directed `AliasEdge { source, target, kind }`: //! -//! * `Source(Param(i)) → Target(Param(j))` — the callee stores data +//! * `Source(Param(i)) → Target(Param(j))`, the callee stores data //! derived from parameter `i` into a field/element of parameter `j`. //! Mutation is observable to the caller through its argument for `j`. -//! * `Source(Param(i)) → Target(Return)` — the return value aliases +//! * `Source(Param(i)) → Target(Return)`, the return value aliases //! parameter `i`'s heap identity. Adds heap-level precision on top of //! the coarser [`TaintTransform::Identity`] view already carried in //! [`crate::summary::ssa_summary::SsaFuncSummary::param_to_return`]. //! -//! `MustAlias` is intentionally omitted — the ROI on +//! `MustAlias` is intentionally omitted, the ROI on //! must-alias inference for cross-file summaries is low, and the soundness //! story for `MayAlias`-only application is straightforward ("take the //! union"). @@ -35,7 +35,7 @@ //! Edge count is capped at [`MAX_ALIAS_EDGES`]. When a callee's alias //! graph exceeds the cap the summary records `overflow = true` and //! callers treat the function as "any tainted parameter may spread to -//! every other parameter and to the return" — the conservative +//! every other parameter and to the return", the conservative //! greatest-lower-bound over the alias lattice. use serde::{Deserialize, Serialize}; @@ -46,7 +46,7 @@ use smallvec::SmallVec; /// Parameters are identified by their 0-based positional index as reported /// by [`crate::ssa::ir::SsaOp::Param`]; the implicit receiver (`self`/`this`) /// is handled outside this table and is deliberately not representable here. -/// `Return` denotes the function's return SSA value — one per function, so +/// `Return` denotes the function's return SSA value, one per function, so /// no further qualifier is needed. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] pub enum AliasPosition { @@ -57,7 +57,7 @@ pub enum AliasPosition { } /// Strength of an alias edge. Only [`AliasKind::MayAlias`] is emitted -/// — the analysis over-approximates identity-level aliasing rather than +///, the analysis over-approximates identity-level aliasing rather than /// proving must-alias. The variant is kept as an enum so a future /// extension that distinguishes the two can slot in without migrating /// on-disk data. @@ -94,7 +94,7 @@ pub const MAX_ALIAS_EDGES: usize = 8; /// Parameter-granularity alias summary persisted in /// [`crate::summary::ssa_summary::SsaFuncSummary`]. /// -/// The summary is empty by default — functions without any parameter / +/// The summary is empty by default, functions without any parameter / /// return aliasing (pure transformers, sinks that consume but don't /// mutate their arguments) carry no edges and cost nothing on disk. /// @@ -109,13 +109,13 @@ pub struct PointsToSummary { /// tracking deserialise cleanly (no edges). #[serde(default, skip_serializing_if = "SmallVec::is_empty")] pub edges: SmallVec<[AliasEdge; 4]>, - /// Conservative fallback flag — set when extraction hit + /// Conservative fallback flag, set when extraction hit /// [`MAX_ALIAS_EDGES`] and refused to drop any edge silently. When /// `true`, callers treat the callee as "every parameter may alias /// every other parameter and the return value". #[serde(default, skip_serializing_if = "core::ops::Not::not")] pub overflow: bool, - /// At least one return path produces a *fresh* container allocation — + /// At least one return path produces a *fresh* container allocation , /// a container literal (`[]`, `{}`) or a known container constructor /// call (`new Map()`, `list()`, …) that does not trace back to any /// parameter. When this is `true` the caller synthesises a fresh @@ -124,8 +124,8 @@ pub struct PointsToSummary { /// the call result (e.g. `bag[0]`, `fillBag(bag, …)`) can find a heap /// cell to read from or store into. /// - /// Closes the factory-pattern cross-file gap — `const bag = makeBag()` - /// followed by `fillBag(bag, env)` and `exec(bag[0])` — by giving the + /// Closes the factory-pattern cross-file gap, `const bag = makeBag()` + /// followed by `fillBag(bag, env)` and `exec(bag[0])`, by giving the /// caller's heap analysis a stable identity to attach stores to. /// Combines freely with `Param(i) → Return` edges: a mixed-return /// function (one branch returns a param, another returns a fresh @@ -136,7 +136,7 @@ pub struct PointsToSummary { } impl PointsToSummary { - /// Empty summary — no aliasing, no overflow. Equivalent to + /// Empty summary, no aliasing, no overflow. Equivalent to /// [`Self::default`] but explicit at call sites. pub fn empty() -> Self { Self::default() @@ -153,7 +153,7 @@ impl PointsToSummary { /// /// Returns `true` when the edge was added, `false` when it was a /// duplicate or when the cap triggered an overflow. The caller can - /// ignore the return — the summary always remains in a valid state. + /// ignore the return, the summary always remains in a valid state. pub fn insert(&mut self, source: AliasPosition, target: AliasPosition, kind: AliasKind) { if self.overflow { return; @@ -168,7 +168,7 @@ impl PointsToSummary { } if self.edges.len() >= MAX_ALIAS_EDGES { self.overflow = true; - // Keep the existing edge list — a consumer that still reads + // Keep the existing edge list, a consumer that still reads // the vector gets a strict *subset* of the sound over- // approximation conveyed by `overflow`. Correctness is // owned by the overflow flag; the residual edges are purely @@ -337,7 +337,7 @@ mod tests { } } -// ── Pointer-Phase 5: field-granularity points-to summary ────────────── +// ── field-granularity points-to summary ────────────── /// Maximum field names retained per parameter in [`FieldPointsToSummary`]. /// @@ -345,12 +345,12 @@ mod tests { /// while leaving room for typical helpers (a handful of fields each). pub const MAX_FIELDS_PER_PARAM: usize = 8; -/// Pointer-Phase 5: field-granularity per-parameter points-to summary. +/// field-granularity per-parameter points-to summary. /// /// Records, for each positional parameter index, the set of field /// **names** read from and written to inside the callee body. Names /// (not [`crate::ssa::ir::FieldId`]) are persisted because field IDs -/// are body-local — the per-body [`crate::ssa::ir::FieldInterner`] +/// are body-local, the per-body [`crate::ssa::ir::FieldInterner`] /// reassigns IDs across files. Callers re-intern through their own /// body's interner before consulting `field_taint` cells. /// @@ -359,23 +359,23 @@ pub const MAX_FIELDS_PER_PARAM: usize = 8; /// same indexing convention as `SsaFuncSummary::receiver_to_*` /// (separate channel). /// -/// Empty by default — functions that don't read or write any field on +/// Empty by default, functions that don't read or write any field on /// their parameters carry no entries and cost nothing on disk. #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct FieldPointsToSummary { - /// `(param_index, field_names_read)` — the callee projected each + /// `(param_index, field_names_read)`, the callee projected each /// listed field on a value derived from `param_index` somewhere /// in its body. Sorted, deduped per-entry. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub param_field_reads: Vec<(u32, SmallVec<[String; 2]>)>, - /// `(param_index, field_names_written)` — the callee assigned to + /// `(param_index, field_names_written)`, the callee assigned to /// each listed field on a value derived from `param_index`. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub param_field_writes: Vec<(u32, SmallVec<[String; 2]>)>, /// Set when the read/write graph hit /// [`MAX_FIELDS_PER_PARAM`] for any parameter. Callers seeing /// `overflow=true` treat each parameter as reading/writing every - /// field on every other parameter — the conservative greatest + /// field on every other parameter, the conservative greatest /// lower bound that preserves soundness. #[serde(default, skip_serializing_if = "core::ops::Not::not")] pub overflow: bool, @@ -441,7 +441,7 @@ impl FieldPointsToSummary { } /// Union with `other`. Overflow propagates per - /// [`PointsToSummary::merge`]'s semantics — once a callee is + /// [`PointsToSummary::merge`]'s semantics, once a callee is /// "any field on any parameter", merging cannot recover precision. pub fn merge(&mut self, other: &Self) { if other.overflow { diff --git a/src/summary/ssa_summary.rs b/src/summary/ssa_summary.rs index cf832bb9..67e79348 100644 --- a/src/summary/ssa_summary.rs +++ b/src/summary/ssa_summary.rs @@ -17,61 +17,35 @@ pub enum TaintTransform { AddBits(Cap), } -/// Maximum [`ReturnPathTransform`] entries retained per parameter. -/// -/// Most functions have one or two return paths; eight is a generous bound -/// that still keeps per-summary memory O(1). Beyond the cap, extraction -/// joins the overflow into a single Top-predicate entry so the caller-side -/// application always sees a bounded vector. +/// Cap on per-parameter return-path entries. Overflow is joined into +/// a single Top-predicate entry so callers always see a bounded vec. pub const MAX_RETURN_PATHS: usize = 8; -/// A single return-path entry in a per-parameter summary. -/// -/// Per-return-path decomposition preserves callee-internal path splits that -/// the aggregate [`TaintTransform`] would erase. Each entry records the -/// path predicate under which this return is reached, the behavioural -/// transform on that path, and (optionally) an abstract-domain contribution. -/// -/// Callers carry their own path-state at the call site and apply only -/// entries whose predicate is consistent with the caller's validated set; -/// the remainder are skipped. Applicable entries are joined to produce -/// the effective transform at the call site. -/// -/// When a callee has a single return path, `param_return_paths` stays empty -/// and the caller falls back to `param_to_return`'s union view. +/// One return-path entry in a per-parameter summary. Records the path +/// predicate, the transform on that path, and optionally an abstract +/// contribution. Callers apply only entries consistent with their +/// caller-side path state. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ReturnPathTransform { /// Behavioural kind on this path (Identity / StripBits / AddBits). pub transform: TaintTransform, - /// Deterministic hash of the path-predicate gate at this return. - /// - /// `0` is reserved for "no predicate gate" — a return reached under - /// no known predicate. Two return blocks whose path predicates are - /// observationally equivalent hash to the same value and are joined. + /// Deterministic hash of the path-predicate gate. `0` = no gate. + /// Equivalent predicates collide and are joined. pub path_predicate_hash: u64, - /// `PredicateSummary::known_true` bits that must hold on every path - /// into this return. Encoded using [`crate::taint::domain::predicate_kind_bit`]: - /// bit 0 = NullCheck, 1 = EmptyCheck, 2 = ErrorCheck. + /// `known_true` predicate bits (bit 0 = NullCheck, 1 = EmptyCheck, + /// 2 = ErrorCheck) that hold on every path into this return. pub known_true: u8, - /// `PredicateSummary::known_false` bits at this return (same encoding - /// as [`Self::known_true`]). + /// `known_false` bits at this return. pub known_false: u8, - /// Abstract contribution for this return path, when non-Top. - /// - /// Callers combine this with their own abstract fact on the call - /// site's argument using `AbstractValue::meet` to recover bounds that - /// survive a specific return. + /// Abstract contribution when non-Top. Callers `meet` it with the + /// caller-side abstract fact. #[serde(default, skip_serializing_if = "Option::is_none")] pub abstract_contribution: Option, } impl ReturnPathTransform { - /// Dedup key combining the semantic fields of a path entry. Two entries - /// with the same `(path_predicate_hash, transform, known_true, known_false)` - /// describe the same behaviour on paths gated by the same predicate and - /// can collapse without losing information. `abstract_contribution` is - /// deliberately ignored — the dedup path joins the two entries' - /// abstract facts rather than dropping one. + /// Dedup key. `abstract_contribution` is intentionally excluded + ///, colliding entries join their abstract facts. pub fn dedup_key(&self) -> (u64, &TaintTransform, u8, u8) { ( self.path_predicate_hash, @@ -234,7 +208,7 @@ pub struct SsaFuncSummary { /// abstract value. At cross-file call sites the caller applies each /// transfer to the corresponding argument's abstract state and joins /// the results (then `meet`s with [`Self::return_abstract`]) to - /// synthesise the return abstract value — recovering interval bounds + /// synthesise the return abstract value, recovering interval bounds /// and string prefixes that would otherwise be lost to the summary's /// Top-seeded baseline. /// @@ -254,8 +228,8 @@ pub struct SsaFuncSummary { /// consistent with the caller's validated set, joining the applicable /// set into the effective call-site transform. /// - /// Empty when the callee has a single return path — the aggregate - /// [`param_to_return`] is already precise — or when extraction + /// Empty when the callee has a single return path, the aggregate + /// [`param_to_return`] is already precise, or when extraction /// could not derive per-return state (e.g. early-exit probes). #[serde(default, skip_serializing_if = "Vec::is_empty")] pub param_return_paths: Vec<(usize, SmallVec<[ReturnPathTransform; 2]>)>, @@ -268,7 +242,7 @@ pub struct SsaFuncSummary { /// each other or the return value. #[serde(default, skip_serializing_if = "PointsToSummary::is_empty")] pub points_to: PointsToSummary, - /// Pointer-Phase 5: field-granularity per-parameter points-to + /// field-granularity per-parameter points-to /// summary. Records which fields the callee reads from / writes /// to on each parameter, so cross-file resolution can spread /// taint through field-level mutations the callee performs on @@ -295,7 +269,7 @@ pub struct SsaFuncSummary { /// Empty for callees whose return blocks produce no non-Top fact, /// or whose single return path makes the aggregate already precise. /// Cross-file callers that cannot pick a specific path fall back to - /// joining the entries — equivalent to the pre-decomposition + /// joining the entries, equivalent to the pre-decomposition /// behaviour. #[serde(default, skip_serializing_if = "SmallVec::is_empty")] pub return_path_facts: SmallVec<[PathFactReturnEntry; 2]>, @@ -307,7 +281,7 @@ pub struct SsaFuncSummary { /// non-empty [`crate::ssa::type_facts::TypeKind::container_name`]. /// /// Consumed by [`crate::callgraph::build_call_graph`] to feed - /// `CalleeQuery.receiver_type` for the matching ordinal — letting + /// `CalleeQuery.receiver_type` for the matching ordinal, letting /// the call graph narrow indirect method-call edges to only those /// targets whose defining container matches the inferred type. /// Strictly additive: an empty map means today's name-only diff --git a/src/summary/tests.rs b/src/summary/tests.rs index 556e759b..9b13dd23 100644 --- a/src/summary/tests.rs +++ b/src/summary/tests.rs @@ -580,7 +580,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() { gs.insert_ssa(key.clone(), v1.clone()); assert_eq!(gs.get_ssa(&key), Some(&v1)); - // Replace with a different summary — exact replacement, not union + // Replace with a different summary, exact replacement, not union let v2 = SsaFuncSummary { param_to_return: vec![(0, TaintTransform::StripBits(Cap::HTML_ESCAPE))], param_to_sink: vec![(0, cap_sites(Cap::SQL_QUERY))], @@ -1492,7 +1492,7 @@ fn free_function_and_method_with_same_name_resolve_separately() { assert_eq!(method, CalleeResolution::Resolved(km)); // Without any qualifier, receiver, or receiver_type, a bare - // `process()` call is syntactically a free-function invocation — a + // `process()` call is syntactically a free-function invocation, a // method cannot be invoked that way from outside its class. The // resolver's bare-call preference (step 5.5) picks the sole // empty-container candidate deterministically. @@ -1709,7 +1709,7 @@ fn legacy_callees_string_array_deserializes() { #[test] fn mixed_callee_form_deserializes() { // Interop / partial-migration rows may mix legacy strings with - // structured entries in the same array — deserializer accepts both. + // structured entries in the same array, deserializer accepts both. let json = r#"{ "name": "mixed", "file_path": "m.rs", @@ -1936,7 +1936,7 @@ fn rust_wildcard_import_resolves_uniquely() { #[test] fn rust_use_map_fallback_when_absent() { - // No use_map entry — falls through to generic same-language resolution, + // No use_map entry, falls through to generic same-language resolution, // which for an unqualified caller in the same namespace still works. let helper = rust_summary_with_mod("helper", "/proj/src/lib.rs", 0, Some(""), &[], &[], vec![]); let caller = rust_summary_with_mod( @@ -1960,7 +1960,7 @@ fn rust_use_map_fallback_when_absent() { #[test] fn rust_use_map_ambiguous_stays_ambiguous_without_hint() { - // Two modules define `validate`; no use-map on the caller — resolution + // Two modules define `validate`; no use-map on the caller, resolution // should remain Ambiguous rather than silently picking one. let token = rust_summary_with_mod( "validate", @@ -2135,7 +2135,7 @@ fn query_prefers_receiver_type_over_leaf_collision() { // Old behaviour-parity regression: `resolve_callee_key_with_container` // (now a thin wrapper) used to treat `MessageQueue` as an authoritative // qualifier that *only* picked on exact match. The new resolver must - // still do that — swap to `MessageQueue` and we get its method back. + // still do that, swap to `MessageQueue` and we get its method back. let resolved_queue = gs.resolve_callee(&CalleeQuery { name: "send", caller_lang: Lang::Java, @@ -2164,7 +2164,7 @@ fn query_prefers_receiver_type_over_leaf_collision() { fn query_authoritative_receiver_miss_does_not_fall_through_to_leaf() { // When `receiver_type = HttpClient` is supplied but no // `HttpClient::send` exists, the resolver MUST NOT silently pick a - // same-leaf collision in another container — that would be the + // same-leaf collision in another container, that would be the // classic "resolved by leaf name" bug the refactor aims to prevent. let mut gs = GlobalSummaries::new(); let (k_queue, s_queue) = method_summary("src/queue.java", "MessageQueue", "send", 1, 0x02); @@ -2326,7 +2326,7 @@ fn query_caller_container_resolves_self_call() { fn query_leaf_same_namespace_still_resolves_intra_file_calls() { // Two definitions share a leaf name but live in different files. // A same-namespace call (intra-file) must resolve to the local one - // without requiring any structured hint — this is the common case + // without requiring any structured hint, this is the common case // for bare top-level function calls. let mut gs = GlobalSummaries::new(); let (k_a, s_a) = free_summary("src/a.js", "helper", 1, 0x01); @@ -2369,7 +2369,7 @@ fn query_leaf_same_namespace_still_resolves_intra_file_calls() { #[test] fn query_arity_filter_is_hard() { - // Same container and leaf, different arities — resolution must + // Same container and leaf, different arities, resolution must // honour the arity filter before any qualifier-based tie-break. let mut gs = GlobalSummaries::new(); let (k_1arg, s_1arg) = method_summary("src/svc.py", "Svc", "render", 1, 0x01); @@ -2402,7 +2402,7 @@ fn query_arity_filter_is_hard() { assert_eq!(two, CalleeResolution::Resolved(k_2arg)); // With a non-existent arity, arity filter prunes everything and we - // get NotFound — not a "closest match" guess. + // get NotFound, not a "closest match" guess. let mismatched = gs.resolve_callee(&CalleeQuery { name: "render", caller_lang: Lang::Java, @@ -2427,7 +2427,7 @@ fn query_receiver_var_is_soft_tiebreak_not_primary() { // happens to also be called "obj". The old resolver used the // variable name as container_hint #1, which could mis-pick when // the qualified index had a coincidental hit. The new resolver - // treats `receiver_var` as a *soft* tie-break — it only fires + // treats `receiver_var` as a *soft* tie-break, it only fires // after same-namespace unique-leaf resolution fails. let mut gs = GlobalSummaries::new(); let (k_same_ns, s_same_ns) = free_summary("src/app.js", "method", 1, 0xAA); @@ -2514,7 +2514,7 @@ fn legacy_wrapper_preserves_test_contract() { gs.insert(k_a.clone(), s_a); // container_hint doesn't match any container, but the leaf name has - // exactly one candidate — the wrapper should still resolve. + // exactly one candidate, the wrapper should still resolve. let resolved = gs.resolve_callee_key_with_container( "only", Lang::Java, @@ -2530,7 +2530,7 @@ fn legacy_wrapper_preserves_test_contract() { // These tests target the most error-prone identity cases: two or more // definitions that share `(lang, namespace, name, arity)` but differ in // `container`. The resolver must either resolve to the exact container -// target or refuse to guess — silently falling back to a same-leaf +// target or refuse to guess, silently falling back to a same-leaf // collision in a different container is a correctness bug, and mis- // ordering the resolution steps can cause either false positives (wrong // summary picked) or false negatives (missed flow because Ambiguous @@ -2542,7 +2542,7 @@ fn same_file_two_classes_same_method_typed_receiver_picks_exact() { // incompatible security behaviour: `Safe::run` is a sanitizer-ish // passthrough (no sink bits) while `Unsafe::run` is a shell sink. // When the caller has a typed receiver (via type inference), the - // resolver must pick the exact class — the wrong pick would either + // resolver must pick the exact class, the wrong pick would either // miss the Unsafe sink or wrongly flag the Safe path. let mut gs = GlobalSummaries::new(); let (k_safe, s_safe) = method_summary("src/app.java", "Safe", "run", 1, 0x00); @@ -2595,7 +2595,7 @@ fn same_file_two_classes_same_method_typed_receiver_picks_exact() { #[test] fn same_file_two_classes_same_method_untyped_receiver_is_ambiguous_not_wrong() { // Same setup as above, but the caller only has a variable-name - // receiver (no type facts). `receiver_var` is a SOFT hint — and in + // receiver (no type facts). `receiver_var` is a SOFT hint, and in // the common case `s`/`u` don't match any container. The resolver // MUST refuse to pick one arbitrarily; returning `Safe::run` when // the call was `u.run(...)` would be a silent false negative of the @@ -2635,8 +2635,8 @@ fn same_file_two_classes_same_method_untyped_receiver_is_ambiguous_not_wrong() { #[test] fn same_file_free_function_and_method_bare_call_prefers_free_function() { // Classic "I wrote a top-level helper AND a method with the same - // name in the same file" trap. A bare `process()` call — no - // receiver, no qualifier, caller outside any container — is + // name in the same file" trap. A bare `process()` call, no + // receiver, no qualifier, caller outside any container, is // syntactically a FREE function call; the method cannot be invoked // this way. The resolver MUST resolve to the free function, not // return Ambiguous. @@ -2682,7 +2682,7 @@ fn same_file_method_calling_sibling_free_function_resolves_to_free() { // Variant of the previous test with the caller LIVING INSIDE a // class whose own container does NOT define `process`. Bare // `process()` inside `Runner::kick()` must still resolve to the - // file-local free function — not get lost in Ambiguous because the + // file-local free function, not get lost in Ambiguous because the // caller_container hint (`Runner`) misses both candidates. let mut gs = GlobalSummaries::new(); let (k_free, s_free) = free_summary("src/app.java", "process", 1, 0x0F); @@ -2727,7 +2727,7 @@ fn same_file_method_calling_own_container_sibling_prefers_self_class() { // Inverse of the previous: caller is INSIDE `Worker::other()` and // calls bare `process()`. Both a free `process` AND `Worker::process` // exist in the file. The caller's own container resolution (step 3) - // must prefer `Worker::process` — otherwise intra-class self calls + // must prefer `Worker::process`, otherwise intra-class self calls // would get misresolved to a free function with possibly different // security behaviour. let mut gs = GlobalSummaries::new(); @@ -2804,7 +2804,7 @@ fn same_file_nested_container_same_method_disambiguates_by_container() { "`Outer` receiver_type must pick only Outer::foo — not Outer::Inner::foo via prefix match" ); - // Exact cap pinning — guards against merge_summaries accidentally + // Exact cap pinning, guards against merge_summaries accidentally // unioning caps across the two nested keys. assert_eq!(gs.get(&k_inner).unwrap().sink_caps, 0x02); } @@ -2814,7 +2814,7 @@ fn same_file_same_name_different_security_behaviour_no_cap_leak() { // Three `validate/1` entries in the same file: a sanitizer // passthrough (free function), an HTML-escape sanitizer in one // class, and a shell-exec sink in another class. These must end - // up as three distinct keys with their caps preserved exactly — + // up as three distinct keys with their caps preserved exactly , // no merge of sink caps into the sanitizer entry, no cross-leak // via `by_lang_name` fallback. let mut gs = GlobalSummaries::new(); @@ -2873,7 +2873,7 @@ fn same_file_same_name_different_security_behaviour_no_cap_leak() { // (typically `disambig: None` from legacy/interop/DB-loaded summaries) where // the old code silently collapsed structurally distinct functions. -/// Build a minimal `FuncSummary` with `disambig: None` — mirrors the shape +/// Build a minimal `FuncSummary` with `disambig: None`, mirrors the shape /// produced by legacy JSON rows / interop configs that don't know byte /// offsets. `file_path` is left blank so namespace normalisation doesn't /// separate the two otherwise-identical keys. @@ -2956,7 +2956,7 @@ fn insert_mismatched_module_path_does_not_silently_merge() { .find(|(_, s)| s.module_path.as_deref() == Some("billing::invoice")) .expect("billing::invoice summary preserved"); // Cross-contamination guard: the two crates must not have their - // caps unioned — that's the observable failure mode of a silent + // caps unioned, that's the observable failure mode of a silent // merge. assert_eq!(auth.1.sink_caps, Cap::SHELL_ESCAPE.bits()); assert_eq!(billing.1.sink_caps, Cap::SQL_QUERY.bits()); @@ -2967,7 +2967,7 @@ fn insert_mismatched_module_path_does_not_silently_merge() { #[test] fn insert_mismatched_kind_does_not_silently_merge() { // A free function and a method with the same name, arity, namespace, - // and container ("" vs "") can't actually occur — but kind alone + // and container ("" vs "") can't actually occur, but kind alone // mismatching does happen in interop configs where a getter is // described as a function. Make sure the two end up distinct. let mut gs = GlobalSummaries::new(); @@ -2996,7 +2996,7 @@ fn insert_mismatched_kind_does_not_silently_merge() { let hits = gs.lookup_same_lang(Lang::Java, "size"); assert_eq!(hits.len(), 2); // The getter's sink caps must not have been unioned into the - // function — that would be a security-relevant leak. + // function, that would be a security-relevant leak. let func_hit = hits .iter() .find(|(k, _)| k.kind == FuncKind::Function) @@ -3010,7 +3010,7 @@ fn insert_mismatched_kind_does_not_silently_merge() { #[test] fn insert_mismatched_param_names_does_not_silently_merge() { // Two overloads in Java/C++ with the same arity but different - // parameter types/names — a classic case where arity-only identity + // parameter types/names, a classic case where arity-only identity // collapses distinct functions. Neither summary ships a disambig // because it was loaded from legacy JSON. let mut gs = GlobalSummaries::new(); @@ -3052,7 +3052,7 @@ fn insert_mismatched_param_names_does_not_silently_merge() { #[test] fn insert_synthetic_disambig_bit_set_only_for_collisions() { // A single legacy-style insert with `disambig: None` must NOT gain a - // synthetic disambig — we only rekey to resolve collisions, never + // synthetic disambig, we only rekey to resolve collisions, never // speculatively. This prevents downstream lookups keyed with // `disambig: None` from spuriously missing legitimately-single // summaries. @@ -3075,7 +3075,7 @@ fn insert_synthetic_disambig_bit_set_only_for_collisions() { #[test] fn insert_compatible_refinement_still_unions() { // Two summaries describing the same function (structurally identical - // head, differing only on behaviour fields) must still union — the + // head, differing only on behaviour fields) must still union, the // tightened check doesn't regress the classic parallel-fold merge. let mut gs = GlobalSummaries::new(); let a = FuncSummary { @@ -3109,7 +3109,7 @@ fn insert_compatible_refinement_still_unions() { let merged = gs.get(&k).expect("compatible summaries still merge"); assert_eq!(merged.source_caps, Cap::ENV_VAR.bits()); assert_eq!(merged.sink_caps, Cap::SHELL_ESCAPE.bits()); - // Single entry — no accidental split for the compatible case. + // Single entry, no accidental split for the compatible case. let hits = gs.lookup_same_lang(Lang::Rust, "f"); assert_eq!(hits.len(), 1); } @@ -3129,7 +3129,7 @@ fn insert_body_param_count_mismatch_rekeys() { ..Default::default() }; gs.insert_body(key.clone(), make_callee_body(2, 2)); - // Incoming body with a different param_count — must not overwrite. + // Incoming body with a different param_count, must not overwrite. gs.insert_body(key.clone(), make_callee_body(5, 4)); // Invariant 1: the original body stays at the original key (not @@ -3164,7 +3164,7 @@ fn insert_body_param_count_mismatch_rekeys() { #[test] fn insert_ssa_arity_overflow_rekeys() { // Key claims arity 1, but the incoming SSA summary references - // param index 3 — structurally impossible for the same function. + // param index 3, structurally impossible for the same function. // The fix must split so the key arity invariant is preserved. let mut gs = GlobalSummaries::new(); let key = FuncKey { @@ -3185,7 +3185,7 @@ fn insert_ssa_arity_overflow_rekeys() { vec![(0, TaintTransform::Identity)] ); - // Bad-arity incoming summary — must not overwrite the legitimate one. + // Bad-arity incoming summary, must not overwrite the legitimate one. let overflowing = SsaFuncSummary { param_to_return: vec![(3, TaintTransform::Identity)], param_to_sink: vec![(2, cap_sites(Cap::SQL_QUERY))], @@ -3207,10 +3207,10 @@ fn insert_ssa_arity_overflow_rekeys() { /// /// This is the case `lower_to_ssa` produces for Java instance/static /// methods that reference free identifiers (e.g. `f.close()` where -/// `close` is treated as an external capture — the synthetic Param 0 +/// `close` is treated as an external capture, the synthetic Param 0 /// then leaks into `param_to_return`/`param_to_sink`). Without the /// audit-gap fix, `reconcile_ssa_summary_key` would synthesise a -/// disambig and Phase 3's `summaries.get_ssa(caller_key)` lookup +/// disambig and the analysis's `summaries.get_ssa(caller_key)` lookup /// (consuming `typed_call_receivers` at the FuncSummary-aligned key) /// would miss. #[test] @@ -3229,7 +3229,7 @@ fn insert_ssa_arity_overflow_keeps_original_key_when_no_collision() { }; let summary = SsaFuncSummary { // Synthetic Param-0 for the external `close` identifier inside - // the static `read()` body — `param_count == 0` per the source- + // the static `read()` body, `param_count == 0` per the source- // level signature. param_to_return: vec![(0, TaintTransform::Identity)], typed_call_receivers: vec![(1, "FileHandle".to_string())], @@ -3241,7 +3241,7 @@ fn insert_ssa_arity_overflow_keeps_original_key_when_no_collision() { .get_ssa(&key) .expect("Reader::read SSA must be reachable at the FuncSummary-aligned key"); assert_eq!(kept.typed_call_receivers, summary.typed_call_receivers); - // The synthetic Param-0 reference is preserved verbatim — pass-2 + // The synthetic Param-0 reference is preserved verbatim, pass-2 // analysis still aligns it with the caller's implicit-uses // argument group at the same index. assert_eq!(kept.param_to_return, summary.param_to_return); @@ -3288,7 +3288,7 @@ fn insert_ssa_arity_overflow_iterative_rescan_stays_at_original_key() { assert_eq!(kept.param_to_return, round2.param_to_return); } -// ── Primary sink-location attribution — SinkSite round-trips ──────────── +// ── Primary sink-location attribution, SinkSite round-trips ──────────── #[test] fn sink_site_serde_round_trip_solo() { @@ -3549,7 +3549,7 @@ fn cf4_merge_return_paths_caps_at_max() { "overflow collapses to a single Top-predicate entry" ); // Joined entry has no predicate gate (hash=0) and conservatively takes - // the intersection of all strip bits — which here is HTML_ESCAPE. + // the intersection of all strip bits, which here is HTML_ESCAPE. let joined = &existing[0]; assert_eq!(joined.path_predicate_hash, 0); assert!(matches!( @@ -3626,11 +3626,11 @@ fn cf4_union_param_return_paths_by_index() { #[test] fn cf4_ssa_summary_fits_arity_keeps_out_of_range_path_idx_at_original_key() { // A path whose param index exceeds the key's arity is treated as a - // synthetic external-capture artefact (audit gap A.2.1.G1 — see + // synthetic external-capture artefact (audit gap A.2.1.G1, see // `project_typed_callgraph_audit_gap_ssa_disambig.md`). When no // existing entry sits at the key, `insert_ssa` keeps the (untrimmed) // summary at the original key so the SSA FuncKey stays aligned with - // the matching FuncSummary FuncKey — Phase 3's + // the matching FuncSummary FuncKey, the analysis's // `summaries.get_ssa(caller_key)` lookup (consuming // `typed_call_receivers`) depends on this alignment. let bad = SsaFuncSummary { @@ -3641,7 +3641,7 @@ fn cf4_ssa_summary_fits_arity_keeps_out_of_range_path_idx_at_original_key() { lang: Lang::Rust, namespace: "test.rs".into(), name: "helper".into(), - arity: Some(2), // too small for idx 5 — synthetic-Param marker + arity: Some(2), // too small for idx 5, synthetic-Param marker ..Default::default() }; let mut gs = GlobalSummaries::new(); @@ -3730,9 +3730,9 @@ fn cf6_ssa_summary_fits_arity_keeps_out_of_range_points_to_idx_at_original_key() assert_eq!(kept.points_to.max_param_index(), Some(7)); } -/// Phase 4 (typed call-graph devirtualisation): two `findById` +/// two `findById` /// definitions on different containers must remain structurally -/// disjoint after [`merge_summaries`] — no cap union may leak +/// disjoint after [`merge_summaries`], no cap union may leak /// across them. The FuncKey identity model already keys on /// `(lang, namespace, container, name, arity, ...)` so this is /// supposed to be true today; the test pins it down so a future @@ -3741,7 +3741,7 @@ fn cf6_ssa_summary_fits_arity_keeps_out_of_range_points_to_idx_at_original_key() /// Concretely: `Repository::findById` is parameterised (no /// `SQL_QUERY` sink cap), `UnsafeCache::findById` runs a string- /// concatenated query (carries `Cap::SQL_QUERY`). After merge, -/// each FuncKey must own only its own caps — Repository must NOT +/// each FuncKey must own only its own caps, Repository must NOT /// inherit Cache's `SQL_QUERY` bit. #[test] fn cross_file_devirt_does_not_union_unrelated_findbyids() { @@ -3777,7 +3777,7 @@ fn cross_file_devirt_does_not_union_unrelated_findbyids() { let gs = merge_summaries(vec![safe_repo, unsafe_cache], None); - // Two distinct keys must coexist — no merge collision. + // Two distinct keys must coexist, no merge collision. let repo_key = FuncKey { lang: Lang::Rust, namespace: "src/repo.rs".into(), @@ -3798,7 +3798,7 @@ fn cross_file_devirt_does_not_union_unrelated_findbyids() { let repo_sum = gs.get(&repo_key).expect("Repository::findById missing"); let cache_sum = gs.get(&cache_key).expect("UnsafeCache::findById missing"); - // Sink caps stay on their own owner — the whole point of + // Sink caps stay on their own owner, the whole point of // devirtualisation. Repository must not have inherited the // SQL_QUERY bit from UnsafeCache. assert_eq!( @@ -3812,7 +3812,7 @@ fn cross_file_devirt_does_not_union_unrelated_findbyids() { Cap::SQL_QUERY.bits(), "UnsafeCache::findById lost its own sink cap during merge" ); - // Same invariant on tainted_sink_params — must not bleed across. + // Same invariant on tainted_sink_params, must not bleed across. assert!( repo_sum.tainted_sink_params.is_empty(), "Repository::findById inherited tainted_sink_params from UnsafeCache: {:?}", @@ -3821,7 +3821,7 @@ fn cross_file_devirt_does_not_union_unrelated_findbyids() { assert_eq!(cache_sum.tainted_sink_params, vec![0]); } -// ── Phase 6 hierarchy fan-out at runtime resolution ──────────────────── +// ── the analysis ──────────────────── // // `GlobalSummaries::resolve_callee_widened` is the runtime counterpart of // the call-graph builder's `TypeHierarchyIndex::resolve_with_hierarchy`. @@ -3855,7 +3855,7 @@ mod hierarchy_widened_tests { (key, summary) } - /// A1 — no hierarchy installed. Widening collapses to today's + /// A1, no hierarchy installed. Widening collapses to today's /// single-result behaviour: one key in / one key out. #[test] fn widened_without_hierarchy_returns_single_resolved() { @@ -3877,7 +3877,7 @@ mod hierarchy_widened_tests { assert_eq!(widened, vec![k]); } - /// A2 — hierarchy installed but the receiver type has no recorded + /// A2, hierarchy installed but the receiver type has no recorded /// sub-types. Falls through to today's single-result behaviour. #[test] fn widened_no_subtypes_returns_single() { @@ -3899,7 +3899,7 @@ mod hierarchy_widened_tests { assert_eq!(widened, vec![k]); } - /// A3 — hierarchy with one sub-type implementer. Widening returns + /// A3, hierarchy with one sub-type implementer. Widening returns /// both the direct receiver match and the sub-type's match. #[test] fn widened_one_subtype_returns_two_keys() { @@ -3938,14 +3938,14 @@ mod hierarchy_widened_tests { assert!(widened.contains(&k_impl)); } - /// A4 — hierarchy with multiple sub-types: every implementer's + /// A4, hierarchy with multiple sub-types: every implementer's /// matching method is in the result, deduplicated. #[test] fn widened_multiple_subtypes_returns_all() { let mut gs = GlobalSummaries::new(); // Three impls + one interface. The interface itself has no // body so we omit a method on it (that is the more common - // shape — a pure interface plus concrete classes). + // shape, a pure interface plus concrete classes). let edges = vec![ ("FileLogger".to_string(), "ILogger".to_string()), ("NetLogger".to_string(), "ILogger".to_string()), @@ -3984,7 +3984,7 @@ mod hierarchy_widened_tests { assert!(widened.contains(&k_std)); } - /// A5 — the arity filter must apply across the whole fan-out, not + /// A5, the arity filter must apply across the whole fan-out, not /// just the direct-receiver leg. An implementer with a different /// arity must not leak into the result. #[test] @@ -4013,10 +4013,10 @@ mod hierarchy_widened_tests { assert_eq!(widened, vec![k_one], "arity-2 impl must be filtered out"); } - /// A6 — fan-out is bounded at `MAX_HIERARCHY_FANOUT`. Build a + /// A6, fan-out is bounded at `MAX_HIERARCHY_FANOUT`. Build a /// hierarchy with more impls than the cap allows and assert the /// result is exactly capped (and that early impls are preserved - /// — the cap drops the *tail*, not the head). + ///, the cap drops the *tail*, not the head). #[test] fn widened_caps_at_max_hierarchy_fanout() { let cap = GlobalSummaries::MAX_HIERARCHY_FANOUT; @@ -4030,7 +4030,7 @@ mod hierarchy_widened_tests { .map(|i| (format!("Impl{i:02}"), "IBase".to_string())) .collect(); - // Carrier — first impl carries every edge so the index is + // Carrier, first impl carries every edge so the index is // populated in one shot. let (k0, s0) = java_method("src/impl00.java", "Impl00", "run", 0, 0x01, edges); gs.insert(k0.clone(), s0); @@ -4065,18 +4065,18 @@ mod hierarchy_widened_tests { ); } - /// A7 — when hierarchy widening produces no candidates AND the + /// A7, when hierarchy widening produces no candidates AND the /// receiver_type lookup is authoritative (Step 1), the secondary /// fall-through goes through `resolve_callee` which returns /// Ambiguous/NotFound rather than silently picking an unrelated - /// leaf — exactly the "subset of today's targets, never a + /// leaf, exactly the "subset of today's targets, never a /// superset" rule. Test asserts the empty result is preserved. #[test] fn widened_empty_does_not_silently_pick_unrelated_leaf() { let mut gs = GlobalSummaries::new(); // Edge: IUnused has a sub Used, but neither declares // `something`. An unrelated free function `something` exists - // in the same namespace — under today's authoritative + // in the same namespace, under today's authoritative // receiver_type rules, that function MUST NOT be picked when // the call is annotated with receiver_type "IUnused". let edges = vec![("Used".to_string(), "IUnused".to_string())]; @@ -4104,7 +4104,7 @@ mod hierarchy_widened_tests { ); } - /// A7b — when hierarchy widening produces nothing AND today's + /// A7b, when hierarchy widening produces nothing AND today's /// `resolve_callee` *does* resolve (no receiver_type, just bare /// leaf or qualifier hint), the fallback returns the single key. /// This pins the secondary-fallback contract on the path where it @@ -4131,7 +4131,7 @@ mod hierarchy_widened_tests { assert_eq!(widened, vec![k_free]); } - /// A8 — receiver_type is None → no widening; behaves identically + /// A8, receiver_type is None → no widening; behaves identically /// to `resolve_callee` (single-result wrap). #[test] fn widened_no_receiver_type_collapses_to_resolve_callee() { @@ -4153,7 +4153,7 @@ mod hierarchy_widened_tests { assert_eq!(widened, vec![k_free]); } - /// A9 — `merge()` must invalidate the cached hierarchy index so a + /// A9, `merge()` must invalidate the cached hierarchy index so a /// post-merge call to `resolve_callee_widened` doesn't look up a /// stale view. Since `install_hierarchy` is required after merges, /// the test asserts: post-merge, before reinstall, fan-out must @@ -4180,7 +4180,7 @@ mod hierarchy_widened_tests { }); assert_eq!(pre_merge.len(), 2); - // Merge in an empty `gs_b` — should invalidate the cached + // Merge in an empty `gs_b`, should invalidate the cached // hierarchy. gs_a.merge(GlobalSummaries::new()); assert!( diff --git a/src/suppress/mod.rs b/src/suppress/mod.rs index 5197d909..180b8fa4 100644 --- a/src/suppress/mod.rs +++ b/src/suppress/mod.rs @@ -1,8 +1,8 @@ //! Inline per-finding suppression via source-code comments. //! //! Supports two directive forms: -//! - `nyx:ignore [, …]` — suppress findings on the same line -//! - `nyx:ignore-next-line [, …]` — suppress findings on the next line +//! - `nyx:ignore [, …]` , suppress findings on the same line +//! - `nyx:ignore-next-line [, …]`, suppress findings on the next line //! //! Comments are detected for all supported languages without tree-sitter, //! using a lightweight string/comment state machine. @@ -34,7 +34,7 @@ pub struct SuppressionMeta { // Internal types // ───────────────────────────────────────────────────────────────────────────── -/// A single rule matcher — either exact or wildcard-suffix (`foo.*`). +/// A single rule matcher, either exact or wildcard-suffix (`foo.*`). #[derive(Debug)] enum RuleMatcher { Exact(String), @@ -120,11 +120,11 @@ pub fn canonical_rule_id(id: &str) -> &str { #[derive(Clone, Copy)] enum CommentStyle { - /// `//` and `/* */` — Rust, C, C++, Java, Go, JS, TS + /// `//` and `/* */`, Rust, C, C++, Java, Go, JS, TS CStyle, - /// `#` only — Python, Ruby + /// `#` only, Python, Ruby Hash, - /// `//`, `#`, and `/* */` — PHP + /// `//`, `#`, and `/* */`, PHP PhpStyle, } @@ -189,7 +189,7 @@ pub fn parse_inline_suppressions(path: &std::path::Path, source: &str) -> Suppre if in_block_comment { // Check for block comment end. if let Some(end_pos) = line.find("*/") { - // Extract text before `*/` — may contain a directive. + // Extract text before `*/`, may contain a directive. let block_text = &line[..end_pos]; if let Some(dir) = try_parse_directive(block_text, line_num) { let target = target_line(&dir, line_num, total_lines); @@ -208,7 +208,7 @@ pub fn parse_inline_suppressions(path: &std::path::Path, source: &str) -> Suppre } } } else { - // Still inside block comment — check for directive. + // Still inside block comment, check for directive. if let Some(dir) = try_parse_directive(line, line_num) { let target = target_line(&dir, line_num, total_lines); if let Some(t) = target { @@ -220,7 +220,7 @@ pub fn parse_inline_suppressions(path: &std::path::Path, source: &str) -> Suppre continue; } - // Not in a block comment — scan the line character by character + // Not in a block comment, scan the line character by character // tracking string state. if let Some(dir) = scan_line_for_directive(line, line_num, style, &mut in_block_comment) { let target = target_line(&dir, line_num, total_lines); @@ -237,7 +237,7 @@ pub fn parse_inline_suppressions(path: &std::path::Path, source: &str) -> Suppre } /// Compute the target line for a directive. Returns `None` if the directive -/// is `NextLine` but on the last line (EOF — no-op). +/// is `NextLine` but on the last line (EOF, no-op). fn target_line(dir: &LineDirective, line_num: usize, total_lines: usize) -> Option { match dir.kind { SuppressionKind::SameLine => Some(line_num), @@ -245,7 +245,7 @@ fn target_line(dir: &LineDirective, line_num: usize, total_lines: usize) -> Opti if line_num < total_lines { Some(line_num + 1) } else { - None // EOF — no next line + None // EOF, no next line } } } @@ -304,7 +304,7 @@ fn scan_line_for_directive( if ch == b'r' && i + 1 < len { let next = bytes[i + 1]; if next == b'"' { - // r"..." — skip to closing " + // r"...", skip to closing " i += 2; while i < len && bytes[i] != b'"' { i += 1; diff --git a/src/symbol/mod.rs b/src/symbol/mod.rs index b3764b44..b85b810d 100644 --- a/src/symbol/mod.rs +++ b/src/symbol/mod.rs @@ -4,7 +4,7 @@ use std::fmt; /// Supported source-code languages. /// /// `Default` is provided only so that [`FuncKey`] can derive `Default` for -/// test ergonomics — production code always constructs a `Lang` explicitly +/// test ergonomics, production code always constructs a `Lang` explicitly /// (via `from_slug` / `from_extension`). `Rust` was chosen as the default /// purely because it is the host language of the scanner; tests that rely /// on lang-specific behaviour should set `lang` explicitly, not rely on the @@ -147,18 +147,18 @@ impl FuncKind { /// plus a structural `kind` tag. Every field is deliberately narrow so /// legitimately-distinct definitions never collide: /// -/// * `lang` — prevents cross-language aliasing. -/// * `namespace` — project-relative source file path. -/// * `container` — enclosing class / impl / module / namespace / outer function +/// * `lang`, prevents cross-language aliasing. +/// * `namespace`, project-relative source file path. +/// * `container`, enclosing class / impl / module / namespace / outer function /// (qualified with `::` for nested containers). Empty string for free /// top-level functions. -/// * `name` — leaf identifier as written in source. -/// * `arity` — parameter count (including `self`/`this`) for languages that +/// * `name`, leaf identifier as written in source. +/// * `arity`, parameter count (including `self`/`this`) for languages that /// discriminate by arity. `None` for unknown. -/// * `disambig` — numeric discriminator for same-name definitions in the same +/// * `disambig`, numeric discriminator for same-name definitions in the same /// container (closure byte offset, nested-function occurrence index). /// `None` for the common case of a single definition. -/// * `kind` — structural role (see [`FuncKind`]). Separates e.g. a getter +/// * `kind`, structural role (see [`FuncKind`]). Separates e.g. a getter /// named `size` from a method `size()`. /// /// Backward-compat: `container`, `disambig`, and `kind` all have serde @@ -180,7 +180,7 @@ pub struct FuncKey { /// Typically the function node's start byte offset. #[serde(default)] pub disambig: Option, - /// Structural role — Function, Method, Constructor, Closure, etc. + /// Structural role, Function, Method, Constructor, Closure, etc. #[serde(default)] pub kind: FuncKind, } diff --git a/src/symex/executor.rs b/src/symex/executor.rs index 5d02ae69..479e04e9 100644 --- a/src/symex/executor.rs +++ b/src/symex/executor.rs @@ -54,7 +54,7 @@ const MAX_TOTAL_STEPS: usize = 500; /// A single exploration path in flight. /// /// The executor advances this one block at a time via successor transitions. -/// No pre-computed block sequence — successor choice happens at each terminator. +/// No pre-computed block sequence, successor choice happens at each terminator. struct ExplorationState { /// Current symbolic state (cloned at fork points). sym_state: SymbolicState, @@ -71,7 +71,7 @@ struct ExplorationState { /// Constraints checked on this path. constraints_checked: u32, /// Per-block visit count for bounded loop unrolling. - /// Inherited at fork points — both branches share the visit history. + /// Inherited at fork points, both branches share the visit history. visit_counts: HashMap, /// When `Some`, this path entered via an exception edge. /// Moved into `sym_state.exception_context` immediately before block @@ -110,7 +110,7 @@ pub(super) struct ExplorationResult { /// Compute the set of blocks on some CFG path from source to sink. /// /// This is **CFG source-to-sink reachability pruning**, NOT a taint slice. -/// It does not prove that tainted data flows through these blocks — only that +/// It does not prove that tainted data flows through these blocks, only that /// control flow can reach the sink from the source through them. Used to /// prevent exploring branches structurally disconnected from the /// source-to-sink span. @@ -521,7 +521,7 @@ fn run_path( steps_taken: state.steps_taken, constraints_checked: state.constraints_checked, visit_counts: state.visit_counts.clone(), - // Taint carrier — not a faithful thrown-value model. + // Taint carrier, not a faithful thrown-value model. // CatchParam transfer will mark the catch parameter tainted. exception_context: Some(SymbolicValue::Unknown), }; @@ -545,7 +545,7 @@ fn run_path( match (true_reachable, false_reachable) { (false, false) => { - // Dead end — neither successor reaches sink. + // Dead end, neither successor reaches sink. // Still try to extract a witness: the path may have // already walked past the sink node. let witness = try_extract_witness(state, finding, ssa, cfg); @@ -594,7 +594,7 @@ fn run_path( state.current_block = *false_blk; } (true, true) => { - // Both successors reachable — fork candidate + // Both successors reachable, fork candidate let can_fork = state.forks_used < MAX_FORKS_PER_FINDING && outcomes.len() + work_queue.len() + 1 < MAX_PATHS_PER_FINDING && *total_steps < MAX_TOTAL_STEPS; @@ -617,7 +617,7 @@ fn run_path( smt_ctx, ); } else { - // Budget exhausted — follow original path + // Budget exhausted, follow original path *search_exhausted = false; let preferred_polarity = if on_path.contains(true_blk) { true @@ -882,7 +882,7 @@ fn step_switch( let _ = planned_remaining; if !any_enqueued { - // All paths were pruned by infeasibility — record the current + // All paths were pruned by infeasibility, record the current // state's witness so the caller can decide. let witness = try_extract_witness(state, finding, ssa, cfg); return Some(PathOutcome { @@ -917,7 +917,7 @@ fn apply_branch_constraint( }; if matches!(cond_expr, constraint::ConditionExpr::Unknown) { - // No useful constraint — continue without recording + // No useful constraint, continue without recording return None; } @@ -938,7 +938,7 @@ fn apply_branch_constraint( }); } - // SMT escalation — check with Z3 when PathEnv says SAT but + // SMT escalation, check with Z3 when PathEnv says SAT but // accumulated constraints have cross-variable shape. #[cfg(feature = "smt")] if let Some(smt) = smt_ctx { @@ -1073,7 +1073,7 @@ fn fork_at_branch( work_queue.push_back(false_state); } - // Original state consumed by fork — no outcome from this path + // Original state consumed by fork, no outcome from this path None } @@ -1328,6 +1328,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), } } @@ -1717,6 +1718,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ctx = super::SymexContext { @@ -1861,6 +1863,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ctx = super::SymexContext { @@ -2140,6 +2143,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let cfg_graph = crate::cfg::Cfg::new(); diff --git a/src/symex/heap.rs b/src/symex/heap.rs index f3a234af..e95b95cf 100644 --- a/src/symex/heap.rs +++ b/src/symex/heap.rs @@ -8,7 +8,7 @@ //! Design: #![allow(clippy::collapsible_if, clippy::new_without_default)] //! - `FieldSlot::Named` for object properties (per-field precision). -//! - `FieldSlot::Elements` for container contents (flow-insensitive union — +//! - `FieldSlot::Elements` for container contents (flow-insensitive union , //! deliberately lower precision than named fields). //! - Bounded: `MAX_HEAP_ENTRIES` total, `MAX_FIELDS_PER_OBJECT` per object. //! Overflow silently drops the store (conservative: subsequent load → `Unknown`). @@ -313,7 +313,7 @@ impl SymbolicHeap { /// Count non-index fields stored for a specific object. /// - /// Excludes `Index(*)` entries — those are bounded separately by + /// Excludes `Index(*)` entries, those are bounded separately by /// [`MAX_TRACKED_INDICES`] via [`count_indices_for`]. fn fields_for_object(&self, object: HeapObjectId) -> usize { self.fields @@ -425,7 +425,7 @@ pub fn resolve_receiver_ssa( /// Resolve an SSA value to a singleton `HeapObjectId` via points-to analysis. /// /// Returns `Some` only when the points-to set contains exactly one object. -/// May-alias (set size > 1) or unknown (not in result) returns `None` — +/// May-alias (set size > 1) or unknown (not in result) returns `None` , /// the caller should fall through to existing behavior (sound: never pick /// among ambiguous options). pub fn resolve_singleton_object( @@ -675,7 +675,7 @@ mod tests { heap.load(&index_key(0, 1)), SymbolicValue::ConcreteStr("safe".to_string()) ); - // Taint: conservative — Elements taint poisons Index(1). + // Taint: conservative, Elements taint poisons Index(1). assert!(heap.is_tainted(&index_key(0, 1))); } diff --git a/src/symex/interproc.rs b/src/symex/interproc.rs index 863f9fb8..132c2a65 100644 --- a/src/symex/interproc.rs +++ b/src/symex/interproc.rs @@ -2,8 +2,8 @@ //! //! When a callee's `CalleeSsaBody` is available, the symbolic executor walks //! the callee's SSA blocks as a nested frame instead of treating it as an -//! opaque `mk_call`. Full symbolic state — return values, heap mutations, -//! taint, and path constraints — is propagated back to the caller. +//! opaque `mk_call`. Full symbolic state, return values, heap mutations, +//! taint, and path constraints, is propagated back to the caller. //! //! Resolution order in `transfer_inst` Call arm: //! container ops → string methods → **interprocedural execution** → summary → opaque mk_call. @@ -247,7 +247,7 @@ pub struct InterprocCtx<'a> { /// Pre-lowered intra-file function bodies, keyed by canonical `FuncKey`. pub callee_bodies: &'a HashMap, /// The top-level caller's body CFG. Callees have their own per-body graphs - /// (see `CalleeSsaBody::body_graph`) — `execute_callee` must swap this for + /// (see `CalleeSsaBody::body_graph`), `execute_callee` must swap this for /// the callee's own graph before indexing by `SsaInst::cfg_node`. pub cfg: &'a Cfg, /// Source language. @@ -373,7 +373,7 @@ impl CallOutcome { /// Create a cutoff outcome with conservative return. /// /// Returns `Unknown` with taint preserved if any argument was tainted. - /// This ensures cutoffs never silently drop taint — conservative soundness. + /// This ensures cutoffs never silently drop taint, conservative soundness. fn cutoff(reason: CutoffReason, any_arg_tainted: bool) -> Self { CallOutcome { exit_states: if any_arg_tainted { @@ -478,7 +478,7 @@ pub fn select_merge_policy(exit_count: usize, has_cutoffs: bool) -> MergePolicy /// - bits 1-4: SymbolicValue discriminant /// - bits 5-15: hash of concrete value (if Concrete/ConcreteStr) /// -/// Richer than taint-only — captures concrete string/int identity. +/// Richer than taint-only, captures concrete string/int identity. #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct ArgAbstraction(SmallVec<[(usize, u16); 4]>); @@ -561,14 +561,14 @@ impl<'a> Drop for ReentryGuard<'a> { /// reasons and conservative return values (taint preserved). /// /// # Arguments -/// * `ctx` — shared interprocedural context -/// * `callee_name` — raw callee name from `SsaOp::Call` -/// * `arg_values` — per-argument (caller SsaValue, SymbolicValue, tainted) -/// * `caller_heap` — caller's current symbolic heap (for callee reads) -/// * `depth` — current call depth (0 = top-level caller) -/// * `call_chain` — function names from outermost caller to current -/// * `summary_ctx` — summary context for nested calls that can't be inlined -/// * `heap_ctx` — heap context for nested calls +/// * `ctx` , shared interprocedural context +/// * `callee_name` , raw callee name from `SsaOp::Call` +/// * `arg_values` , per-argument (caller SsaValue, SymbolicValue, tainted) +/// * `caller_heap` , caller's current symbolic heap (for callee reads) +/// * `depth` , current call depth (0 = top-level caller) +/// * `call_chain` , function names from outermost caller to current +/// * `summary_ctx` , summary context for nested calls that can't be inlined +/// * `heap_ctx` , heap context for nested calls pub fn execute_callee( ctx: &InterprocCtx, callee_name: &str, @@ -616,7 +616,7 @@ pub fn execute_callee( } } - // Resolve callee by leaf name — finds first FuncKey with matching name + // Resolve callee by leaf name, finds first FuncKey with matching name // (optionally agreeing on arity). Symex preserves its existing leaf-name // semantics; disambiguation happens upstream in the taint engine. let normalized = callee_leaf_name(callee_name); @@ -642,7 +642,7 @@ pub fn execute_callee( gs.resolve_callee_body(ctx.lang, normalized, arity_hint, ctx.caller_namespace) }) { Some(b) => (b, true), - None => return None, // No body — fall through to summary + None => return None, // No body, fall through to summary } } }; @@ -825,7 +825,7 @@ pub fn execute_callee( } else { None }; - // `inst.cfg_node` indices are body-local — refer to `body.body_graph`, + // `inst.cfg_node` indices are body-local, refer to `body.body_graph`, // not `ctx.cfg` (the caller's graph). Fall back to `ctx.cfg` only for // cross-file bodies, where `node_meta` is populated and the graph is // never indexed directly. @@ -838,7 +838,7 @@ pub fn execute_callee( path.predecessor, summary_ctx, heap_ctx, - // Pass None for interproc_ctx — we handle nested calls directly below. + // Pass None for interproc_ctx, we handle nested calls directly below. None, Some(ctx.lang), xfile_meta, @@ -1033,7 +1033,7 @@ fn detect_internal_sinks( ) { for inst in block.body.iter() { let labels: &[DataLabel] = if let Some(meta) = node_meta { - // cross-file body — use embedded metadata + // cross-file body, use embedded metadata meta.get(&(inst.cfg_node.index() as u32)) .map(|m| m.info.taint.labels.as_slice()) .unwrap_or(&[]) @@ -1282,7 +1282,7 @@ fn compute_heap_delta(initial: &SymbolicHeap, final_heap: &SymbolicHeap) -> Vec< /// /// Full structural equality is expensive for deep trees. This checks the /// common cases (Concrete, ConcreteStr, Symbol, Unknown) and returns false -/// for complex expressions (conservative — will over-report heap mutations). +/// for complex expressions (conservative, will over-report heap mutations). fn sym_value_structurally_eq(a: &SymbolicValue, b: &SymbolicValue) -> bool { match (a, b) { (SymbolicValue::Concrete(x), SymbolicValue::Concrete(y)) => x == y, diff --git a/src/symex/loops.rs b/src/symex/loops.rs index acebd2f3..4534009d 100644 --- a/src/symex/loops.rs +++ b/src/symex/loops.rs @@ -94,11 +94,11 @@ impl LoopInfo { match (true_in, false_in) { (true, false) => Some(*false_blk), (false, true) => Some(*true_blk), - (false, false) => Some(*true_blk), // both exit — deterministic pick + (false, false) => Some(*true_blk), // both exit, deterministic pick (true, true) => None, // nested: no clear exit } } - _ => None, // Goto or Return — no branching exit + _ => None, // Goto or Return, no branching exit } } diff --git a/src/symex/mod.rs b/src/symex/mod.rs index b8a40633..bcd0bf1c 100644 --- a/src/symex/mod.rs +++ b/src/symex/mod.rs @@ -174,7 +174,7 @@ fn analyse_finding_path(finding: &Finding, ctx: &SymexContext) -> SymbolicVerdic } if path_blocks.len() < 2 { - // Short path (single block, no branches) — skip the multi-path + // Short path (single block, no branches), skip the multi-path // explorer but still run a linear transfer to extract a witness. let witness = linear_witness(finding, ctx, &path_blocks); return SymbolicVerdict { @@ -411,6 +411,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let blocks = extract_path_blocks(&finding, &ssa); @@ -483,6 +484,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ctx = SymexContext { @@ -541,6 +543,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ssa = SsaBody { @@ -567,7 +570,7 @@ mod tests { cross_file_bodies: None, }; annotate_findings(std::slice::from_mut(&mut finding), &ctx); - // Should remain None — skipped due to path_validated + // Should remain None, skipped due to path_validated assert!(finding.symbolic.is_none()); } @@ -600,6 +603,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ssa = SsaBody { @@ -626,7 +630,7 @@ mod tests { cross_file_bodies: None, }; annotate_findings(std::slice::from_mut(&mut finding), &ctx); - // Should remain None — only 1 flow step + // Should remain None, only 1 flow step assert!(finding.symbolic.is_none()); } } diff --git a/src/symex/smt.rs b/src/symex/smt.rs index e0934bda..0feb62b1 100644 --- a/src/symex/smt.rs +++ b/src/symex/smt.rs @@ -99,7 +99,7 @@ enum VarSort { Str, } -/// Polymorphic Z3 variable — either integer or string sort. +/// Polymorphic Z3 variable, either integer or string sort. enum Z3Var { Int(Z3Int), Str(Z3Str), @@ -140,7 +140,7 @@ impl SmtContext { /// proves the constraints are contradictory. /// /// Constraints that cannot be fully translated (unknown sorts, sort - /// conflicts, etc.) are silently skipped — this is sound because omitting + /// conflicts, etc.) are silently skipped, this is sound because omitting /// a constraint can only make Z3 return `Sat` when the actual result /// might be `Unsat`, never the reverse. pub fn check_path_feasibility( @@ -366,7 +366,7 @@ fn seed_from_path_env(solver: &Solver, var_map: &mut VarMap, env: &PathEnv) { (Some(Z3Var::Str(r)), Some(Z3Var::Str(vi))) => { solver.assert(&vi.eq(r)); } - _ => {} // Sort mismatch or missing — skip. + _ => {} // Sort mismatch or missing, skip. } } } @@ -380,7 +380,7 @@ fn seed_from_path_env(solver: &Solver, var_map: &mut VarMap, env: &PathEnv) { (Some(Z3Var::Str(za)), Some(Z3Var::Str(zb))) => { solver.assert(&za.ne(zb)); } - _ => {} // Sort mismatch or missing — skip. + _ => {} // Sort mismatch or missing, skip. } } @@ -402,7 +402,7 @@ fn seed_from_path_env(solver: &Solver, var_map: &mut VarMap, env: &PathEnv) { /// Translate a single path constraint into a Z3 assertion. /// /// Skips constraints that cannot be fully translated (unknown sort, sort -/// conflict, etc.). This is sound — see module-level docs. +/// conflict, etc.). This is sound, see module-level docs. fn assert_path_constraint( solver: &Solver, var_map: &mut VarMap, @@ -440,7 +440,7 @@ fn assert_path_constraint( } } } - // NullCheck, TypeCheck, Unknown — skip (not modeled). + // NullCheck, TypeCheck, Unknown, skip (not modeled). ConditionExpr::NullCheck { .. } | ConditionExpr::TypeCheck { .. } | ConditionExpr::Unknown => {} @@ -450,7 +450,7 @@ fn assert_path_constraint( /// Infer a sort hint from a constant operand. /// /// When one side of a comparison is a known constant, it hints the sort of -/// the other side (a `Value`). This is the lowest-priority evidence — used +/// the other side (a `Value`). This is the lowest-priority evidence, used /// only when var_map and PathEnv provide no information. fn operand_sort_hint(op: &Operand) -> Option { match op { @@ -487,7 +487,7 @@ fn translate_operand(var_map: &mut VarMap, op: &Operand, env: &PathEnv) -> Optio if is_known_int(*v, env) { return force_int_var(var_map, *v).map(Z3Expr::Int); } - // 3. Unknown sort — return None; caller may apply hint. + // 3. Unknown sort, return None; caller may apply hint. None } Operand::Const(ConstValue::Null) | Operand::Unknown => None, @@ -723,7 +723,7 @@ mod tests { #[test] fn cross_variable_contradiction() { // x > y AND y > x → Unsat - // PathEnv cannot detect this — it tracks per-variable intervals. + // PathEnv cannot detect this, it tracks per-variable intervals. let constraints = vec![ comparison_constraint(val(0), CompOp::Gt, val(1), true), comparison_constraint(val(1), CompOp::Gt, val(0), true), diff --git a/src/symex/state.rs b/src/symex/state.rs index 3e1554c8..bf10584a 100644 --- a/src/symex/state.rs +++ b/src/symex/state.rs @@ -34,14 +34,14 @@ pub struct SymbolicState { values: HashMap, /// Branch constraints collected along the path. path_constraints: Vec, - /// SSA values known to carry taint. Eagerly propagated during transfer — + /// SSA values known to carry taint. Eagerly propagated during transfer , /// no recursive expression-tree walking needed. tainted_roots: HashSet, /// Field-sensitive symbolic heap. heap: SymbolicHeap, /// Exception context for catch-path symbolic execution. /// When `Some`, the next `CatchParam` instruction consumes this value and - /// marks itself tainted. This is NOT a faithful model of the thrown value — + /// marks itself tainted. This is NOT a faithful model of the thrown value , /// it is a taint carrier that signals "this CatchParam was reached via an /// exception edge and should be treated as tainted." The symbolic value is /// `Unknown` because we do not model the exception object's structure. @@ -143,7 +143,7 @@ impl SymbolicState { let block_data = &ssa.blocks[block.0 as usize]; for phi in &block_data.phis { self.values.insert(phi.value, SymbolicValue::Unknown); - // PRESERVE taint — do NOT remove from tainted_roots. + // PRESERVE taint, do NOT remove from tainted_roots. } // Widen heap: degrade field symbolic precision, preserve taint. self.heap.widen(); @@ -163,7 +163,7 @@ impl SymbolicState { ConstLattice::Str(s) => { self.values.insert(v, SymbolicValue::ConcreteStr(s.clone())); } - _ => {} // Bool, Null, Top, Varying — not modeled + _ => {} // Bool, Null, Top, Varying, not modeled } } } @@ -343,6 +343,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ssa = SsaBody { blocks: vec![], @@ -382,6 +383,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ssa = SsaBody { blocks: vec![], @@ -418,6 +420,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; let ssa = SsaBody { blocks: vec![], diff --git a/src/symex/strings.rs b/src/symex/strings.rs index ecb80862..4cae38cd 100644 --- a/src/symex/strings.rs +++ b/src/symex/strings.rs @@ -34,9 +34,9 @@ pub enum StringMethod { /// Where the string operand comes from in the call. #[derive(Clone, Copy, Debug, PartialEq)] pub enum StringOperandSource { - /// `receiver.method()` — JS, Java, Ruby, Rust + /// `receiver.method()`, JS, Java, Ruby, Rust Receiver, - /// `func(string, ...)` — Python `len()`, Go `strings.*`, PHP `strlen()` + /// `func(string, ...)`, Python `len()`, Go `strings.*`, PHP `strlen()` FirstArg, } @@ -68,7 +68,7 @@ pub struct SanitizerInfo { /// - **Representation transforms** (non-protective): witness-only, never /// used for mismatch reasoning. /// -/// Symex `Encode`/`Decode` nodes preserve taint unconditionally — this enum +/// Symex `Encode`/`Decode` nodes preserve taint unconditionally, this enum /// carries no sanitization authority. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum TransformKind { @@ -79,7 +79,7 @@ pub enum TransformKind { UrlEncode, /// Shell quoting: single-quote wrapping with internal quote escaping. ShellEscape, - /// SQL string escaping: `'` → `''`. Witness-only — no label rule yet, + /// SQL string escaping: `'` → `''`. Witness-only, no label rule yet, /// so `verified_cap()` returns `Cap::empty()`. SqlEscape, // ── Representation transforms (non-protective) ─────────────────────── @@ -87,7 +87,7 @@ pub enum TransformKind { Base64Encode, /// Base64 decoding. Base64Decode, - /// URL percent-decoding (reverses URL encoding — anti-protective). + /// URL percent-decoding (reverses URL encoding, anti-protective). UrlDecode, } @@ -119,7 +119,7 @@ impl TransformKind { TransformKind::HtmlEscape => Cap::HTML_ESCAPE, TransformKind::UrlEncode => Cap::URL_ENCODE, TransformKind::ShellEscape => Cap::SHELL_ESCAPE, - // SqlEscape: no verified label rule — witness-only + // SqlEscape: no verified label rule, witness-only TransformKind::SqlEscape => Cap::empty(), // Representation transforms: not protective TransformKind::Base64Encode @@ -149,7 +149,7 @@ pub struct TransformMethodInfo { /// /// Returns `None` for unrecognized methods (fall through to opaque `Call`). /// For `Replace`, only classifies when pattern and replacement args are concrete -/// strings — dynamic patterns produce `None`. +/// strings, dynamic patterns produce `None`. pub fn classify_string_method( callee: &str, args: &[SymbolicValue], @@ -217,7 +217,7 @@ fn classify_python(method: &str, callee: &str, args: &[SymbolicValue]) -> Option use StringMethod::*; use StringOperandSource::*; - // Python builtins: len(s) — no receiver + // Python builtins: len(s), no receiver if callee == "len" { return Some(StringMethodInfo { method: StrLen, @@ -396,7 +396,7 @@ fn classify_php(method: &str, callee: &str, args: &[SymbolicValue]) -> Option { - // PHP: str_replace($search, $replace, $subject) — string is arg[2] + // PHP: str_replace($search, $replace, $subject), string is arg[2] // But in our callee model, receiver is not present for free functions. // args[0] = pattern, args[1] = replacement, args[2] = subject let (pat, rep) = extract_replace_args(args, 0)?; @@ -488,7 +488,7 @@ fn classify_c(method: &str) -> Option { /// Classify a callee as a recognized encoding/decoding transform. /// /// Returns `None` for unrecognized methods. Rich sanitizers (DOMPurify, -/// bleach, markupsafe, etc.) are intentionally NOT classified here — they +/// bleach, markupsafe, etc.) are intentionally NOT classified here, they /// are complex library-level sanitizers, not simple character-level escapes. pub fn classify_transform_method(callee: &str, lang: Lang) -> Option { match lang { @@ -605,7 +605,7 @@ fn classify_transform_php(callee: &str) -> Option { kind: ShellEscape, operand_source: FirstArg, }), - // SQL escaping (witness-only — no verified label rule) + // SQL escaping (witness-only, no verified label rule) "addslashes" => Some(TransformMethodInfo { kind: SqlEscape, operand_source: FirstArg, @@ -624,7 +624,7 @@ fn classify_transform_java(callee: &str) -> Option { // examine the dotted callee for receiver-qualified disambiguation. let method = bare_method_name(callee); - // URL encoding/decoding — `java.net.URLEncoder.encode` / `URLDecoder.decode`. + // URL encoding/decoding, `java.net.URLEncoder.encode` / `URLDecoder.decode`. if callee.ends_with("URLEncoder.encode") { return Some(TransformMethodInfo { kind: UrlEncode, @@ -639,7 +639,7 @@ fn classify_transform_java(callee: &str) -> Option { } // Apache commons-text / commons-lang `StringEscapeUtils.escapeHtml4`, - // `escapeXml11`, `escapeXml10`. These are character-level entity escapes — + // `escapeXml11`, `escapeXml10`. These are character-level entity escapes , // NOT rich sanitizers like OWASP ESAPI's `Encoder`. if callee.ends_with("StringEscapeUtils.escapeHtml4") || callee.ends_with("StringEscapeUtils.escapeHtml") @@ -653,7 +653,7 @@ fn classify_transform_java(callee: &str) -> Option { }); } - // Base64 — `Base64.getEncoder().encodeToString(bytes)` (and the URL-safe + // Base64, `Base64.getEncoder().encodeToString(bytes)` (and the URL-safe // / MIME variants). Match by leaf method name; the encoder/decoder chain // before it is opaque to symex, but the operand is still the first arg. match method { @@ -661,7 +661,7 @@ fn classify_transform_java(callee: &str) -> Option { kind: Base64Encode, operand_source: FirstArg, }), - // `Base64.getDecoder().decode(s)` — the leaf `decode` collides with + // `Base64.getDecoder().decode(s)`, the leaf `decode` collides with // `URLDecoder.decode` (handled above) so this only matches when the // URLDecoder branch did not. "decode" if callee.contains("Base64") => Some(TransformMethodInfo { @@ -677,7 +677,7 @@ fn classify_transform_go(callee: &str) -> Option { use TransformKind::*; match callee { - // URL encoding/decoding — `net/url` package. + // URL encoding/decoding, `net/url` package. "url.QueryEscape" | "url.PathEscape" => Some(TransformMethodInfo { kind: UrlEncode, operand_source: FirstArg, @@ -686,7 +686,7 @@ fn classify_transform_go(callee: &str) -> Option { kind: UrlDecode, operand_source: FirstArg, }), - // HTML entity escaping — `html` package (NOT `template.HTMLEscapeString`, + // HTML entity escaping, `html` package (NOT `template.HTMLEscapeString`, // which is a context-aware sanitizer). `html.UnescapeString` is // intentionally NOT classified: TransformKind has no `HtmlUnescape` // variant, and reusing UrlDecode would label the witness wrongly. @@ -694,7 +694,7 @@ fn classify_transform_go(callee: &str) -> Option { kind: HtmlEscape, operand_source: FirstArg, }), - // Base64 — `encoding/base64` package, `StdEncoding`/`URLEncoding`/ + // Base64, `encoding/base64` package, `StdEncoding`/`URLEncoding`/ // `RawStdEncoding`/`RawURLEncoding` all expose `EncodeToString`. "base64.StdEncoding.EncodeToString" | "base64.URLEncoding.EncodeToString" @@ -723,7 +723,7 @@ fn classify_transform_ruby(callee: &str) -> Option { let normalised = callee.replace("::", "."); match normalised.as_str() { // URL percent-encoding. Note: `CGI.escape` in Ruby is percent-encoding - // (NOT HTML escape — that's `CGI.escapeHTML`). + // (NOT HTML escape, that's `CGI.escapeHTML`). "CGI.escape" | "URI.encode_www_form_component" => Some(TransformMethodInfo { kind: UrlEncode, operand_source: FirstArg, @@ -732,7 +732,7 @@ fn classify_transform_ruby(callee: &str) -> Option { kind: UrlDecode, operand_source: FirstArg, }), - // HTML entity escaping (character-level — NOT Rails `sanitize` or + // HTML entity escaping (character-level, NOT Rails `sanitize` or // `strip_tags` which are rich sanitizers). "ERB::Util.html_escape" | "ERB.Util.html_escape" | "CGI.escapeHTML" => { Some(TransformMethodInfo { @@ -740,7 +740,7 @@ fn classify_transform_ruby(callee: &str) -> Option { operand_source: FirstArg, }) } - // Base64 — `Base64.strict_encode64` / `encode64` / `urlsafe_encode64`. + // Base64, `Base64.strict_encode64` / `encode64` / `urlsafe_encode64`. "Base64.strict_encode64" | "Base64.encode64" | "Base64.urlsafe_encode64" => { Some(TransformMethodInfo { kind: Base64Encode, @@ -763,7 +763,7 @@ fn classify_transform_ruby(callee: &str) -> Option { /// Apply encoding for witness rendering. /// -/// **NOT a spec-complete codec.** These are witness-quality helpers only — +/// **NOT a spec-complete codec.** These are witness-quality helpers only , /// not suitable for security decisions, not reusable outside witness display. pub fn encode_concrete_for_witness(kind: TransformKind, input: &str) -> Option { match kind { @@ -980,7 +980,7 @@ pub fn evaluate_string_op_concrete(method: &StringMethod, receiver: &str) -> Opt )), StringMethod::StrLen => Some(SymbolicValue::Concrete(receiver.len() as i64)), StringMethod::Substr => { - // Substr needs index args — concrete evaluation handled in smart constructor + // Substr needs index args, concrete evaluation handled in smart constructor None } } @@ -993,7 +993,7 @@ pub fn evaluate_string_op_concrete(method: &StringMethod, receiver: &str) -> Opt /// Detect whether a Replace operation acts as a security sanitizer. /// /// Returns `None` if the pattern is not security-relevant. This is conservative: -/// the symbolic string theory does NOT clear taint via Replace — detection is +/// the symbolic string theory does NOT clear taint via Replace, detection is /// informational only for witness quality. pub fn detect_replace_sanitizer( pattern: &str, @@ -1271,7 +1271,7 @@ mod tests { #[test] fn test_evaluate_substr_returns_none() { - // Substr needs index args — concrete eval handled in smart constructor + // Substr needs index args, concrete eval handled in smart constructor let result = evaluate_string_op_concrete(&StringMethod::Substr, "hello"); assert_eq!(result, None); } @@ -1363,7 +1363,7 @@ mod tests { #[test] fn test_classify_transform_js_rich_sanitizer_not_matched() { - // DOMPurify.sanitize is a rich sanitizer — NOT a simple escape + // DOMPurify.sanitize is a rich sanitizer, NOT a simple escape assert!(classify_transform_method("DOMPurify.sanitize", Lang::JavaScript).is_none()); assert!(classify_transform_method("sanitizeHtml", Lang::JavaScript).is_none()); assert!(classify_transform_method("xss", Lang::JavaScript).is_none()); @@ -1499,7 +1499,7 @@ mod tests { #[test] fn test_classify_transform_ruby_cgi_escape() { let info = classify_transform_method("CGI.escape", Lang::Ruby).unwrap(); - // CGI.escape is percent-encoding in Ruby (not HTML escape — that's + // CGI.escape is percent-encoding in Ruby (not HTML escape, that's // CGI.escapeHTML). assert_eq!(info.kind, TransformKind::UrlEncode); let info = classify_transform_method("CGI::escape", Lang::Ruby).unwrap(); @@ -1532,7 +1532,7 @@ mod tests { #[test] fn test_classify_transform_ruby_rich_sanitizer_not_matched() { - // Rails `sanitize` / `strip_tags` are rich library sanitizers — NOT + // Rails `sanitize` / `strip_tags` are rich library sanitizers, NOT // simple character-level escapes. assert!(classify_transform_method("sanitize", Lang::Ruby).is_none()); assert!(classify_transform_method("strip_tags", Lang::Ruby).is_none()); @@ -1642,7 +1642,7 @@ mod tests { #[test] fn test_verified_cap_sql_escape_is_empty() { - // SqlEscape has no verified label rule — witness-only + // SqlEscape has no verified label rule, witness-only assert_eq!(TransformKind::SqlEscape.verified_cap(), Cap::empty()); assert!(!TransformKind::SqlEscape.is_protective()); } diff --git a/src/symex/transfer.rs b/src/symex/transfer.rs index 84121dc7..3a18cd2f 100644 --- a/src/symex/transfer.rs +++ b/src/symex/transfer.rs @@ -102,18 +102,18 @@ pub fn transfer_inst( } SsaOp::SelfParam => { - // Implicit method receiver — symbolic input, not tainted by default. + // Implicit method receiver, symbolic input, not tainted by default. state.set(inst.value, SymbolicValue::Symbol(inst.value)); } SsaOp::CatchParam => { if let Some(exc_val) = state.take_exception_context() { - // On an exception path — seed from exception context + // On an exception path, seed from exception context // and mark tainted (matches taint engine: CatchParam gets Cap::all()) state.set(inst.value, exc_val); state.mark_tainted(inst.value); } else { - // Normal path or no explicit exception context — still mark tainted + // Normal path or no explicit exception context, still mark tainted // to match taint engine behavior (ssa_transfer.rs CatchParam gets Cap::all()) state.set(inst.value, SymbolicValue::Symbol(inst.value)); state.mark_tainted(inst.value); @@ -121,7 +121,7 @@ pub fn transfer_inst( } SsaOp::Nop => { - // Nop does not define a meaningful value — skip. + // Nop does not define a meaningful value, skip. } SsaOp::Undef => { @@ -136,10 +136,10 @@ pub fn transfer_inst( // receiver's taint to the result so flat root-set tracking // continues to flow taint through chained accesses. // - // Phase 4 deliberately keeps the opaque-Symbol model: without + // This pass deliberately keeps the opaque-Symbol model: without // a field-sensitive heap, a dedicated `Field { receiver, name }` // SymbolicValue variant cannot soundly carry concrete reads - // across method boundaries — the witness pipeline already + // across method boundaries, the witness pipeline already // reconstructs `obj.field` text from `ValueDef.var_name` // (populated by lower.rs to `"base.f1.f2"` for chain projections). // The structured variant is deferred to the field-sensitive @@ -166,7 +166,7 @@ pub fn transfer_inst( // When RHS is a member expression, SSA produces 2 uses: // uses[0] = dotted-path SSA value (e.g., v for "user.name") // uses[1] = base variable SSA value (e.g., v for "user") - // The first operand IS the field value — use it directly. + // The first operand IS the field value, use it directly. if let Some(def) = ssa.value_defs.get(uses_slice[0].0 as usize) { if def.var_name.as_ref().is_some_and(|n| n.contains('.')) { let sym = state.get(uses_slice[0]); @@ -200,13 +200,13 @@ pub fn transfer_inst( let sym = mk_binop(Op::from(bin_op), lhs, rhs); state.set(inst.value, sym); } else { - // No structural info — conservative Unknown + // No structural info, conservative Unknown state.set(inst.value, SymbolicValue::Unknown); } state.propagate_taint(inst.value, uses_slice); } _ => { - // 3+ operands — complex expression + // 3+ operands, complex expression state.set(inst.value, SymbolicValue::Unknown); state.propagate_taint(inst.value, uses_slice); } @@ -306,7 +306,7 @@ pub fn transfer_inst( // Fall through to normal Call } ContainerOp::Writeback { .. } => { - // Symex doesn't model writeback yet — taint + // Symex doesn't model writeback yet, taint // engine handles the destination-arg taint // directly. Fall through to normal Call. } @@ -338,7 +338,7 @@ pub fn transfer_inst( } // Interprocedural symbolic execution. - // Execute callee body when available — full state propagation. + // Execute callee body when available, full state propagation. if let Some(ictx) = interproc_ctx { let mut callee_args: Vec<(crate::ssa::ir::SsaValue, SymbolicValue, bool)> = Vec::new(); @@ -550,7 +550,7 @@ fn try_heap_alias_load( /// Transfer a single SSA instruction with optional predecessor context. /// -/// ONLY phi instructions use predecessor-sensitive selection — when +/// ONLY phi instructions use predecessor-sensitive selection, when /// `predecessor` is `Some(bid)`, the phi resolves to the operand from /// that specific predecessor block instead of building a `Phi(...)` /// expression. All non-phi instructions delegate to [`transfer_inst`]. @@ -579,7 +579,7 @@ pub fn transfer_inst_with_predecessor( return; } } - // Predecessor not found among operands — propagate from all (fallback) + // Predecessor not found among operands, propagate from all (fallback) let operand_vals: Vec<_> = operands.iter().map(|(_, v)| *v).collect(); state.propagate_taint(inst.value, &operand_vals); } @@ -715,7 +715,7 @@ fn try_string_method( // If receiver was prepended to arg_syms, it's at index 0; // otherwise first explicit arg is at index 0. if let Some(recv) = receiver { - // Receiver was prepended — it IS the string operand + // Receiver was prepended, it IS the string operand (state.get(*recv), *recv) } else if let Some(&first_op) = all_operands.first() { ( @@ -764,7 +764,7 @@ fn try_string_method( /// Recognize encoding/decoding transforms and build structured /// `Encode`/`Decode` nodes instead of opaque `Call`. /// -/// Taint is always propagated from the operand — encoding preserves taint +/// Taint is always propagated from the operand, encoding preserves taint /// unconditionally. This function does NOT sanitize. fn try_transform_method( state: &SymbolicState, @@ -902,7 +902,7 @@ fn model_from_summary( /// /// When a receiver has a known type via type facts, tries type-qualified /// callee name (e.g., `"HttpClient.send"`) before bare-name resolution. This -/// improves summary-based modeling only — not general virtual dispatch. +/// improves summary-based modeling only, not general virtual dispatch. fn resolve_callee_symbolically( ctx: &SymexSummaryCtx, callee: &str, @@ -913,7 +913,7 @@ fn resolve_callee_symbolically( receiver: Option, ) -> Option { // Type-qualified symbolic resolution when receiver has a known type. - // Improves summary-based modeling only — not general virtual dispatch. + // Improves summary-based modeling only, not general virtual dispatch. // Precedence: exact qualified > type-aided disambiguation > bare-name fallback. if let (Some(tf), Some(recv)) = (ctx.type_facts, receiver) && let Some(receiver_type) = tf.get_type(recv) @@ -935,7 +935,7 @@ fn resolve_callee_symbolically( // Attempt 2: Disambiguate among ambiguous bare-name candidates. // Only select when a candidate's FuncKey.name EXACTLY equals the - // qualified name — no substring matching, never guess. + // qualified name, no substring matching, never guess. let bare_resolution = ctx.global_summaries .resolve_callee_key(method, ctx.lang, ctx.namespace, None); @@ -1632,7 +1632,7 @@ mod tests { state.mark_tainted(SsaValue(0)); state.set(SsaValue(1), SymbolicValue::Concrete(42)); - // Two Identity entries — should fall back to mk_call, NOT pick one + // Two Identity entries, should fall back to mk_call, NOT pick one let mut gs = GlobalSummaries::new(); insert_summary( &mut gs, @@ -2131,7 +2131,7 @@ mod tests { }, ); - // Empty type facts — no receiver type info + // Empty type facts, no receiver type info let tf = make_type_facts(vec![]); let ctx = SymexSummaryCtx { global_summaries: &gs, @@ -2170,7 +2170,7 @@ mod tests { #[test] fn transfer_call_type_qualified_disambiguation() { // Two summaries both named "send" in different namespaces. - // One named "HttpClient.send" — type disambiguation picks it. + // One named "HttpClient.send", type disambiguation picks it. let (cfg, node) = cfg_with_node(None); let ssa = empty_ssa(); let mut state = SymbolicState::new(); @@ -2180,7 +2180,7 @@ mod tests { state.set(SsaValue(1), SymbolicValue::Symbol(SsaValue(1))); let mut gs = GlobalSummaries::new(); - // First "send" — generic, in ns A (Identity: passes through) + // First "send", generic, in ns A (Identity: passes through) insert_java_summary( &mut gs, "send", @@ -2209,7 +2209,7 @@ mod tests { typed_call_receivers: vec![], }, ); - // Second "send" — in ns B, also with same arity → ambiguous bare-name + // Second "send", in ns B, also with same arity → ambiguous bare-name insert_java_summary( &mut gs, "send", @@ -2247,7 +2247,7 @@ mod tests { SsaFuncSummary { param_to_return: vec![], param_to_sink: vec![], - source_caps: Cap::ENV_VAR, // Source — distinct signal + source_caps: Cap::ENV_VAR, // Source, distinct signal param_to_sink_param: vec![], param_container_to_return: vec![], param_to_container_store: vec![], @@ -2276,7 +2276,7 @@ mod tests { type_facts: Some(&tf), }; - // v2 = v1.send(v0) — receiver v1 is HttpClient + // v2 = v1.send(v0), receiver v1 is HttpClient let inst = make_inst( 2, SsaOp::Call { @@ -2316,7 +2316,7 @@ mod tests { state.set(SsaValue(1), SymbolicValue::Symbol(SsaValue(1))); let mut gs = GlobalSummaries::new(); - // Summary under "DatabaseConnection.send" — wrong type + // Summary under "DatabaseConnection.send", wrong type insert_java_summary( &mut gs, "DatabaseConnection.send", @@ -2346,7 +2346,7 @@ mod tests { }, ); - // Receiver typed as HttpClient — constructs "HttpClient.send", not "DatabaseConnection.send" + // Receiver typed as HttpClient, constructs "HttpClient.send", not "DatabaseConnection.send" let tf = make_type_facts(vec![(SsaValue(1), TypeKind::HttpClient)]); let ctx = SymexSummaryCtx { global_summaries: &gs, @@ -2396,7 +2396,7 @@ mod tests { state.set(SsaValue(1), SymbolicValue::Symbol(SsaValue(1))); let mut gs = GlobalSummaries::new(); - // Two "send" summaries — different namespaces → ambiguous + // Two "send" summaries, different namespaces → ambiguous insert_java_summary( &mut gs, "send", @@ -2453,7 +2453,7 @@ mod tests { typed_call_receivers: vec![], }, ); - // No "HttpClient.send" summary registered — disambiguation has 0 exact matches + // No "HttpClient.send" summary registered, disambiguation has 0 exact matches let tf = make_type_facts(vec![(SsaValue(1), TypeKind::HttpClient)]); let ctx = SymexSummaryCtx { diff --git a/src/symex/value.rs b/src/symex/value.rs index 4df2993e..bd0e84c5 100644 --- a/src/symex/value.rs +++ b/src/symex/value.rs @@ -5,6 +5,7 @@ use std::fmt; use crate::cfg; use crate::ssa::ir::{BlockId, SsaValue}; +use crate::utils::snippet::truncate_at_char_boundary; /// Maximum expression tree depth before collapsing to `Unknown`. pub const MAX_EXPR_DEPTH: u32 = 32; @@ -120,7 +121,7 @@ pub enum SymbolicValue { StrLen(Box), // ── Encoding/decoding transforms ─────────────────────────── /// Protective or representation transform applied to inner value. - /// Preserves taint unconditionally — does NOT sanitize in symex. + /// Preserves taint unconditionally, does NOT sanitize in symex. Encode(super::strings::TransformKind, Box), /// Decoding/reverse transform applied to inner value. Decode(super::strings::TransformKind, Box), @@ -189,7 +190,7 @@ impl SymbolicValue { } // ───────────────────────────────────────────────────────────────────────────── -// Smart constructors — all tree-building goes through these +// Smart constructors, all tree-building goes through these // ───────────────────────────────────────────────────────────────────────────── /// Build a binary arithmetic expression with concrete folding and depth bounding. @@ -218,11 +219,11 @@ pub fn mk_binop(op: Op, lhs: SymbolicValue, rhs: SymbolicValue) -> SymbolicValue a.checked_rem(*b) } } - // Bitwise — &, |, ^ cannot overflow on i64 + // Bitwise, &, |, ^ cannot overflow on i64 Op::BitAnd => Some(*a & *b), Op::BitOr => Some(*a | *b), Op::BitXor => Some(*a ^ *b), - // Shifts — bounds-checked to 0..=63 (i64 width) + // Shifts, bounds-checked to 0..=63 (i64 width) Op::LeftShift => { if *b < 0 || *b > 63 { None @@ -237,7 +238,7 @@ pub fn mk_binop(op: Op, lhs: SymbolicValue, rhs: SymbolicValue) -> SymbolicValue a.checked_shr(*b as u32) } } - // Comparisons — produce 1 (true) or 0 (false) + // Comparisons, produce 1 (true) or 0 (false) Op::Eq => Some(if *a == *b { 1 } else { 0 }), Op::NotEq => Some(if *a != *b { 1 } else { 0 }), Op::Lt => Some(if *a < *b { 1 } else { 0 }), @@ -397,7 +398,7 @@ pub fn mk_substr( let result = cs.get(i..).unwrap_or(""); return SymbolicValue::ConcreteStr(result.to_owned()); } - _ => {} // end is Some but not concrete — can't fold + _ => {} // end is Some but not concrete, can't fold } } } @@ -458,7 +459,7 @@ pub fn mk_decode(kind: super::strings::TransformKind, s: SymbolicValue) -> Symbo } // ───────────────────────────────────────────────────────────────────────────── -// Display — human-readable witness strings +// Display, human-readable witness strings // ───────────────────────────────────────────────────────────────────────────── /// Maximum length for the Display output before truncation. @@ -468,10 +469,12 @@ const MAX_STR_DISPLAY_LEN: usize = 64; impl fmt::Display for SymbolicValue { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // Use an internal formatter, then truncate if needed. + // Use an internal formatter, then truncate if needed. UTF-8-safe + // truncation, `ConcreteStr` may carry localised text from source + // (e.g. Cyrillic / Gurmukhi regex literals). let s = display_inner(self); if s.len() > MAX_DISPLAY_LEN { - write!(f, "{}...", &s[..MAX_DISPLAY_LEN]) + write!(f, "{}...", truncate_at_char_boundary(&s, MAX_DISPLAY_LEN)) } else { write!(f, "{}", s) } @@ -483,7 +486,10 @@ fn display_inner(val: &SymbolicValue) -> String { SymbolicValue::Concrete(n) => format!("{}", n), SymbolicValue::ConcreteStr(s) => { if s.len() > MAX_STR_DISPLAY_LEN { - format!("\"{}...\"", &s[..MAX_STR_DISPLAY_LEN]) + format!( + "\"{}...\"", + truncate_at_char_boundary(s, MAX_STR_DISPLAY_LEN) + ) } else { format!("\"{}\"", s) } @@ -675,7 +681,7 @@ mod tests { #[test] fn depth_bounding() { - // Build a chain of depth 33 — should collapse to Unknown + // Build a chain of depth 33, should collapse to Unknown let mut val = SymbolicValue::Symbol(SsaValue(0)); for _ in 0..MAX_EXPR_DEPTH { val = mk_binop(Op::Add, val, SymbolicValue::Concrete(1)); @@ -702,7 +708,7 @@ mod tests { #[test] fn concat_no_int_coercion() { - // ConcreteStr + Concrete(int) should NOT fold — no type coercion + // ConcreteStr + Concrete(int) should NOT fold, no type coercion let result = mk_concat( SymbolicValue::ConcreteStr("val=".into()), SymbolicValue::Concrete(42), @@ -980,7 +986,7 @@ mod tests { #[test] fn left_shift_amount_63() { - // Max valid shift — should not panic + // Max valid shift, should not panic let result = mk_binop(Op::LeftShift, c(1), c(63)); assert_eq!(result, c(1i64 << 63)); } @@ -1286,7 +1292,7 @@ mod tests { /// `mk_phi` must not fold when operands have differing types /// (e.g. one branch returns a Concrete int, another returns - /// ConcreteStr). The result is genuinely uncertain — a Phi node + /// ConcreteStr). The result is genuinely uncertain, a Phi node /// must be preserved to expose the type-conflict to downstream /// witness logic, not collapse to one operand. #[test] diff --git a/src/symex/witness.rs b/src/symex/witness.rs index 417eea21..57a0fe99 100644 --- a/src/symex/witness.rs +++ b/src/symex/witness.rs @@ -1,7 +1,7 @@ //! Witness generation for confirmed symbolic findings. //! //! When the multi-path explorer confirms a finding as feasible, this module -//! generates a concrete proof witness — an actual input value that would +//! generates a concrete proof witness, an actual input value that would //! trigger the vulnerability. Witnesses are best-effort: if the expression //! is not string-renderable or constraints are too complex, a generic //! description is produced instead. @@ -44,7 +44,7 @@ pub fn extract_witness( } // 1b. When the sink is a Call node, the return value is typically opaque. - // Look for the best tainted argument instead — that's where injected + // Look for the best tainted argument instead, that's where injected // data actually flows into the sink. let sym = unwrap_sink_call_arg(&sym, state); @@ -85,7 +85,7 @@ pub fn extract_witness( // 6. Branch on string-renderability if tainted.is_empty() { - // No tainted symbols — expression is fully concrete or opaque + // No tainted symbols, expression is fully concrete or opaque let concrete = evaluate_concrete(&sym); Some(format!( "input '{}' flows to {}(\"{}\")", @@ -125,7 +125,7 @@ pub fn extract_witness( /// When the sink expression is a `Call`, find the most informative tainted /// argument to use for witness generation instead of the opaque return value. /// -/// Scores each tainted arg by structural richness — args containing protective +/// Scores each tainted arg by structural richness, args containing protective /// transforms (`Encode`/`Decode`), string composition (`Concat`/`BinOp(Add)`), /// or string methods (`Replace`/`Substr`/etc.) outrank bare `Call(...)` /// wrappers (which typically come from prepended receivers or opaque property @@ -242,7 +242,7 @@ fn is_string_renderable(expr: &SymbolicValue) -> bool { SymbolicValue::Substr(s, _, _) => is_string_renderable(s), // Encoding/decoding transforms produce strings SymbolicValue::Encode(_, s) | SymbolicValue::Decode(_, s) => is_string_renderable(s), - // StrLen returns integer — not string-renderable + // StrLen returns integer, not string-renderable SymbolicValue::StrLen(_) => false, // BinOp(Add) on string-renderable operands is string concatenation // in languages where + is overloaded (JS, Python, etc.) @@ -253,7 +253,7 @@ fn is_string_renderable(expr: &SymbolicValue) -> bool { // pass-through for witness purposes (covers property access, simple // wrappers). Multi-arg calls or calls with non-renderable args are opaque. SymbolicValue::Call(_, args) if args.len() == 1 => is_string_renderable(&args[0]), - // Other arithmetic, opaque calls, phis, integers, unknown — not string-renderable + // Other arithmetic, opaque calls, phis, integers, unknown, not string-renderable SymbolicValue::Concrete(_) | SymbolicValue::BinOp(_, _, _) | SymbolicValue::Call(_, _) @@ -290,7 +290,7 @@ fn collect_tainted_inner(expr: &SymbolicValue, state: &SymbolicState, out: &mut collect_tainted_inner(v, state, out); } } - // String operations — recurse into operands + // String operations, recurse into operands SymbolicValue::ToLower(s) | SymbolicValue::ToUpper(s) | SymbolicValue::Trim(s) @@ -352,7 +352,7 @@ fn substitute_tainted( .collect(); SymbolicValue::Phi(new_ops) } - // String operations — recurse into operands + // String operations, recurse into operands SymbolicValue::Trim(s) => { SymbolicValue::Trim(Box::new(substitute_tainted(s, tainted, payload))) } @@ -376,14 +376,14 @@ fn substitute_tainted( end.as_ref() .map(|e| Box::new(substitute_tainted(e, tainted, payload))), ), - // Encoding/decoding transforms — preserve structure + // Encoding/decoding transforms, preserve structure SymbolicValue::Encode(kind, s) => { SymbolicValue::Encode(*kind, Box::new(substitute_tainted(s, tainted, payload))) } SymbolicValue::Decode(kind, s) => { SymbolicValue::Decode(*kind, Box::new(substitute_tainted(s, tainted, payload))) } - // Leaf nodes that are not tainted symbols — return unchanged + // Leaf nodes that are not tainted symbols, return unchanged other => other.clone(), } } @@ -407,7 +407,7 @@ fn evaluate_concrete(expr: &SymbolicValue) -> String { let right = evaluate_concrete(r); format!("{}{}", left, right) } - // String operations — apply to recursively evaluated inner + // String operations, apply to recursively evaluated inner SymbolicValue::Trim(s) => evaluate_concrete(s).trim().to_owned(), SymbolicValue::ToLower(s) => evaluate_concrete(s).to_lowercase(), SymbolicValue::ToUpper(s) => evaluate_concrete(s).to_uppercase(), @@ -439,7 +439,7 @@ fn evaluate_concrete(expr: &SymbolicValue) -> String { format!("{}", expr) } } - // Encoding/decoding — apply transform to recursively evaluated inner + // Encoding/decoding, apply transform to recursively evaluated inner SymbolicValue::Encode(kind, s) => { let inner = evaluate_concrete(s); super::strings::encode_concrete_for_witness(*kind, &inner) @@ -465,7 +465,7 @@ fn evaluate_concrete(expr: &SymbolicValue) -> String { /// the sink's vulnerability class? /// /// Returns a human-readable note if a transform's `verified_cap()` is -/// non-empty AND does NOT intersect the sink's cap — indicating the +/// non-empty AND does NOT intersect the sink's cap, indicating the /// transform does not match the sink's neutralization class. /// /// This is a **heuristic witness annotation**, not a proof. Representation @@ -485,7 +485,7 @@ fn detect_transform_mismatch(expr: &SymbolicValue, sink_cap: Cap) -> Option { /// Callee's SSA body. pub ssa: &'a SsaBody, @@ -178,7 +178,7 @@ impl<'a> BackwardsCtx<'a> { /// One step of the backwards transfer: given a demand on `value`, compute /// the demand on its immediate SSA operands. Returns the list of -/// `(operand, demand)` pairs — possibly empty if the defining op terminates +/// `(operand, demand)` pairs, possibly empty if the defining op terminates /// the walk (Source/Const/Param). /// /// This is a pure function over the op and demand; cycle detection and @@ -224,7 +224,7 @@ pub fn backward_transfer( SsaOp::CatchParam => (BackwardStep::ReachedCatchParam, SmallVec::new()), SsaOp::Nop => (BackwardStep::Unknown, SmallVec::new()), // Undef is a phi-operand sentinel on edges with no reaching - // definition — nothing to trace backwards through. + // definition, nothing to trace backwards through. SsaOp::Undef => (BackwardStep::ReachedConst, SmallVec::new()), SsaOp::Phi(operands) => { // Demand fans out to every incoming value: the runtime value of @@ -254,7 +254,7 @@ pub fn backward_transfer( .. } => { // For Call ops the full demand transfer depends on callee - // metadata (summary or body). The driver handles that — + // metadata (summary or body). The driver handles that , // return a `BackwardStep::Call` carrying the receiver + args // so the driver can consult [`GlobalSummaries`] / bodies_by_key. let mut flat: SmallVec<[(SsaValue, DemandState); 4]> = SmallVec::new(); @@ -276,7 +276,7 @@ pub fn backward_transfer( SsaOp::FieldProj { receiver, .. } => { // Field projection: demand for `obj.f` flows to `obj`. Treated // structurally like a single-operand Assign for the backwards - // walk — sufficient until Phase 4 introduces field-sensitive + // walk, sufficient until future passes will introduce field-sensitive // demand discrimination. let mut next: SmallVec<[(SsaValue, DemandState); 4]> = SmallVec::new(); next.push((*receiver, demand.clone())); @@ -290,12 +290,12 @@ pub fn backward_transfer( /// resolution. #[derive(Clone, Debug, PartialEq, Eq)] pub enum BackwardStep { - /// Defining op is a tainted [`SsaOp::Source`] — walk terminates with a + /// Defining op is a tainted [`SsaOp::Source`], walk terminates with a /// confirmed flow. ReachedSource(NodeIndex), - /// Defining op is a [`SsaOp::Const`] — walk terminates without a source. + /// Defining op is a [`SsaOp::Const`], walk terminates without a source. ReachedConst, - /// Defining op is an [`SsaOp::Param`] / [`SsaOp::SelfParam`] — walk may + /// Defining op is an [`SsaOp::Param`] / [`SsaOp::SelfParam`], walk may /// continue by resolving the parameter against the caller's arguments /// (requires reverse call-graph expansion, which is out of scope for /// the current cut and is handled as a terminal step). @@ -305,13 +305,13 @@ pub enum BackwardStep { /// the actual exception source requires exception-edge traversal not /// performed here. ReachedCatchParam, - /// Phi node — driver fans out to predecessors. + /// Phi node, driver fans out to predecessors. Phi, - /// Arithmetic / copy / cast — driver fans out to operands. + /// Arithmetic / copy / cast, driver fans out to operands. Assign, - /// Call op — driver consults summaries and/or callee bodies. + /// Call op, driver consults summaries and/or callee bodies. Call { callee: String }, - /// Defining op could not be located or was a [`SsaOp::Nop`] — walk + /// Defining op could not be located or was a [`SsaOp::Nop`], walk /// terminates as inconclusive. Unknown, } @@ -321,7 +321,7 @@ pub enum BackwardStep { /// Walk backwards from `sink_value` in `ctx.ssa`, producing at most one /// [`BackwardFlow`] per reached source (phi fan-outs can produce multiple). /// -/// Does not consult forward findings — the caller is responsible for +/// Does not consult forward findings, the caller is responsible for /// matching the returned flows against its finding set. pub fn analyse_sink_backwards( ctx: &BackwardsCtx<'_>, @@ -385,7 +385,7 @@ fn walk_dfs( // Before dispatching on the SSA op kind, consult the defining CFG node's // label set. Many Source-labelled callables in the CFG lower to an // `SsaOp::Call` rather than `SsaOp::Source` (request.args.get, - // os.getenv, …) — recognising the label here keeps the walk in + // os.getenv, …), recognising the label here keeps the walk in // sync with the forward engine's source model. let def_cfg_node = ctx.ssa.def_of(value).cfg_node; if def_cfg_node.index() < ctx.cfg.node_count() { @@ -429,7 +429,7 @@ fn walk_dfs( }); } BackwardStep::ReachedConst => { - // Constants never supply taint — treat as a silent prune. + // Constants never supply taint, treat as a silent prune. } BackwardStep::ReachedParam { index: _, node } => { // Reverse-call-graph expansion is intentionally left out of the @@ -452,7 +452,7 @@ fn walk_dfs( }); } BackwardStep::ReachedCatchParam => { - // Exception-borne taint — record but don't confirm. Marked + // Exception-borne taint, record but don't confirm. Marked // non-confirmatory so unit tests can distinguish "walk reached // catch-param" from "walk reached source". } @@ -514,7 +514,7 @@ fn walk_dfs( } } // Prevent an unused-variable warning while still accepting - // the key in the matcher — the key is useful for debug + // the key in the matcher, the key is useful for debug // logging in bigger expansions. let _ = callee_key; return; @@ -539,7 +539,7 @@ fn walk_dfs( } } BackwardStep::Unknown => { - // No information — terminate silently. + // No information, terminate silently. } } } @@ -632,12 +632,12 @@ pub const NOTE_BUDGET: &str = "backwards-budget-exhausted"; /// Classification for a forward finding after backwards post-processing. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum FindingVerdict { - /// Backwards reached a matching source — finding corroborated. + /// Backwards reached a matching source, finding corroborated. Confirmed, /// Backwards was inconclusive (no source, not infeasible). Finding /// keeps its forward-assigned confidence. Inconclusive, - /// Backwards proved the flow infeasible — finding confidence must drop. + /// Backwards proved the flow infeasible, finding confidence must drop. Infeasible, /// Budget exhausted before a verdict was reached. BudgetExhausted, @@ -658,7 +658,7 @@ pub fn aggregate_verdict(flows: &[BackwardFlow]) -> FindingVerdict { } /// Apply a verdict as a note on a [`Finding`]. No-ops when the verdict is -/// [`FindingVerdict::Inconclusive`] — the forward finding retains its +/// [`FindingVerdict::Inconclusive`], the forward finding retains its /// original metadata. pub fn annotate_finding(finding: &mut Finding, verdict: FindingVerdict) { // `Finding` does not own an Evidence struct directly (that lives on @@ -1079,6 +1079,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; annotate_finding(&mut f, FindingVerdict::Confirmed); let sv = f.symbolic.as_ref().expect("symbolic verdict created"); @@ -1116,6 +1117,7 @@ mod tests { path_hash: 0, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), }; annotate_finding(&mut f, FindingVerdict::Inconclusive); assert!(f.symbolic.is_none()); diff --git a/src/taint/domain.rs b/src/taint/domain.rs index d8bb2bbb..6728f0f8 100644 --- a/src/taint/domain.rs +++ b/src/taint/domain.rs @@ -13,7 +13,7 @@ pub struct VarTaint { pub uses_summary: bool, } -/// A single taint origin — the node and classification of where taint came from. +/// A single taint origin, the node and classification of where taint came from. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct TaintOrigin { pub node: NodeIndex, @@ -30,7 +30,7 @@ pub struct TaintOrigin { /// # Capacity limit /// /// `SmallBitSet` is a fixed-size 64-slot bitset backed by a single `u64`. -/// Inserting a `SymbolId` with ordinal ≥ 64 is a no-op — the bit is silently +/// Inserting a `SymbolId` with ordinal ≥ 64 is a no-op, the bit is silently /// dropped. This is a deliberate precision-over-completeness trade: the /// bitset underpins predicate / validation tracking in the SSA taint engine, /// and functions with more than 64 distinct predicate-relevant variables are diff --git a/src/taint/mod.rs b/src/taint/mod.rs index 79d76594..25df5e7b 100644 --- a/src/taint/mod.rs +++ b/src/taint/mod.rs @@ -1,4 +1,5 @@ #![allow(clippy::collapsible_if, clippy::too_many_arguments)] +#![doc = include_str!(concat!(env!("OUT_DIR"), "/taint.md"))] pub mod backwards; pub mod domain; @@ -84,7 +85,7 @@ fn js_ts_pass2_cap() -> usize { // // Active only when the slot is `Some`. Production code path leaves it // `None`, making instrumentation cost a single thread-local borrow + a -// `match Option::None` per measured chunk — sub-nanosecond. +// `match Option::None` per measured chunk, sub-nanosecond. thread_local! { static PERF_LOWER_TIMINGS: std::cell::Cell> = const { std::cell::Cell::new(None) }; @@ -112,10 +113,10 @@ fn perf_lower_record(slot: usize, micros: u128) { /// Test-only override for the Gauss-Seidel toggle. Values: /// -/// * `0` — respect `NYX_JS_GAUSS_SEIDEL` env var (default production +/// * `0`, respect `NYX_JS_GAUSS_SEIDEL` env var (default production /// behaviour). -/// * `1` — force Jacobi (env ignored). -/// * `2` — force Gauss-Seidel (env ignored). +/// * `1`, force Jacobi (env ignored). +/// * `2`, force Gauss-Seidel (env ignored). /// /// Used exclusively by integration tests that need to assert both /// variants produce equal findings without per-test process isolation. @@ -209,7 +210,7 @@ pub struct Finding { /// The kind of source that originated the taint. pub source_kind: SourceKind, /// Whether all tainted sink variables are guarded by a validation - /// predicate on this path (metadata only — does not change severity). + /// predicate on this path (metadata only, does not change severity). pub path_validated: bool, /// The kind of validation guard protecting this path, if any. pub guard_kind: Option, @@ -233,7 +234,7 @@ pub struct Finding { /// sink was resolved via a function summary carrying a /// [`crate::summary::SinkSite`] with concrete coordinates for primary /// sink-location attribution. `None` for: - /// * intra-procedural / label-based sinks — the caller's `cfg[sink]` + /// * intra-procedural / label-based sinks, the caller's `cfg[sink]` /// span already names the dangerous instruction; /// * summary-resolved sinks whose `SinkSite` was cap-only (no tree or /// bytes context at extraction time). @@ -245,7 +246,7 @@ pub struct Finding { /// the scan root is the file itself (every namespace normalizes to /// `""`); consumers resolve empty `file_rel` against the file under /// analysis. Enforced at `ssa_events_to_findings` by a - /// `debug_assert!` — upstream filters drop cap-only sites before + /// `debug_assert!`, upstream filters drop cap-only sites before /// they reach this field. /// /// Deliberately independent of `uses_summary`: that flag tracks whether @@ -255,13 +256,13 @@ pub struct Finding { /// `primary_location`. pub primary_location: Option, /// Engine provenance notes recorded during the analysis that produced - /// this finding. Populated when an internal budget/cap was hit — see + /// this finding. Populated when an internal budget/cap was hit, see /// [`crate::engine_notes::EngineNote`]. Empty for the typical /// under-budget finding. pub engine_notes: SmallVec<[EngineNote; 2]>, /// Stable hash of the intermediate-variable sequence between `source` /// and `sink`. Used to keep distinct paths through different - /// variables as separate findings during deduplication — two + /// variables as separate findings during deduplication, two /// `(body_id, sink, source)` siblings with different `path_hash` /// values represent flows along different data paths and are /// preserved as alternatives rather than collapsed. @@ -289,6 +290,13 @@ pub struct Finding { /// formatters can present them as "this flow … and N alternative /// path(s)" rather than silently dropping one. pub alternative_finding_ids: SmallVec<[String; 2]>, + /// Sink-cap mask that this specific finding fired against. Carries the + /// per-event `sink_caps` from the multi-gate dispatch (e.g. + /// `Cap::SSRF` for a URL-flow finding on `fetch`, `Cap::DATA_EXFIL` + /// for a body-flow finding on the same call). Used by `ast.rs` to + /// route the finding to a cap-specific rule id rather than the + /// generic `taint-unsanitised-flow` bucket. + pub effective_sink_caps: crate::labels::Cap, } impl Finding { @@ -425,7 +433,7 @@ pub(crate) fn analyse_file_with_lowered( // 3. Unified multi-body analysis with lexical containment propagation. // - // `max_iterations` is the safety cap, not an expected depth — the + // `max_iterations` is the safety cap, not an expected depth, the // pass-2 loop breaks on seed equality (monotone lattice, finite // height) and only rides the cap when convergence legitimately // needs more rounds than the cap allows. See @@ -481,7 +489,7 @@ pub(crate) fn analyse_file_with_lowered( // dedup_by_key(|f| (body_id, sink, source)); // // which silently collapsed an *unguarded* flow reaching the same - // `(sink, source)` as a guarded flow — the `!path_validated` sort + // `(sink, source)` as a guarded flow, the `!path_validated` sort // ordered `path_validated == true` first, so the exploitable // branch was the one that got dropped. // @@ -541,7 +549,7 @@ fn make_finding_id(f: &Finding) -> String { /// Cross-link findings that share `(body_id, sink, source)` but differ /// on `path_validated` or `path_hash`. After this call each such /// finding's `alternative_finding_ids` lists every sibling's -/// [`Finding::finding_id`] — so a guarded flow links to the unguarded +/// [`Finding::finding_id`], so a guarded flow links to the unguarded /// sibling and vice versa. Isolated findings (no sibling) get an /// empty list. fn link_alternative_paths(findings: &mut [Finding]) { @@ -576,7 +584,7 @@ fn link_alternative_paths(findings: &mut [Finding]) { /// Compute containment-topological order: parent bodies before children. /// /// Uses BFS from roots (bodies with no parent), ensuring a body is always -/// processed after its parent — required for lexical seed propagation. +/// processed after its parent, required for lexical seed propagation. /// Returns indices into `file_cfg.bodies` in processing order. fn containment_order(bodies: &[BodyCfg]) -> Vec { let mut children: HashMap> = HashMap::new(); @@ -637,7 +645,7 @@ fn analyse_body_with_seed( // Per-body graphs contain only the body's own nodes. // For non-toplevel bodies, use lower_to_ssa_with_params with scope to // create SsaOp::Param ops for external/captured variables and formal - // parameters — required for global_seed to inject taint from the parent. + // parameters, required for global_seed to inject taint from the parent. // Top-level bodies use lower_to_ssa with scope_all=true (no Param ops). let is_toplevel = body.meta.parent_body_id.is_none(); // JS/TS function bodies always use scoped lowering to create Param ops @@ -708,12 +716,9 @@ fn analyse_body_with_seed( } else { Some(static_map) }; - // Pointer-Phase 3 / W1+W2+W3: per-body field-sensitive points-to - // facts. Computed only when `NYX_POINTER_ANALYSIS=1`; the - // per-body `analyse_body` cost is amortised across the three - // hooks (W1 field-write read-back, W2 container ELEM cells, - // W3 cross-call resolver). Strict-additive: `None` keeps - // pointer-disabled behaviour bit-identical. + // Per-body field-sensitive points-to facts. Cost is + // amortised across field-write read-back, container ELEM + // cells, and the cross-call resolver. let pointer_facts = if crate::pointer::is_enabled() { Some(crate::pointer::analyse_body(&ssa_body, body.meta.id)) } else { @@ -836,7 +841,7 @@ fn analyse_body_with_seed( Err(e) => { // SSA lowering produced no analyzable body. We still surface // the event so downstream tooling can tell "we tried and gave - // up" from "we ran clean" — a TRACE-level log records the + // up" from "we ran clean", a TRACE-level log records the // reason (no synthetic Finding is manufactured because a // diag pointing at no source location would be misleading). tracing::trace!( @@ -948,7 +953,7 @@ fn analyse_multi_body( let top_cfg = &top.graph; // Collect top-level binding keys for seed filtering. Always - // keyed under `BodyId(0)` — `filter_seed_to_toplevel` matches + // keyed under `BodyId(0)`, `filter_seed_to_toplevel` matches // by name and re-keys every surviving entry to `BodyId(0)` // anyway, so the body_id on the probe keys is informational. let toplevel_keys: HashSet = { @@ -969,7 +974,7 @@ fn analyse_multi_body( // re-analysis when a name it reads via Param or via the // global_seed ancestor-lookup path has actually changed in // the combined seed. `reads` is a superset of the body's - // top-level dependencies — we err on the side of over-running + // top-level dependencies, we err on the side of over-running // (false dirty) rather than missing a dependency. let body_reads: HashMap> = { let mut m: HashMap> = HashMap::new(); @@ -1060,7 +1065,7 @@ fn analyse_multi_body( // Re-run non-toplevel bodies with updated seed. body_exit_states.insert(BodyId(0), current_seed.clone()); - // Phase-C: Gauss-Seidel variant — as each body is + // Phase-C: Gauss-Seidel variant, as each body is // re-analysed, merge its new exit into `current_seed` // immediately so subsequent bodies in the same round see // the fresh value. Order matters here; we pin to @@ -1137,7 +1142,7 @@ fn analyse_multi_body( // Record observability counter. `iters_used == 0` covers the // non-JS/TS path (`max_iterations == 1`) and the JS/TS case where - // the convergence loop did not enter — report `1` so the counter + // the convergence loop did not enter, report `1` so the counter // always reflects "at least the lexical-containment pass ran". let reported_iters = if iters_used == 0 { 1 } else { iters_used }; LAST_JS_TS_PASS2_ITERATIONS.store(reported_iters, Ordering::Relaxed); @@ -1287,7 +1292,7 @@ fn lookup_formal_params(local_summaries: &FuncSummaries, func_name: &str) -> Vec /// When exactly one `(name, arity)`-matching entry exists we use its full /// identity (container / disambig / kind preserved). When zero or multiple /// match we fall back to a free-function key so the caller still has a -/// well-formed key — this can only happen in legacy discovery paths that +/// well-formed key, this can only happen in legacy discovery paths that /// cannot see through same-name siblings, and those paths were already /// collision-prone before this refactor. New intra-file analysis code /// should prefer [`BodyMeta::func_key`]. @@ -1300,7 +1305,7 @@ fn lookup_canonical_func_key( ) -> FuncKey { // `local_summaries` is file-local, so every entry's namespace agrees with // whatever `build_cfg` wrote (raw file path). We match by lang + name + - // arity and fall back to name-only — the caller's `namespace` argument is + // arity and fall back to name-only, the caller's `namespace` argument is // only used when we have to synthesise a key as a last resort. let mut matches = local_summaries .keys() @@ -1372,7 +1377,7 @@ pub(crate) fn extract_intra_file_ssa_summaries( .count() }; - // Zero-param helpers are normally elided — a fixture with no + // Zero-param helpers are normally elided, a fixture with no // parameters cannot carry per-parameter taint transforms. But // zero-arg factories (`function makeBag() { return []; }`) do // have one observable cross-file effect: the return is a fresh @@ -1409,7 +1414,7 @@ pub(crate) fn extract_intra_file_ssa_summaries( // must survive this filter so summary application at cross-file // call sites can replay the alias edges. Zero-param factories // are kept via the `returns_fresh_alloc` leg of - // `points_to.is_empty()` — `is_empty()` returns false when the + // `points_to.is_empty()`, `is_empty()` returns false when the // fresh-alloc flag is set. if !summary.param_to_return.is_empty() || !summary.param_to_sink.is_empty() @@ -1436,7 +1441,7 @@ pub(crate) fn extract_intra_file_ssa_summaries( } /// Lower all function bodies from `FileCfg` to produce SSA summaries + cached -/// bodies. Each body's own graph is used directly — no scope filtering needed. +/// bodies. Each body's own graph is used directly, no scope filtering needed. /// /// Both returned maps are keyed by each body's canonical [`FuncKey`] (carried /// on [`crate::cfg::BodyMeta::func_key`]). This is the most collision- @@ -1503,7 +1508,7 @@ pub(crate) fn lower_all_functions_from_bodies( // `build_cfg` wrote. The caller passes `namespace` already normalized // against `scan_root`, which is what FuncSummary keys use on the // cross-file side (`FuncSummary::func_key`). Overriding the namespace - // here keeps both sides of `GlobalSummaries` agreement — otherwise + // here keeps both sides of `GlobalSummaries` agreement, otherwise // `resolve_callee` resolves to the normalized FuncSummary key and // misses the raw-path SSA entry. let mut key = body.meta.func_key.clone().unwrap_or_else(|| { @@ -1542,7 +1547,7 @@ pub(crate) fn lower_all_functions_from_bodies( // Always insert the summary, even when all fields are empty/default. // An empty summary tells resolve_callee "this function exists and has - // no taint effects" — preventing fallthrough to the less precise old + // no taint effects", preventing fallthrough to the less precise old // FuncSummary which may report false source_caps from internal sources. // For zero-param functions we only insert when the summary carries // the fresh-container signal (the only observable effect worth @@ -1563,34 +1568,23 @@ pub(crate) fn lower_all_functions_from_bodies( perf_lower_record(2, _t_opt.elapsed().as_micros()); let _t_typed = std::time::Instant::now(); - // Phase 2 (typed call-graph devirtualisation): walk every SSA - // method call in this body, look up the receiver SSA value's - // [`crate::ssa::type_facts::TypeKind`] in the just-computed - // `opt.type_facts`, and record `(call_ordinal, container_name)` - // on the matching summary so Phase 3 in `build_call_graph` can - // narrow the indirect-method-call edge to the receiver-typed - // container. Free-function calls (`receiver: None`) and - // unknown receiver types are silently skipped — the bare-name - // resolution path applies unchanged in that case. + // For every SSA method call, look up the receiver's TypeKind + // and record `(call_ordinal, container_name)` so devirtualisation + // in `build_call_graph` can narrow the edge to the receiver-typed + // container. Free-function calls and unknown types fall back to + // bare-name resolution. let typed_receivers = collect_typed_call_receivers(&func_ssa, &body.graph, &opt.type_facts); if !typed_receivers.is_empty() { - // The summary may not have been inserted above (zero-param, - // no-fresh-alloc bodies are skipped). Force-insert in that - // case so the receiver-type info reaches Phase 3 — without - // it, the cross-file devirtualisation signal would be lost - // for any method invoked inside a parameterless caller. + // Zero-param/no-fresh-alloc bodies are skipped above; + // force-insert so receiver-type info still reaches + // build_call_graph. let entry = summaries.entry(key.clone()).or_default(); entry.typed_call_receivers = typed_receivers; } - // Pointer-Phase 5 / W3: populate `field_points_to` from the - // body's pointer facts when the analysis is enabled. Strict - // opt-in via `NYX_POINTER_ANALYSIS=1`; off-by-default keeps - // bit-for-bit identity with the pre-W3 behaviour. - // - // `extract_field_points_to` covers both reads (via - // `SsaOp::FieldProj` walks) and writes (via the W1 - // `field_writes` side-table on the body) in a single pass. + // Populate `field_points_to` from the body's pointer facts. + // `extract_field_points_to` covers both reads (FieldProj walks) + // and writes (`field_writes` side-table) in one pass. if crate::pointer::is_enabled() { let facts = crate::pointer::analyse_body(&func_ssa, body.meta.id); let fpt = crate::pointer::extract_field_points_to(&func_ssa, &facts); @@ -1621,7 +1615,7 @@ pub(crate) fn lower_all_functions_from_bodies( // Lift child-body sinks into the parent's `param_to_sink` for // every parent body with lexically contained children. This // handles the direct-wrapper case - // `f(x) { return new Promise((res, rej) => sink(x)) }` — the + // `f(x) { return new Promise((res, rej) => sink(x)) }`, the // executor's gated http.get sink becomes visible to callers of // `f` via `f.summary.param_to_sink`. // @@ -1635,8 +1629,8 @@ pub(crate) fn lower_all_functions_from_bodies( // propagation at summary-extraction time so cross-call // resolution sees the sink at every caller of `f`. // - // Strict-additive: only ADDs `param_to_sink` entries — never - // removes or modifies existing data — so it cannot regress + // Strict-additive: only ADDs `param_to_sink` entries, never + // removes or modifies existing data, so it cannot regress // detection. Bounded: each parent-param probe runs each child // body's analysis exactly once. let _t_aug = std::time::Instant::now(); @@ -1665,7 +1659,7 @@ pub(crate) fn lower_all_functions_from_bodies( // OR-merge: only adds `param_to_sink` / `param_to_sink_param` // entries to existing summaries. Existing entries (return // transforms, source caps, augment-populated sinks, etc.) are - // preserved. Strict-additive — cannot regress detection. + // preserved. Strict-additive, cannot regress detection. let _t_rerun = std::time::Instant::now(); rerun_extraction_with_augmented_summaries( file_cfg, @@ -1919,7 +1913,7 @@ fn augment_summaries_with_child_sinks( let parent_interner = crate::state::symbol::SymbolInterner::from_cfg(parent_cfg); // Collect (formal_param_idx, var_name, ssa_value) for the parent's - // formal params — mirrors `extract_ssa_func_summary`'s param scan. + // formal params, mirrors `extract_ssa_func_summary`'s param scan. let mut parent_param_info: Vec<(usize, String)> = Vec::new(); for block in &parent_ssa.blocks { for inst in block.phis.iter().chain(block.body.iter()) { @@ -2055,7 +2049,7 @@ fn augment_summaries_with_child_sinks( } // Aggregate sink caps across all child events into one - // entry per parent param (cap-only SinkSite — the + // entry per parent param (cap-only SinkSite, the // exact location lives in the child body's CFG and is // not directly addressable from the parent's summary). let mut union_caps = Cap::empty(); @@ -2088,7 +2082,7 @@ fn augment_summaries_with_child_sinks( // engine's primary sink-site picker uses // `param_to_sink_param` for arg-position filtering) // sees this captured-flow sink. Position 0 is a - // best-effort placeholder — the actual filtering at + // best-effort placeholder, the actual filtering at // the caller is by SSRF cap, not arg position, when // the wrapper is itself non-gated. if !entry @@ -2109,7 +2103,7 @@ fn augment_summaries_with_child_sinks( /// non-empty [`crate::ssa::type_facts::TypeKind::container_name`]. /// /// Free-function calls (`receiver: None`) and unknown receiver types -/// are skipped — the cross-file call-graph builder will fall back to +/// are skipped, the cross-file call-graph builder will fall back to /// today's name-only resolution for those, preserving the /// "subset of today's targets, never a superset" invariant from /// `docs/typed-call-graph-prompt.md`. @@ -2135,13 +2129,13 @@ fn collect_typed_call_receivers( continue; }; let Some(receiver_val) = receiver else { - continue; // free-function call — no devirtualisation possible + continue; // free-function call, no devirtualisation possible }; let Some(kind) = type_facts.get_type(*receiver_val) else { - continue; // type unknown — fall back to name-only resolution + continue; // type unknown, fall back to name-only resolution }; let Some(container) = kind.container_name() else { - continue; // scalar/unknown type — no useful container + continue; // scalar/unknown type, no useful container }; let Some(node_info) = cfg.node_weight(inst.cfg_node) else { continue; @@ -2150,7 +2144,7 @@ fn collect_typed_call_receivers( // A single SSA call instruction maps 1:1 with a CFG call // node, so each ordinal should appear at most once. The // dedup guard exists in case lowering ever introduces a - // second SSA Call sharing a cfg_node — first wins. + // second SSA Call sharing a cfg_node, first wins. if !seen.insert(ordinal) { continue; } @@ -2211,7 +2205,7 @@ pub(crate) fn build_eligible_bodies( continue; } // Populate node metadata against the per-body graph whose NodeIndex - // space the SSA was produced on — otherwise cross-file replay can't + // space the SSA was produced on, otherwise cross-file replay can't // find the original CFG nodes. // // `key.namespace` was already normalised against `scan_root` in diff --git a/src/taint/path_state.rs b/src/taint/path_state.rs index 5b7a3027..692dca3d 100644 --- a/src/taint/path_state.rs +++ b/src/taint/path_state.rs @@ -35,13 +35,13 @@ pub enum PredicateKind { /// Commonly paired with [`ShellMetaValidated`] in OR-chain rejection /// idioms (`if x.len() > MAX || x.contains(";") { reject }`). Counts as /// a dominator guard for `cfg-unguarded-sink` purposes, but intentionally - /// does **not** mark variables as validated — the rejection direction is + /// does **not** mark variables as validated, the rejection direction is /// ambiguous from the condition alone (a `.len() > 5 { sink(x) }` /// gate is a precondition, not a rejection). BoundedLength, /// Comparison operators: `x == 5`, `x > threshold` Comparison, - /// Generic boolean test — cannot classify further. + /// Generic boolean test, cannot classify further. Unknown, } @@ -50,7 +50,7 @@ pub enum PredicateKind { /// /// Presence of any of these in user input is sufficient to enable shell /// injection, so rejecting input that contains them is a real sanitizer. -/// `"foo"` or other non-metachar needles don't qualify — a rejection of +/// `"foo"` or other non-metachar needles don't qualify, a rejection of /// those is business logic, not security. const SHELL_METACHARS: &[&str] = &[";", "|", "&", "`", "$", ">", "<", "\n", "\r", "\0"]; @@ -65,7 +65,7 @@ const SHELL_METACHARS: &[&str] = &[";", "|", "&", "`", "$", ">", "<", "\n", "\r" /// character class containing only metacharacters. /// /// Returns `false` if the needle is a non-metachar literal or cannot be -/// extracted — falls through to broader classification. +/// extracted, falls through to broader classification. fn is_shell_metachar_rejection(text: &str) -> bool { // Method-call form: `.contains(…)` / `.includes(…)` / `.include?(…)` for method in [".contains(", ".includes(", ".include?("] { @@ -134,7 +134,7 @@ fn extract_first_string_arg(after_open: &str) -> Option { } /// For Python `"" in x` (needle on the left side of ` in `), return -/// the needle. Returns `None` for `x in ALLOWED` (identifier on the left) — +/// the needle. Returns `None` for `x in ALLOWED` (identifier on the left) , /// that is an allowlist check, not a rejection. fn extract_python_in_needle(text: &str) -> Option { let pos = text.find(" in ")?; @@ -155,7 +155,7 @@ fn extract_python_in_needle(text: &str) -> Option { /// Detect regex character classes that contain only shell metacharacters: /// `[;|&]`, `[;&`$]`, etc. Missing: escape-class metacharacters inside the -/// class (e.g. `[\n]`) — conservative, returns false there. +/// class (e.g. `[\n]`), conservative, returns false there. fn is_metachar_regex_class(text: &str) -> bool { // Find `[` followed by content and `]`, anywhere in the text. let mut rest = text; @@ -180,7 +180,7 @@ fn is_metachar_regex_class(text: &str) -> bool { /// Check whether `text` looks like a bounded-length rejection: /// `x.len() > N`, `x.len() < N`, `x.length >= N`, etc. where `N` is an -/// integer literal >= 2. Excludes `> 0` / `>= 1` / `< 1` — those are +/// integer literal >= 2. Excludes `> 0` / `>= 1` / `< 1`, those are /// non-empty checks, which are not length-bound validations. fn is_bounded_length_check(lower: &str) -> bool { const PROBES: &[&str] = &[ @@ -290,7 +290,7 @@ pub fn classify_condition(text: &str) -> PredicateKind { // Matched BEFORE AllowlistCheck so that `x.contains(";")` is recognized // as a rejection idiom rather than a membership test. Checked on the // raw (non-lowercased) text so metacharacter comparisons stay - // case-accurate — `;` / `|` / `&` have no case. + // case-accurate, `;` / `|` / `&` have no case. if is_shell_metachar_rejection(text) { return PredicateKind::ShellMetaValidated; } @@ -409,7 +409,7 @@ pub fn classify_condition(text: &str) -> PredicateKind { /// validator's effect is opaque: we can't tell which argument is being /// checked. Returning the original kind with `None` target would cause /// upstream code to over-validate (mark every `condition_var` as validated). -/// Instead, we fall back to `PredicateKind::Unknown` — safer to assume the +/// Instead, we fall back to `PredicateKind::Unknown`, safer to assume the /// validator did nothing than to assume it validated every variable in the /// condition. Single-argument calls retain `(kind, None)` so downstream code /// can still use the predicate-summary bit tracking. @@ -442,7 +442,7 @@ pub fn classify_condition_with_target(text: &str) -> (PredicateKind, Option { - // `x === '/login'`, `x == 5`, `null != obj` — when exactly one + // `x === '/login'`, `x == 5`, `null != obj`, when exactly one // side is a literal, extract the identifier side as the target. // Downstream `apply_branch_predicates` uses this to mark the // variable as `validated_may` on the true (equal) branch. @@ -464,7 +464,7 @@ pub fn classify_condition_with_target(text: &str) -> (PredicateKind, Option Option { let trimmed = text.trim(); @@ -537,7 +537,7 @@ fn is_comparison_literal(s: &str) -> bool { /// `Some(0)` for a call with empty argument list. Respects paren/bracket/brace /// nesting so `f(g(a, b), c)` counts as 2 top-level args. /// -/// Best-effort — operates on source text, not an AST. Used by +/// Best-effort, operates on source text, not an AST. Used by /// `classify_condition_with_target` to distinguish single-arg vs multi-arg /// validator calls when target extraction fails. fn count_call_args(text: &str) -> Option { @@ -592,7 +592,7 @@ fn extract_validation_target(text: &str) -> Option { } } - // Function call pattern: `func(x, ...)` — extract first argument + // Function call pattern: `func(x, ...)`, extract first argument // Strip closing paren if present let args_inner = args_part.trim_end().strip_suffix(')').unwrap_or(args_part); // Take text up to first comma (first argument) @@ -653,7 +653,7 @@ fn extract_allowlist_target(text: &str) -> Option { // Python `in` operator: `cmd in ALLOWED` / `cmd not in ALLOWED` if lower.contains(" in ") { - // Find the leftmost ` in ` — everything before it is the target expression + // Find the leftmost ` in `, everything before it is the target expression // Handle `not in` by looking for ` not in ` first let target_part = if let Some(pos) = lower.find(" not in ") { &trimmed[..pos] @@ -857,7 +857,7 @@ mod tests { #[test] fn classify_validation_requires_paren() { - // `x_valid == true` should NOT be ValidationCall — no `(` call syntax. + // `x_valid == true` should NOT be ValidationCall, no `(` call syntax. assert_eq!( classify_condition("x_valid == true"), PredicateKind::Comparison @@ -978,7 +978,7 @@ mod tests { #[test] fn target_multi_arg_fallback_opaque_expr_is_unknown() { - // `validate(x + 1, y)` — first arg is an expression, not an identifier. + // `validate(x + 1, y)`, first arg is an expression, not an identifier. // Target extraction fails. Multi-arg call, so fall back to Unknown // rather than letting upstream validate every condition var. let (kind, target) = classify_condition_with_target("validate(x + 1, y)"); diff --git a/src/taint/ssa_transfer/events.rs b/src/taint/ssa_transfer/events.rs index 3f9e4388..df9c0cf7 100644 --- a/src/taint/ssa_transfer/events.rs +++ b/src/taint/ssa_transfer/events.rs @@ -1,9 +1,9 @@ //! Taint event emission and conversion to [`crate::taint::Finding`]. //! //! Extracted from the monolithic `ssa_transfer.rs`. Contains: -//! * [`SsaTaintEvent`] — the raw event struct produced by the block-level +//! * [`SsaTaintEvent`], the raw event struct produced by the block-level //! worklist each time a tainted value reaches a sink. -//! * [`ssa_events_to_findings`] — event → `Finding` conversion with the +//! * [`ssa_events_to_findings`], event → `Finding` conversion with the //! `primary_location` invariant and dedup. //! * Flow-path reconstruction helpers ([`reconstruct_flow_path`] and //! operand pickers). @@ -38,14 +38,14 @@ pub struct SsaTaintEvent { /// `sink_caps`. When multiple [`SinkSite`]s for the same `(param_idx, /// cap mask)` match, the emission site produces one event per /// [`SinkSite`] so each downstream [`crate::taint::Finding`] carries a - /// single primary attribution — the multi-primary case collapses to + /// single primary attribution, the multi-primary case collapses to /// multiple single-primary events. /// /// `None` for: /// * intra-procedural sinks (`uses_summary == false`), where the /// caller's sink span already names the dangerous instruction; /// * summary-resolved sinks whose callee summary carried only cap-only - /// [`SinkSite`]s (no source coordinates — e.g. pass-2 transient + /// [`SinkSite`]s (no source coordinates, e.g. pass-2 transient /// summaries or local `LocalFuncSummary`-only callees). pub primary_sink_site: Option, } @@ -79,7 +79,7 @@ pub(super) fn block_distance(ssa: &SsaBody, source_node: NodeIndex, sink_node: N } } } - 0 // unreachable or not connected — conservative default + 0 // unreachable or not connected, conservative default } // ── Flow Path Reconstruction ───────────────────────────────────────────── @@ -204,7 +204,7 @@ pub(super) fn reconstruct_flow_path( SsaOp::FieldProj { receiver, .. } => { // Treat field projection as a one-step assignment for // flow-step reconstruction: taint reaching `obj.f` came - // from `obj`. Phase 4 will refine the witness rendering + // from `obj`. the analysis may refine the witness rendering // to include the field name in the step. steps.push(FlowStepRaw { cfg_node: inst.cfg_node, @@ -270,7 +270,7 @@ fn pick_tainted_operand_call( /// /// Note: this invariant is intentionally independent of `uses_summary`. /// The taint-chain flag tracks summary-propagated *taint*, not summary- -/// resolved *sinks* — a local source can reach a cross-file sink, so +/// resolved *sinks*, a local source can reach a cross-file sink, so /// `primary_location.is_some()` does not imply `uses_summary == true`. pub fn ssa_events_to_findings( events: &[SsaTaintEvent], @@ -329,7 +329,7 @@ pub fn ssa_events_to_findings( // Data-integrity invariant: a populated primary_location must at least // carry resolved line coordinates. `file_rel` may legitimately be - // empty — when the scan root is the caller file itself (single-file + // empty, when the scan root is the caller file itself (single-file // scans), every namespace normalizes to `""` and the callee's site // inherits that empty path; consumers resolve it against the file // under analysis. Line==0 is the only filter-worthy invariant. @@ -340,7 +340,7 @@ pub fn ssa_events_to_findings( // Dedup key includes primary location so multi-site events that // share a single (source, sink) pair still produce distinct findings - // — one per resolved callee-internal site. + //, one per resolved callee-internal site. let loc_key = primary_location .as_ref() .map(|l| (l.file_rel.clone(), l.line, l.col)); @@ -374,6 +374,11 @@ pub fn ssa_events_to_findings( path_hash, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + // Per-event mask from the multi-gate dispatch, picks + // exactly the cap that fired (e.g. `Cap::DATA_EXFIL` + // for a `fetch` body-flow finding versus `Cap::SSRF` + // for a URL-flow finding on the same call). + effective_sink_caps: event.sink_caps & *caps, }); } } diff --git a/src/taint/ssa_transfer/inline.rs b/src/taint/ssa_transfer/inline.rs index 65a46ce3..c3c6d4df 100644 --- a/src/taint/ssa_transfer/inline.rs +++ b/src/taint/ssa_transfer/inline.rs @@ -1,34 +1,10 @@ -//! Context-sensitive inline analysis — cache, body, and attribution types. +//! Context-sensitive inline analysis, cache, body, and attribution types. //! -//! Extracted from the monolithic `ssa_transfer.rs`. Contains: -//! * [`ArgTaintSig`] — compact per-arg cap signature used as a cache key. -//! * [`InlineResult`] / [`CachedInlineShape`] / [`ReturnShape`] — the -//! callsite-adapted and callsite-agnostic inline-analysis result types. -//! * [`InlineCache`] — the shared cache map keyed by -//! `(FuncKey, ArgTaintSig)`. -//! * [`CrossFileNodeMeta`] / [`CalleeSsaBody`] — the serde-able bodies -//! persisted to SQLite for cross-file context-sensitive analysis. -//! * [`populate_node_meta`] / [`rebuild_body_graph`] — bookkeeping for -//! cross-file body proxy CFGs. -//! -//! The implementation functions (`inline_analyse_callee`, -//! `apply_cached_shape`, `extract_inline_return_taint`) remain in the -//! parent `mod.rs` because they depend tightly on the block worklist, the -//! `run_ssa_taint_full` entry point, and the callee-resolution pipeline. -//! -//! # Cache key scope and origin attribution -//! -//! The inline-analysis cache below ([`InlineCache`]) is keyed by -//! `(FuncKey, ArgTaintSig)`, where [`ArgTaintSig`] encodes **per-arg -//! capability bits only** — not the identity of the source -//! [`crate::taint::domain::TaintOrigin`]s that produced those caps. The -//! stored value ([`CachedInlineShape`]) captures **only the structural** -//! shape of the callee's return taint: return caps, callee-internal -//! origins (from `Source` ops inside the callee body), and per-parameter -//! provenance flags that record which formal parameters contributed to -//! the return. Caller-specific origin identity is *not* stored — it is -//! re-attributed at cache-apply time from the current call site's -//! argument taint. +//! The cache ([`InlineCache`]) is keyed by `(FuncKey, ArgTaintSig)`, +//! where [`ArgTaintSig`] is per-arg cap bits only (not origin identity). +//! Stored values ([`CachedInlineShape`]) capture the structural shape of +//! the callee's return taint; caller-specific origins are re-attributed +//! at apply time. use crate::labels::Cap; use crate::ssa::ir::{SsaBody, Terminator}; @@ -42,61 +18,30 @@ use std::collections::HashMap; /// Maximum SSA blocks in a callee body before skipping inline analysis. pub(super) const MAX_INLINE_BLOCKS: usize = 500; -/// Compact cache key: per-arg-position cap bits (sorted, non-empty only). -/// -/// Two calls with identical `ArgTaintSig` produce identical inline results -/// for soundness purposes (return caps, callee-internal sink activations). -/// Origin identity is **not** part of the key — see the module-level note -/// above on origin-attribution non-determinism. +/// Compact cache key: per-arg-position cap bits (sorted, non-empty +/// only). Origin identity is not part of the key. #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub(crate) struct ArgTaintSig(pub(super) SmallVec<[(usize, u16); 4]>); -/// Call-site-adapted result of inline-analyzing a callee. -/// -/// Constructed fresh per call site by `apply_cached_shape` from a stored -/// [`CachedInlineShape`]; carries origins that point to the *current* -/// caller's source chain, not to whichever caller first populated the -/// cache entry. +/// Call-site-adapted result of inline-analyzing a callee. Built fresh +/// per call site so origins point to the current caller's chain. #[derive(Clone, Debug)] pub(crate) struct InlineResult { - /// Taint on the return value after inline analysis. pub(super) return_taint: Option, - /// PathFact on the return value after inline analysis. - /// - /// Non-top when the callee's body provably narrows the - /// [`crate::abstract_interp::PathFact`] of the value it returns (for - /// example, a `sanitize_path(s) -> Option` helper that - /// early-returns on `s.contains("..")` / `s.starts_with('/')`). At - /// apply time the caller sets its call-result SSA value's PathFact to - /// this narrowed fact, so downstream FILE_IO sinks see the sanitised - /// axis regardless of whether a named label-rule exists for the - /// helper. Top when the callee produces no narrowing — matches - /// pre-PathFact behaviour exactly. + /// PathFact on the return value. Non-top when the callee body + /// provably narrows it (e.g. a `sanitize_path` early-returning on + /// `s.contains("..")`). pub(super) return_path_fact: crate::abstract_interp::PathFact, - /// Per-return-path decomposition of [`Self::return_path_fact`]. - /// - /// Non-empty when the callee has ≥2 distinct return blocks whose - /// predicate gates differ. Match-arm-sensitive callers pick the - /// entry whose `variant_inner_fact` matches the arm binding's - /// variant; path-resolvable callers may refuse infeasible entries. - /// Callers unable to distinguish paths still consult - /// [`Self::return_path_fact`] (the join of all entries) and see - /// pre-decomposition behaviour. + /// Per-return-path decomposition of `return_path_fact`. Non-empty + /// when the callee has ≥2 return blocks with different predicate + /// gates. #[allow(dead_code)] pub(super) return_path_facts: SmallVec<[PathFactReturnEntry; 2]>, } -/// Structural (callsite-agnostic) summary of an inline-analyzed callee. -/// -/// Stored in [`InlineCache`] in place of a fully-attributed `InlineResult`. -/// Origin-identity information that depends on the caller's argument chain -/// is *not* kept here; instead, [`ReturnShape::param_provenance`] -/// records which callee parameter positions contributed seed taint to the -/// return, and the actual caller origins are re-unioned in at apply time. -/// -/// `None` means "this callee produced no return taint for the given -/// argument shape". A cached `None` is still a meaningful result — it -/// short-circuits re-analysis on subsequent calls with matching caps. +/// Structural (callsite-agnostic) summary of an inline-analyzed +/// callee. `None` means "no return taint for this arg shape", still +/// meaningful, short-circuits subsequent calls with matching caps. #[derive(Clone, Debug)] pub(crate) struct CachedInlineShape(pub(super) Option); @@ -107,7 +52,7 @@ pub(crate) struct CachedInlineShape(pub(super) Option); /// origins. See the module-level note above on origin attribution. #[derive(Clone, Debug)] pub(crate) struct ReturnShape { - /// Return value caps (cap bits only — structural). + /// Return value caps (cap bits only, structural). pub(super) caps: Cap, /// Origins produced **inside the callee body** (e.g. `Source` op fired /// in the callee). `node` is set to a placeholder; at apply time the @@ -115,31 +60,19 @@ pub(crate) struct ReturnShape { /// stable (from the callee CFG) and preserved as-is. pub(super) internal_origins: SmallVec<[TaintOrigin; 2]>, /// Bit i set = callee's `Param(i)` seed taint reached the return value. - /// At apply time, caller's argument origins at matching positions are - /// unioned into the applied `VarTaint`. Params beyond index 63 are - /// dropped (matching `SmallBitSet` semantics); the capped case is rare - /// and still yields cap-correct results. + /// At apply time, caller arg origins at matching positions are + /// unioned into the applied `VarTaint`. Params beyond 63 are + /// dropped (matches `SmallBitSet`); rare and still cap-correct. pub(super) param_provenance: u64, - /// Whether the receiver (`SelfParam`) seed taint flowed to the return. + /// Whether the receiver (`SelfParam`) seed taint flowed to return. pub(super) receiver_provenance: bool, - /// Whether the applied `VarTaint` should be tagged `uses_summary`. pub(super) uses_summary: bool, - /// PathFact of the return value observed from the callee's exit - /// abstract state. Cache-safe because the callee is inline-analysed - /// with [`crate::abstract_interp::PathFact::top`] Param seeds — the - /// resulting fact describes the callee's intrinsic narrowing (e.g. - /// the `Some` arm of a `sanitize(..) -> Option` body - /// proves `dotdot = No`) and does not depend on caller-side - /// narrowing of the argument's PathFact. Top when the callee does - /// not narrow. + /// PathFact of the return value, observed from the callee exit + /// state under Top-seeded Params. Describes the callee's intrinsic + /// narrowing. pub(super) return_path_fact: crate::abstract_interp::PathFact, - /// Per-return-path [`PathFact`] decomposition of the return value. - /// - /// Populated alongside [`Self::return_path_fact`] when the callee - /// has ≥2 distinct return blocks with different predicate gates. - /// Cache-safe for the same reason as `return_path_fact`: entries - /// describe callee-intrinsic narrowing under Top-seeded Params. - /// Empty when no per-path distinction was observed. + /// Per-return-path decomposition of the return value. Populated + /// when the callee has ≥2 return blocks with different predicates. pub(super) return_path_facts: SmallVec<[PathFactReturnEntry; 2]>, } @@ -151,50 +84,21 @@ impl CachedInlineShape { } } -/// Cache for context-sensitive inline analysis results. -/// -/// Keyed by the callee's canonical [`FuncKey`] rather than a bare function -/// name so that same-name definitions (e.g. two `process/1` methods on -/// different classes in the same file) never share or overwrite each -/// other's cache entries. Values are stored as [`CachedInlineShape`]; see -/// the module-level note above for why origins are stripped from the -/// cache value and re-attributed at apply time. +/// Cache for context-sensitive inline analysis results, keyed by +/// canonical [`FuncKey`] so same-name definitions in different scopes +/// never collide. pub(crate) type InlineCache = HashMap<(FuncKey, ArgTaintSig), CachedInlineShape>; -/// Drop every entry from an inline cache, marking the start of a new -/// convergence epoch. -/// -/// Cross-file SCC fixed-point iteration runs pass 2 repeatedly until the -/// merged summaries stop changing. Between iterations the callee-summary -/// inputs to inline analysis may have changed, so results cached under a -/// stale snapshot must not leak into the next iteration — otherwise the -/// engine could converge to a non-fixed-point (reporting a taint result -/// that would not reproduce on a fresh run of the same file order). -/// -/// The per-file inline cache is already reconstructed fresh at the top of -/// each [`crate::taint::analyse_file`] call, so in the current code this -/// call is effectively a no-op plumbing hook. Keeping the method (instead -/// of relying on ambient re-construction) makes the lifecycle explicit for -/// any future refactor that moves the cache up into the SCC orchestrator. -#[allow(dead_code)] // semantic hook; used by tests and future shared-cache refactor +/// Drop every entry from the inline cache between SCC fixpoint +/// iterations so stale results don't leak forward. +#[allow(dead_code)] pub(crate) fn inline_cache_clear_epoch(cache: &mut InlineCache) { cache.clear(); } -/// Set-equal fingerprint of an inline cache, used by the SCC orchestrator -/// to detect when cross-file inline analysis has reached a fixed point -/// alongside summary convergence. -/// -/// Returns a `HashMap` mapping each `(FuncKey, ArgTaintSig)` cache key to -/// the return-value capability bits of its inline result. `HashMap` -/// equality is set-equal (unordered), so two caches with the same entries -/// compare equal regardless of insertion order. -/// -/// Origins are intentionally omitted — they are non-deterministic across -/// callers with identical caps (see the module-level note on origin -/// attribution) and would cause the fingerprint to oscillate without -/// reflecting a real precision change. -#[allow(dead_code)] // observability hook; used by tests and future shared-cache refactor +/// Set-equal fingerprint of the inline cache, used by the SCC +/// orchestrator to detect convergence. +#[allow(dead_code)] pub(crate) fn inline_cache_fingerprint( cache: &InlineCache, ) -> HashMap<(FuncKey, ArgTaintSig), u16> { @@ -206,24 +110,11 @@ pub(crate) fn inline_cache_fingerprint( /// CFG node metadata embedded in cross-file callee bodies. /// -/// ## Why a full [`crate::cfg::NodeInfo`] lives here -/// -/// An earlier variant carried only the two fields the symex executor reads -/// (`bin_op`, `labels`). That was sufficient for symex but not for the -/// taint engine, which reads ~20 fields off `cfg[inst.cfg_node]` across -/// `transfer_inst`, `collect_block_events`, `compute_succ_states`, and -/// helpers (callee name, `arg_uses`, `arg_callees`, `call_ordinal`, -/// `outer_callee`, `kwargs`, `arg_string_literals`, `ast.span`, -/// `ast.enclosing_func`, `condition_*`, `all_args_literal`, `catch_param`, -/// `parameterized_query`, `in_defer`, `cast_target_type`, `string_prefix`, -/// `taint.uses`, `taint.defines`, `taint.extra_defines`, -/// `taint.const_text`, …). Rather than shuttling each of those through a -/// `CfgView` accessor at every callsite, we store a full serde-able -/// [`crate::cfg::NodeInfo`] snapshot here so the indexed-scan path can -/// rehydrate an equivalent `Cfg` on load (see [`rebuild_body_graph`]). -/// Both scan paths then feed the same `&Cfg` into the taint engine, and -/// cross-file inline fires regardless of whether the body came from pass -/// 1 or from SQLite. +/// Stores a full serde-able [`crate::cfg::NodeInfo`] snapshot rather +/// than projecting individual fields, so the indexed-scan path can +/// rehydrate an equivalent `Cfg` (see [`rebuild_body_graph`]) and feed +/// the same `&Cfg` into the taint engine regardless of whether the +/// body came from pass 1 or SQLite. #[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct CrossFileNodeMeta { /// Full `NodeInfo` snapshot for this body-local NodeIndex. @@ -268,7 +159,7 @@ pub fn populate_node_meta(body: &mut CalleeSsaBody, cfg: &crate::cfg::Cfg) -> bo // `compute_succ_states` via `cfg[*cond]`, so without it the synthesized // cross-file proxy CFG (`rebuild_body_graph`) ends up too small whenever // the callee body has any conditional branch whose `cond` index sits - // past the maximum `inst.cfg_node` index — inline analysis then panics + // past the maximum `inst.cfg_node` index, inline analysis then panics // with an out-of-bounds index. let mut referenced: Vec = Vec::new(); for block in &body.ssa.blocks { @@ -320,7 +211,7 @@ pub fn rebuild_body_graph(body: &mut CalleeSsaBody) -> bool { // index. We fill any unreferenced intermediate indices with // `NodeInfo::default()`. // - // Walks both instruction `cfg_node`s and `Terminator::Branch.cond` — + // Walks both instruction `cfg_node`s and `Terminator::Branch.cond` , // the latter is read by `compute_succ_states` via `cfg[*cond]`, so // missing it produces an OOB panic when a conditional branch's cond // node has a higher index than any `inst.cfg_node` in the body. @@ -339,7 +230,7 @@ pub fn rebuild_body_graph(body: &mut CalleeSsaBody) -> bool { } } } - // Also consider node_meta keys — they should be a subset of the + // Also consider node_meta keys, they should be a subset of the // SSA-referenced indices, but be defensive. for &k in body.node_meta.keys() { if k > max_idx { diff --git a/src/taint/ssa_transfer/mod.rs b/src/taint/ssa_transfer/mod.rs index 59eec29a..f844e7ea 100644 --- a/src/taint/ssa_transfer/mod.rs +++ b/src/taint/ssa_transfer/mod.rs @@ -46,6 +46,7 @@ use crate::interop::InteropEdge; use crate::labels::{Cap, DataLabel, RuntimeLabelRule, SourceKind}; use crate::ssa::heap::{HeapObjectId, HeapSlot, PointsToResult, PointsToSet}; use crate::ssa::ir::*; +use crate::ssa::type_facts::InputValidatorPolarity; use crate::state::lattice::Lattice; use crate::state::symbol::SymbolInterner; use crate::summary::{CalleeQuery, CalleeResolution, GlobalSummaries, SinkSite}; @@ -95,7 +96,7 @@ pub struct SsaTaintTransfer<'a> { pub param_seed: Option<&'a [Option]>, /// Per-call-site receiver seed for context-sensitive inline /// analysis. Mirrors [`Self::param_seed`] for [`SsaOp::SelfParam`] - /// reads — seeds the callee's implicit `this` / `self` slot with + /// reads, seeds the callee's implicit `this` / `self` slot with /// the caller's method-receiver taint. pub receiver_seed: Option<&'a VarTaint>, /// Per-SSA-value constant lattice from constant propagation. @@ -107,13 +108,13 @@ pub struct SsaTaintTransfer<'a> { /// Precise per-function SSA summaries for intra-file callee resolution. /// Checked before legacy FuncSummary resolution. /// - /// Keyed by canonical [`FuncKey`] — never bare function name — so + /// Keyed by canonical [`FuncKey`], never bare function name, so /// same-name functions in the same file cannot silently overwrite one /// another. pub ssa_summaries: Option<&'a HashMap>, /// Extra label rules from user config (custom sources/sanitizers/sinks). /// Used as fallback when `resolve_callee` finds no summary for an inner - /// arg callee — so label-only sanitizers still reduce sink caps. + /// arg callee, so label-only sanitizers still reduce sink caps. pub extra_labels: Option<&'a [RuntimeLabelRule]>, /// Pre-lowered + optimized SSA bodies for intra-file functions. /// When present, enables context-sensitive inline analysis at call sites. @@ -181,7 +182,7 @@ pub struct SsaTaintTransfer<'a> { /// non-cross-file behaviour for unit tests and non-cross-file /// construction sites. pub cross_file_bodies: Option<&'a HashMap>, - /// Pointer-Phase 3: per-body field-sensitive points-to facts. + /// per-body field-sensitive points-to facts. /// Populated only when [`crate::pointer::is_enabled()`]. When /// present, [`SsaOp::FieldProj`] reads consult /// [`SsaTaintState::field_taint`] for each `loc ∈ pt(receiver)`, @@ -292,7 +293,7 @@ fn run_ssa_taint_internal( // Static-map seeding is intentionally NOT fused into the // AbstractState here. A blanket `StringFact::finite_set` would // compose with `StringFact::exact` facts emitted by - // `transfer_abstract` for every string literal — and downstream + // `transfer_abstract` for every string literal, and downstream // suppression logic can't distinguish "single-literal exact" // from "multi-literal bounded lookup". Instead the sink check // consults `transfer.static_map` directly via the dedicated @@ -445,7 +446,7 @@ fn run_ssa_taint_internal( // Post-hoc origin-truncation detection. If any converged block state // has a `VarTaint` whose origin list reached the cap, assume at least // one origin was dropped during the fixed-point iteration. Coarse - // but useful signal — `merge_origins` already emits the precise-count + // but useful signal, `merge_origins` already emits the precise-count // note on the merge path; this complements push sites inside transfer. let cap = effective_max_origins(); let mut saturated = 0u32; @@ -657,7 +658,7 @@ fn is_simple_increment(ssa: &SsaBody, inc_val: SsaValue, phi_val: SsaValue) -> b for inst in &block.body { if inst.value == inc_val { if let SsaOp::Assign(ref uses) = inst.op { - // Pattern: assign([phi_val, const_val]) — simple binary op + // Pattern: assign([phi_val, const_val]), simple binary op if uses.len() == 2 && uses.contains(&phi_val) { let other = if uses[0] == phi_val { uses[1] } else { uses[0] }; // Check if the other operand is a constant @@ -960,7 +961,7 @@ fn compute_succ_states( transfer.interner, ); - // PathFact branch narrowing — language-agnostic. The + // PathFact branch narrowing, language-agnostic. The // text-level rejection patterns recognised by // `classify_path_rejection_atom` cover the common idioms // across all 10 supported languages: @@ -1012,13 +1013,42 @@ fn compute_succ_states( ); } + // Generic input-validator branch narrowing. Recognises the + // two-statement idiom + // `const err = validate(x); if (err) throw …;` + // (also `if (!isValid(x)) throw`), kinds the predicate + // classifier returns Unknown / NullCheck / ErrorCheck for + // because the if-condition is a bare result variable, not a + // direct call expression. The narrowing only fires when + // the condition has exactly one variable and that + // variable's reaching SSA def is a Call to a callee + // recognised by `classify_input_validator_callee`. + // + // Motivated by Novu CVE GHSA-4x48-cgf9-q33f + // (`const ssrfError = await validateUrlSsrf(child.webhookUrl); + // if (ssrfError) throw …;`). + if matches!( + kind, + PredicateKind::Unknown | PredicateKind::NullCheck | PredicateKind::ErrorCheck + ) { + apply_input_validator_branch_narrowing( + &mut true_state, + &mut false_state, + cond_text, + &cond_info.condition_vars, + ssa, + block.id, + transfer.interner, + ); + } + // Constraint refinement // // `lower_condition` returns a ConditionExpr that represents the // full semantic condition (it already applies `condition_negated` // internally). The true branch is where the condition holds // (polarity=true), the false branch is where it doesn't - // (polarity=false). We do NOT reuse `effective_negated` here — + // (polarity=false). We do NOT reuse `effective_negated` here , // that variable incorporates `has_semantic_negation` which is a // predicate-system concern, not a constraint-system concern. if true_state.path_env.is_some() || false_state.path_env.is_some() { @@ -1056,13 +1086,13 @@ fn compute_succ_states( // Contradiction pruning. // // Two sources of contradiction: - // (a) `predicates` — a known_true and known_false bit + // (a) `predicates`, a known_true and known_false bit // set for the same predicate kind on the same // symbol. This is genuine: prior branches asserted // conflicting truth values about the same predicate, // so the joined branch is unreachable. Reset the // branch state to bot. - // (b) `path_env.is_unsat()` — the constraint solver's + // (b) `path_env.is_unsat()`, the constraint solver's // interval / nullability domain proved the branch // infeasible. Empirically the constraint refinement // can over-prune branches whose feasibility hinges @@ -1072,7 +1102,7 @@ fn compute_succ_states( // caps land on the destination). In those cases // resetting the data state to bot drops legitimate // taint flow that travels through the surviving - // branch — see CVE-2024-31450's + // branch, see CVE-2024-31450's // `if err := …Decode(emoji); err != nil { return }` // shape. // @@ -1080,7 +1110,7 @@ fn compute_succ_states( // reset to bot when the contradiction is in `predicates`. // For path_env-only unsat, drop path_env (treat as Top // for downstream path-sensitive reasoning) and keep the - // rest of the state — values, field_taint, heap, + // rest of the state, values, field_taint, heap, // predicates, validated_*, abstract_state. let true_pred_contra = true_state .predicates @@ -1103,7 +1133,7 @@ fn compute_succ_states( smallvec::smallvec![(*true_blk, true_state), (*false_blk, false_state),] } else { - // Non-If condition or no condition vars — uniform propagation + // Non-If condition or no condition vars, uniform propagation smallvec::smallvec![ (*true_blk, exit_state.clone()), (*false_blk, exit_state.clone()), @@ -1113,7 +1143,7 @@ fn compute_succ_states( Terminator::Goto(_) => { // `block.succs` is authoritative. The terminator target records // the single logical successor (or the first of a collapsed - // ≥3-way fanout — see src/ssa/lower.rs `three_successor_collapse`). + // ≥3-way fanout, see src/ssa/lower.rs `three_successor_collapse`). // Propagating only the terminator target would drop flow to the // other successors; iterate `succs` instead so every downstream // block receives the exit state. @@ -1173,7 +1203,7 @@ fn apply_branch_predicates( } } - // ShellMetaValidated: inverted polarity — the FALSE branch (no metachar + // ShellMetaValidated: inverted polarity, the FALSE branch (no metachar // found) is the validated path; the TRUE branch is the rejection path. if kind == PredicateKind::ShellMetaValidated && !polarity { for var in condition_vars { @@ -1221,8 +1251,8 @@ fn apply_branch_predicates( /// /// Walks `cond_info.condition_vars` to locate the SSA value bound to the /// condition's `err`/result variable, finds the SsaInst that defined that -/// value, and — if the defining op is a [`SsaOp::Call`] to a -/// [`crate::ssa::type_facts::is_int_producing_callee`] — copies the call's +/// value, and, if the defining op is a [`SsaOp::Call`] to a +/// [`crate::ssa::type_facts::is_int_producing_callee`], copies the call's /// argument variable names into `validated_must` / `validated_may` on the /// `err == null` branch. /// @@ -1292,7 +1322,7 @@ fn apply_validation_err_check_narrowing( return; } // Collect candidate input arg variable names: every SSA value across - // every positional arg group, looked up by var_name. Conservative — + // every positional arg group, looked up by var_name. Conservative , // we mark *all* of them validated rather than guessing which arg the // validator narrows. The validators we recognise here // (`strconv.Atoi`, `parseInt`, `ParseFloat`, …) all take exactly one @@ -1327,6 +1357,136 @@ fn apply_validation_err_check_narrowing( } } +/// Mark the input arguments of a generic input-validator helper as +/// validated on the success branch of a downstream truthiness check. +/// +/// Recognised idioms: +/// +/// ```text +/// // ErrorReturning (Novu CVE GHSA-4x48-cgf9-q33f) +/// const err = validateUrlSsrf(child.webhookUrl); +/// if (err) throw …; +/// // → child.webhookUrl is validated on the falsy (false) branch +/// +/// // BooleanTrueIsValid +/// const ok = isValidPath(p); +/// if (!ok) throw …; +/// // → p is validated on the !ok==false (true value of ok) branch +/// ``` +/// +/// Resolves `condition_vars[0]` to its reaching SSA def, checks that +/// the def is a [`SsaOp::Call`] to a callee classified by +/// [`classify_input_validator_callee`], and copies the call's input +/// argument variable names into `validated_must`/`validated_may` on +/// the branch the validator's polarity says succeeded. +/// +/// The branch direction starts from `cond_text` (uses the same +/// `success_branch_is_true` heuristics as +/// [`apply_validation_err_check_narrowing`]) and is then flipped for +/// `BooleanTrueIsValid` validators (a truthy result means "valid", so +/// the *true* branch carries the validation). +/// +/// Strict-additive: when no condition var matches, the def isn't a +/// Call, the callee isn't a recognised validator, or no arg has an +/// SSA-level var_name, the function is a no-op. +fn apply_input_validator_branch_narrowing( + true_state: &mut SsaTaintState, + false_state: &mut SsaTaintState, + cond_text: &str, + condition_vars: &[String], + ssa: &SsaBody, + block: BlockId, + interner: &SymbolInterner, +) { + if condition_vars.len() != 1 { + return; + } + + let result_name = condition_vars[0].as_str(); + let result_val = match resolve_var_to_ssa_value(result_name, ssa, block) { + Some(v) => v, + None => return, + }; + + let def_inst = ssa + .blocks + .iter() + .flat_map(|b| b.body.iter()) + .find(|i| i.value == result_val); + let Some(def_inst) = def_inst else { return }; + + let SsaOp::Call { + ref callee, + ref args, + .. + } = def_inst.op + else { + return; + }; + + let polarity = match crate::ssa::type_facts::classify_input_validator_callee(callee.as_str()) { + Some(p) => p, + None => return, + }; + + // Determine the success branch. + // + // Default: bare `if (X)` truthy-test → success is the FALSE branch + // for ErrorReturning (X truthy means "error"), and the TRUE branch + // for BooleanTrueIsValid (X truthy means "valid"). + // + // Equality checks (`X === null`, `X == null`, etc.) flip the + // truthiness sense, match the same set of patterns + // `apply_validation_err_check_narrowing` uses for the `err == nil` + // family. + let lower = cond_text.to_ascii_lowercase(); + let cond_text_says_null_branch_is_true = lower.contains("== nil") + || lower.contains("== none") + || lower.contains("is none") + || lower.contains("is_ok") + || lower.contains("=== null") + || lower.contains("== null"); + + let success_branch_is_true = match polarity { + InputValidatorPolarity::ErrorReturning => cond_text_says_null_branch_is_true, + InputValidatorPolarity::BooleanTrueIsValid => !cond_text_says_null_branch_is_true, + }; + + // Collect candidate input-arg variable names. Conservative, every + // SSA value across every positional arg group, looked up by + // var_name, OR'd into validated_*. Validators usually take one + // primary arg so this collects ≤ 1 name in practice. + let mut arg_names: SmallVec<[String; 2]> = SmallVec::new(); + for arg_group in args { + for &v in arg_group { + if let Some(name) = ssa + .value_defs + .get(v.0 as usize) + .and_then(|vd| vd.var_name.as_deref()) + { + if !arg_names.iter().any(|s: &String| s == name) { + arg_names.push(name.to_string()); + } + } + } + } + if arg_names.is_empty() { + return; + } + + let success_state = if success_branch_is_true { + true_state + } else { + false_state + }; + for name in &arg_names { + if let Some(sym) = interner.get(name) { + success_state.validated_may.insert(sym); + success_state.validated_must.insert(sym); + } + } +} + /// Find the latest reaching SSA definition for `var_name` at the end of /// `block`. Mirrors `crate::constraint::lower::resolve_single_var` but /// avoids the cross-module privacy leak: callers in this module need it @@ -1405,7 +1565,7 @@ fn apply_path_fact_branch_narrowing_with_interner( // pattern fires. Mirrors the AllowlistCheck quirk that already // marks validated on the rejection-arm via `apply_branch_predicates` // for languages whose `.contains(...)` / membership idiom hits the - // AllowlistCheck classifier — but normalises behaviour for shapes + // AllowlistCheck classifier, but normalises behaviour for shapes // like C `strstr(path, "..") != NULL` that hit the NullCheck arm // first and never get a chance to mark validation through the // allowlist path. Once the path-rejection classifier has accepted @@ -1425,7 +1585,7 @@ fn apply_path_fact_branch_narrowing_with_interner( // Collect SSA values whose `var_name` appears in `effective_vars`. We // pick the *highest-index* matching value (latest definition by SSA - // ordering — closest to the current program point). Absent an + // ordering, closest to the current program point). Absent an // explicit name table, iterating `ssa.value_defs` is the only way to // recover the mapping from name → SsaValue. let mut targets: smallvec::SmallVec<[SsaValue; 2]> = smallvec::SmallVec::new(); @@ -1447,7 +1607,7 @@ fn apply_path_fact_branch_narrowing_with_interner( // Apply rejection: true branch = reject (widen to Top / leave alone), // false branch = narrow the axis. The plan's polarity rule about // whether the enclosing block inherits the narrowing when the true - // branch terminates is enforced by the existing CFG successor graph — + // branch terminates is enforced by the existing CFG successor graph , // when the true branch returns/panics, only the false state reaches // subsequent blocks and the narrowed fact propagates naturally. let narrow_false = |fact: &mut PathFact| { @@ -1566,7 +1726,7 @@ fn inline_analyse_callee( // Resolve the call site to a canonical FuncKey and the body to inline. // Step 1: intra-file. Step 2: cross-file. // - // Without a resolved key we cannot inline safely — bare-name lookup could + // Without a resolved key we cannot inline safely, bare-name lookup could // pick the wrong same-name sibling (e.g. `A::process/1` vs `B::process/1`). let normalized = callee_leaf_name(callee); let container_raw = callee_container_hint(callee); @@ -1630,7 +1790,7 @@ fn inline_analyse_callee( // synthesizes a proxy `Cfg` from `node_meta` so the taint // engine can index `cfg[inst.cfg_node]` uniformly. A // body that still has neither a real graph nor any - // rehydrated metadata is structurally unusable — skip it. + // rehydrated metadata is structurally unusable, skip it. if body.body_graph.is_none() { tracing::debug!( callee = %normalized, @@ -1667,7 +1827,7 @@ fn inline_analyse_callee( let sig = build_arg_taint_sig(args, receiver, state); // Check cache (keyed by FuncKey + arg signature). The cached value - // is a structural shape — re-attribute origins to the current call + // is a structural shape, re-attribute origins to the current call // site before returning so two callers with matching caps but // different origins see their own source chains. { @@ -1690,7 +1850,7 @@ fn inline_analyse_callee( // `Param { index }` read picks up slot `index` directly via // `SsaTaintTransfer::param_seed`. Receiver taint is carried on a // separate channel (`SsaTaintTransfer::receiver_seed`) consumed by - // `SelfParam`. Name-based keying is not needed here — the callee + // `SelfParam`. Name-based keying is not needed here, the callee // analysis is scoped to this one call site and cannot merge with // another callee's param seed. @@ -1700,7 +1860,7 @@ fn inline_analyse_callee( // own `cfg_node` and preserves only `source_span`, so without this // pre-fill cross-file inline would lose the caller's source line // entirely (finding emission in `ast.rs` uses `source_span` first, - // falls back to indexing the caller's CFG at `node` — which is now + // falls back to indexing the caller's CFG at `node`, which is now // the callee's NodeIndex and resolves to a wrong or missing span). let populate_span = |mut o: TaintOrigin| -> TaintOrigin { if o.source_span.is_none() { @@ -1804,8 +1964,8 @@ fn inline_analyse_callee( // per-file [`InlineCache`] is reused across all iterations of the // pass-2 convergence loop in `taint::mod::analyse_multi_body`; the // cache is keyed by `(FuncKey, ArgTaintSig)` only, so if the - // inlined callee could read from a caller's `global_seed` — which - // is refined each round — the same cache key could map to two + // inlined callee could read from a caller's `global_seed`, which + // is refined each round, the same cache key could map to two // different return shapes across rounds, producing a // non-reproducible fixed point. // @@ -1848,8 +2008,8 @@ fn inline_analyse_callee( cross_file_bodies: transfer.cross_file_bodies, // Inline analysis re-lowers the callee in its own body-local // location space; pointer facts are body-relative, so we don't - // forward the caller's facts. Phase 5's `PointsToSummary` is - // the cross-call substitute. + // forward the caller's facts. `PointsToSummary` is the + // cross-call substitute. pointer_facts: None, }; @@ -1920,7 +2080,7 @@ struct CalleeParamNodeBits { /// (propagated through a `Param`/`SelfParam` op; its `node` points at the /// callee's Param NodeIndex). /// -/// Caller-seeded origins are *not* baked into the cached shape — their +/// Caller-seeded origins are *not* baked into the cached shape, their /// identity depends on the caller's argument chain, which varies across call /// sites with matching cap signatures. Instead, the origin position is /// recorded as a bit in [`ReturnShape::param_provenance`] (or the @@ -1974,7 +2134,7 @@ fn extract_inline_return_taint( } // Callee-internal origins carry their span from the callee CFG (lazily - // filled when missing) but have `node` set to a placeholder — the + // filled when missing) but have `node` set to a placeholder, the // applying call site fills in its own call-site NodeIndex via // `apply_cached_shape`. // @@ -2104,7 +2264,7 @@ fn extract_inline_return_taint( // return blocks the entry state's AbstractState has already // been diluted by the join, so we additionally replay // `transfer_block` once per predecessor seeded from that - // predecessor's `block_exit_states` entry — yielding a + // predecessor's `block_exit_states` entry, yielding a // predecessor-specific exit whose PathFact on `rv` still // carries that path's narrowing. The per-predecessor facts // are then joined to describe the callee-intrinsic @@ -2161,7 +2321,7 @@ fn extract_inline_return_taint( // `return_path_fact`. When the rv is a one-arg variant // constructor (structurally: upper-camel-case leaf, 1 arg, // no receiver), the *inner* fact is what a destructuring - // caller would see on the match-bound variable — the outer + // caller would see on the match-bound variable, the outer // variant-wrapper fact is semantically irrelevant because // `Option` / `Result` / `Box` // values are not themselves path values. Summary-level @@ -2266,7 +2426,7 @@ fn extract_inline_return_taint( // Only keep per-return-path entries when at least one entry carries // meaningful signal (non-Top path_fact or a variant_inner_fact). A // list of all-Top entries adds bytes on disk without helping a - // caller pick a path. Additionally require ≥2 distinct entries — + // caller pick a path. Additionally require ≥2 distinct entries , // a single-entry list is no finer than the joined `return_path_fact`. let return_path_facts = if per_return_path_entries.len() >= 2 && per_return_path_entries @@ -2303,20 +2463,20 @@ fn extract_inline_return_taint( })) } -/// Structural predicate: does `rv` represent a "non-data" return — +/// Structural predicate: does `rv` represent a "non-data" return , /// a value that cannot carry path-typed content on this return path? /// /// Recognises the common failure-arm idioms without hard-coding /// specific identifier names: /// * [`SsaOp::Const`] whose text is a recognised nullary tag -/// (`None`, `null`, `nil`, `NULL`, `()`, `Err`, `Nothing`, …) — +/// (`None`, `null`, `nil`, `NULL`, `()`, `Err`, `Nothing`, …) , /// tree-sitter-rust emits `None` as a constant path identifier /// rather than a call; across other languages `null` / `nil` /// cover the equivalents. /// * [`SsaOp::Call`] with *zero* arguments and no receiver whose /// callee leaf segment looks like a Rust-grammar variant / /// struct constructor (ASCII upper-case start, alphanumeric / -/// underscore body) — covers user-defined nullary variants like +/// underscore body), covers user-defined nullary variants like /// `Nothing` or `Default` without naming them. Zero-arg /// constructors carry no attacker-controlled content by /// definition, so they are provably not a path-typed payload. @@ -2334,7 +2494,7 @@ fn is_non_data_return(rv: SsaValue, ssa: &SsaBody) -> bool { match &inst.op { SsaOp::Const(Some(text)) => { // Match the nullary sentinels used across the - // supported languages. Intentionally narrow — + // supported languages. Intentionally narrow , // any non-sentinel constant may be a path // literal that must participate in the join. let trimmed = text.trim(); @@ -2379,7 +2539,7 @@ fn is_non_data_return(rv: SsaValue, ssa: &SsaBody) -> bool { /// * `rv` is defined by [`SsaOp::Call`] in `ssa`; /// * the call's callee leaf segment is a Rust-grammar variant / type /// constructor (upper-camel-case start, alphanumeric/underscore -/// tail — see +/// tail, see /// [`crate::abstract_interp::path_domain::is_structural_variant_ctor`]); /// * the call has no receiver and exactly one positional argument /// group whose size is 1 (a single SSA value); @@ -2416,7 +2576,7 @@ pub(super) fn detect_variant_inner_fact( // Single positional argument in the first group. SSA // lowering appends an implicit chained-call uses group // after the positional ones, so we cannot read positional - // arity from `args.len()` alone — however the *first* + // arity from `args.len()` alone, however the *first* // group still captures the positional arg 0's contributing // SsaValues. Join PathFacts across every value in that // group so chained inner calls (`Some(s.to_string())` @@ -2560,8 +2720,8 @@ fn apply_cached_shape( } } -/// Pointer-Phase 5 / W3: apply a callee's [`FieldPointsToSummary`] field -/// writes at a caller call site. +/// Apply a callee's [`FieldPointsToSummary`] field writes at a caller +/// call site. /// /// For each `(param_idx, field_names)` in /// [`FieldPointsToSummary::param_field_writes`], substitute the callee @@ -2569,20 +2729,20 @@ fn apply_cached_shape( /// argument's taint into each `(loc, field_id)` cell on the caller's /// `field_taint`. /// -/// * `param_idx == u32::MAX` is the receiver sentinel — resolve via +/// * `param_idx == u32::MAX` is the receiver sentinel, resolve via /// the call's `receiver` SsaValue rather than positional args. /// * `field_name == ""` translates to [`FieldId::ELEM`] without -/// going through the caller's interner — matches the wire-format +/// going through the caller's interner, matches the wire-format /// convention from /// [`crate::pointer::extract_field_points_to`]. /// * Any other field name is *looked up* (read-only) in the caller's /// [`FieldInterner`]. Names the caller never referenced are skipped -/// — no FieldProj read in the caller could observe such a cell. +/// , no FieldProj read in the caller could observe such a cell. /// * `pt(arg)` saturated to `{Top}` is conservatively skipped (matches /// the W1/W2 hooks' over-approximation policy). /// /// Strict-additive: when [`FieldPointsToSummary::overflow`] is `true` -/// the helper does nothing — the conservative interpretation is "every +/// the helper does nothing, the conservative interpretation is "every /// param touches every field on every other param", which would /// require a body-wide field cell flood the lattice cannot /// efficiently represent. The bit is informational; consumers @@ -2675,7 +2835,7 @@ fn apply_field_points_to_writes( /// cells and: /// /// * Unions their `taint.caps` into the call result's value taint -/// (additive — preserves any caps already set by upstream +/// (additive, preserves any caps already set by upstream /// `try_container_propagation` / heap analysis). /// * AND-intersects the cells' `validated_must`; OR-unions /// `validated_may`; seeds the call result's symbol-level bits @@ -2858,29 +3018,21 @@ pub(super) fn transfer_inst( .. } => { // Excluded callees (e.g. router.get, app.post) should not propagate - // taint through their return value — they are framework scaffolding, + // taint through their return value, they are framework scaffolding, // not data-flow operations. if crate::labels::is_excluded(transfer.lang.as_str(), callee.as_bytes()) { return; } - // Pointer-Phase 4 / W2: container element-write hook. + // Container element-write hook. Runs before other Call-arm + // processing so `try_container_propagation`'s early-return + // can't bypass us. Writes only into `(loc, ELEM)` cells on + // `field_taint`, strictly additive. // - // Run before any other Call-arm processing so the existing - // `try_container_propagation` early-return path (which fires on - // recognised container ops and `return`s once handled) cannot - // bypass us. Strict-additive: the hook only writes into the - // `(loc, ELEM)` cells on `SsaTaintState.field_taint`, never - // touches the existing per-SSA-value taint or the call result. - // - // Pointer-Phase 4 / W4: each pushed value's symbol-level - // `validated_must` / `validated_may` flow through to the - // cell. The cell records `must = AND` over args (intersect: - // every writer must be must-validated), `may = OR` over - // args. When an arg has no var_name (anonymous SSA temp), - // it contributes `false / false` and breaks the must - // invariant — matching the symbol-keyed lattice's "absent - // entry = un-validated" semantics. + // Each pushed value's `validated_must`/`validated_may` flow + // through: cell `must = AND` over args (every writer must be + // must-validated), `may = OR` over args. Anonymous SSA temps + // contribute `false/false` and break the `must` invariant. if let (Some(pf), Some(rcv)) = (transfer.pointer_facts, *receiver) { if crate::pointer::is_container_write_callee(callee) { let pt = pf.pt(rcv); @@ -2940,7 +3092,7 @@ pub(super) fn transfer_inst( // Python `requests.get`, JS `axios.get`). When invoked with // a hardcoded URL whose prefix passes `is_string_safe_for_ssrf` // (a fully-formed `scheme://host/path`), the developer has - // explicitly bound the endpoint at compile time — the SSRF + // explicitly bound the endpoint at compile time, the SSRF // sink suppression already trusts this prefix-lock to // silence the SSRF concern, and the same trust applies on // the source side: the response body is developer-chosen, @@ -2960,12 +3112,12 @@ pub(super) fn transfer_inst( .iter() .any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SSRF))); // Detect a hardcoded URL via three channels: - // 1. `info.string_prefix` — populated by the JS/TS template- + // 1. `info.string_prefix`, populated by the JS/TS template- // literal extractor and inline call shapes. - // 2. AbstractState `StringFact` on the first positional arg — + // 2. AbstractState `StringFact` on the first positional arg , // populated by const propagation for plain string literals. // 3. As a last resort when `info.call.first_arg_text` is - // populated with a hardcoded literal — extracted at CFG + // populated with a hardcoded literal, extracted at CFG // construction time for network-fetch primitive callees. let url_prefix_safe_via_node = info .string_prefix @@ -3002,7 +3154,7 @@ pub(super) fn transfer_inst( for lbl in &info.taint.labels { if let DataLabel::Source(bits) = lbl { if url_is_hardcoded_safe { - // Skip Source propagation — see network-fetch + // Skip Source propagation, see network-fetch // source suppression rationale above. continue; } @@ -3052,7 +3204,7 @@ pub(super) fn transfer_inst( } } - // Resolve callee summary — always attempt, even when explicit + // Resolve callee summary, always attempt, even when explicit // labels are present. Labels take precedence for source caps, but // summary propagation and sanitizer behaviour must still apply // (matches legacy `apply_call()` semantics). @@ -3068,7 +3220,7 @@ pub(super) fn transfer_inst( // Context-sensitive inline analysis: attempt before summary fallback. // Only for intra-file calls when context sensitivity is enabled. // Only claims resolution when the inline result produces non-empty - // return taint — otherwise falls through to summary for cases like + // return taint, otherwise falls through to summary for cases like // receiver-only method calls where summary propagation is needed. if transfer.inline_cache.is_some() && transfer.context_depth < 1 { if let Some(result) = @@ -3113,7 +3265,7 @@ pub(super) fn transfer_inst( let mut resolved_container_store: Vec<(usize, usize)> = Vec::new(); // Captured alongside container fields because the // callee_summary gets moved when the main taint branch takes it - // below. We only need the points_to summary itself — clone it + // below. We only need the points_to summary itself, clone it // out before the move so application can still read it. let mut resolved_points_to: crate::summary::points_to::PointsToSummary = crate::summary::points_to::PointsToSummary::empty(); @@ -3151,25 +3303,18 @@ pub(super) fn transfer_inst( resolved_container_store = resolved.param_to_container_store.clone(); resolved_points_to = resolved.points_to.clone(); - // Pointer-Phase 5 / W3: cross-call field-points-to - // application. Walk the callee's - // `field_points_to.param_field_writes`; for each - // `(param_idx, field_names)`, substitute `Param(callee, i)` - // with the caller's `pt(arg_i)` and union the caller's - // argument taint into each `(loc, field_id)` cell on the - // caller's `SsaTaintState.field_taint`. + // Cross-call field-points-to application: walk the + // callee's `field_points_to.param_field_writes`; for + // each `(param_idx, field_names)` substitute the + // callee's param with the caller's `pt(arg_i)` and + // union the caller's argument taint into each + // `(loc, field_id)` cell on `field_taint`. // - // Receiver flow uses sentinel `param_idx == u32::MAX`; - // resolve via the call's receiver SsaValue instead of - // positional args. Field names are looked up against the - // *caller's* `field_interner`; names the caller never - // referenced are skipped — no FieldProj read in the caller - // could observe such a cell, so writing it is wasteful. - // - // The container-element sentinel `""` translates - // to [`FieldId::ELEM`] without going through interner - // lookup, mirroring the wire-format convention - // established in `summary::points_to::FieldPointsToSummary`. + // Receiver flow uses sentinel `param_idx == u32::MAX`. + // Field names are looked up in the *caller's* + // `field_interner`, names the caller never referenced + // are skipped. The `""` sentinel translates to + // [`FieldId::ELEM`]. if let Some(pf) = transfer.pointer_facts { apply_field_points_to_writes( &resolved.field_points_to, @@ -3239,7 +3384,7 @@ pub(super) fn transfer_inst( // Fall back to whichever side is non-bottom // (meet can contradict when the callee's // baseline and the caller-side transfer - // describe disjoint facts — rare, but sound + // describe disjoint facts, rare, but sound // to widen back to the less restrictive). if m.is_bottom() { Some(synth.join(&base)) @@ -3337,7 +3482,7 @@ pub(super) fn transfer_inst( // Per-parameter application: each propagating param // contributes taint narrowed by its own per-path // sanitizer. Origins are still aggregated across - // params — they name source anchors, not transforms. + // params, they name source anchors, not transforms. let mut any_origin_added = false; for ¶m_idx in effective_params { let arg_caps_origins = @@ -3425,7 +3570,7 @@ pub(super) fn transfer_inst( // any cross-procedural sanitization (e.g. an interprocedural // path-traversal sanitizer whose caller also carries a label-only // sanitizer matching on callee name). Only collect `use_caps` - // when no summary applied — that is the original pure-label + // when no summary applied, that is the original pure-label // sanitizer-wrapper code path. if !sanitizer_bits.is_empty() { if !resolved_callee { @@ -3437,17 +3582,13 @@ pub(super) fn transfer_inst( } return_bits &= !sanitizer_bits; - // Phase C auth-as-taint: the UNAUTHORIZED_ID cap models a - // caller-supplied identifier that must clear an ownership or - // membership guard before a state-changing sink. Sanitizer - // calls for this cap (e.g. `authz::require_group_member(db, - // group_id, user.id)?`) do not pass their validated inputs - // through a return value — the ownership proof is the side - // effect. So when a sanitizer carries the UNAUTHORIZED_ID - // bit, additionally strip it from each argument's SSA value - // so downstream uses see the cap cleared. Kept isolated to - // UNAUTHORIZED_ID to preserve existing return-only semantics - // for every other cap. + // UNAUTHORIZED_ID models a caller-supplied id that must + // clear an ownership/membership guard. Sanitizers for + // this cap don't pass inputs through a return value , + // the ownership proof is the side effect. Strip the bit + // from each argument's SSA value so downstream uses see + // it cleared. Isolated to UNAUTHORIZED_ID; other caps + // keep return-only sanitizer semantics. if sanitizer_bits.contains(Cap::UNAUTHORIZED_ID) { strip_cap_from_call_args(args, receiver, state, Cap::UNAUTHORIZED_ID); } @@ -3455,7 +3596,7 @@ pub(super) fn transfer_inst( // Container operation propagation (push/pop/get/set/etc.) // Try the primary callee first, then fall back to outer_callee // (set when find_classifiable_inner_call overrides the callee, - // e.g. `parts.add(req.getParameter("input"))` — callee is + // e.g. `parts.add(req.getParameter("input"))`, callee is // "req.getParameter" but outer_callee is "parts.add"). let mut container_handled = try_container_propagation( inst, info, args, receiver, state, transfer, callee, ssa, @@ -3490,14 +3631,10 @@ pub(super) fn transfer_inst( } // Fall through to write return_bits to inst.value if non-empty if return_bits.is_empty() { - // Pointer-Phase 4 / W4: container ELEM read - // counterpart fires here for container_handled - // calls with no source label of their own — - // e.g. `cmd := arr.shift()` — whose taint and - // validation come from the cell rather than - // any inline source. The post-match hook - // would otherwise be skipped by this early - // return. + // Container ELEM read counterpart fires for + // container_handled calls with no source label + // (e.g. `cmd := arr.shift()`) whose taint comes + // from the cell rather than an inline source. apply_container_elem_read_w4(inst, ssa, transfer, state); return; } @@ -3545,7 +3682,7 @@ pub(super) fn transfer_inst( } } - // No labels and no summary — default propagation (gen/kill) + // No labels and no summary, default propagation (gen/kill) let (use_caps, use_origins) = collect_args_taint(args, receiver, state, &[]); if return_bits.is_empty() { return_bits = use_caps; @@ -3636,7 +3773,7 @@ pub(super) fn transfer_inst( } // When the primary callee is a Source (e.g. req.query.input // overrode storeInto as the callee), the source taint is - // produced as the call's return — not yet in args. Use + // produced as the call's return, not yet in args. Use // return_bits as the source taint for the container store. if src_caps.is_empty() && !return_bits.is_empty() { src_caps = return_bits; @@ -3662,12 +3799,12 @@ pub(super) fn transfer_inst( // parameter positions and the return; at the call site we replay // each edge against the caller's taint state. // - // * `Param(src) → Param(dst)` — union caller-arg[src]'s taint + // * `Param(src) → Param(dst)`, union caller-arg[src]'s taint // into caller-arg[dst]'s heap slot. Sound because the // callee *may* have stored data derived from arg[src] into // an alias of arg[dst]; the caller must assume any later // read from arg[dst] could surface that taint. - // * `Param(src) → Return` — union caller-arg[src]'s points-to + // * `Param(src) → Return`, union caller-arg[src]'s points-to // set into the call's return value, giving the result the // same heap identity as its input argument. Overlaps with // `param_container_to_return`; both channels are idempotent @@ -3678,7 +3815,7 @@ pub(super) fn transfer_inst( // (container literal or known constructor not tracing to any // parameter), synthesise a `HeapObjectId` keyed on the call's // SSA value and seed it into `dynamic_pts`. This closes the - // factory-pattern cross-file gap — `const bag = makeBag()` + // factory-pattern cross-file gap, `const bag = makeBag()` // gives `bag` a stable heap identity so subsequent // `fillBag(bag, …)` / `bag[0]` operations have a heap cell // to store into or read from. @@ -3812,7 +3949,7 @@ pub(super) fn transfer_inst( // Apply Param → Return edges: the call result inherits the // source argument's points-to set. Re-runs the same // channel `resolved_container_to_return` drives a few - // lines above — safe (idempotent union), and catches + // lines above, safe (idempotent union), and catches // cases where the callee returned a param through a // non-identity chain (e.g. `return Box::new(x)`). if !param_to_return_edges.is_empty() @@ -3869,7 +4006,7 @@ pub(super) fn transfer_inst( // produces return_bits. Check if the wrapper function blocks taint: // if its SSA summary shows no propagation, no source_caps, and no // container identity return, the return value is independent of its - // arguments — clear return_bits. + // arguments, clear return_bits. if !return_bits.is_empty() && has_source_label { if let Some(ref oc) = info.call.outer_callee { if let Some(ref oc_sum) = resolve_callee_hinted( @@ -3937,7 +4074,7 @@ pub(super) fn transfer_inst( // Synthetic field-write inheritance. When SSA lowering emits // `u_new = Assign(rhs)` to model `u.f = rhs` (an obj-update // synth), `u_new` represents the same logical object after the - // field write — it retains every other field's taint. The + // field write, it retains every other field's taint. The // base-only Assign uses include only the rhs, so without this // step a clean rhs (`u.Path = "/foo"`) would zero out every // tainted field on the prior `u`. Owncast CVE-2023-3188 hit @@ -4015,14 +4152,11 @@ pub(super) fn transfer_inst( ); } - // Pointer-Phase 3 / W1: synthetic base-update Assign emitted by - // SSA lowering for `obj.f = rhs`. The side-table on the body - // maps this synth assign's value → (prior_receiver, FieldId) so - // we can lift the assign into a structural field WRITE: union - // the rhs taint into every `(loc, field)` cell for `loc ∈ - // pt(prior_receiver)` that isn't `Top`. Skip when the receiver - // pt set saturates to `Top` — over-approximating every field - // cell would amplify rather than localise the taint. + // Synthetic base-update Assign emitted by SSA lowering for + // `obj.f = rhs`. The side-table maps this synth assign's + // value → (prior_receiver, FieldId), so we lift it into a + // field WRITE: union rhs taint into every `(loc, field)` + // cell for non-Top `loc ∈ pt(prior_receiver)`. if let Some(pf) = transfer.pointer_facts { if let Some((receiver, fid)) = ssa.field_writes.get(&inst.value).copied() { let pt = pf.pt(receiver); @@ -4067,7 +4201,7 @@ pub(super) fn transfer_inst( } SsaOp::Const(_) | SsaOp::Nop => { - // No taint — this is the kill mechanism for `x = "literal"` after + // No taint, this is the kill mechanism for `x = "literal"` after // `x = source()`. The fresh SsaValue carries zero caps. } @@ -4178,7 +4312,7 @@ pub(super) fn transfer_inst( // registered caller (typical for controller methods, handler // dispatch functions, and stream lambda bodies). Skipped in // summary-extraction mode so baseline probes keep their - // intrinsic-source contract. Gate is set by the caller — e.g. + // intrinsic-source contract. Gate is set by the caller, e.g. // always-on for JS/TS, only AnonymousFunction bodies for Java. if transfer.auto_seed_handler_params && !seeded_from_scope @@ -4209,7 +4343,7 @@ pub(super) fn transfer_inst( } SsaOp::Phi(_) => { - // Phis processed separately above — shouldn't appear in body + // Phis processed separately above, shouldn't appear in body } SsaOp::Undef => { @@ -4223,14 +4357,9 @@ pub(super) fn transfer_inst( SsaOp::FieldProj { receiver, field, .. } => { - // Field projection: propagate the receiver's full taint - // record to the projected value. Phase 1 keeps the simple - // pass-through behaviour — `obj.f` carries `obj`'s caps and - // origins; Phase 4 will introduce field-sensitive narrowing. - // - // Strict pass-through: if the receiver is untainted, the - // projection stays untainted (no entry inserted), preserving - // the existing block-state semantics. + // Field projection: pass the receiver's full taint record + // through to the projected value. Untainted receiver → + // untainted projection (no entry inserted). let mut combined: Option = state.get(*receiver).cloned(); // W4: collect cell validation channels alongside taint. @@ -4241,12 +4370,9 @@ pub(super) fn transfer_inst( let mut cell_must_all: Option = None; let mut cell_may_any = false; - // Pointer-Phase 3 read: when per-body PointsToFacts are - // available, also union taint from each `(loc, field)` cell - // for `loc ∈ pt(receiver)`. This carries cross-method field - // flow within a single body — method A writes `this.cache = - // req.body` (recorded into the field cell), method B's - // `this.cache` projection picks the taint up here. + // When per-body PointsToFacts are available, also union + // taint from each `(loc, field)` cell for `loc ∈ pt(receiver)`. + // Carries cross-method field flow within a single body. if let Some(pf) = transfer.pointer_facts { let pt = pf.pt(*receiver); if !pt.is_empty() && !pt.is_top() { @@ -4254,15 +4380,14 @@ pub(super) fn transfer_inst( // Read the specific `(loc, *field)` cell first // (per-field-name flow from cross-call writes). // When it's absent, fall back to the - // `(loc, ANY_FIELD)` wildcard — populated by the + // `(loc, ANY_FIELD)` wildcard, populated by the // [`ContainerOp::Writeback`] handler for sinks // like `json.NewDecoder(r.Body).Decode(&dest)` // that taint every field of the destination // wholesale. The fallback is gated on // specific-field absence so existing field-cell - // semantics (Pointer-Phase 3 / W3) are - // bit-identical when the writer used a named - // field. ANY_FIELD is intentionally distinct + // semantics are bit-identical when the writer + // used a named field. ANY_FIELD is distinct // from `ELEM` (container-element wildcard) to // avoid a struct-with-`length`-field reading // taint from a sibling array's `push` writes. @@ -4365,16 +4490,8 @@ pub(super) fn transfer_inst( } } - // Pointer-Phase 4 / W4 read counterpart for container reads. - // - // Lives outside the SsaOp::Call match arm so it fires after - // non-container Calls reach this point. The container-read path - // can also early-return inside the match arm (when - // `try_container_propagation` claims the call), so the hook is - // *additionally* invoked from inside the arm before those early - // returns — see the call to `apply_container_elem_read_w4` - // adjacent to the container_handled branch. This post-match - // invocation covers the call-fall-through cases. + // Container read counterpart, post-match. Also invoked inline + // before container-handled early-returns inside the Call arm. if matches!(&inst.op, SsaOp::Call { .. }) { apply_container_elem_read_w4(inst, ssa, transfer, state); } @@ -4395,12 +4512,12 @@ pub(super) fn transfer_inst( // narrow the destination value's type in PathEnv. // // Semantics vary by language: - // - Java casts: runtime-checked — type is reliably narrowed + // - Java casts: runtime-checked, type is reliably narrowed // - TypeScript `as`: compile-time assertion only, not runtime proof // - Go type assertions: runtime-checked (direct form) // // In ALL cases: taint is preserved. Narrowing the type does NOT - // erase taint — a tainted value cast to String is still tainted. + // erase taint, a tainted value cast to String is still tainted. let node_info = &cfg[inst.cfg_node]; if let Some(ref cast_type) = node_info.cast_target_type { if let Some(kind) = crate::constraint::solver::parse_type_name(cast_type) { @@ -4480,7 +4597,7 @@ pub(super) fn transfer_inst( /// unknown operations (calls, sources, params). /// /// `lang` is consulted only for language-specific transfer rules (currently -/// Rust path primitives — `fs::canonicalize`, `.starts_with`, etc.); `None` +/// Rust path primitives, `fs::canonicalize`, `.starts_with`, etc.); `None` /// disables them and matches the pre-PathFact behaviour exactly. fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: Option) { use crate::abstract_interp::{AbstractValue, BitFact, IntervalFact, PathFact, StringFact}; @@ -4505,7 +4622,7 @@ fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: O let s = strip_string_quotes(trimmed); // String literal: derive PathFact axes from the *literal* // content. An empty string has no `..` segment and no - // absolute root — both axes proven safe — so a Const `""` + // absolute root, both axes proven safe, so a Const `""` // (Python / JS / TS / Java rejection-arm sentinel) carries a // path-safe fact even without a per-language allocator // recogniser like Rust's `String::new()`. Non-empty @@ -4722,7 +4839,7 @@ fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: O ); } - // Path-primitive calls — per-language classifiers map known stdlib + // Path-primitive calls, per-language classifiers map known stdlib // sanitisers (`fs::canonicalize`, `os.path.normpath`, // `path.normalize`, `filepath.Clean`, `Path.normalize()`, // `File.expand_path`, `realpath`, `std::filesystem::canonical`) @@ -4753,7 +4870,7 @@ fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: O .map(|v| abs.get(v).path) .unwrap_or_else(PathFact::top); - // Primary path-producing primitives — per-language dispatch. + // Primary path-producing primitives, per-language dispatch. let lang_unwrapped = lang.expect("guard ensures lang.is_some()"); if let Some(pf) = crate::abstract_interp::path_domain::classify_path_primitive_for_lang( lang_unwrapped, @@ -4803,13 +4920,13 @@ fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: O // Structural variant-wrapper transparency. When a call is // a one-positional-argument variant / type constructor // (receiver-less; callee leaf begins with ASCII upper-case - // — the + //, the // [`crate::abstract_interp::path_domain::is_structural_variant_ctor`] // gate), its result inherits the joined PathFact of every // SSA value the lowering recorded for that single // positional argument. Covers `Some(s)`, `Ok(s)`, // `Err(s)`, `Box::new(s)`, and user-defined single-field - // variants / tuple structs alike — the classification is + // variants / tuple structs alike, the classification is // deliberately name-agnostic, so a freshly introduced // wrapper variant participates without code change. // @@ -4819,7 +4936,7 @@ fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: O // group of chained-call uses after the positional // groups, so `args.len()` over-counts. For the // positional group itself we join the PathFacts across - // all contributing SsaValues — chained calls inside the + // all contributing SsaValues, chained calls inside the // argument (`Some(s.to_string())`) surface every uses' // value; the join picks the most precise axis each // value proves. @@ -4851,7 +4968,7 @@ fn transfer_abstract(inst: &SsaInst, cfg: &Cfg, abs: &mut AbstractState, lang: O // Callee is a Rust scoped identifier (contains `::`) whose // parent segment (e.g. `String` in `String::new`) begins // with ASCII upper-case, the call has no receiver and no - // arguments, and the node carries no Source label — + // arguments, and the node carries no Source label , // i.e. the helper is a fresh-allocation entry point, not // an external-input read. Zero inputs ⇒ the result // carries no attacker-controlled path content and is @@ -4891,14 +5008,14 @@ fn is_int_producing_callee(callee: &str) -> bool { /// /// Used by the zero-argument-allocator arm of `transfer_abstract` to /// recognise `Type::new` / `Type::default` / `Type::with_capacity` / -/// `Type::empty` — and any user-defined associated allocator — as a +/// `Type::empty`, and any user-defined associated allocator, as a /// fresh-allocation site without hard-coding the leaf name. The check /// is deliberately conservative: /// /// * Must contain at least one `::` separator. /// * The segment *before* the final leaf must start with an ASCII /// upper-case letter and contain only ASCII alphanumeric / `_` -/// characters — Rust's grammar for type identifiers. (Module-only +/// characters, Rust's grammar for type identifiers. (Module-only /// paths like `std::env` don't qualify; the gate fires only on /// type paths like `String::new`.) /// @@ -5069,7 +5186,7 @@ fn collect_block_events( } // Replay abstract value phi join (from predecessor exit states). - // Mirrors the same logic in transfer_block() — without this, abstract + // Mirrors the same logic in transfer_block(), without this, abstract // values for phi-defined SSA values would be stale during sink suppression. if state.abstract_state.is_some() { for phi in &block.phis { @@ -5120,7 +5237,7 @@ fn collect_block_events( continue; } - // Parameterized SQL queries are safe — skip sink detection. + // Parameterized SQL queries are safe, skip sink detection. if info.parameterized_query { continue; } @@ -5304,9 +5421,9 @@ fn collect_block_events( } // Same-node Sanitizer subtraction. When the CFG node carries both - // Sink and Sanitizer labels for overlapping caps — the shape-based + // Sink and Sanitizer labels for overlapping caps, the shape-based // synthesis pattern used by Ruby AR safe-arg-0 detection - // (`src/cfg/mod.rs`) and the Java JPA parameterised-execute chain — + // (`src/cfg/mod.rs`) and the Java JPA parameterised-execute chain , // the sanitizer reflexively dominates the sink and the cap should // not surface as a taint-flow finding. The SSA Call arm already // applies same-node sanitizer to the *return* value @@ -5365,7 +5482,7 @@ fn collect_block_events( // SSA-level literal suppression: if all argument SSA values are known // constants (from const propagation), skip sink detection. - // Only applies to non-Call instructions (Assign to a sink) — for Call + // Only applies to non-Call instructions (Assign to a sink), for Call // instructions, the CFG-level `all_args_literal` check already handles // chained calls more accurately. if !matches!(inst.op, SsaOp::Call { .. }) { @@ -5434,17 +5551,57 @@ fn collect_block_events( } } - // Collect tainted SSA values that flow into this sink - let tainted = collect_tainted_sink_values( - inst, - info, - &state, - sink_caps, - ssa, - transfer, - &sink_info.param_to_sink, - ); - if !tainted.is_empty() { + // Per-gate-filter dispatch. When the call site carries multiple + // gated-sink classes (e.g. `fetch` is both an SSRF gate on the URL + // arg and a `DATA_EXFIL` gate on the body / headers / json arg), + // each filter contributes its own sink-cap mask, payload positions, + // and destination-uses. Iterating per-filter keeps cap attribution + // exact: a body-only taint surfaces as a `DATA_EXFIL` event with no + // SSRF bit, and vice versa. + // + // The single-filter / no-filter case takes one trip through the + // loop with the legacy `(sink_caps, info.call.sink_payload_args, + // info.call.destination_uses)` triple, preserving prior behavior + // for every non-multi-gate site. + let multi_gate = info.call.gate_filters.len() > 1; + type FilterEntry<'a> = (Cap, Option<&'a [usize]>, Option<&'a [String]>); + let filter_iter: smallvec::SmallVec<[FilterEntry<'_>; 2]> = if multi_gate { + info.call + .gate_filters + .iter() + .map(|f| { + ( + sink_caps & f.label_caps, + Some(f.payload_args.as_slice()), + f.destination_uses.as_deref(), + ) + }) + .collect() + } else { + smallvec::smallvec![(sink_caps, None, None)] + }; + + for (filter_caps, positions_override, destination_override) in filter_iter { + if filter_caps.is_empty() { + continue; + } + + // Collect tainted SSA values that flow into this sink + let tainted = collect_tainted_sink_values( + inst, + info, + &state, + filter_caps, + ssa, + transfer, + &sink_info.param_to_sink, + positions_override, + destination_override, + ); + if tainted.is_empty() { + continue; + } + // Compute all_validated: check if all tainted vars are validated let all_validated = tainted.iter().all(|(val, _, _)| { let var_name = ssa @@ -5471,13 +5628,17 @@ fn collect_block_events( // Pick primary sink sites (if any) from the resolved callee // summary. Multi-site cases emit one event per matching // [`SinkSite`] so each downstream Finding carries one attribution. - let primary_sites = - pick_primary_sink_sites(inst, &tainted, sink_caps, &sink_info.param_to_sink_sites); + let primary_sites = pick_primary_sink_sites( + inst, + &tainted, + filter_caps, + &sink_info.param_to_sink_sites, + ); emit_ssa_taint_events( events, inst.cfg_node, tainted, - sink_caps, + filter_caps, all_validated, guard_kind, any_uses_summary, @@ -5496,12 +5657,12 @@ fn collect_block_events( /// 1. `param_idx` appears in the call's positional `args` and contains one /// of the `tainted` SSA values (proves this site's parameter actually /// carried the tainted flow), AND -/// 2. [`SinkSite`] carries resolved coordinates (`line != 0` — cap-only +/// 2. [`SinkSite`] carries resolved coordinates (`line != 0`, cap-only /// sites are ignored), AND /// 3. [`SinkSite::cap`] intersects `sink_caps` (the propagated cap mask). /// /// Returns the deduped list of matching sites (`dedup_key` identity). -/// Empty ⇒ no primary attribution — caller emits a single event with +/// Empty ⇒ no primary attribution, caller emits a single event with /// `primary_sink_site = None`. fn pick_primary_sink_sites( inst: &SsaInst, @@ -5585,9 +5746,9 @@ fn pick_primary_sink_sites_from_resolved( /// /// Every [`SinkSite`] in `primary_sites` must have been filtered at the /// pick-site to satisfy: -/// * `site.line != 0` — cap-only sites carry no primary attribution and +/// * `site.line != 0`, cap-only sites carry no primary attribution and /// must not reach the event stream. -/// * `(site.cap & sink_caps).is_empty() == false` — the site's cap +/// * `(site.cap & sink_caps).is_empty() == false`, the site's cap /// intersects the propagated cap mask (it's the dangerous-bit /// justification for the finding). /// @@ -5595,7 +5756,7 @@ fn pick_primary_sink_sites_from_resolved( /// The taint-chain `uses_summary` flag tracks whether a callee summary /// propagated taint along the source→sink chain, whereas a primary /// [`SinkSite`] only requires that the *sink* itself was resolved via a -/// callee summary — an intra-file source can still reach a cross-file +/// callee summary, an intra-file source can still reach a cross-file /// sink, producing `uses_summary == false` alongside a populated primary. fn emit_ssa_taint_events( events: &mut Vec, @@ -5646,7 +5807,7 @@ fn emit_ssa_taint_events( /// Collect taint from call arguments. /// -/// `args` contains **positional arguments only** — the receiver is a separate +/// `args` contains **positional arguments only**, the receiver is a separate /// channel and is passed via `receiver`. `propagating_params` indexes directly /// into `args` using callee positional-parameter indices (no receiver offset). /// @@ -5682,7 +5843,7 @@ fn collect_args_taint( } } } else { - // Collect only from propagating param positions. Positional only — + // Collect only from propagating param positions. Positional only , // receiver-to-return propagation is handled by `receiver_to_return` on // the summary, not by this path. for ¶m_idx in propagating_params { @@ -5702,13 +5863,11 @@ fn collect_args_taint( (combined_caps, combined_origins) } -/// Phase C auth-as-taint helper: strip a capability bit from every argument -/// SSA value of a call. Used by the [`DataLabel::Sanitizer`] arm in -/// [`transfer_inst`] when the sanitizer covers [`Cap::UNAUTHORIZED_ID`] — -/// ownership / membership guards model their proof as a side effect on the -/// inputs rather than a cap stripped from the return value, so downstream -/// uses of those SSA values should see the cap cleared. Leaves origins and -/// other caps untouched; purely a cap mask. +/// Strip a capability bit from every argument SSA value of a call. +/// Used by the [`DataLabel::Sanitizer`] arm when the sanitizer covers +/// [`Cap::UNAUTHORIZED_ID`], ownership/membership guards prove on +/// inputs rather than the return value. Other caps and origins are +/// untouched. fn strip_cap_from_call_args( args: &[SmallVec<[SsaValue; 2]>], receiver: &Option, @@ -5834,9 +5993,9 @@ fn try_curl_url_propagation( /// determine whether the index is a provably non-negative integer constant /// within `MAX_TRACKED_INDICES`. /// -/// - Intraprocedural: guaranteed — each function's own const propagation +/// - Intraprocedural: guaranteed, each function's own const propagation /// results are used. -/// - Inline callee analysis (k=1): guaranteed — `inline_analyse_callee()` +/// - Inline callee analysis (k=1): guaranteed, `inline_analyse_callee()` /// sets `const_values: Some(&callee_body.opt.const_values)` on the child /// transfer, so callee-local constants are resolved. /// - Unknown / non-integer / out-of-bounds: falls back to `HeapSlot::Elements`. @@ -5997,7 +6156,7 @@ fn try_container_propagation( // Fallback: direct SSA value taint (no pts info for this container) merge_taint_into(state, container_val, val_caps, &val_origins); - // For Go append, the result is the new slice — propagate merged taint + // For Go append, the result is the new slice, propagate merged taint if lang == Lang::Go && receiver.is_none() { if let Some(merged) = state.get(container_val) { state.set(inst.value, merged.clone()); @@ -6045,20 +6204,20 @@ fn try_container_propagation( } ContainerOp::Writeback { dest_arg } => { // Receiver carries the source taint (e.g. - // `json.NewDecoder(r.Body).Decode(&dest)` — the decoder's + // `json.NewDecoder(r.Body).Decode(&dest)`, the decoder's // receiver chain is tainted by `r.Body`). Propagate that taint // into the call's destination argument so downstream sinks see // the flow through the decoded struct. // // Go method calls lower to `Kind::CallFn` with the receiver - // implicit in the dotted callee text (`d.Decode`) — there's no + // implicit in the dotted callee text (`d.Decode`), there's no // explicit `receiver` channel and no slice-as-arg-0 convention // (unlike Go's `append`), so the existing `resolve_container` // helper either returns the wrong value or `None` here. Look // up the receiver SSA value by var-name from the callee prefix. // Detect a chained-call receiver shape (`a.b(c).d(e)`) where // the receiver of the writeback method is itself a call - // expression — so its return value never gets a separate SSA + // expression, so its return value never gets a separate SSA // value and there is no `var_name` to look up. // // For `json.NewDecoder(r.Body).Decode(emoji)` the callee text @@ -6108,7 +6267,7 @@ fn try_container_propagation( t.clone() } else if chain_shape { // Receiver SSA value found but carries no direct - // taint — fall through to chain-shape arg union. + // taint, fall through to chain-shape arg union. let mut caps = Cap::empty(); let mut origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new(); for (idx, arg_group) in args.iter().enumerate() { @@ -6179,7 +6338,7 @@ fn try_container_propagation( // and (3) the field-cell channel via `pointer_facts.pt(v)` with // [`FieldId::ELEM`] as a tainted-at-all-fields wildcard so // subsequent `dest.Field` projections (which read through the - // higher-tier `pointer_facts.pt(receiver)` channel — see the + // higher-tier `pointer_facts.pt(receiver)` channel, see the // `SsaOp::FieldProj` arm) inherit the taint. Without (3), CVE // shapes like `json.NewDecoder(r.Body).Decode(&dest)` followed // by `os.Remove(filepath.Join(_, dest.Name))` left the dest @@ -6326,7 +6485,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { let caller_func = info.ast.enclosing_func.as_deref().unwrap_or(""); // The sink-label path needs an arity hint so we do not match a // same-name/different-arity overload in another namespace. - // `arg_uses.len()` is the positional-argument count — the receiver is a + // `arg_uses.len()` is the positional-argument count, the receiver is a // separate channel on `info.call.receiver`, not prepended to `arg_uses`. let arity_hint = if info.call.arg_uses.is_empty() { None @@ -6380,7 +6539,14 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { /// Collect tainted SSA values at a sink instruction. /// /// When `param_to_sink` is non-empty, only arguments at those positions are -/// checked — enables per-parameter sink precision from cross-file summaries. +/// checked, enables per-parameter sink precision from cross-file summaries. +/// +/// `positions_override` and `destination_override`, when `Some`, supersede +/// `info.call.sink_payload_args` and `info.call.destination_uses` for this +/// call. Used by the multi-gate sink dispatch in [`collect_block_events`] +/// to attribute taint per-cap when a callee carries several gates (e.g. +/// `fetch` SSRF on the URL position vs `DATA_EXFIL` on the body position). +#[allow(clippy::too_many_arguments)] fn collect_tainted_sink_values( inst: &SsaInst, info: &NodeInfo, @@ -6389,6 +6555,8 @@ fn collect_tainted_sink_values( ssa: &SsaBody, transfer: &SsaTaintTransfer, param_to_sink: &[(usize, Cap)], + positions_override: Option<&[usize]>, + destination_override: Option<&[String]>, ) -> Vec<(SsaValue, Cap, SmallVec<[TaintOrigin; 2]>)> { let mut result = Vec::new(); @@ -6409,19 +6577,21 @@ fn collect_tainted_sink_values( // Collect SSA values used by this instruction let used_values = inst_use_values(inst); - // Priority 1: gated sink filtering (CFG-level sink_payload_args). - // `sink_payload_args` indexes into positional args (no receiver offset); - // the receiver is a separate channel via `SsaOp::Call.receiver`. + // Priority 1: gated sink filtering (CFG-level sink_payload_args, or a + // multi-gate per-filter override). The position list indexes into + // positional args (no receiver offset); the receiver is a separate + // channel via `SsaOp::Call.receiver`. // - // Destination-aware narrowing: when `destination_uses` is also set by - // the CFG (outbound HTTP gate with an object-literal destination arg), + // Destination-aware narrowing: when a destination filter is set, // restrict sink-taint checks to SSA values whose `var_name` matches one // of the listed destination field identifiers. This silences // `fetch({url: fixed, body: tainted})` while still firing on // `fetch({url: tainted, body: fixed})`. - if let Some(ref positions) = info.call.sink_payload_args { + let positions: Option<&[usize]> = positions_override.or(info.call.sink_payload_args.as_deref()); + let destination_filter: Option<&[String]> = + destination_override.or(info.call.destination_uses.as_deref()); + if let Some(positions) = positions { if let SsaOp::Call { args, .. } = &inst.op { - let destination_filter = info.call.destination_uses.as_deref(); for &pos in positions { if let Some(arg_vals) = args.get(pos) { for &v in arg_vals { @@ -6477,7 +6647,7 @@ fn collect_tainted_sink_values( } } - // Priority 3: aggregate fallback — check all used values + // Priority 3: aggregate fallback, check all used values for v in used_values { if let Some(taint) = state.get(v) { if (taint.caps & sink_caps) != Cap::empty() { @@ -6533,7 +6703,7 @@ fn apply_field_aware_suppression( // as uses; treating `u.String` as a clean field of `u` suppressed // the SSRF. But JS object-field FP guards (e.g. // `db.query(obj.safeField)` with `obj.unsafeField` tainted) need - // the opposite — `obj.safeField` is a real field access and SHOULD + // the opposite, `obj.safeField` is a real field access and SHOULD // count as a clean field. The CFG distinguishes the two via // `arg_callees`: when an argument expression is itself a call, its // callee text is recorded; pure member-access args leave the slot @@ -6739,7 +6909,7 @@ fn propagate_sanitization_to_aliases( // the aliased field path. for alias_base in alias_bases { if alias_base == base { - continue; // skip self — already sanitized + continue; // skip self, already sanitized } let target = if suffix.is_empty() { // Plain ident: look for exact match on alias base @@ -6794,7 +6964,7 @@ fn propagate_sanitization_to_aliases( /// copy propagation), this function also taints `alias.data` in the taint state. /// For plain idents (no dot), tainting `obj` also taints `alias`. /// -/// Uses only the existing `BaseAliasResult` alias groups — no new alias inference. +/// Uses only the existing `BaseAliasResult` alias groups, no new alias inference. fn propagate_taint_to_aliases( inst: &SsaInst, state: &mut SsaTaintState, @@ -6829,7 +6999,7 @@ fn propagate_taint_to_aliases( let vdef_name = vdef.var_name.as_deref()?; for alias_base in alias_bases { if alias_base == base { - continue; // skip self — already tainted + continue; // skip self, already tainted } if suffix.is_empty() { // Plain ident: look for exact match on alias base @@ -6867,7 +7037,7 @@ fn propagate_taint_to_aliases( }, ); } else { - // No existing taint — set fresh + // No existing taint, set fresh state.set( v, VarTaint { @@ -6981,19 +7151,15 @@ fn resolve_type_qualified_labels( /// chains to collect candidate SSA values for type-fact lookup. /// /// Two motivating shapes: -/// - Rust chained methods: `conn.execute(x).unwrap()` is one outer call whose -/// receiver is itself a call. The stable base identifier (`conn`) is -/// several `Call.receiver` hops up. -/// - Phase 2 `FieldProj` decomposition (all languages): `c.client.send(req)` -/// lowers to `v_client = FieldProj(v_c, "client")`, `Call("send", [v_client])`. -/// The typed root (`c`, of e.g. `RouterContext` type) sits one -/// `FieldProj.receiver` hop above `v_client`. Walking through FieldProj -/// lets `resolve_type_qualified_labels` discover the typed root regardless -/// of intermediate field accesses. +/// - Rust chained methods: `conn.execute(x).unwrap()` is one outer call +/// whose receiver is itself a call. The stable base identifier +/// (`conn`) is several `Call.receiver` hops up. +/// - `FieldProj` decomposition: `c.client.send(req)` lowers through +/// `v_client = FieldProj(v_c, "client")`, so the typed root (`c`) +/// sits one `FieldProj.receiver` hop above `v_client`. /// -/// FieldProj walking runs for every language (it's the universal Phase 2 -/// decomposition). Call-receiver walking remains Rust-only — other -/// languages have method-call nesting handled at AST level. +/// FieldProj walking runs for every language. Call-receiver walking is +/// Rust-only, other languages handle method nesting at AST level. fn receiver_candidates_for_type_lookup( start: SsaValue, ssa: Option<&SsaBody>, @@ -7012,7 +7178,7 @@ fn receiver_candidates_for_type_lookup( for inst in block.phis.iter().chain(block.body.iter()) { if inst.value == current { match &inst.op { - // Phase 2: FieldProj receiver chain — universal. + // FieldProj receiver chain, universal. SsaOp::FieldProj { receiver, .. } => { next_receiver = Some(*receiver); } @@ -7114,7 +7280,7 @@ fn is_type_safe_for_sink( /// /// Returns `true` if the type cannot carry the payload required by the sink. /// Policy: Int/Bool values cannot carry injection payloads (SQL, code, path). -/// String-typed values CAN carry injection payloads — casts to String do NOT +/// String-typed values CAN carry injection payloads, casts to String do NOT /// make a value safe. fn type_safe_for_taint_sink(kind: &crate::ssa::type_facts::TypeKind, cap: Cap) -> bool { use crate::ssa::type_facts::TypeKind; @@ -7130,7 +7296,7 @@ fn type_safe_for_taint_sink(kind: &crate::ssa::type_facts::TypeKind, cap: Cap) - /// /// Returns the Cap bits that should be REMOVED because the receiver type /// proves the sink doesn't apply. For example, `HTML_ESCAPE` sinks require -/// an HTTP-response-like receiver — if the receiver is known to be +/// an HTTP-response-like receiver, if the receiver is known to be /// Int/Bool/String, `HTML_ESCAPE` doesn't apply. fn receiver_incompatible_sink_caps(kind: &crate::ssa::type_facts::TypeKind, sink_caps: Cap) -> Cap { use crate::ssa::type_facts::TypeKind; @@ -7174,7 +7340,7 @@ fn is_path_type_safe_for_sink(inst: &SsaInst, sink_caps: Cap, env: &constraint:: /// Check if abstract domain facts prove a sink is safe. /// /// SSRF: string prefix with locked host. -/// SQL_QUERY / FILE_IO: dual gate — type-proven Int AND bounded interval on all +/// SQL_QUERY / FILE_IO: dual gate, type-proven Int AND bounded interval on all /// tainted leaf values. Traces back through Assign chains to find original /// tainted data (e.g., `parseInt(x)` inside `"SELECT ..." + parseInt(x) * 10`). /// @@ -7197,7 +7363,7 @@ fn is_abstract_safe_for_sink( return false; } - // SSRF — string prefix with locked host + // SSRF, string prefix with locked host if sink_caps.intersects(Cap::SSRF) { // Inline template-literal prefix attached to the CFG node directly // (covers sinks whose URL is a template literal argument without an @@ -7217,12 +7383,12 @@ fn is_abstract_safe_for_sink( } } - // SHELL_ESCAPE — static-map finite-domain safety. When every tainted + // SHELL_ESCAPE, static-map finite-domain safety. When every tainted // payload value is proved by the static-HashMap-lookup analysis to come // from a bounded set of metacharacter-free literals, the call cannot // carry shell injection regardless of how the attacker influenced the // lookup key. Only fires when the value appears in `static_map.finite_ - // string_values`, not for arbitrary single-literal exact facts — those + // string_values`, not for arbitrary single-literal exact facts, those // already have their own constant-argument suppression path and we // must not over-apply shell-safety to unrelated const-prop bare-string // artefacts (e.g. Python `commands = []`). @@ -7233,8 +7399,8 @@ fn is_abstract_safe_for_sink( // HTML_ESCAPE type-only gate: an integer's decimal representation is // always digits (with optional leading `-`), which never contain HTML // metacharacters (`<`, `>`, `"`, `'`, `&`, `/`, `:`) in either text or - // attribute context. The interval bound is irrelevant here — a large - // magnitude doesn't introduce metachars — so HTML_ESCAPE uses a + // attribute context. The interval bound is irrelevant here, a large + // magnitude doesn't introduce metachars, so HTML_ESCAPE uses a // type-only leaf check rather than the SQL/FILE/SHELL dual gate below. if sink_caps.intersects(Cap::HTML_ESCAPE) { if let Some(tf) = type_facts { @@ -7299,7 +7465,7 @@ fn is_path_safe_for_sink( if safe { // Publish the suppression to the file-level set so the // state-analysis pass can suppress `state-unauthed-access` on - // the same sink — once the taint engine has proved the + // the same sink, once the taint engine has proved the // user-controlled input cannot escape into a privileged // location, the auth concern is structurally reduced. let span = cfg[inst.cfg_node].ast.span; @@ -7320,11 +7486,11 @@ fn is_call_abstract_safe( ssa: &SsaBody, cfg: &Cfg, ) -> bool { - // SSRF — check if the URL argument (first arg) has a safe prefix. + // SSRF, check if the URL argument (first arg) has a safe prefix. if sink_caps.intersects(Cap::SSRF) { // Inline template-literal prefix from the call AST itself // (e.g. `axios.get(\`https://host/…${x}\`)` has no intermediate Assign - // to seed a StringFact — check the node-attached prefix directly). + // to seed a StringFact, check the node-attached prefix directly). let node_info = &cfg[inst.cfg_node]; if let Some(prefix) = node_info.string_prefix.as_deref() { let synthetic = crate::abstract_interp::StringFact::from_prefix(prefix); @@ -7343,7 +7509,7 @@ fn is_call_abstract_safe( } } - // SHELL_ESCAPE — static-map finite-domain safety on every non-empty arg + // SHELL_ESCAPE, static-map finite-domain safety on every non-empty arg // group. Mirrors the non-Call path so suppression fires regardless of // which branch the sink detector took. if sink_caps.intersects(Cap::SHELL_ESCAPE) && !args.is_empty() { @@ -7431,7 +7597,7 @@ fn trace_single_leaf( let inst = match block.body.iter().find(|i| i.value == v) { Some(i) => i, None => { - // Phi or not found in body — treat as leaf + // Phi or not found in body, treat as leaf leaves.push(v); return; } @@ -7452,7 +7618,7 @@ fn trace_single_leaf( match &inst.op { SsaOp::Assign(uses) if uses.len() >= 2 => { // Numeric binary operations (bitwise, arithmetic except Add, comparisons) - // always produce integers — treat the result as a leaf rather than tracing + // always produce integers, treat the result as a leaf rather than tracing // through to the string-typed operands. Add is excluded because it may be // string concatenation. let bin_op = cfg.node_weight(inst.cfg_node).and_then(|ni| ni.bin_op); @@ -7515,7 +7681,7 @@ fn trace_single_leaf( // the arguments to find the upstream tainted leaves. The Call's // return taint is a function of its args under this // classification, so the leaves are the Call's inputs. Source- - // labeled Calls keep the default leaf behavior — tracing past + // labeled Calls keep the default leaf behavior, tracing past // them would erase the Source and over-suppress. let is_source = cfg .node_weight(inst.cfg_node) @@ -7527,15 +7693,15 @@ fn trace_single_leaf( }) .unwrap_or(false); // PathFact-proven sanitisation: when the abstract state has - // recorded a non-Top [`PathFact`] on this Call's result — + // recorded a non-Top [`PathFact`] on this Call's result , // typically because cross-function inline analysis narrowed - // the return path's `dotdot` / `absolute` axis — the Call + // the return path's `dotdot` / `absolute` axis, the Call // is the *proof point*. Tracing past it would land on the // upstream source (whose PathFact is still Top) and defeat // the narrowing. Push the Call result as a leaf so // `is_path_safe_for_sink` reads the proven fact directly. // - // Strictly additive — only fires when the abstract domain + // Strictly additive, only fires when the abstract domain // proves a non-Top fact, so source-labeled Calls (already // caught above) and unrelated calls fall back to the // existing trace-through-args behaviour. @@ -7564,7 +7730,7 @@ fn trace_single_leaf( // Single-use Assign: pass through to the source value's leaf. // Covers the common pattern where SSA lowering emits both a Call // form carrying a sink expression and an outer Assign that binds - // the Call's value to the defined variable — without this, the + // the Call's value to the defined variable, without this, the // Assign's tracing stops at the wrapped Call (String-typed by // default) and loses the Int / bounded leaf already known through // the Call's args. @@ -7599,7 +7765,7 @@ fn is_stringify_callee(callee: &str) -> bool { /// Return `true` when every value in `values` was proved by the static-map /// analysis to be drawn from a finite set of metacharacter-free literals. /// Returns `false` when `static_map` is `None`, when any value is missing, -/// or when any value's bounded set contains a shell metacharacter — the +/// or when any value's bounded set contains a shell metacharacter, the /// predicate is conservative, so a missing entry never suppresses. fn is_static_map_shell_safe( values: &[SsaValue], @@ -7629,7 +7795,7 @@ fn is_string_safe_for_ssrf(sf: &crate::abstract_interp::StringFact) -> bool { Some(p) => p.as_str(), None => return false, }; - // Absolute-path prefix (e.g. "/projects/...") — internal redirect, not open redirect. + // Absolute-path prefix (e.g. "/projects/..."), internal redirect, not open redirect. // The leading "/" locks the path to the same origin; the attacker cannot control the scheme // or host, so this is not an SSRF vector. if prefix.starts_with('/') { @@ -7649,32 +7815,32 @@ fn is_string_safe_for_ssrf(sf: &crate::abstract_interp::StringFact) -> bool { /// /// Resolution is deliberately identity-aware: /// -/// 1. Filter by `(lang, namespace, name)` — these always participate in the +/// 1. Filter by `(lang, namespace, name)`, these always participate in the /// identity hash, so the candidate set is guaranteed to be the /// same-file same-leaf-name definitions. /// 2. If `container_hint` is supplied (e.g. the `obj` in `obj.method`), /// narrow to candidates whose [`FuncKey::container`] matches. /// 3. If exactly one candidate remains, return its key. /// -/// Returns `None` when zero or multiple candidates remain — callers should +/// Returns `None` when zero or multiple candidates remain, callers should /// then fall through to their own ambiguity policy instead of accidentally /// picking an arbitrary definition. /// Split a raw callee string into a `(namespace_qualifier, receiver_var)` /// pair. /// /// * `"env::var"` → `(Some("env"), None)` -/// * `"std::io::File::open"` → `(Some("File"), None)` — leaf's immediate +/// * `"std::io::File::open"` → `(Some("File"), None)`, leaf's immediate /// container is kept so qualified lookup can match /// `File::open`. Deeper module prefixes are discarded here; the call /// graph's Rust-specific resolver handles full paths via the use map. /// * `"obj.method"` → `(None, Some("obj"))` -/// * `"a.b.method"` → `(None, Some("b"))` — immediate object hop. +/// * `"a.b.method"` → `(None, Some("b"))`, immediate object hop. /// * `"foo"` → `(None, None)` /// /// `::` is treated as a namespace separator and produces a /// `namespace_qualifier`; `.` is treated as a method receiver and /// produces a `receiver_var`. When both separators appear, the -/// last-used one wins — matching the leaf-extraction rule in +/// last-used one wins, matching the leaf-extraction rule in /// [`callee_leaf_name`]. fn split_qualifier(raw: &str) -> (Option<&str>, Option<&str>) { if let Some(pos) = raw.rfind("::") { @@ -7757,7 +7923,7 @@ pub(crate) fn resolve_local_func_key_query( if let Some(k) = pick_with_container(rt) { return Some(k); } - // Authoritative miss — do not silently pick a different container. + // Authoritative miss, do not silently pick a different container. return None; } @@ -7784,7 +7950,7 @@ pub(crate) fn resolve_local_func_key_query( } } - // Bare-call free-function preference — mirrors + // Bare-call free-function preference, mirrors // `GlobalSummaries::resolve_callee` step 5.5. When the call is // syntactically bare (no receiver, no namespace qualifier, no // authoritative receiver type) and exactly one arity-matched local @@ -7850,7 +8016,7 @@ struct ResolvedSummary { sanitizer_caps: Cap, sink_caps: Cap, /// Per-parameter sink caps: (param_index, caps). When non-empty, only - /// arguments at these positions flow to internal sinks — enables positional + /// arguments at these positions flow to internal sinks, enables positional /// and capability-aware filtering instead of aggregate-only detection. param_to_sink: Vec<(usize, Cap)>, /// Per-parameter [`SinkSite`] records mirroring `param_to_sink` by index. @@ -7895,7 +8061,7 @@ struct ResolvedSummary { /// When present, summary application at the call site consults the /// caller's [`SsaTaintState::predicates`] and applies only entries /// whose predicate gate is consistent with the caller's validated - /// set — recovering callee-internal path splits that the aggregate + /// set, recovering callee-internal path splits that the aggregate /// [`Self::sanitizer_caps`] / [`Self::propagating_params`] view /// otherwise erases. Empty for non-SSA resolution paths. param_return_paths: Vec<( @@ -7907,16 +8073,13 @@ struct ResolvedSummary { /// Populated only via `convert_ssa_to_resolved`; other resolution /// paths leave it empty (they do not derive alias edges). Empty / /// default means "no aliasing beyond what param_to_container_store - /// already captures" — the caller treats the call as a pure + /// already captures", the caller treats the call as a pure /// taint-through-signature edge. points_to: crate::summary::points_to::PointsToSummary, - /// Pointer-Phase 5 / W3: field-granularity per-parameter points-to - /// summary. Populated only via `convert_ssa_to_resolved` when the - /// underlying SSA summary carries `field_points_to` records; other - /// resolution paths leave it empty. Applied at the caller-side - /// call site by `apply_field_points_to_writes` to spread argument - /// taint into matching `(loc, field)` cells when - /// [`crate::pointer::is_enabled()`] is set. + /// Field-granularity per-parameter points-to summary. Populated + /// only via `convert_ssa_to_resolved` when the SSA summary carries + /// `field_points_to` records. Applied at the caller call site by + /// `apply_field_points_to_writes`. field_points_to: crate::summary::points_to::FieldPointsToSummary, } @@ -7933,7 +8096,7 @@ fn resolve_callee( /// candidate set to functions with a matching parameter count. /// /// Used by the call-graph / SSA-transfer paths when the caller knows the -/// number of positional arguments at this site — this eliminates false +/// number of positional arguments at this site, this eliminates false /// resolution to same-name siblings with different arities (e.g. /// `encode(x)` vs `encode(x, opts)` in the same namespace). fn resolve_callee_hinted( @@ -8144,7 +8307,7 @@ fn resolve_callee_full( // while `local_summaries` keys keep the raw file path that // `build_cfg` wrote. When the exact-key lookup misses, fall back // to a namespace-tolerant scan that matches every other FuncKey - // field (lang/container/name/arity/disambig/kind) — this recovers + // field (lang/container/name/arity/disambig/kind), this recovers // intra-file SSA summary lookups in single-file or non-indexed // scans where the two namespaces disagree by construction. if let Some(ssa_sums) = transfer.ssa_summaries { @@ -8173,7 +8336,7 @@ fn resolve_callee_full( // type has recorded sub-types whose `method` overrides exist, the // taint engine sees ALL implementers, not just the super-type's // own definition. This is the runtime counterpart of the - // call-graph builder's `resolve_with_hierarchy` step — without + // call-graph builder's `resolve_with_hierarchy` step, without // it, virtual dispatch through a super-type silently lost // sub-type sources / sinks. if let Some(gs) = transfer.global_summaries { @@ -8214,14 +8377,14 @@ fn resolve_callee_full( ); return accum; } - // None of the widened keys had SSA summaries — fall + // None of the widened keys had SSA summaries, fall // through to step 2 (FuncSummary path) which may have // hierarchy-widened FuncSummary entries. } } } - // 1) Local (same-file) — lookup via canonical FuncKey using the + // 1) Local (same-file), lookup via canonical FuncKey using the // same qualified-first policy as the global resolver. if let Some(key) = resolve_local_func_key_query(transfer.local_summaries, &build_query()) { if let Some(ls) = transfer.local_summaries.get(&key) { @@ -8270,7 +8433,7 @@ fn resolve_callee_full( // 2) Global same-language (FuncSummary path) with Phase-6 hierarchy // fan-out. Same semantics as step 0.5 but on coarse FuncSummary - // entries — the SSA path missed because no implementer had an SSA + // entries, the SSA path missed because no implementer had an SSA // summary, so we widen the FuncSummary lookup symmetrically. if let Some(gs) = transfer.global_summaries { let widened = gs.resolve_callee_widened(&build_query()); @@ -8434,7 +8597,7 @@ fn effective_param_sanitizer( } if compatible.is_empty() { - // No path applies — the caller's predicate state contradicts every + // No path applies, the caller's predicate state contradicts every // recorded return. Fall back to the aggregate rather than // synthesise a sanitiser from zero data. return resolved.sanitizer_caps; @@ -8504,7 +8667,7 @@ fn convert_ssa_to_resolved_for_caller( // attribute cross-file findings to the callee-internal sink. Sites // with coordinates of `(0, 0)` (cap-only, no tree/bytes context at // extraction time) remain in the list but contribute no primary - // location — the emission site filters by `SinkSite::line != 0`. + // location, the emission site filters by `SinkSite::line != 0`. // // Strip same-file sites when `caller_namespace` is supplied: the // caller's own taint analysis already produces a finding at the @@ -8559,29 +8722,29 @@ fn convert_ssa_to_resolved_for_caller( /// receiver static type fans out to multiple concrete implementers via /// [`crate::callgraph::TypeHierarchyIndex`]. /// -/// Semantics — designed to keep the engine sound under fan-out: +/// Semantics, designed to keep the engine sound under fan-out: /// /// * **Caps that *grow* the taint signal** /// (`source_caps`, `sink_caps`, `receiver_to_sink`, -/// `propagates_taint`) — **OR**. Any implementer that introduces +/// `propagates_taint`), **OR**. Any implementer that introduces /// the cap is a valid runtime target, so the union conservatively /// covers every dispatch outcome. -/// * **`sanitizer_caps`** — **AND**. Only bits sanitized by *every* +/// * **`sanitizer_caps`**, **AND**. Only bits sanitized by *every* /// implementer can be considered cleared at the call site, since /// the dispatch could land on the implementer that doesn't /// sanitize. /// * **Per-parameter vectors** (`param_to_sink`, `propagating_params`, /// `param_container_to_return`, `param_to_container_store`, -/// `source_to_callback`) — **union**. An impl that contributes a +/// `source_to_callback`), **union**. An impl that contributes a /// propagation/sink at parameter N is a valid runtime path; missing /// impls do not subtract. -/// * **`param_to_sink_sites`** — concatenated per-parameter (dedup +/// * **`param_to_sink_sites`**, concatenated per-parameter (dedup /// on `SinkSite::PartialEq`). Each site is independently /// emittable; the dedup avoids reporting the same callee-internal /// sink twice. /// * **SSA-precision fields** (`return_type`, `return_abstract`, /// `receiver_to_return`, `abstract_transfer`, `param_return_paths`, -/// `points_to`) — **drop on disagreement**. These describe the +/// `points_to`), **drop on disagreement**. These describe the /// precise behavior of *one* function body; merging two /// incompatible bodies yields a meaningless composite. Identity /// is preserved when both sides agree exactly (string equality or diff --git a/src/taint/ssa_transfer/state.rs b/src/taint/ssa_transfer/state.rs index 92198346..a9fdd8c8 100644 --- a/src/taint/ssa_transfer/state.rs +++ b/src/taint/ssa_transfer/state.rs @@ -2,7 +2,7 @@ //! the original monolithic `ssa_transfer.rs`. //! //! Contains: -//! * [`SsaTaintState`] — the per-block lattice value with `values`, +//! * [`SsaTaintState`], the per-block lattice value with `values`, //! `validated_must`/`validated_may`, `predicates`, `heap`, `path_env`, //! `abstract_state`. //! * [`BindingKey`] / [`seed_lookup`] for cross-body taint seeding. @@ -25,7 +25,7 @@ use std::collections::HashMap; // NOTE: The per-SSA-value origin cap used to be a hardcoded // `MAX_ORIGINS: usize = 4`. It is now governed by the stable -// `analysis.engine.max_origins` option (default `32`) — see +// `analysis.engine.max_origins` option (default `32`), see // `crate::utils::analysis_options` and [`effective_max_origins`]. The // test-only override below still short-circuits the config read so // `engine_notes_tests.rs` can force a tiny cap to trigger truncation @@ -42,7 +42,7 @@ static WORKLIST_CAP_OVERRIDE: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); /// Records the MAX iteration count observed across every /// `run_ssa_taint_full` call since the most recent reset. Cheaper and -/// more useful for regression tests than the last-call value — a cap +/// more useful for regression tests than the last-call value, a cap /// hit anywhere in the scan is remembered. pub(super) static MAX_WORKLIST_ITERATIONS: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); @@ -90,7 +90,7 @@ pub fn reset_worklist_observability() { /// force `OriginsTruncated` emission on small fixtures. static MAX_ORIGINS_OVERRIDE: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); -/// Total number of origins dropped since the most recent reset — captured +/// Total number of origins dropped since the most recent reset, captured /// from `merge_origins` and the post-hoc saturation scan. Used by tests /// to detect truncation events that don't propagate to a finding (e.g. /// when the cap is so tight no taint flow survives to emit a sink event). @@ -136,7 +136,7 @@ pub fn reset_origins_observability() { thread_local! { /// Per-body engine-note collector. Cleared at the start of each /// `analyse_body_with_seed` invocation and drained after - /// `run_ssa_taint_full` returns — notes are then attached to every + /// `run_ssa_taint_full` returns, notes are then attached to every /// finding emitted from that body. Living as a thread-local avoids /// threading a `&RefCell` through the nearly-10-argument transfer /// struct; inline analysis recursion is intentionally allowed to @@ -148,7 +148,7 @@ thread_local! { /// was suppressed by an SSA-engine path-safety proof (PathFact /// `dotdot=No && absolute=No`). Populated by `is_path_safe_for_sink` /// and consumed by the state-analysis pass to suppress - /// `state-unauthed-access` on the same sink — when the taint engine + /// `state-unauthed-access` on the same sink, when the taint engine /// has already proved the user-controlled input cannot escape into a /// privileged location, the auth concern on that sink is reduced. /// Reset at start of `analyse_file`, drained before state analysis. @@ -156,7 +156,7 @@ thread_local! { RefCell::new(std::collections::HashSet::new()); /// File-level set of CFG sink spans where the SSA engine emitted an - /// `all_validated` event — every tainted input to the sink passed + /// `all_validated` event, every tainted input to the sink passed /// through a recognised validation/sanitisation predicate before /// reaching it. Distinct from `PATH_SAFE_SUPPRESSED_SPANS`, which /// is FILE_IO-scoped and feeds state analysis: this set is @@ -167,7 +167,7 @@ thread_local! { /// /// Without this signal the suppression gate has to fall back to /// "function emitted at least one taint-unsanitised-flow finding" - /// or "function contains a labelled Sanitizer node" — both of + /// or "function contains a labelled Sanitizer node", both of /// which miss validated/dominated/early-return safety where the /// engine cleared the flow without firing or hitting an explicit /// sanitiser. @@ -227,7 +227,7 @@ pub fn take_path_safe_suppressed_spans() -> std::collections::HashSet<(usize, us /// Record a sink CFG-node span where the SSA engine proved every /// tainted input was validated (`SsaTaintEvent::all_validated`). -/// Cap-agnostic — fires for any sink the engine evaluated and cleared. +/// Cap-agnostic, fires for any sink the engine evaluated and cleared. /// Consumed by `TaintSuppressionCtx::build` as positive evidence that /// taint analysis reached this line and proved safety, so AST-pattern /// findings on the same line can be suppressed without misclassifying @@ -263,7 +263,7 @@ pub fn take_all_validated_spans() -> std::collections::HashSet<(usize, usize)> { /// into the seed map always specify the owning body's id; readers look /// up by the scope they know they want (typically their own /// `parent_body_id`, with a fallback to `BodyId(0)` for entries that -/// the JS/TS two-level solve has re-keyed onto the top-level scope — +/// the JS/TS two-level solve has re-keyed onto the top-level scope , /// see [`crate::taint::ssa_transfer::filter_seed_to_toplevel`]). #[derive(Debug, Clone, Hash, Eq, PartialEq)] pub struct BindingKey { @@ -284,7 +284,7 @@ impl BindingKey { /// Look up a binding in a seed map. /// /// Thin wrapper over [`HashMap::get`] retained for call-site readability -/// — every seed entry is now exactly scoped to a single `(name, +///, every seed entry is now exactly scoped to a single `(name, /// BodyId)`, so the lookup is O(1) with no fallback. Writers that want /// cross-scope reachability must explicitly re-key their entries (see /// [`crate::taint::ssa_transfer::filter_seed_to_toplevel`]). @@ -299,7 +299,7 @@ pub fn seed_lookup<'a>( /// Compact key for a heap-field taint cell. /// -/// `(loc, field)` — `loc` is the abstract location of the *parent* +/// `(loc, field)`, `loc` is the abstract location of the *parent* /// (interned by the body's [`crate::pointer::LocInterner`]), `field` /// is the [`FieldId`] of the projected field. The pair survives lattice /// joins / leq comparisons by `Ord`-derived sort. @@ -309,16 +309,16 @@ pub struct FieldTaintKey { pub field: FieldId, } -/// Pointer-Phase 4 / W4: per-field-cell taint record. +/// per-field-cell taint record. /// /// Carries the union of writers' taint for the abstract field cell plus /// two validation channels: -/// * `validated_must` — set when *every* writer recorded a value that was +/// * `validated_must`, set when *every* writer recorded a value that was /// `validated_must` in its own SSA scope. Lattice join intersects -/// (`AND`) — matching the symbol-keyed [`SsaTaintState::validated_must`] +/// (`AND`), matching the symbol-keyed [`SsaTaintState::validated_must`] /// semantics for "validated on every path". -/// * `validated_may` — set when *any* writer recorded a `validated_may` -/// value. Lattice join unions (`OR`) — matching the symbol-keyed +/// * `validated_may`, set when *any* writer recorded a `validated_may` +/// value. Lattice join unions (`OR`), matching the symbol-keyed /// [`SsaTaintState::validated_may`] semantics for "validated on some /// path". /// @@ -332,7 +332,7 @@ pub struct FieldCell { } impl FieldCell { - /// Construct a cell with no validation bits — convenience for the + /// Construct a cell with no validation bits, convenience for the /// pre-W4 callers that don't propagate symbol-level validation. pub fn unvalidated(taint: VarTaint) -> Self { Self { @@ -365,17 +365,17 @@ pub struct SsaTaintState { /// interpretation is disabled (`analysis.engine.abstract_interpretation /// = false`). pub abstract_state: Option, - /// Pointer-Phase 3: per-heap-field taint cells, keyed by + /// per-heap-field taint cells, keyed by /// `(parent_loc, field)`. Sorted by [`FieldTaintKey`] for O(n) /// merge-join. Populated only when the body's /// [`crate::pointer::PointsToFacts`] is available /// (`NYX_POINTER_ANALYSIS=1`); empty otherwise so the lattice join /// is a strict no-op for pointer-disabled runs. Field reads /// (`SsaOp::FieldProj`) consult the cells; field writes record into - /// them. Cross-call propagation lands in Phase 5 via the + /// them. Cross-call propagation lands during lowering via the /// field-granularity `PointsToSummary`. /// - /// Cell shape (Phase 4 / W4): [`FieldCell`] carries `taint` plus + /// Cell shape: [`FieldCell`] carries `taint` plus /// `validated_must` / `validated_may` flags so validation flows /// through abstract field / element identity. pub field_taint: SmallVec<[(FieldTaintKey, FieldCell); 4]>, @@ -403,7 +403,7 @@ impl SsaTaintState { } } - /// Pointer-Phase 3: read the field cell at `key`. Returns `None` + /// read the field cell at `key`. Returns `None` /// when no cell has been recorded (caller should treat as /// untainted). O(log n) on the sorted [`field_taint`] list. pub fn get_field(&self, key: FieldTaintKey) -> Option<&FieldCell> { @@ -413,13 +413,13 @@ impl SsaTaintState { .map(|idx| &self.field_taint[idx].1) } - /// Pointer-Phase 3 / W4: union `t` into the field cell at `key`, + /// union `t` into the field cell at `key`, /// recording per-write `validated_must` / `validated_may` channels. /// /// Maintains sorted invariant. No-op when `t.caps` is empty (so the /// lattice bottom stays `[]`). When the cell already exists, the - /// validation channels merge with the lattice-join semantics — - /// `must` AND-intersects, `may` OR-unions — matching the symbol- + /// validation channels merge with the lattice-join semantics , + /// `must` AND-intersects, `may` OR-unions, matching the symbol- /// keyed [`SsaTaintState::validated_must`] / `validated_may` /// semantics so a write coming through a non-validated path tears /// down `must` while preserving `may` of any earlier validated path. @@ -563,15 +563,15 @@ impl Lattice for SsaTaintState { } } -/// Pointer-Phase 3 / W4: merge-join two sorted `field_taint` lists. +/// merge-join two sorted `field_taint` lists. /// Same shape as [`merge_join_ssa_vars`] but keyed on [`FieldTaintKey`]: -/// * `taint.caps` — OR-union -/// * `taint.origins` — merged with cap-respecting de-dup -/// * `taint.uses_summary` — OR-union -/// * `validated_must` — AND-intersect (matches the symbol-keyed +/// * `taint.caps` , OR-union +/// * `taint.origins`, merged with cap-respecting de-dup +/// * `taint.uses_summary`, OR-union +/// * `validated_must`, AND-intersect (matches the symbol-keyed /// `validated_must` lattice: a path that didn't validate this cell /// breaks the invariant) -/// * `validated_may` — OR-union (any path's validation contributes) +/// * `validated_may`, OR-union (any path's validation contributes) pub(super) fn merge_join_field_taint( a: &[(FieldTaintKey, FieldCell)], b: &[(FieldTaintKey, FieldCell)], @@ -581,7 +581,7 @@ pub(super) fn merge_join_field_taint( while i < a.len() && j < b.len() { match a[i].0.cmp(&b[j].0) { std::cmp::Ordering::Less => { - // Cell present only in `a` — counterpart in `b` is the + // Cell present only in `a`, counterpart in `b` is the // lattice bottom (no validation, no taint), so: // must = a.must AND false = false // may = a.may OR false = a.may @@ -637,11 +637,11 @@ pub(super) fn merge_join_field_taint( /// `a ≤ b` for sorted `field_taint` lists. Used by the convergence /// check in [`Lattice::leq`]. Per-cell criteria: /// -/// * `taint.caps` — `a ⊆ b` (sub-state on caps; matches per-SSA-value +/// * `taint.caps`, `a ⊆ b` (sub-state on caps; matches per-SSA-value /// `ssa_vars_leq`). -/// * `validated_must` — `a.must ⊇ b.must` (super-state on must; same +/// * `validated_must`, `a.must ⊇ b.must` (super-state on must; same /// shape as the symbol-keyed `validated_must` leq). -/// * `validated_may` — `a.may ⊆ b.may` (sub-state on may). +/// * `validated_may`, `a.may ⊆ b.may` (sub-state on may). /// /// When `b` lacks a key present in `a`, `b`'s side is the lattice /// bottom: no caps, no validation. `a`'s caps must also be empty @@ -669,12 +669,12 @@ pub(super) fn field_taint_leq( if (ca.taint.caps - cb.taint.caps).bits() != 0 { return false; } - // Must: a ⊇ b — every must-validated key in b is must-validated + // Must: a ⊇ b, every must-validated key in b is must-validated // in a. Equivalently: !cb.must OR ca.must. if cb.validated_must && !ca.validated_must { return false; } - // May: a ⊆ b — every may-validated key in a is may-validated + // May: a ⊆ b, every may-validated key in a is may-validated // in b. Equivalently: !ca.may OR cb.may. if ca.validated_may && !cb.validated_may { return false; @@ -735,7 +735,7 @@ pub(super) fn merge_join_ssa_vars( /// /// Ordering is lexicographic over /// `(source_span_start, source_span_end, source_kind_tag, node_index)`. -/// `source_span` is the most stable component across bodies — cross-body +/// `source_span` is the most stable component across bodies, cross-body /// remapped origins carry the original byte span explicitly; intra-body /// origins default to `(0, 0)` and fall through to the secondary keys. /// @@ -760,7 +760,7 @@ fn origin_sort_key(o: &TaintOrigin) -> (usize, usize, u8, usize) { /// Bounded, deterministic insertion of an origin into a sorted origin /// set. Returns `true` when `new` was admitted (or de-duplicated against /// an existing entry), `false` when the cap forced a drop. On drop, -/// the origin with the *largest* sort key is evicted first — the caller +/// the origin with the *largest* sort key is evicted first, the caller /// sees a survivor set that depends only on the input multiset and /// [`effective_max_origins`], not on insertion order. /// @@ -774,7 +774,7 @@ pub(crate) fn push_origin_bounded( ) -> bool { // Identity check: same node counts as the same origin. We keep // node-only dedup to match [`ssa_vars_leq`], which compares origin - // sets by node membership — widening dedup here without tightening + // sets by node membership, widening dedup here without tightening // there would break the monotonicity invariant. if target.iter().any(|o| o.node == new.node) { return true; @@ -814,7 +814,7 @@ pub(crate) fn push_origin_bounded( target.insert(pos, new); true } else { - // `new` itself is the worst — drop it instead of the survivor. + // `new` itself is the worst, drop it instead of the survivor. false } } @@ -829,7 +829,7 @@ pub(super) fn merge_origins( a: &SmallVec<[TaintOrigin; 2]>, b: &SmallVec<[TaintOrigin; 2]>, ) -> SmallVec<[TaintOrigin; 2]> { - // Seed the result with `a` — but re-sort defensively in case the + // Seed the result with `a`, but re-sort defensively in case the // caller constructed `a` through non-bounded paths. Historically // every write goes through `push_origin_bounded` (or `merge_origins` // itself), so this resort is a no-op on the steady state but costs @@ -911,7 +911,7 @@ pub(super) fn merge_join_ssa_predicates( mod origin_cap_tests { //! Tests for the deterministic, config-driven origin cap. These //! cover the behavior at the `push_origin_bounded` / `merge_origins` - //! boundary — the end-to-end engine-note signal is exercised in + //! boundary, the end-to-end engine-note signal is exercised in //! `tests/engine_notes_tests.rs`. use super::*; @@ -1037,7 +1037,7 @@ mod origin_cap_tests { fn effective_cap_reads_runtime_config_when_override_zero() { // Override takes priority; override=0 falls through to config. // `current()` returns the default (32) when no runtime is - // installed — which is the state the rest of the test suite runs + // installed, which is the state the rest of the test suite runs // under. Guard that the fallback path reaches 32. let _g = TEST_GUARD.lock().unwrap_or_else(|e| e.into_inner()); set_max_origins_override(0); @@ -1053,7 +1053,7 @@ mod origin_cap_tests { #[cfg(test)] mod field_taint_tests { - //! Pointer-Phase 3: tests for the heap-field taint cells on + //!: tests for the heap-field taint cells on //! [`SsaTaintState`]. Cover get/add round-trip, lattice join //! (cap union + origin merge), and `leq` convergence semantics. use super::*; @@ -1202,7 +1202,7 @@ mod field_taint_tests { assert!(cell.validated_must, "a.must AND b.must = true"); assert!(cell.validated_may); - // Now make `b`'s validated_must false — must should drop to + // Now make `b`'s validated_must false, must should drop to // false on the join, may stays at OR. let mut c = SsaTaintState::initial(); c.add_field(k, taint(Cap::ENV_VAR), false, true); @@ -1213,7 +1213,7 @@ mod field_taint_tests { } /// W4 audit: `merge_join_field_taint` OR-unions `validated_may` - /// — any path's may-validation contributes to the joined cell. + ///, any path's may-validation contributes to the joined cell. #[test] fn lattice_validated_may_unions_on_join() { let k = key(1, 7); @@ -1275,7 +1275,7 @@ mod field_taint_tests { a.leq(&b), "must super-state and equal caps: a ≤ b should hold" ); - // Reverse: b.must=false, a.must=true — for b ≤ a, we need + // Reverse: b.must=false, a.must=true, for b ≤ a, we need // b.must ⊇ a.must which is false ⊇ true = false. So b ≤ a // must fail. assert!(!b.leq(&a), "b lacks the must invariant a holds"); @@ -1289,7 +1289,7 @@ mod field_taint_tests { assert!(!a2.leq(&b2), "a.may=true is NOT ⊆ b.may=false"); } - /// Pointer-Phase 3 / A8 audit: the field_taint lattice is monotone + /// the field_taint lattice is monotone /// and converges under a deterministic enumeration of inputs. /// Caps grow (OR), `uses_summary` grows (OR), origins grow modulo /// the cap (merge_origins is bounded). Joins must: @@ -1409,7 +1409,7 @@ mod field_taint_tests { /// `field_taint_leq` is the soundness gate for worklist /// convergence: once `next ≤ acc`, the worklist halts. Pin that - /// `leq` is consistent with `join` — i.e. `s.leq(s.join(t))` holds + /// `leq` is consistent with `join`, i.e. `s.leq(s.join(t))` holds /// for any `s, t`. Without this, the worklist could loop /// indefinitely on inputs whose join produces a state not /// dominated by both inputs. diff --git a/src/taint/ssa_transfer/summary_extract.rs b/src/taint/ssa_transfer/summary_extract.rs index ecc5611a..17e92c3b 100644 --- a/src/taint/ssa_transfer/summary_extract.rs +++ b/src/taint/ssa_transfer/summary_extract.rs @@ -1,11 +1,11 @@ //! SSA function-summary and container-flow extraction. //! //! Extracted from the monolithic `ssa_transfer.rs`. Contains: -//! * [`extract_ssa_func_summary`] — runs per-parameter taint probes and +//! * [`extract_ssa_func_summary`], runs per-parameter taint probes and //! synthesises an [`crate::summary::ssa_summary::SsaFuncSummary`] with //! source caps, return transforms, per-path transforms, and sink site //! attribution. -//! * [`extract_container_flow_summary`] — structural scan for +//! * [`extract_container_flow_summary`], structural scan for //! `param_container_to_return` + `param_to_container_store` pairs. //! * Private helpers for predicate-hash summarisation, abstract-transfer //! derivation, callback source detection, and return-type inference. @@ -123,15 +123,15 @@ pub fn extract_ssa_func_summary_full( .collect(); // Collect all param SSA values to exclude from return cap collection. - // Param values persist with their seeded taint throughout the function — + // Param values persist with their seeded taint throughout the function , // we only want caps on derived values (call results, assigns) at return. let all_param_values: std::collections::HashSet = param_info.iter().map(|(_, _, v)| *v).collect(); // Per-return-block observation captured alongside the aggregate return - // caps. Each entry records one return block's exit state — caps + // caps. Each entry records one return block's exit state, caps // contributed on that path, path-predicate hash, known_true/false bits, - // and the return SSA value's abstract fact — so the per-param loop can + // and the return SSA value's abstract fact, so the per-param loop can // emit one [`ReturnPathTransform`] per distinct predicate gate. struct ReturnBlockObs { /// Caps at the return SSA value (or joined live values for @@ -141,7 +141,7 @@ pub fn extract_ssa_func_summary_full( /// (passthrough fallback). param_caps: Cap, /// Deterministic hash of the predicate gate at this return. - /// `0` means "no predicate gate" — an unguarded return. + /// `0` means "no predicate gate", an unguarded return. predicate_hash: u64, /// `PredicateSummary::known_true` bits intersected across all /// tracked variables at this return. Encoded via @@ -268,7 +268,7 @@ pub fn extract_ssa_func_summary_full( } } } else { - // Return(None): implicit return — fall back to all live values. + // Return(None): implicit return, fall back to all live values. for (val, taint) in &exit.values { if all_param_values.contains(val) { block_param_caps |= taint.caps; @@ -348,7 +348,7 @@ pub fn extract_ssa_func_summary_full( // Per-return-path PathFact decomposition derived from the baseline // probe (no seeded taint). Abstract facts on the return rv are - // independent of taint seeding — they describe the function's + // independent of taint seeding, they describe the function's // intrinsic narrowing, so the baseline run captures them without // per-param noise. // @@ -388,7 +388,7 @@ pub fn extract_ssa_func_summary_full( let mut param_to_sink: Vec<(usize, SmallVec<[SinkSite; 1]>)> = Vec::new(); let mut param_to_sink_param = Vec::new(); // Per-param return-path decomposition. Populated only when the param - // has ≥2 distinct return-block predicate hashes — a single-return-path + // has ≥2 distinct return-block predicate hashes, a single-return-path // callee is already precise via `param_to_return`. let mut param_return_paths: Vec<( usize, @@ -417,7 +417,7 @@ pub fn extract_ssa_func_summary_full( // expressions (e.g. `file._source.uri`) as their own // [`SsaOp::Param`] ops with composite `var_name`s like // `"file._source.uri"`. These phantom Params are the values - // actually used as call arguments — not the formal-param SSA + // actually used as call arguments, not the formal-param SSA // value the seed targets. Without this, the per-param probe // misses cross-call sinks because the call's arg SSA value is // a phantom Param with no seed entry, so `transfer_inst::Param` @@ -447,7 +447,7 @@ pub fn extract_ssa_func_summary_full( let (return_caps, events, _, per_return_obs) = run_probe(seed); - // Subtract baseline source_caps — we only want param-contributed caps + // Subtract baseline source_caps, we only want param-contributed caps let param_return_caps = return_caps & !source_caps; if !param_return_caps.is_empty() { @@ -464,7 +464,7 @@ pub fn extract_ssa_func_summary_full( // observed return block, derive a `ReturnPathTransform` mirroring // the aggregate logic (prefer derived caps, fall back to param // caps, strip baseline source caps). Only emit when ≥2 distinct - // predicate hashes are present — a single-hash summary adds no + // predicate hashes are present, a single-hash summary adds no // signal over the aggregate `param_to_return`. if per_return_obs.len() >= 2 { let mut per_path: SmallVec<[crate::summary::ssa_summary::ReturnPathTransform; 2]> = @@ -477,7 +477,7 @@ pub fn extract_ssa_func_summary_full( }; let block_contributed = block_return_caps & !source_caps; let transform_kind = if block_contributed.is_empty() { - // No caps on this path — param does not reach return + // No caps on this path, param does not reach return // under this predicate. A `StripBits(all)` records // "all bits cleared" so downstream join preserves the // disparity with other paths. @@ -513,9 +513,31 @@ pub fn extract_ssa_func_summary_full( } } - // Collect sink caps + primary-location sites from events + per-arg-position detail + // Collect sink caps + primary-location sites from events + per-arg-position detail. + // + // Skip events flagged `all_validated`: every tainted SSA value + // that reached the sink was already proved validated by a + // dominating predicate (AllowlistCheck / TypeCheck / + // ValidationCall, including the indirect-validator branch + // narrowing for `validate*` / `is_valid*` callees). Those + // events would have been dropped by `ssa_events_to_findings` at + // the per-file finding step; carrying them into + // `param_to_sink` / `param_to_sink_param` re-publishes a sink + // attribution callers can no longer suppress, because the + // caller can't see the validator that lives inside the + // callee body. + // + // Strict-additive: `all_validated` is set only when every + // tainted operand at the sink has its `var_name` in + // `state.validated_may`, single-path single-validator helpers + // cleanly skip; mixed-tainted-with-some-unvalidated events + // still propagate. Closes the helper-summary precision gap + // surfaced by Novu CVE GHSA-4x48-cgf9-q33f. let mut param_sites: SmallVec<[SinkSite; 1]> = SmallVec::new(); for event in &events { + if event.all_validated { + continue; + } for pos in extract_sink_arg_positions(event, ssa) { param_to_sink_param.push((idx, pos, event.sink_caps)); } @@ -601,14 +623,14 @@ pub fn extract_ssa_func_summary_full( // Per-parameter abstract-domain transfers. // - // Derived structurally from the SSA body — no additional taint probes. + // Derived structurally from the SSA body, no additional taint probes. // Three-step inference per parameter: // 1. Identity: return SSA value at every return block traces back to // this parameter (possibly through assigns / phi merges all feeding // from the same param). // 2. Callee-intrinsic bound: baseline `return_abstract` carries a // concrete fact (bounded interval or known prefix) that holds - // regardless of caller input — record it once per parameter as + // regardless of caller input, record it once per parameter as // `Clamped` / `LiteralPrefix` so the caller sees the bound even // when it has no abstract info on its own argument. // 3. Top: default; the entry is omitted (empty transfer is meaningless). @@ -630,14 +652,14 @@ pub fn extract_ssa_func_summary_full( param_return_paths, return_path_facts, points_to, - // Pointer-Phase 5 extension — empty until the field-granularity + // extension, empty until the field-granularity // extractor is wired (`NYX_POINTER_ANALYSIS=1` only). Default // path stays bit-identical to today. field_points_to: crate::summary::points_to::FieldPointsToSummary::empty(), // Populated post-extraction in // `taint::lower_all_functions_from_bodies` once SSA optimisation // has computed `opt.type_facts`. Empty here means the - // extractor itself doesn't carry receiver-type info — the + // extractor itself doesn't carry receiver-type info, the // caller patches it in. typed_call_receivers: Vec::new(), } @@ -699,14 +721,14 @@ pub(super) fn summarise_return_predicates(state: &SsaTaintState) -> (u64, u8, u8 /// /// `return_abstract` is the callee's intrinsic baseline (from the no-seed /// probe). When present, it describes a fact that holds for the return -/// regardless of parameter input — so it can be attached as a +/// regardless of parameter input, so it can be attached as a /// `Clamped` / `LiteralPrefix` transform to every parameter that flows to /// the return. /// /// Identity detection is structural: walk the return values back through /// [`SsaOp::Assign`] / [`SsaOp::Phi`] chains (bounded) and check whether /// every leaf resolves to the same [`SsaOp::Param`]. The trace is cheap -/// and can only produce `Identity` for passthrough callees — anything +/// and can only produce `Identity` for passthrough callees, anything /// more complex degrades to the baseline fact or `Top`. fn derive_abstract_transfer( ssa: &SsaBody, @@ -780,7 +802,7 @@ fn derive_abstract_transfer( } // Derive a baseline-invariant transform from `return_abstract`. This is - // the "callee intrinsic" fact that always holds — each parameter that + // the "callee intrinsic" fact that always holds, each parameter that // flows to the return gets it attached as the conservative transfer. let baseline_invariant: Option = return_abstract.map(|av| { let interval = match (av.interval.lo, av.interval.hi) { @@ -805,7 +827,7 @@ fn derive_abstract_transfer( } else if let Some(base) = baseline_invariant.as_ref() { // Baseline intrinsic bound applies to every parameter that could // reach the return. We conservatively attach it to all params - // — at apply time the caller meets it with the real return + //, at apply time the caller meets it with the real return // abstract (also from this same summary), so double-counting // would collapse to the tighter of the two. transfer = base.clone(); @@ -879,7 +901,7 @@ fn infer_summary_return_type( lang: Lang, ) -> Option { // Find blocks with Return terminators, then look at the last defined value - // in those blocks — if it's a Call with a known constructor, that's our type. + // in those blocks, if it's a Call with a known constructor, that's our type. for block in &ssa.blocks { if !matches!(block.terminator, Terminator::Return(_)) { continue; @@ -965,7 +987,7 @@ pub(crate) fn extract_container_flow_summary( // `trace_to_param` will happily return any `SsaOp::Param { index }`, but // scoped lowering synthesises `Param` ops for external captures (module // imports, free identifiers) at indices beyond the formal parameter count. - // Those must not enter the summary — the key's arity only covers formal + // Those must not enter the summary, the key's arity only covers formal // params, and an out-of-range index trips `ssa_summary_fits_arity`, forcing // the reconciliation probe to generate a synthetic disambiguator that no // caller will ever look up. @@ -1035,7 +1057,7 @@ pub(crate) fn extract_container_flow_summary( }; // Trace container to positional param (SelfParam → None, so - // when the container is the receiver we skip — the caller + // when the container is the receiver we skip, the caller // tracks that via `receiver_to_container_store` if needed). // Same arity filter as above: reject synthetic Param ops that // were injected for free captures. diff --git a/src/taint/ssa_transfer/tests.rs b/src/taint/ssa_transfer/tests.rs index 70e81f61..cd32ace2 100644 --- a/src/taint/ssa_transfer/tests.rs +++ b/src/taint/ssa_transfer/tests.rs @@ -221,7 +221,7 @@ mod cross_file_tests { mod inline_cache_epoch_tests { //! Hooks for cross-file SCC joint fixed-point iteration. //! - //! These do not exercise the full inline pipeline — they lock down the + //! These do not exercise the full inline pipeline, they lock down the //! semantic contract of [`inline_cache_clear_epoch`] and //! [`inline_cache_fingerprint`] so the SCC orchestrator can rely on: //! @@ -229,7 +229,7 @@ mod inline_cache_epoch_tests { //! * `fingerprint` is deterministic across equivalent caches (same //! keys → same bytes). Two caches with identical entries produce //! identical fingerprints regardless of insertion order. - //! * `fingerprint` changes when return caps change — the signal the + //! * `fingerprint` changes when return caps change, the signal the //! orchestrator will use to detect inline-cache convergence. use super::super::*; @@ -675,7 +675,7 @@ mod worklist_tests { #[test] fn dense_successors_no_duplicates() { - // Many successors, some repeated — old O(n) contains() would be slow here + // Many successors, some repeated, old O(n) contains() would be slow here let mut wl = VecDeque::new(); let mut in_wl = HashSet::new(); @@ -735,8 +735,8 @@ mod primary_sink_location_tests { //! [`SsaTaintEvent::primary_sink_site`] → //! [`crate::taint::Finding::primary_location`]. //! - //! The test is deliberately low-level — it wires up synthetic SSA and - //! drives the three emission stages directly — so any future refactor + //! The test is deliberately low-level, it wires up synthetic SSA and + //! drives the three emission stages directly, so any future refactor //! that drops the site on the floor between stages fails here rather //! than only at the corpus/benchmark layer. use super::super::*; @@ -841,7 +841,7 @@ mod primary_sink_location_tests { /// If this fails, something on the summary→event→finding path /// (`pick_primary_sink_sites`, `emit_ssa_taint_events`, or /// `ssa_events_to_findings`) has silently stopped forwarding - /// coordinates. Fixing that path — not this test — is the right + /// coordinates. Fixing that path, not this test, is the right /// response. #[test] fn ssa_summary_sinksite_surfaces_as_finding_primary_location() { @@ -863,7 +863,7 @@ mod primary_sink_location_tests { }; // Drive the three emission stages with the summary's own - // `param_to_sink` — that is what summary resolution feeds in the + // `param_to_sink`, that is what summary resolution feeds in the // real pipeline. let tainted: Vec<(SsaValue, Cap, SmallVec<[TaintOrigin; 2]>)> = vec![( SsaValue(0), @@ -944,7 +944,7 @@ mod goto_succ_propagation_tests { #[test] fn goto_propagates_to_every_succ_on_three_way_collapse() { - // Build a block with Terminator::Goto(1) but succs = [1, 2, 3] — the + // Build a block with Terminator::Goto(1) but succs = [1, 2, 3], the // shape lowering emits for a 3-way fanout. let block = SsaBlock { id: BlockId(0), @@ -1001,7 +1001,7 @@ mod goto_succ_propagation_tests { pointer_facts: None, }; - // A non-bottom exit state — the test only cares that *every* succ + // A non-bottom exit state, the test only cares that *every* succ // receives a clone of it, so any distinguishable state works. let mut exit_state = SsaTaintState::initial(); exit_state.values.push(( @@ -1259,7 +1259,7 @@ mod goto_succ_propagation_tests { fn is_path_safe_for_sink_unknown_axis_returns_false() { use crate::abstract_interp::PathFact; - // Only dotdot is cleared — absolute stays Maybe → not path-safe. + // Only dotdot is cleared, absolute stays Maybe → not path-safe. let half_fact = PathFact::default().with_dotdot_cleared(); assert!(!half_fact.is_path_safe()); } @@ -1328,9 +1328,9 @@ mod goto_succ_propagation_tests { } } -// ── Phase 4.2: receiver_candidates_for_type_lookup walks FieldProj ────── +// ── receiver_candidates_for_type_lookup walks FieldProj ────── // -// After Phase 2 SSA decomposition, `c.client.send(req)` lowers to +// After SSA decomposition, `c.client.send(req)` lowers to // v_c = Param("c", 0) // v_client = FieldProj(v_c, "client") // v_call = Call("send", receiver: v_client, args: [v_req]) @@ -1430,7 +1430,7 @@ mod receiver_candidates_field_proj_tests { fn field_proj_receiver_walks_to_typed_root_in_go() { // Go is not Rust, so pre-Phase-4 the candidate walk would have // returned ONLY the immediate receiver (v2 = FieldProj). With - // Phase 4 we walk through FieldProj.receiver to recover v0 (the + // We walk through FieldProj.receiver to recover v0 (the // typed root `c`). let body = body_with_field_proj_chain(); let cands = @@ -1516,7 +1516,7 @@ mod receiver_candidates_field_proj_tests { } } -// ── Phase 6 hierarchy fan-out: ResolvedSummary union semantics ────────── +// ── Hierarchy: ResolvedSummary union semantics ────────── // // `merge_resolved_summaries_fanout` is invoked at virtual-dispatch call // sites where the receiver's static type has multiple concrete @@ -1553,7 +1553,7 @@ mod fanout_merge_tests { } } - /// B1 — caps that grow taint signal (source/sink/receiver_to_sink) + /// B1, caps that grow taint signal (source/sink/receiver_to_sink) /// are unioned. sanitizer_caps are intersected so only bits /// stripped by EVERY implementer count as cleared at the call site. #[test] @@ -1581,7 +1581,7 @@ mod fanout_merge_tests { ); } - /// B2 — propagates_taint is OR'd; propagating_params is the union + /// B2, propagates_taint is OR'd; propagating_params is the union /// (any implementer's propagator counts). #[test] fn merge_propagation_unions() { @@ -1600,7 +1600,7 @@ mod fanout_merge_tests { assert_eq!(params, vec![0, 1, 2]); } - /// B3 — param_to_sink merges per-parameter caps (OR). An impl + /// B3, param_to_sink merges per-parameter caps (OR). An impl /// that adds a sink at param N composes with another impl that /// adds a different cap at the same N. #[test] @@ -1630,7 +1630,7 @@ mod fanout_merge_tests { ); } - /// B4 — param_to_sink_sites merges per-parameter site lists with + /// B4, param_to_sink_sites merges per-parameter site lists with /// PartialEq dedup. The same site appearing in both impls (e.g. /// inherited definition) must not be reported twice. #[test] @@ -1675,7 +1675,7 @@ mod fanout_merge_tests { assert!(sites.iter().any(|s| s == &unique_b)); } - /// B5 — SSA-precision fields are dropped on disagreement. Two + /// B5, SSA-precision fields are dropped on disagreement. Two /// summaries with different `return_type` collapse to None; /// agreement is preserved. #[test] @@ -1704,7 +1704,7 @@ mod fanout_merge_tests { ); } - /// B6 — abstract_transfer + param_return_paths drop on + /// B6, abstract_transfer + param_return_paths drop on /// disagreement (precise predicate-path data is not safely /// composable across distinct function bodies). #[test] @@ -1737,7 +1737,7 @@ mod fanout_merge_tests { ); } - /// B7 — empty + empty = empty (no panic on degenerate inputs). + /// B7, empty + empty = empty (no panic on degenerate inputs). #[test] fn merge_empties_is_identity() { let m = merge_resolved_summaries_fanout(empty(), empty()); @@ -1748,7 +1748,7 @@ mod fanout_merge_tests { } } -// ── Pointer-Phase 3 / W1: synthetic field-WRITE round-trip ────────────── +//── synthetic field-WRITE round-trip ────────────── // // SSA lowering populates `SsaBody.field_writes` with entries that lift a // synthetic base-update Assign (`obj.f = rhs`) into a structural field @@ -1918,8 +1918,8 @@ mod field_write_tests { crate::pointer::analyse_body(body, crate::cfg::BodyId(7)) } - /// Reuse `make_cfg`'s nodes — the body's instructions all reference - /// them — so `transfer_inst` can index `cfg[cfg_node]`. + /// Reuse `make_cfg`'s nodes, the body's instructions all reference + /// them, so `transfer_inst` can index `cfg[cfg_node]`. fn drive(body: &SsaBody, pf: &PointsToFacts) -> SsaTaintState { // We need a CFG that contains the bodies' cfg_nodes. let (cfg, _, _, _, _) = make_cfg(); @@ -1998,7 +1998,7 @@ mod field_write_tests { /// Pointer-disabled run (`pointer_facts: None`): no field cell is /// recorded, no taint flows through the `obj.cache` projection. The - /// strict-additive contract — pointer-disabled behaviour is the + /// strict-additive contract, pointer-disabled behaviour is the /// pre-W1 baseline. #[test] fn pointer_disabled_run_produces_no_field_taint() { @@ -2047,8 +2047,8 @@ mod field_write_tests { state.field_taint.is_empty(), "pointer-disabled run must not populate field_taint", ); - // FieldProj reads still produce the receiver's existing taint — - // none — so no entry for SsaValue(3) either. + // FieldProj reads still produce the receiver's existing taint , + // none, so no entry for SsaValue(3) either. assert!(state.get(SsaValue(3)).is_none()); let _ = cache_id; } @@ -2059,7 +2059,7 @@ mod field_write_tests { /// projected value's symbol-level `validated_must` from the cell. /// /// This is the key invariant: validation flows *through* abstract - /// field identity — the read recovers what the write recorded. + /// field identity, the read recovers what the write recorded. #[test] fn write_then_read_preserves_validated_must() { let (body, cache_id) = make_body(); @@ -2208,7 +2208,7 @@ mod field_write_tests { }, }; let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0)); - // v0 is Const → empty pt — the hook should not insert anything. + // v0 is Const → empty pt, the hook should not insert anything. assert!( pf.pt(SsaValue(0)).is_empty(), "Const value should have empty pt set", @@ -2259,7 +2259,7 @@ mod field_write_tests { } } -// ── Pointer-Phase 4 / W2: container ELEM write/read round-trip ────────── +//── container ELEM write/read round-trip ────────── // // Container methods like `arr.push(v)` / `arr.shift()` flow per-element // taint through the `Field(_, ELEM)` cells on `SsaTaintState`. These @@ -2351,7 +2351,7 @@ mod container_elem_tests { state } - /// `arr.push(source()); arr.shift()` — the read picks the source's + /// `arr.push(source()); arr.shift()`, the read picks the source's /// caps up via the ELEM cell. #[test] fn container_write_then_read_round_trips_taint() { @@ -2456,7 +2456,7 @@ mod container_elem_tests { ); // Drive the transfer. `e := arr.shift()` goes through the - // existing Call arm — the W2 path is the *write* on `push`. + // existing Call arm, the W2 path is the *write* on `push`. // The element-read side already exists on `analyse_body`; the // taint engine doesn't yet read field cells through call-result // paths (Call args are walked by Call's own argument-taint @@ -2482,7 +2482,7 @@ mod container_elem_tests { } } - /// W4: `arr.push(validate(src)); arr.shift()` — the push records + /// W4: `arr.push(validate(src)); arr.shift()`, the push records /// `validated_must = true` on the ELEM cell because the pushed /// value's symbol carried `validated_must`. The shift call result /// reads through the cell and seeds the result symbol's @@ -2761,7 +2761,7 @@ mod container_elem_tests { } } -// ── Pointer-Phase 5 / W3: cross-call field-points-to application ──────── +//── cross-call field-points-to application ──────── // // `apply_field_points_to_writes` is the resolver-side hook that turns // callee-summary `field_points_to.param_field_writes` into caller-side @@ -2783,7 +2783,7 @@ mod cross_call_field_tests { use smallvec::smallvec; use std::collections::HashMap; - /// W3 / W4: shared empty interner — these unit tests don't seed + /// W3 / W4: shared empty interner, these unit tests don't seed /// validation bits, so a fresh interner is sufficient for the /// `interner` parameter on `apply_field_points_to_writes`. fn empty_interner() -> SymbolInterner { @@ -2861,23 +2861,23 @@ mod cross_call_field_tests { state } - /// Callee summary with `param_field_writes[(0, ["cache"])]` — + /// Callee summary with `param_field_writes[(0, ["cache"])]` , /// "callee writes cache field on parameter 0 (obj)". - /// Caller passes `(obj, source)` to this callee — `arg 0 = obj`, + /// Caller passes `(obj, source)` to this callee, `arg 0 = obj`, /// but the W3 hook resolves the *value at arg position 0* as the /// receiver of the field write, populating its pt's cells. /// /// We model the caller as `callee(obj, source)` with arg 0 = obj /// (the receiver) and arg 1 = source (the value being written). /// The callee's signature is `fn store(obj, value) { obj.cache = value; }` - /// — so the field write on param 0 is keyed by `pt(obj)` and the + ///, so the field write on param 0 is keyed by `pt(obj)` and the /// taint comes from arg 1's caps. Our helper conservatively unions - /// every arg's taint into the cell — which over-tints (for this + /// every arg's taint into the cell, which over-tints (for this /// shape, arg 0's pt member becomes the loc, with arg 0's own taint /// applied), but is sound. /// /// To make the test precise, we model the simpler shape `fn store(obj) - /// { obj.cache = source(); }` — callee writes a literal source into + /// { obj.cache = source(); }`, callee writes a literal source into /// `obj.cache`, with no value parameter. Then the caller-side hook /// only sees param 0's taint (zero), so the cell is empty and the /// test fails. @@ -2886,7 +2886,7 @@ mod cross_call_field_tests { /// at the call site arg 0 carries source taint. The hook then /// records (pt(arg0_value), cache) ← arg0_value's taint. In a /// real callee this corresponds to "callee writes its parameter - /// value into a self.cache field internally" — but the spread we + /// value into a self.cache field internally", but the spread we /// validate is just substitute-and-mirror. #[test] fn cross_call_writes_into_param_field_cell() { @@ -2947,7 +2947,7 @@ mod cross_call_field_tests { fn cross_call_receiver_field_uses_max_sentinel() { let (body, cache_id, pf) = caller_body(); let mut state = SsaTaintState::initial(); - // Seed receiver with taint — SsaValue(0) is the param/receiver. + // Seed receiver with taint, SsaValue(0) is the param/receiver. state.set( SsaValue(0), VarTaint { @@ -3026,7 +3026,7 @@ mod cross_call_field_tests { ); } - /// Field names the caller never interned are skipped silently — + /// Field names the caller never interned are skipped silently , /// no FieldProj read in the caller could observe such a cell. #[test] fn cross_call_unknown_field_name_skipped() { @@ -3062,7 +3062,7 @@ mod cross_call_field_tests { ); } - /// Overflow summary is treated conservatively as no-op — the + /// Overflow summary is treated conservatively as no-op, the /// engine cannot soundly cell-flood, so it skips entirely. #[test] fn cross_call_overflow_summary_is_noop() { @@ -3117,7 +3117,7 @@ mod cross_call_field_tests { // // `SsaTaintState.add_field` already routes through `merge_origins`, but // the FieldProj READ path used to walk the cell's origins inline, -// deduping by node only — meaning a cell with N>cap origins surfaced +// deduping by node only, meaning a cell with N>cap origins surfaced // all N to the projected SSA value. After A7, the read path uses // `push_origin_bounded`, ensuring the cap-driven survivor selection // applies on read too. @@ -3225,7 +3225,7 @@ mod field_taint_origin_cap_tests { let (body, cache_id, cfg, _n_proj) = build_body(); let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0)); - // Pre-populate the (Param, cache) cell with 4 origins — + // Pre-populate the (Param, cache) cell with 4 origins , // 2× the cap. The `add_field` path already truncates via // `merge_origins`, so we go through it 4 times to grow. let mut state = SsaTaintState::initial(); @@ -3326,14 +3326,14 @@ mod field_taint_origin_cap_tests { // the field_taint cells. // // Two scenarios: -// 1. `must_validated_flows_through_join` — both predecessor blocks +// 1. `must_validated_flows_through_join`, both predecessor blocks // write the cell with `validated_must = true`. After the join, the // cell at the read site retains `validated_must = true` (AND // intersection of two `true`s). -// 2. `early_exit_branch_drops_validated_must` — only one predecessor +// 2. `early_exit_branch_drops_validated_must`, only one predecessor // writes; the other reaches the read block via an empty branch. // After the join, the cell has `validated_must = false`, -// `validated_may = true` — W4's must/may intersection in action. +// `validated_may = true`, W4's must/may intersection in action. #[cfg(test)] mod pointer_lattice_worklist_tests { use super::super::*; @@ -3425,7 +3425,7 @@ mod pointer_lattice_worklist_tests { succs: smallvec![BlockId(1), BlockId(2)], }; - // Block 1: synth `obj.cache = src` — field_writes[v2] = (v0, cache_id) + // Block 1: synth `obj.cache = src`, field_writes[v2] = (v0, cache_id) let block1 = SsaBlock { id: BlockId(1), phis: vec![], @@ -3441,7 +3441,7 @@ mod pointer_lattice_worklist_tests { succs: smallvec![BlockId(3)], }; - // Block 2: identical synth write — keeps both branches + // Block 2: identical synth write, keeps both branches // contributing the same cell so AND-intersection of must // preserves true on the join. let block2 = SsaBlock { @@ -3459,7 +3459,7 @@ mod pointer_lattice_worklist_tests { succs: smallvec![BlockId(3)], }; - // Block 3: read — FieldProj uses obj from a phi between B1 and B2. + // Block 3: read, FieldProj uses obj from a phi between B1 and B2. let block3 = SsaBlock { id: BlockId(3), phis: vec![SsaInst { @@ -3634,7 +3634,7 @@ mod pointer_lattice_worklist_tests { ); } - /// A2.b: early-exit branch — only B1 writes, B2 reaches B3 via + /// A2.b: early-exit branch, only B1 writes, B2 reaches B3 via /// an empty body. After the join, the cell exists (B1 wrote /// it), but `validated_must` is `false` (B2 didn't write, the /// orphan-side merge clears `must` per the W4 lattice rule); @@ -3642,7 +3642,7 @@ mod pointer_lattice_worklist_tests { /// /// To exercise the validation channels we synthesise the cell /// directly at the appropriate exit state, then run the - /// worklist's join via two `SsaTaintState::join()` calls — the + /// worklist's join via two `SsaTaintState::join()` calls, the /// body's worklist itself doesn't seed `validated_must` on the /// rhs of an Assign, so we model the "writer recorded must=true" /// scenario at the lattice level rather than driving it through diff --git a/src/taint/tests.rs b/src/taint/tests.rs index a3359435..f99ecb40 100644 --- a/src/taint/tests.rs +++ b/src/taint/tests.rs @@ -698,7 +698,7 @@ fn cross_file_sink_finding_carries_primary_location() { ); let finding = &findings[0]; // Note: `uses_summary == false` here because the source (env::var) is - // local — only the *sink* was summary-resolved. That's the case the + // local, only the *sink* was summary-resolved. That's the case the // `primary_location` / `uses_summary` independence comment on // [`super::Finding::primary_location`] documents. let loc = finding @@ -925,7 +925,7 @@ fn multi_file_sink_in_another_file() { } "#; - // File B: env::var → exec_cmd() — sink is cross-file. + // File B: env::var → exec_cmd(), sink is cross-file. let caller_src = br#" use std::env; fn main() { @@ -956,7 +956,7 @@ fn multi_file_sink_in_another_file() { fn multi_file_passthrough_preserves_taint() { use crate::summary::FuncSummary; - // identity() just returns its argument — it propagates taint but has no + // identity() just returns its argument, it propagates taint but has no // source/sanitizer/sink caps of its own. let mut global = GlobalSummaries::new(); let key = FuncKey { @@ -1071,7 +1071,7 @@ fn multi_file_chain_source_sanitize_sink_across_files() { fn sanitizer_strips_only_matching_bits() { // Source(ALL) → shell_escape → sink_html (HTML sink). // shell_escape strips SHELL_ESCAPE but not HTML_ESCAPE. - // sink_html is an HTML sink — HTML_ESCAPE bit is still set → 1 finding. + // sink_html is an HTML sink, HTML_ESCAPE bit is still set → 1 finding. let src = br#" use std::env; fn sink_html(s: &str) {} @@ -1142,7 +1142,7 @@ fn taint_through_variable_reassignment() { #[test] fn untainted_variable_at_sink_is_safe() { - // A string literal (not from a source) passed to Command — no finding. + // A string literal (not from a source) passed to Command, no finding. let src = br#" use std::process::Command; fn main() { @@ -1585,7 +1585,7 @@ fn cpp_source_to_sink() { ); } -/// Phase 2 (cpp-precision): `c_str()` is a const accessor on `std::string` +/// `c_str()` is a const accessor on `std::string` /// that returns a pointer to the same buffer. It must propagate taint from /// the receiver to the result so the downstream sink fires. #[test] @@ -1597,12 +1597,12 @@ fn cpp_c_str_propagates_taint() { let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None); assert!( !findings.is_empty(), - "C++: tainted s.c_str() into system() must fire (Phase 2 c_str passthrough)", + "C++: tainted s.c_str() into system() must fire", ); } -/// Phase 2: `std::move(x)` returns its argument unchanged in terms of -/// data flow — the rvalue cast is a representation move, not a sanitiser. +/// `std::move(x)` returns its argument unchanged in terms of +/// data flow, the rvalue cast is a representation move, not a sanitiser. /// Default propagation collects argument taint into the result. #[test] fn cpp_std_move_propagates_taint() { @@ -1617,7 +1617,7 @@ fn cpp_std_move_propagates_taint() { ); } -/// Phase 2: `static_cast(x)` is parsed as a call expression by +/// `static_cast(x)` is parsed as a call expression by /// tree-sitter-cpp; default propagation transports taint from the casted /// argument to the result. #[test] @@ -1633,7 +1633,7 @@ fn cpp_static_cast_propagates_taint() { ); } -/// Phase 5 (cpp-precision): a fluent builder chain whose host +/// a fluent builder chain whose host /// argument is tainted should fire on the terminal `.connect()` /// SSRF sink. The chained `.host(...)` / `.port(...)` calls return /// the receiver, and default Call-arg propagation puts the tainted @@ -1647,12 +1647,12 @@ fn cpp_builder_chain_user_host_fires() { let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None); assert!( !findings.is_empty(), - "C++: tainted host through fluent builder chain must reach terminal connect() (Phase 5)", + "C++: tainted host through fluent builder chain must reach terminal connect()", ); } -/// Phase 5: a fluent builder chain with a hardcoded host literal -/// must NOT fire on the terminal connect() sink — the chain carries +/// a fluent builder chain with a hardcoded host literal +/// must NOT fire on the terminal connect() sink, the chain carries /// no taint. #[test] fn cpp_builder_chain_const_host_silent() { @@ -1663,11 +1663,11 @@ fn cpp_builder_chain_const_host_silent() { let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None); assert!( findings.is_empty(), - "C++: builder chain with literal host must NOT fire (Phase 5 negative)", + "C++: builder chain with literal host must NOT fire (Negative)", ); } -/// Phase 4 (cpp-precision): inline member-function bodies inside a +/// inline member-function bodies inside a /// `class_specifier` must be extracted as separate functions and /// intra-file calls must resolve to their bodies. Pre-Phase-4, the /// `class_specifier` AST kind was unmapped in cpp KINDS, so the CFG @@ -1682,11 +1682,11 @@ fn cpp_inline_class_method_resolves() { let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None); assert!( !findings.is_empty(), - "C++: tainted arg through inline class method must reach system() (Phase 4)", + "C++: tainted arg through inline class method must reach system()", ); } -/// Phase 3 (cpp-precision): a tainted argument passed through an +/// a tainted argument passed through an /// identity-style lambda (`auto echo = [](const char* s) { return s; }`) /// must reach the downstream sink. This is handled by the same default /// Call-arg propagation as `std::move`/`static_cast`; pinning the @@ -1705,7 +1705,7 @@ fn cpp_identity_lambda_propagates_taint() { ); } -/// Phase 2: `std::vector::data()` is a Load-style container op that +/// `std::vector::data()` is a Load-style container op that /// returns a pointer to the underlying buffer; `system(v.data())` should /// fire when `v` is tainted. #[test] @@ -1801,7 +1801,7 @@ fn ruby_source_to_sink() { // ───────────────────────────────────────────────────────────────────────────── // // Cross-language resolution now requires explicit InteropEdge declarations. -// Without an edge, functions from different languages are never resolved — +// Without an edge, functions from different languages are never resolved , // this prevents false positives from name collisions across languages. /// Extract cross-file summaries from any language's source bytes. @@ -1984,7 +1984,7 @@ fn cross_lang_rust_sanitizer_in_js_via_interop() { None, ); // eval uses Cap::all(), so a SHELL_ESCAPE sanitizer alone does NOT - // neutralise taint — shell-escape is semantically wrong for code injection. + // neutralise taint, shell-escape is semantically wrong for code injection. // The finding should still be reported. assert!( !findings.is_empty(), @@ -2481,7 +2481,7 @@ fn cross_lang_summary_preserves_lang_metadata() { let global = merge_summaries(vec![py_summary, js_summary], None); - // They are now separate entries — not merged + // They are now separate entries, not merged let py_matches = global.lookup_same_lang(Lang::Python, "helper"); let js_matches = global.lookup_same_lang(Lang::JavaScript, "helper"); @@ -2609,7 +2609,7 @@ fn ambiguous_resolution_returns_none() { ); } - // Caller from c.rs calls helper() — ambiguous (two matches, neither is caller's namespace) + // Caller from c.rs calls helper(), ambiguous (two matches, neither is caller's namespace) let src = br#" use std::process::Command; fn main() { @@ -2855,7 +2855,7 @@ fn validate_and_early_return() { let summaries = &file_cfg.summaries; let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None); - // Validated findings are now suppressed — validate() guard means the + // Validated findings are now suppressed, validate() guard means the // sink is on the safe path, so no finding should be emitted. assert_eq!(findings.len(), 0, "validated finding should be suppressed"); } @@ -2888,7 +2888,7 @@ fn validate_in_if_else_path_validated() { let summaries = &file_cfg.summaries; let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None); - // Validated findings are now suppressed — sink is in the validated + // Validated findings are now suppressed, sink is in the validated // branch, so no finding should be emitted. assert_eq!(findings.len(), 0, "validated finding should be suppressed"); } @@ -2932,7 +2932,7 @@ fn contradictory_null_check_pruned() { // Inner branch is infeasible: if x.is_none() then x cannot also be is_none(). // After early return on is_none(), the fall-through path has polarity=false - // for NullCheck. The inner `if x.is_none()` True branch has polarity=true — + // for NullCheck. The inner `if x.is_none()` True branch has polarity=true , // contradiction. let src = br#" use std::env; use std::process::Command; @@ -3045,7 +3045,7 @@ fn path_state_budget_graceful() { let summaries = &file_cfg.summaries; let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None); - // Should still detect the flow — truncation shouldn't cause false negatives. + // Should still detect the flow, truncation shouldn't cause false negatives. assert_eq!( findings.len(), 1, @@ -3080,7 +3080,7 @@ fn unknown_predicate_not_pruned() { let summaries = &file_cfg.summaries; let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None); - // Comparison is not in the whitelist — the path should NOT be pruned. + // Comparison is not in the whitelist, the path should NOT be pruned. assert_eq!( findings.len(), 1, @@ -3096,7 +3096,7 @@ fn duplicate_null_guard_prunes_unreachable_sink() { // After `if y.is_none() { return; }`, the false arm proves // `y.is_none() == false` on the only surviving path. A second // `if y.is_none() { sink }` then adds `y.is_none() == true` on the - // body's True arm — a per-symbol PredicateSummary contradiction + // body's True arm, a per-symbol PredicateSummary contradiction // (known_true & known_false on bit NullCheck). The body is // structurally unreachable; the sink must not fire. // @@ -3573,7 +3573,7 @@ fn js_two_level_converges_no_mutation() { #[test] fn catch_param_to_sink_has_caught_exception_source_kind() { - // Catch param flows to a sink — the finding source_kind must be + // Catch param flows to a sink, the finding source_kind must be // CaughtException, not Unknown. let src = b" const { exec } = require('child_process'); @@ -3743,7 +3743,7 @@ fn assert_ssa_integration(src: &[u8]) { // High-level path (per-body analysis) let high_level = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None); - // Direct SSA path — use the first function body (fn main), not top-level + // Direct SSA path, use the first function body (fn main), not top-level let body = if file_cfg.bodies.len() > 1 { &file_cfg.bodies[1] } else { @@ -4654,7 +4654,7 @@ fn ssa_induction_var_no_taint() { #[test] fn ssa_loop_tainted_var_not_induction() { - // `x` is tainted and transformed in a loop — NOT an induction variable + // `x` is tainted and transformed in a loop, NOT an induction variable let src = br#" use std::{env, process::Command}; fn main() { @@ -4766,7 +4766,7 @@ fn ssa_phi_path_sensitive_both_branches_validated() { let summaries = &file_cfg.summaries; let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None); - // Validated findings are now suppressed — sink is in the validated + // Validated findings are now suppressed, sink is in the validated // branch, so no finding should be emitted. assert_eq!(findings.len(), 0, "validated finding should be suppressed"); } @@ -5116,7 +5116,7 @@ fn abstract_ssrf_prefix_linear_suppression() { /// Two predecessor blocks produce string concat values with different safe /// prefixes ("https://api.example.com/users/" and "https://api.example.com/admins/"). /// A phi merges them. The LCP of the prefixes is "https://api.example.com/" which -/// still has scheme://host/ — so SSRF suppression should fire. +/// still has scheme://host/, so SSRF suppression should fire. /// /// Before the phi replay fix, collect_block_events did NOT replay abstract phis, /// leaving the phi result's abstract value as Top (stale). The SSRF suppression @@ -5255,7 +5255,7 @@ fn phi_validated_must_requires_all_paths() { use tree_sitter::Language; // Path A validates x, path B does NOT validate x. - // The phi for x after the merge must NOT get validated_must — only + // The phi for x after the merge must NOT get validated_must, only // validated_may (since at least one path validated). The sink after // the merge must still fire because the must-analysis says "not // definitely validated on all paths". @@ -5324,7 +5324,7 @@ fn inline_return_constant_with_internal_source_produces_no_finding() { None, ); - // transform() returns a constant — no taint should leak to caller + // transform() returns a constant, no taint should leak to caller assert_eq!( findings.len(), 0, @@ -5386,7 +5386,7 @@ fn inline_return_taint_internal_source_does_not_widen_caps() { // Callee has an internal source (document.location) alongside a tainted // param. The explicit return value is the param. Without the C-1 fix, // extract_inline_return_taint would union ALL live tainted values' caps - // — the internal source's derived-caps would override the param-caps + //, the internal source's derived-caps would override the param-caps // (derived takes priority in the extraction logic). With the fix, only // the return value's taint is collected, so param taint is returned // correctly. @@ -5420,7 +5420,7 @@ fn inline_return_taint_internal_source_does_not_widen_caps() { None, ); - // The callee returns cmd (tainted param) — 1 finding expected. + // The callee returns cmd (tainted param), 1 finding expected. // The internal document.location() should NOT widen the return taint. assert_eq!( findings.len(), @@ -5435,7 +5435,7 @@ fn inline_return_taint_internal_source_does_not_widen_caps() { /// /// Two class methods share the leaf name `process` in the same file. If the /// summary map were keyed by bare name (or raw file-path namespace), the -/// second lowering would overwrite the first — both methods would end up +/// second lowering would overwrite the first, both methods would end up /// pointing at whichever summary was extracted last. /// /// With canonical `FuncKey` identity (`container` discriminates them) both @@ -5483,7 +5483,7 @@ class Worker { summaries.keys().collect::>(), ); - // Same invariant on the cached-bodies map — inline analysis depends on + // Same invariant on the cached-bodies map, inline analysis depends on // being able to fetch the correct body by full FuncKey. let mut body_containers: Vec = bodies .iter() @@ -5593,6 +5593,7 @@ fn make_finding_for_link_test( path_hash, finding_id: String::new(), alternative_finding_ids: smallvec::SmallVec::new(), + effective_sink_caps: crate::labels::Cap::empty(), } } @@ -5628,7 +5629,7 @@ fn finding_id_encodes_validation_and_path_hash() { ); // Differing path_hash produces a different ID even with the same - // (body, source, sink, validated) — the whole point of the path + // (body, source, sink, validated), the whole point of the path // component in the dedup key. let mut u2 = make_finding_for_link_test(1, 3, 7, 0xdead_beef_0000_0002, false); u2.finding_id = super::make_finding_id(&u2); @@ -5639,7 +5640,7 @@ fn finding_id_encodes_validation_and_path_hash() { } /// `link_alternative_paths` must cross-link findings that share -/// `(body_id, sink, source)` — so a validated flow and an unvalidated +/// `(body_id, sink, source)`, so a validated flow and an unvalidated /// flow on the same source/sink pair each list the other's ID. #[test] fn link_alternative_paths_cross_references_same_body_sink_source() { @@ -5668,18 +5669,18 @@ fn link_alternative_paths_cross_references_same_body_sink_source() { } /// Findings that differ on `(body_id, sink, source)` are independent -/// vulnerabilities — they must **not** end up cross-linked as +/// vulnerabilities, they must **not** end up cross-linked as /// alternatives, otherwise the "alternative path" framing becomes /// noise. #[test] fn link_alternative_paths_does_not_link_distinct_sink_source() { let mut findings = vec![ make_finding_for_link_test(1, 3, 7, 0x1111, false), - // Different sink — independent finding, not an alternative. + // Different sink, independent finding, not an alternative. make_finding_for_link_test(1, 3, 8, 0x1111, false), - // Different source — also independent. + // Different source, also independent. make_finding_for_link_test(1, 4, 7, 0x1111, false), - // Different body — also independent. + // Different body, also independent. make_finding_for_link_test(2, 3, 7, 0x1111, false), ]; for f in &mut findings { @@ -5697,7 +5698,7 @@ fn link_alternative_paths_does_not_link_distinct_sink_source() { /// When the same `(body, sink, source)` has three sibling findings /// (e.g. validated, unvalidated-path-A, unvalidated-path-B), each -/// finding must list the other two — the group is symmetric and +/// finding must list the other two, the group is symmetric and /// complete rather than a chain. #[test] fn link_alternative_paths_three_way_group() { @@ -5726,14 +5727,14 @@ fn link_alternative_paths_three_way_group() { } // ───────────────────────────────────────────────────────────────────────────── -// Typed call-graph devirtualisation — Phase 2 (typed_call_receivers) +// Typed call-graph devirtualisation (typed_call_receivers) // ───────────────────────────────────────────────────────────────────────────── -/// Phase 2: when a method call's receiver was constructed from a known +/// when a method call's receiver was constructed from a known /// constructor (`File::open` → `FileHandle`), the SSA-extraction /// pipeline must record `(call_ordinal, "FileHandle")` on the /// caller's [`crate::summary::ssa_summary::SsaFuncSummary::typed_call_receivers`] -/// so Phase 3 can devirtualise the cross-file edge. +/// so build_call_graph can devirtualise the cross-file edge. /// /// Uses Java because `FileInputStream` / `FileOutputStream` are part /// of the [`crate::ssa::type_facts::constructor_type`] table for Java @@ -5779,14 +5780,14 @@ class Reader { ); } -/// Phase 2 negative control: free-function calls (no receiver) must +/// Negative control: free-function calls (no receiver) must /// never appear in `typed_call_receivers`. Even when the callee is a /// known type-producing constructor, it sits in the body as a Call /// with `receiver = None` and is not a candidate for devirtualisation. #[test] fn typed_call_receivers_skips_free_function_calls() { // `new FileInputStream(...)` is a constructor invocation with no - // receiver — exactly the shape we want to ignore. + // receiver, exactly the shape we want to ignore. let src = br#" class Maker { void make() { @@ -5808,10 +5809,10 @@ class Maker { // make() has zero parameters and no fresh-allocation return, so the // generic insertion gate skips it. The phase-2 patch only force- - // inserts when `typed_call_receivers` is non-empty — which it + // inserts when `typed_call_receivers` is non-empty, which it // isn't here, since `new FileInputStream(...)` is a free-function- // shaped constructor call (no SSA receiver). So either the - // summary is absent, or — if some other side effect inserted it — + // summary is absent, or, if some other side effect inserted it , // its `typed_call_receivers` is empty. Both forms prove no // spurious typed entry was recorded. let typed = summaries @@ -5829,7 +5830,7 @@ class Maker { /// Regression: nested arrow functions inside `return new Promise((res,rej) /// => { ... })` must be lifted as separate bodies. Before the Kind::Return /// arm in cfg/mod.rs called `collect_nested_function_nodes`, only the -/// outer function (`downloadFromUri`) was extracted — the executor and +/// outer function (`downloadFromUri`) was extracted, the executor and /// its inner callbacks were silently swallowed, hiding the inner gated /// http.get sink from classification. Motivated by CVE-2025-64430. #[test] @@ -5972,7 +5973,7 @@ const handler = (req) => { /// The augment pass populates `downloadFromUri.summary.param_to_sink: /// [(0, SSRF)]` (single-hop closure-capture lift). For the handler's /// `helper(req.body)` call to fire, `helper.summary.param_to_sink` must -/// also contain `[(0, SSRF)]` — but that requires `helper`'s probe to +/// also contain `[(0, SSRF)]`, but that requires `helper`'s probe to /// see `downloadFromUri`'s augmented summary at resolution time. /// /// Because the probe currently runs with `ssa_summaries=None`, @@ -6065,11 +6066,198 @@ const handler = (req) => { /// `middle.summary.param_to_sink`, then handler's call site picks it up. /// /// Today the second-pass runs only once (no fixed-point), so depth-3+ -/// is expected to NOT fire — guards against accidental fixed-point +/// is expected to NOT fire, guards against accidental fixed-point /// regression that would mask an over-eager rewrite. Marked /// `#[ignore]` so it documents the depth limit without breaking CI. /// Motivated by CVE-2025-64430 corner case; remove the `#[ignore]` and /// any guarding `assert!` polarity if a fixed-point is added later. +/// Indirect-validator branch narrowing: when an if-condition is a +/// bare result variable whose reaching SSA def is a Call to a +/// callee classified by `classify_input_validator_callee` (e.g. +/// `validateUrlSsrf`, `verifyToken`, `isValidUrl`), the validator's +/// argument is treated as validated on the success branch. +/// +/// This pins the SSA-level +/// `apply_input_validator_branch_narrowing` regardless of whether +/// downstream consumers (sink-arg taint, cfg-unguarded-sink) honor +/// `validated_must`. Test asserts the symbol-keyed validation flag +/// is set on the analysis exit state. +/// +/// Direct-flow shape (no helper indirection); the helper-summary +/// case still has open architectural gaps (validated_must doesn't +/// propagate through `param_to_sink` summaries, same gap blocks +/// AllowlistCheck-in-helper, see CVE_DEFERRED.md GHSA-4x48-cgf9-q33f). +/// +/// Motivated by Novu CVE GHSA-4x48-cgf9-q33f +/// (`const ssrfError = await validateUrlSsrf(child.webhookUrl); if (ssrfError) throw …;`). +#[test] +fn indirect_validator_narrowing_marks_arg_validated() { + let src = br#" +async function handler(req) { + const target = req.query.url; + const ssrfError = await validateUrlSsrf(target); + if (ssrfError) { + throw new Error('blocked'); + } + await axios.get(target); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + // Direct-flow: validator narrowing should clear axios.get's taint event. + assert!( + findings.is_empty(), + "validator narrowing should suppress direct-flow SSRF; got {} finding(s)", + findings.len() + ); +} + +/// Regression: `extract_ssa_func_summary` must skip `all_validated` +/// events when populating `param_to_sink` / `param_to_sink_param`. +/// +/// Helper bodies whose validator-call branch narrowing fired produce +/// per-param probe events flagged `all_validated=true`. Without +/// summary-extract suppression, callers would still see the helper +/// in their summary's sink set and refire on `helper(taintedArg)` +/// even though the validator inside the helper proved the path +/// safe. The caller can't see the validator (it's behind the +/// summary), so the gap manifests as a precision miss only when +/// helper + caller are in the same file. +/// +/// Closes the helper-summary half of Novu CVE GHSA-4x48-cgf9-q33f. +#[test] +fn helper_with_validator_does_not_propagate_to_caller_via_summary() { + let src = br#" +async function getWebhookResponse(child) { + const ssrfError = await validateUrlSsrf(child.webhookUrl); + if (ssrfError) { + throw new Error('blocked'); + } + return await axios.post(child.webhookUrl, {}); +} + +async function handler(req) { + const child = req.body.filter; + const r = await getWebhookResponse(child); + return r; +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "helper-with-validator should not propagate sink via summary; got {} finding(s)", + findings.len() + ); +} + +/// Companion: same shape WITHOUT the validator inside the helper +/// must still fire so the precision gain is targeted. Asserts +/// `all_validated` skip doesn't accidentally suppress unsafe helpers. +#[test] +fn helper_without_validator_still_propagates_to_caller_via_summary() { + let src = br#" +async function getWebhookResponse(child) { + return await axios.post(child.webhookUrl, {}); +} + +async function handler(req) { + const child = req.body.filter; + const r = await getWebhookResponse(child); + return r; +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + !findings.is_empty(), + "helper-without-validator must still flag the cross-fn SSRF path", + ); +} + +/// Regression: `validate*`-named callees match +/// `InputValidatorPolarity::ErrorReturning`, bare `if (err) throw` +/// guards the success branch (false branch). `is_valid*`/`is_safe*` +/// callees match `InputValidatorPolarity::BooleanTrueIsValid`, bare +/// `if (!ok) throw` guards the success branch (true branch via +/// `condition_negated`). +#[test] +fn classify_input_validator_callee_polarity_buckets() { + use crate::ssa::type_facts::{InputValidatorPolarity, classify_input_validator_callee}; + + // ErrorReturning bucket + assert_eq!( + classify_input_validator_callee("validateUrlSsrf"), + Some(InputValidatorPolarity::ErrorReturning) + ); + assert_eq!( + classify_input_validator_callee("verifyToken"), + Some(InputValidatorPolarity::ErrorReturning) + ); + assert_eq!( + classify_input_validator_callee("validate_url"), + Some(InputValidatorPolarity::ErrorReturning) + ); + + // BooleanTrueIsValid bucket + assert_eq!( + classify_input_validator_callee("isValidUrl"), + Some(InputValidatorPolarity::BooleanTrueIsValid) + ); + assert_eq!( + classify_input_validator_callee("is_valid_email"), + Some(InputValidatorPolarity::BooleanTrueIsValid) + ); + assert_eq!( + classify_input_validator_callee("isSafe"), + Some(InputValidatorPolarity::BooleanTrueIsValid) + ); + + // Negative, names that look like validators but are auth-flavored + // (`checkPermissions`, `is_authorized`) are intentionally not + // matched here; they have separate semantics in the auth pipeline. + assert_eq!(classify_input_validator_callee("checkPermissions"), None); + assert_eq!(classify_input_validator_callee("is_authorized"), None); + assert_eq!(classify_input_validator_callee("randomThing"), None); + + // Path-prefix peeling: `obj.validateXxx` should classify the same + // as the bare callee. + assert_eq!( + classify_input_validator_callee("validator.validateUrlSsrf"), + Some(InputValidatorPolarity::ErrorReturning) + ); +} + #[test] #[ignore] fn cve_2025_64430_three_hop_transitive_documents_depth_limit() { diff --git a/src/utils/analysis_options.rs b/src/utils/analysis_options.rs index 236b5278..ffffcb98 100644 --- a/src/utils/analysis_options.rs +++ b/src/utils/analysis_options.rs @@ -27,7 +27,7 @@ pub const DEFAULT_PARSE_TIMEOUT_MS: u64 = 10_000; /// value. Raised from the historical `4` to `32` so realistic codebases /// with wide joins (many param sources, deep helper chains) no longer /// silently drop origin attribution. Tunable via -/// [`AnalysisOptions::max_origins`] — see +/// [`AnalysisOptions::max_origins`], see /// `src/taint/ssa_transfer/state.rs::effective_max_origins`. pub const DEFAULT_MAX_ORIGINS: u32 = 32; @@ -38,11 +38,11 @@ pub const DEFAULT_MAX_ORIGINS: u32 = 32; pub const MIN_MAX_ORIGINS: u32 = 1; /// Default upper bound on the number of abstract heap objects tracked per -/// intra-procedural points-to set. Set to `32` — high enough that +/// intra-procedural points-to set. Set to `32`, high enough that /// realistic factory/builder/DI patterns (routine 10–30 allocation sites /// aliased into one variable) stay precise, low enough to keep /// `HeapState` join/clone cost bounded in the worklist. Tunable via -/// [`AnalysisOptions::max_pointsto`] — see +/// [`AnalysisOptions::max_pointsto`], see /// `src/ssa/heap.rs::effective_max_pointsto`. pub const DEFAULT_MAX_POINTSTO: u32 = 32; @@ -152,7 +152,7 @@ impl Default for AnalysisOptions { /// (notably `nyx serve`, which resolves the engine profile per scan /// request) can replace the installed options between scans via /// [`reinstall`]. Within a single scan run, engine toggles must not -/// change mid-flight — the caller is responsible for that invariant +/// change mid-flight, the caller is responsible for that invariant /// (`JobManager`'s single-scan guarantee provides it in the server). static RUNTIME: RwLock> = RwLock::new(None); @@ -174,7 +174,7 @@ pub fn install(opts: AnalysisOptions) -> bool { /// server's scan thread, which re-resolves the engine profile from each /// incoming request; `install`'s first-wins semantics would otherwise /// pin the first scan's choice for the lifetime of the server. Callers -/// must ensure no scan is concurrently reading `current()` — in practice +/// must ensure no scan is concurrently reading `current()`, in practice /// this means calling `reinstall` before the scan's rayon pool starts. pub fn reinstall(opts: AnalysisOptions) { *RUNTIME.write().expect("analysis options RwLock poisoned") = Some(opts); diff --git a/src/utils/config.rs b/src/utils/config.rs index 52537071..47876fc3 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -315,8 +315,8 @@ pub struct OutputConfig { /// When `true`, findings whose engine provenance notes include any /// `OverReport` (widening) or `Bail` (lowering/parse failure) /// direction are filtered out before output. `UnderReport` - /// findings — where the result set is a lower bound but each - /// emitted flow is still real — are kept. + /// findings, where the result set is a lower bound but each + /// emitted flow is still real, are kept. /// /// Surfaced via `--require-converged`; intended for strict CI /// gating where a finding from capped analysis is worse than no @@ -644,7 +644,7 @@ impl Default for RunsConfig { } } -/// A named scan profile — a partial overlay of scan-related settings. +/// A named scan profile, a partial overlay of scan-related settings. /// All fields are `Option`: `None` means "don't override". #[derive(Debug, Serialize, Deserialize, Clone, Default)] #[serde(default)] @@ -715,7 +715,7 @@ pub struct Config { pub server: ServerConfig, pub runs: RunsConfig, pub profiles: HashMap, - /// Detected frameworks for the current project — set by the scan pipeline, + /// Detected frameworks for the current project, set by the scan pipeline, /// not persisted to config files. #[serde(skip)] pub framework_ctx: Option, diff --git a/src/utils/ext.rs b/src/utils/ext.rs index a77c1c9d..c8f41d95 100644 --- a/src/utils/ext.rs +++ b/src/utils/ext.rs @@ -5,7 +5,7 @@ pub fn lowercase_ext(path: &std::path::Path) -> Option<&'static str> { // Real-world C++ codebases overwhelmingly use `.cc` / `.cxx` / // `.hpp` / `.hh` / `.h++` rather than the `.cpp` synthetic-fixture // extension. All map to the same tree-sitter-cpp grammar. `.h` - // is intentionally NOT mapped — it's also valid C and + // is intentionally NOT mapped, it's also valid C and // disambiguating without a build system is brittle. "cpp" | "c++" | "cc" | "cxx" | "hpp" | "hxx" | "hh" | "h++" => Some("cpp"), "java" => Some("java"), diff --git a/src/utils/project.rs b/src/utils/project.rs index 1373c0f0..e1d6819b 100644 --- a/src/utils/project.rs +++ b/src/utils/project.rs @@ -84,7 +84,7 @@ fn read_bounded(path: &Path) -> Option { /// /// Intentionally a coarse byte-level substring check against the quoted module /// specifier (e.g. `'fastify'`, `"github.com/labstack/echo/v4"`, -/// `'sinatra'`). Only the first 8 KiB of the file are inspected — imports / +/// `'sinatra'`). Only the first 8 KiB of the file are inspected, imports / /// requires live at the top. Returns an empty list for languages without a /// framework detection policy here. pub fn detect_in_file_frameworks(bytes: &[u8], lang_slug: &str) -> Vec { @@ -147,7 +147,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── Node.js (package.json) ── if let Some(content) = read_bounded(&root.join("package.json")) { // Crude substring search in the "dependencies" block area. - // Good enough for detection — no JSON parsing overhead. + // Good enough for detection, no JSON parsing overhead. if content.contains("\"express\"") { fws.push(DetectedFramework::Express); } diff --git a/src/utils/query_cache.rs b/src/utils/query_cache.rs index 31012778..0d9d16a1 100644 --- a/src/utils/query_cache.rs +++ b/src/utils/query_cache.rs @@ -23,7 +23,7 @@ static CACHE: LazyLock>> = /// patterns for the language are cached normally. A language with an /// all-malformed pattern slice yields an empty cache entry. /// -/// Lock poisoning on the shared cache is recovered transparently — a +/// Lock poisoning on the shared cache is recovered transparently, a /// panic in another thread must not brick pattern loading process-wide. pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc> { // fast path @@ -31,7 +31,7 @@ pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc = patterns .into_iter() diff --git a/src/utils/snippet.rs b/src/utils/snippet.rs index 10b38a35..5a8e89e0 100644 --- a/src/utils/snippet.rs +++ b/src/utils/snippet.rs @@ -1,19 +1,47 @@ -//! Source-line snippet extraction for diagnostics. +//! UTF-8-safe truncation for diagnostic strings. //! -//! Both [`crate::ast`] (per-finding evidence) and [`crate::summary`] -//! (cross-file `SinkSite`) need to grab the source line containing a -//! given byte offset, trim it, and cap it at a fixed character budget. -//! The two callers used to carry private copies of this routine; the -//! truncation step performed a raw byte slice (`&trimmed[..MAX]`) which -//! panics whenever the cap lands inside a multi-byte UTF-8 character. -//! Real-world Ruby/JS test suites with Cyrillic / CJK / emoji string -//! literals tripped this on `mastodon`, `discourse`, and `gitlabhq`. +//! Two related shapes live here: //! -//! This shared helper truncates at the nearest preceding char -//! boundary, so any UTF-8 input is safe. +//! 1. [`line_snippet`], extracts the trimmed source line containing +//! a byte offset, capped at ~120 bytes. Used by [`crate::ast`] +//! (per-finding evidence) and [`crate::summary`] (cross-file +//! `SinkSite`). +//! 2. [`truncate_at_char_boundary`], the underlying primitive: cap a +//! string at `max_bytes`, rounded down to the nearest UTF-8 char +//! boundary. +//! +//! Both arose from the same family of panics: real-world Ruby/JS/Go +//! test suites carry literal Cyrillic / CJK / emoji / Devanagari / +//! Gurmukhi inside string and regex constants. Naive +//! `&s[..MAX].to_string()` truncation panics whenever the cap lands +//! inside a multi-byte UTF-8 sequence, killing the rayon worker that +//! happens to lower that file. Earlier sessions fixed `line_snippet` +//! (mastodon / discourse / gitlabhq, Cyrillic in RSpec strings); the +//! gogs scan still tripped because the CFG condition-text path +//! (`src/cfg/conditions.rs`, `src/cfg/mod.rs`) carried a third copy +//! of the same byte-slice idiom. The Gurmukhi `'ਖ'` regex literal in +//! gogs's localised Gherkin keyword list lands byte 256 mid-character +//! and panics. Centralising the safe-truncation primitive prevents +//! the next bytes-vs-chars site from re-introducing the same bug. const MAX_SNIPPET_BYTES: usize = 120; +/// Truncate `s` to at most `max_bytes` bytes, rounding the cut point +/// down to the nearest UTF-8 character boundary so the returned slice +/// is always valid UTF-8. When `s.len() <= max_bytes` the slice is +/// returned unchanged. When `max_bytes == 0` an empty slice is +/// returned. Never panics on multi-byte input. +pub fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str { + if s.len() <= max_bytes { + return s; + } + let mut end = max_bytes; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + &s[..end] +} + /// Extract the trimmed source line containing `byte_offset`, capped /// at ~120 bytes (rounded down to the nearest UTF-8 char boundary). /// Returns `None` when the offset is out of range or the line is @@ -36,11 +64,10 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option { return None; } if trimmed.len() > MAX_SNIPPET_BYTES { - let mut end = MAX_SNIPPET_BYTES; - while end > 0 && !trimmed.is_char_boundary(end) { - end -= 1; - } - Some(format!("{}...", &trimmed[..end])) + Some(format!( + "{}...", + truncate_at_char_boundary(trimmed, MAX_SNIPPET_BYTES) + )) } else { Some(trimmed.to_string()) } @@ -48,7 +75,51 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option { #[cfg(test)] mod tests { - use super::line_snippet; + use super::{line_snippet, truncate_at_char_boundary}; + + #[test] + fn truncate_short_string_unchanged() { + assert_eq!(truncate_at_char_boundary("hello", 10), "hello"); + assert_eq!(truncate_at_char_boundary("", 10), ""); + } + + #[test] + fn truncate_zero_max_returns_empty() { + assert_eq!(truncate_at_char_boundary("hello", 0), ""); + assert_eq!(truncate_at_char_boundary("ਖਖਖ", 0), ""); + } + + #[test] + fn truncate_ascii_clean_at_byte_max() { + assert_eq!(truncate_at_char_boundary("hello world", 5), "hello"); + } + + #[test] + fn truncate_inside_multibyte_rounds_down() { + // 'ਖ' (Gurmukhi LETTER KHA, U+0A16) is 3 bytes in UTF-8. + // Build a string where byte 5 lands inside the 'ਖ'. + let s = "abcdਖef"; + // bytes: 0..4 = "abcd", 4..7 = 'ਖ', 7.. = "ef" + // Truncating at 5 must not panic; result is "abcd". + assert_eq!(truncate_at_char_boundary(s, 5), "abcd"); + assert_eq!(truncate_at_char_boundary(s, 6), "abcd"); + assert_eq!(truncate_at_char_boundary(s, 7), "abcdਖ"); + } + + #[test] + fn truncate_devanagari_gherkin_regex_literal() { + // Reproduces the gogs panic shape: long regex string that + // contains Devanagari / Gurmukhi / CJK / Thai keywords with + // byte 256 landing mid-character. + let regex_body = "stream.match(/(機能|功能|フィーチャ|기능|โครงหลัก|ความสามารถ|ความต้องการทางธุรกิจ|ಹೆಚ್ಚಳ|గుణము|ਮੁਹਾਂਦਰਾ|ਨਕਸ਼ ਨੁਹਾਰ|".to_string(); + assert!(regex_body.len() > 256); + // Must not panic. + let truncated = truncate_at_char_boundary(®ex_body, 256); + // Must be valid UTF-8 (it's already a `&str`, but the cut point + // landing on a boundary is the actual property under test). + assert!(regex_body.is_char_boundary(truncated.len())); + assert!(truncated.len() <= 256); + } #[test] fn ascii_short_line_returned_verbatim() { diff --git a/src/walk.rs b/src/walk.rs index 336d862b..209692ad 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -136,7 +136,7 @@ pub fn spawn_file_walker(root: &Path, cfg: &Config) -> (Receiver, JoinHan }; let is_file = metadata.file_type().is_file(); let under_limit = max_bytes == 0 || metadata.len() <= max_bytes; - // Always canonicalize and verify containment — a symlink + // Always canonicalize and verify containment, a symlink // in the tree can escape the root even when follow=false // if the walker resolves it at metadata time. let path_allowed = canonical_root.as_ref().is_none_or(|root| { @@ -306,7 +306,7 @@ fn walker_follow_symlinks_does_not_escape_root() { #[test] fn walker_no_follow_symlinks_still_rejects_outside_paths() { // Pre-existing symlink to an out-of-root file must be excluded even when - // follow_symlinks=false — the walker may surface the resolved path on + // follow_symlinks=false, the walker may surface the resolved path on // some platforms. use std::os::unix::fs::symlink; diff --git a/tests/abstract_transfer_tests.rs b/tests/abstract_transfer_tests.rs index fe66547e..d8c0087d 100644 --- a/tests/abstract_transfer_tests.rs +++ b/tests/abstract_transfer_tests.rs @@ -187,7 +187,7 @@ fn interval_join_clamped_widens_range() { #[test] fn interval_join_identity_vs_clamped_is_top() { // Different flow shapes cannot be combined into a single bounded - // form — conservative fallback is Top. + // form, conservative fallback is Top. let a = IntervalTransfer::Identity; let b = IntervalTransfer::Clamped { lo: 0, hi: 10 }; assert_eq!(a.join(&b), IntervalTransfer::Top); @@ -296,7 +296,7 @@ fn transfer_apply_combines_subdomains() { // Interval identity forwards the caller-known bound. assert_eq!(out.interval.lo, Some(8080)); assert_eq!(out.interval.hi, Some(8080)); - // String literal-prefix overrides the caller-side input — the + // String literal-prefix overrides the caller-side input, the // callee's structural fact wins. assert_eq!(out.string.prefix.as_deref(), Some("https://safe.com/")); // Bit subdomain is always Top on cross-file transfer by design. diff --git a/tests/auth_analysis_tests.rs b/tests/auth_analysis_tests.rs index 14dd935a..b353474e 100644 --- a/tests/auth_analysis_tests.rs +++ b/tests/auth_analysis_tests.rs @@ -649,7 +649,7 @@ fn hashmap_local_noise_is_clean() { #[test] fn row_ownership_equality_is_clean() { // `if owner_id != user.id { return ... }` is a row-level - // ownership check — both the row-fetching call and any downstream + // ownership check, both the row-fetching call and any downstream // uses of the row's fields should be considered authorized. assert_absent( "row_ownership_equality.rs", @@ -670,7 +670,7 @@ fn row_ownership_no_early_exit_flags() { #[test] fn helper_scoped_params_is_clean() { // A library helper whose internal work is `result.insert(..)` - // on a locally-constructed HashSet is not a sink — the call is + // on a locally-constructed HashSet is not a sink, the call is // classified as non-sink because the receiver is the locally-bound // collection. assert_absent("helper_scoped_params.rs", "rs.auth.missing_ownership_check"); @@ -688,7 +688,7 @@ fn self_scoped_user_is_clean() { fn true_positive_missing_check_flags() { // Positive control: an authenticated handler that deletes a doc // and publishes against a group without any ownership/membership - // check — must still flag. + // check, must still flag. assert_has( "true_positive_missing_check.rs", "rs.auth.missing_ownership_check", @@ -763,7 +763,7 @@ fn db_connection_type_inferred_is_clean() { // inferred as a `DatabaseConnection` via SSA `constructor_type` // (through `peel_identity_suffix`). The handler logs the caller's // own id; no foreign scoped id reaches the sink, so the ownership - // gate has nothing to flag — the type-facts refinement must not + // gate has nothing to flag, the type-facts refinement must not // introduce a false positive here. assert_absent( "db_connection_type_inferred.rs", diff --git a/tests/benchmark/RESULTS.md b/tests/benchmark/RESULTS.md index bc57c70b..60a93c87 100644 --- a/tests/benchmark/RESULTS.md +++ b/tests/benchmark/RESULTS.md @@ -4,13 +4,13 @@ Current baseline (2026-04-29): | Metric | File-level | Rule-level | CI floor | |-----------|------------|------------|----------| -| Precision | 0.991 | 0.991 | 0.861 | -| Recall | 0.995 | 0.995 | 0.944 | -| F1 | 0.993 | 0.993 | 0.901 | +| Precision | 0.996 | 0.996 | 0.861 | +| Recall | 1.000 | 1.000 | 0.944 | +| F1 | 0.998 | 0.998 | 0.901 | -Corpus: 433 cases across 10 languages, 432 evaluated (1 disabled). Per-run JSON lands in `tests/benchmark/results/` (`latest.json` plus dated snapshots). See `README.md` for what the scoring modes mean and how to run a subset. +Corpus: 451 cases across 10 languages, 449 evaluated (no disabled). Per-run JSON lands in `tests/benchmark/results/` (`latest.json` plus dated snapshots). See `README.md` for what the scoring modes mean and how to run a subset. -The corpus is mostly synthetic 8-20 line fixtures, one vulnerability or one safe pattern per file. A smaller real-CVE replay set under `cve_corpus/` covers 18 published CVEs across all 10 languages. Both contribute to the headline numbers. +The corpus is mostly synthetic 8-20 line fixtures, one vulnerability or one safe pattern per file. A smaller real-CVE replay set under `cve_corpus/` covers 20 published CVEs across all 10 languages. Both contribute to the headline numbers. ## Real CVE coverage @@ -20,14 +20,19 @@ Real disclosed CVEs reduced to minimal reproducers, vulnerable + patched pair pe |----------------|------------|----------------------------|----------------------|-----------------|----------| | CVE-2023-48022 | Python | Ray | Apache-2.0 | CMDI | detected | | CVE-2017-18342 | Python | PyYAML | MIT | Deserialization | detected | +| CVE-2025-69662 | Python | geopandas | BSD-3-Clause | SQL Injection | detected | +| CVE-2026-33626 | Python | LMDeploy | Apache-2.0 | SSRF | detected | | CVE-2019-14939 | JavaScript | mongo-express | MIT | code_exec | detected | | CVE-2025-64430 | JavaScript | Parse Server | Apache-2.0 | SSRF | detected | | CVE-2023-26159 | TypeScript | follow-redirects | MIT | SSRF | detected | +| GHSA-4x48-cgf9-q33f | TypeScript | Novu | MIT | SSRF | detected | | CVE-2022-30323 | Go | hashicorp/go-getter | MPL-2.0 | CMDI | detected | | CVE-2023-3188 | Go | owncast | MIT | SSRF | detected | | CVE-2024-31450 | Go | owncast | MIT | path_traversal | detected | | CVE-2015-7501 | Java | Apache Commons Collections | Apache-2.0 | Deserialization | detected | | CVE-2017-12629 | Java | Apache Solr | Apache-2.0 | CMDI | detected | +| CVE-2022-1471 | Java | SnakeYAML | Apache-2.0 | Deserialization | detected | +| CVE-2022-42889 | Java | Apache Commons Text | Apache-2.0 | code_exec | detected | | CVE-2013-0156 | Ruby | Ruby on Rails | MIT | Deserialization | detected | | CVE-2020-8130 | Ruby | Rake | MIT | CMDI | detected | | CVE-2017-9841 | PHP | PHPUnit | BSD-3-Clause | code_exec | detected | @@ -60,6 +65,9 @@ Most recent first. Metrics are rule-level on the corpus size at that point. | Date | Change | Corpus | P | R | F1 | |------------|------------------------------------------------------------------------------|--------|-------|-------|-------| +| 2026-04-29 | Java SnakeYAML + Text4Shell patterns; CVE-2022-1471 and CVE-2022-42889 detected | 449 | 0.996 | 1.000 | 0.998 | +| 2026-04-29 | Indirect-validator branch narrowing (`const err = validate(x); if (err) throw …;`) + helper-summary all_validated propagation; Novu GHSA-4x48-cgf9-q33f detected | 445 | 0.991 | 1.000 | 0.995 | +| 2026-04-29 | Python f-string SQLi pattern + bindparams sanitizer + HttpClient SSRF rules; CVE-2025-69662 (geopandas) and CVE-2026-33626 (LMDeploy) detected | 439 | 0.991 | 1.000 | 0.995 | | 2026-04-29 | Phantom-Param-aware field suppression: CVE-2023-3188 detected, FP guards hold | 432 | 0.995 | 1.000 | 0.998 | | 2026-04-28 | Ruby bare `Kernel#open` CMDI sink, exact-match sigil on label matchers | 428 | 0.995 | 1.000 | 0.998 | | 2026-04-28 | Go SSRF/FILE_IO sink expansion (`http.DefaultClient.*`, `os.Remove`/`WriteFile`) plus Decode-writeback container op | 426 | 0.995 | 1.000 | 0.998 | diff --git a/tests/benchmark/corpus/go/safe/safe_test_helper_fatal.go b/tests/benchmark/corpus/go/safe/safe_test_helper_fatal.go new file mode 100644 index 00000000..c61583c0 --- /dev/null +++ b/tests/benchmark/corpus/go/safe/safe_test_helper_fatal.go @@ -0,0 +1,62 @@ +// go-safe-realrepo-006 — distilled from minio cmd/admin-handlers-users_test.go +// (and the identical pattern across xl-storage_test.go, erasure-healing_test.go, +// 49+34+12+11+9+7+7+5 findings on minio test files alone). +// +// `cfg-error-fallthrough` looks for `if err != nil { … }` whose body fails to +// terminate. Test code idiomatically writes +// +// if err != nil { c.Fatalf("...", err) } +// postSink(...) +// +// where `c.Fatalf` (a `*testing.T` method) calls `runtime.Goexit()` and the +// `postSink` line is unreachable on the error path. The rule classified +// this as fall-through because `Fatalf` looks like an ordinary call. Engine +// fix: `src/cfg_analysis/error_handling.rs::call_never_returns` recognises +// `Fatal*`, `Panic*`, `FailNow`, `os.Exit`, `runtime.Goexit`, `log.Fatal*`, +// `panic`, etc. as terminators inside `terminates_on_all_paths`. + +package safe + +import ( + "context" + "log" + "os" + "testing" +) + +type clientHelper struct { + bucket string +} + +func (c *clientHelper) MakeBucket(ctx context.Context, name string) error { return nil } +func (c *clientHelper) PutObject(ctx context.Context, name string) error { return nil } + +func setupBucket(t *testing.T, c *clientHelper, ctx context.Context) { + if err := c.MakeBucket(ctx, c.bucket); err != nil { + t.Fatalf("bucket creat error: %v", err) + } + if err := c.PutObject(ctx, "obj"); err != nil { + t.Fatal(err) + } +} + +func runWithExit(c *clientHelper, ctx context.Context) { + if err := c.MakeBucket(ctx, c.bucket); err != nil { + log.Fatalf("init failed: %v", err) + } + c.PutObject(ctx, "obj") +} + +func runWithOsExit(c *clientHelper, ctx context.Context) { + if err := c.MakeBucket(ctx, c.bucket); err != nil { + os.Exit(1) + } + c.PutObject(ctx, "obj") +} + +func runWithPanic(c *clientHelper, ctx context.Context) { + if err := c.MakeBucket(ctx, c.bucket); err != nil { + panic(err) + } + c.PutObject(ctx, "obj") +} diff --git a/tests/benchmark/corpus/javascript/safe/safe_localised_gherkin_regex.js b/tests/benchmark/corpus/javascript/safe/safe_localised_gherkin_regex.js new file mode 100644 index 00000000..8e355181 --- /dev/null +++ b/tests/benchmark/corpus/javascript/safe/safe_localised_gherkin_regex.js @@ -0,0 +1,26 @@ +// js-safe-realrepo-006 — distilled from gogs `public/plugins/codemirror-5.17.0/ +// mode/gherkin/gherkin.js` line 107. The CodeMirror Gherkin tokenizer ships +// localised feature-keyword aliases as one large regex inside a boolean +// sub-condition. The CFG builder textualises every sub-condition of a +// boolean chain and truncates that text to MAX_CONDITION_TEXT_LEN (256 +// bytes) for diagnostics; naive byte-slice truncation panicked when byte +// 256 landed inside a multi-byte UTF-8 character (here Gurmukhi `ਖ`, +// 3-byte UTF-8). Engine fix: +// `src/utils/snippet.rs::truncate_at_char_boundary`, applied at three CFG +// sites and two symex display sites. Invariant: scanning this file must +// terminate without panicking the rayon worker, regardless of where byte +// 256 lands inside the regex. + +function tokenLocalisedFeatureKeyword(stream, state) { + if ( + !state.inKeywordLine && + state.allowFeature && + stream.match(/(機能|功能|フィーチャ|기능|โครงหลัก|ความสามารถ|ความต้องการทางธุรกิจ|ಹೆಚ್ಚಳ|గుణము|ಮುಹಾಂದರಾ|ਮੁਹਾਂਦਰਾ|ਨਕਸ਼ ਨੁਹਾਰ|ਖਾਸੀਅਤ|रूप लेख|وِیژگی|خاصية|תכונה|Функціонал|Функция|Функционалност|Функционал|Үзенчәлеклелек|Свойство|Особина|Мөмкинлек|Могућност|Λειτουργία|Δυνατότητα|Właściwość|Vlastnosť|Trajto|Tính năng|Savybė|Požiadavka|Požadavek|Potrzeba biznesowa|Özellik|Osobina|Ominaisuus|Omadus|Mogućnost|Mogucnost|Jellemző|Funzionalità|Funktionalitéit|Funktionalität|Funkcja|Funkcionalnost|Funkcionalitāte|Funkcia|Fungsi|Functionaliteit|Funcționalitate|Funcţionalitate|Functionalitate|Funcionalitat|Funcionalidade|Fonctionnalité|Fitur|Fīča|Feature|Eiginleiki|Egenskap|Egenskab|Característica|Caracteristica|Business Need|Aspekt|Arwedd|Ability):/) + ) { + state.inKeywordLine = true; + return "keyword"; + } + return null; +} + +module.exports = { tokenLocalisedFeatureKeyword }; diff --git a/tests/benchmark/corpus/python/auth/vuln_fastapi_route_no_dependencies.py b/tests/benchmark/corpus/python/auth/vuln_fastapi_route_no_dependencies.py new file mode 100644 index 00000000..61c41837 --- /dev/null +++ b/tests/benchmark/corpus/python/auth/vuln_fastapi_route_no_dependencies.py @@ -0,0 +1,19 @@ +""" +Vulnerable counterpart to safe_fastapi_route_dependencies_auth.py: same +shape but with NO `dependencies=[Depends(...)]` keyword arg on the route +decorator. The FastAPI ownership-check rule must still fire — the +recognizer must not blanket-suppress every FastAPI route, only those +with an actual dependency-injected auth check. +""" +from fastapi import FastAPI + +router = FastAPI() + + +@router.delete("/{connection_id}") +def delete_connection(connection_id: str, session): + """No auth — must still fire missing_ownership_check.""" + connection = session.scalar(select(Connection).filter_by(conn_id=connection_id)) + if connection is None: + raise HTTPException(404, "not found") + session.delete(connection) diff --git a/tests/benchmark/corpus/python/safe/safe_fastapi_route_dependencies_auth.py b/tests/benchmark/corpus/python/safe/safe_fastapi_route_dependencies_auth.py new file mode 100644 index 00000000..0dc50279 --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_fastapi_route_dependencies_auth.py @@ -0,0 +1,43 @@ +""" +Distilled from airflow `airflow-core/src/airflow/api_fastapi/core_api/routes/public/connections.py`: + @connections_router.delete( + "/{connection_id}", + dependencies=[Depends(requires_access_connection(method="DELETE"))], + ) + def delete_connection(connection_id: str, session: SessionDep): + connection = session.scalar(select(Connection).filter_by(conn_id=connection_id)) + ... + session.delete(connection) + +The route's `dependencies=[Depends(requires_access_*)]` declares the auth gate at +the FastAPI level. The ownership-check rule must recognise the dependency- +injected check and not flag the row-fetch / mutation as missing ownership. +""" +from fastapi import Depends, FastAPI + +router = FastAPI() + + +def requires_access_connection(method: str): + def check(): + ... + return check + + +@router.delete( + "/{connection_id}", + dependencies=[Depends(requires_access_connection(method="DELETE"))], +) +def delete_connection(connection_id: str, session): + connection = session.scalar(select(Connection).filter_by(conn_id=connection_id)) + if connection is None: + raise HTTPException(404, "not found") + session.delete(connection) + + +@router.get( + "/{connection_id}", + dependencies=[Depends(requires_access_connection(method="GET"))], +) +def get_connection(connection_id: str, session): + return session.scalar(select(Connection).filter_by(conn_id=connection_id)) diff --git a/tests/benchmark/corpus/python/safe/safe_fastapi_route_level_row_fetch.py b/tests/benchmark/corpus/python/safe/safe_fastapi_route_level_row_fetch.py new file mode 100644 index 00000000..a70757c3 --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_fastapi_route_level_row_fetch.py @@ -0,0 +1,79 @@ +""" +Distilled from airflow `airflow-core/src/airflow/api_fastapi/core_api/routes/public/dag_run.py`: + + @dag_run_router.post( + "", + dependencies=[Depends(requires_access_dag(method="POST", access_entity=DagAccessEntity.RUN))], + ) + def trigger_dag_run(dag_id, body, dag_bag, user, session, request): + dm = session.scalar(select(DagModel).where(DagModel.dag_id == dag_id)) + ... + dag = get_latest_version_of_dag(dag_bag, dag_id, session) + dag_run = dag.create_dagrun(run_id=params["run_id"], ...) + +The route-level `dependencies=[Depends(requires_access_dag(method="POST", +access_entity=...))]` decorator authorizes the entire handler — the +handler body's `dag.create_dagrun(...)` call (where `dag` is a row +fetched using the auth-checked `dag_id`) must be covered too, even +though the call's subject is the bare row variable rather than the +original id. + +Before the route-level fix, `auth_check_covers_subject` walked +`check.subjects` (empty for decorator-level checks whose inner call +carries no per-arg ValueRef) and never matched. After the fix, +`is_route_level=true` short-circuits coverage to true for any +non-login-guard route-level check, suppressing both the row-fetch +ownership flag and the downstream method-call ownership flag. +""" + +from fastapi import Depends, FastAPI + +router = FastAPI() + + +def requires_access_dag(method: str, access_entity=None): + def check(): + ... + return check + + +def get_latest_version_of_dag(dag_bag, dag_id, session): + return dag_bag.get(dag_id) + + +@router.get( + "/{dag_id}/runs/{run_id}", + dependencies=[Depends(requires_access_dag(method="GET"))], +) +def get_dag_run(dag_id: str, run_id: str, session): + """ + Route-level guard authorizes the entire handler. The + `filter_by(dag_id=dag_id, run_id=run_id)` ORM call must NOT trip + `py.auth.missing_ownership_check` even though the per-arg subjects + are id-shaped — the route-level decorator covers them. + """ + dag_run = session.scalar( + select(DagRun).filter_by(dag_id=dag_id, run_id=run_id) + ) + if dag_run is None: + raise HTTPException(404, "not found") + return dag_run + + +@router.delete( + "/{dag_id}", + dependencies=[Depends(requires_access_dag(method="DELETE"))], +) +def delete_dag(dag_id: str, session): + """ + Same shape, DELETE method. The row fetch and row-variable + method call must also be fully covered by the route-level guard. + `dag` is fetched using the auth-checked `dag_id`; without the + `is_route_level` short-circuit, the per-name walk would mismatch + `dag.` (subject is the row var) against the check's + empty subjects vec. + """ + dag = session.scalar(select(DagModel).where(DagModel.dag_id == dag_id)) + if dag is None: + raise HTTPException(404, "not found") + dag.cleanup_runs(session=session) diff --git a/tests/benchmark/corpus/python/safe/safe_pytest_sqlalchemy_session.py b/tests/benchmark/corpus/python/safe/safe_pytest_sqlalchemy_session.py new file mode 100644 index 00000000..08359db1 --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_pytest_sqlalchemy_session.py @@ -0,0 +1,33 @@ +""" +Distilled from airflow `tests/unit/models/test_backfill.py` and +`providers/google/tests/unit/google/cloud/hooks/test_dlp.py`: pytest test +methods that take a SQLAlchemy `session` fixture by name and call +`session.commit()` / `session.add(...)` / `session.scalar(...)`. + +Bare `session.` was previously classified as auth Session +context, which triggered `unit_has_user_input_evidence` even though the +test function takes no user input — the `session` fixture is the +SQLAlchemy ORM Session, not the auth/HTTP session. After the engine +classifier narrowing, only `session.` (`session.user`, +`session.user_id`, ...) is treated as auth context; SQLAlchemy verbs +do not contribute user-input evidence on their own. +""" + + +def test_reverse_and_depends_on_past_fails(dep_on_past, dag_maker, session): + with dag_maker() as dag: + pass + session.commit() + b = _create_backfill( + dag_id=dag.dag_id, + from_date="2021-01-01", + to_date="2021-01-05", + ) + if dep_on_past: + assert b is None + + +def test_create_deidentify_template_with_org_id(self, get_conn, mock_project_id): + get_conn.return_value.create_deidentify_template.return_value = {} + result = self.hook.create_deidentify_template(organization_id="ORG_ID") + assert result == {} diff --git a/tests/benchmark/corpus/rust/auth/db_connection_type_inferred.rs b/tests/benchmark/corpus/rust/auth/db_connection_type_inferred.rs index 8bd89a59..836c1353 100644 --- a/tests/benchmark/corpus/rust/auth/db_connection_type_inferred.rs +++ b/tests/benchmark/corpus/rust/auth/db_connection_type_inferred.rs @@ -2,7 +2,7 @@ // produces a `DatabaseConnection` via SSA `constructor_type` (through // `peel_identity_suffix`, which strips `.unwrap()` before matching). The // handler then calls `conn.execute(..)`, a callee name that appears in -// neither `mutation_indicator_names` nor `read_indicator_names` for Rust — +// neither `mutation_indicator_names` nor `read_indicator_names` for Rust , // name-based classification returns `None`, so the ownership gate // already cannot flag the call. The type-map refinement should *still* // leave the call unflagged (the type map produces `DbMutation`, but diff --git a/tests/benchmark/corpus/rust/auth/hashmap_local_noise.rs b/tests/benchmark/corpus/rust/auth/hashmap_local_noise.rs index e7fa5d2e..d45ad6ed 100644 --- a/tests/benchmark/corpus/rust/auth/hashmap_local_noise.rs +++ b/tests/benchmark/corpus/rust/auth/hashmap_local_noise.rs @@ -16,7 +16,7 @@ pub async fn handle_list_peer_docs(req: Req, ctx: Ctx) -> Result { let user = auth::require_auth(&req, &ctx).await?; let doc_ids: Vec = vec![1, 2, 3]; - // Pure in-memory bookkeeping — no authorization decision here. + // Pure in-memory bookkeeping, no authorization decision here. let mut counts: HashMap = HashMap::new(); let mut seen: HashSet = HashSet::new(); for doc_id in &doc_ids { diff --git a/tests/benchmark/corpus/rust/auth/row_fetch_then_authorize.rs b/tests/benchmark/corpus/rust/auth/row_fetch_then_authorize.rs index f42733dd..57952768 100644 --- a/tests/benchmark/corpus/rust/auth/row_fetch_then_authorize.rs +++ b/tests/benchmark/corpus/rust/auth/row_fetch_then_authorize.rs @@ -2,7 +2,7 @@ // the row by id first to obtain the resource it needs to authorize, then // calls a named authorization function on the fetched row. This is the // canonical pattern in Lemmy's Actix handlers (and most row-level Rails / -// Django authz code) — the authorization check appears textually after the +// Django authz code), the authorization check appears textually after the // fetch but is the first thing the function does on the row. use std::result::Result; diff --git a/tests/benchmark/corpus/rust/auth/row_ownership_equality.rs b/tests/benchmark/corpus/rust/auth/row_ownership_equality.rs index 6eb68651..321d2cb2 100644 --- a/tests/benchmark/corpus/rust/auth/row_ownership_equality.rs +++ b/tests/benchmark/corpus/rust/auth/row_ownership_equality.rs @@ -41,7 +41,7 @@ pub async fn handle_delete_doc(req: Req, ctx: Ctx, doc_id: i64) -> Result Result` extractor whose `doc_id` // field is declared as `i64`. The DTO field-level taint analysis // proves the value reaching `db.exec` is numeric and exempts -// `dto.doc_id` from the auth subject classifier — the rule must NOT +// `dto.doc_id` from the auth subject classifier, the rule must NOT // fire because numeric DTO fields cannot bypass ownership. use axum::extract::Json; diff --git a/tests/benchmark/corpus/rust/auth/safe_local_collection_param_types.rs b/tests/benchmark/corpus/rust/auth/safe_local_collection_param_types.rs new file mode 100644 index 00000000..f5b8d8f8 --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/safe_local_collection_param_types.rs @@ -0,0 +1,70 @@ +// Function-parameter type annotations naming an in-memory container +// (`RoaringBitmap`, `HashMap`, `HashSet`, ...) classify the +// receiver as `TypeKind::LocalCollection`, which the auth analyser +// maps to `SinkClass::InMemoryLocal` (always non-auth-relevant). +// Without this, the verb-name dispatch (`is_mutation: insert/remove`) +// classified `unsharded.insert(docid)` / +// `task_ids.insert(task_id)` as `DbMutation` and fired +// `missing_ownership_check` whenever the function had at least one +// id-shaped parameter to pass `unit_has_user_input_evidence`. +// +// Cluster surfaced from +// meilisearch/index-scheduler/src/scheduler/enterprise_edition/network.rs::balance_shards +// (`unsharded: RoaringBitmap` typed parameter) and same-pattern +// helpers across the index-scheduler. + +use std::collections::{BTreeSet, HashMap, HashSet}; + +struct RoaringBitmap; +impl RoaringBitmap { + fn new() -> Self { Self } + fn insert(&mut self, _x: u32) -> bool { true } + fn remove(&mut self, _x: u32) -> bool { true } + fn contains(&self, _x: u32) -> bool { true } +} + +// 1. Bare-typed RoaringBitmap parameter, function has id-like param +// `docid` so user-input-evidence fires; the receiver type proves +// the operation is in-memory bookkeeping. +fn balance_shards(mut unsharded: RoaringBitmap, docid: u32) { + unsharded.insert(docid); + unsharded.remove(docid); +} + +// 2. `&mut RoaringBitmap` reference, ref-stripping must reach the +// underlying type head. +fn process_docids(docids: &mut RoaringBitmap, docid: u32) { + docids.insert(docid); + docids.remove(docid); + let _ = docids.contains(docid); +} + +// 3. Lifetime-annotated reference: `&'a mut HashMap<...>`. +// Module-path prefix would also be dropped; head matches `HashMap`. +fn store_shard_docids<'a>( + new_shard_docids: &'a mut HashMap, + shard: String, + docid: u32, +) { + new_shard_docids.insert(shard, docid); +} + +// 4. Std-collection HashSet typed param. +fn add_user_id(ids: &mut HashSet, user_id: u64) { + ids.insert(user_id); + ids.remove(&user_id); +} + +// 5. Local var bound from constructor, already covered, but pinned +// here as a regression guard for the `RoaringBitmap::new()` +// constructor entry. +fn build_local_set(task_id: u32) -> RoaringBitmap { + let mut s = RoaringBitmap::new(); + s.insert(task_id); + s +} + +// 6. BTreeSet typed param. +fn collect_seen(seen: &mut BTreeSet, item_id: u32) { + seen.insert(item_id); +} diff --git a/tests/benchmark/corpus/rust/auth/safe_local_user_view_extractor.rs b/tests/benchmark/corpus/rust/auth/safe_local_user_view_extractor.rs index 73934b73..68288a85 100644 --- a/tests/benchmark/corpus/rust/auth/safe_local_user_view_extractor.rs +++ b/tests/benchmark/corpus/rust/auth/safe_local_user_view_extractor.rs @@ -1,7 +1,7 @@ // Real-repo motivation (lemmy `LocalUserView` extractor). // // Lemmy's authenticated-actor extractor type is named `LocalUserView` -// — every route handler signature is +//, every route handler signature is // `pub async fn handler(.., local_user_view: LocalUserView)`. The // previous exact-name list in `is_self_actor_type_text` // (`CurrentUser`, `SessionUser`, `AuthUser`, `AdminUser`, @@ -44,7 +44,7 @@ pub async fn write_self_note( pool: &mut Pool, local_user_view: LocalUserView, ) -> Result<(), ()> { - // Login predicate on the actor itself — subject is the actor. + // Login predicate on the actor itself, subject is the actor. // No additional ownership check needed because the subject is the // caller's own row. let _ = is_admin(&local_user_view); diff --git a/tests/benchmark/corpus/rust/auth/safe_param_type_segment_idents.rs b/tests/benchmark/corpus/rust/auth/safe_param_type_segment_idents.rs new file mode 100644 index 00000000..41f61f94 --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/safe_param_type_segment_idents.rs @@ -0,0 +1,78 @@ +// Internal helper whose parameter list contains type-segment idents +// that lowercase-match the framework-request-name allow-list (`path`, +// `request`, `ctx`, `body`, `path`). Before the +// `collect_param_names` Rust-parameter arm, the recursive default arm +// pulled `std`, `path`, `Path` out of `dst: &std::path::Path` and +// pushed them into `unit.params`, `path` then matched the +// framework-name list and gated `unit_has_user_input_evidence` open, +// firing `missing_ownership_check` at every id-shaped operation in +// the body. +// +// Cluster surfaced from +// meilisearch/index-scheduler/src/scheduler/process_snapshot_creation.rs::remove_tasks +// (`unsafe fn remove_tasks(tasks: &[Task], dst: &std::path::Path, +// index_base_map_size: usize)`). None of the actual params (`tasks`, +// `dst`, `sz`) match the user-input-evidence heuristic, so the rule +// must NOT fire on the internal task-cleanup loop. + +struct Task { + uid: u32, +} + +struct Database; + +impl Database { + fn delete(&self, _w: &mut u32, _u: &u32) -> Result<(), ()> { + Ok(()) + } +} + +struct TaskQueue { + all_tasks: Database, + canceled_by: Database, +} + +fn remove_tasks( + tasks: &[Task], + dst: &std::path::Path, + sz: usize, +) -> Result<(), ()> { + let _ = (dst, sz); + let mut wtxn = 0u32; + let task_queue = TaskQueue { + all_tasks: Database, + canceled_by: Database, + }; + let TaskQueue { + all_tasks, + canceled_by, + } = task_queue; + for task in tasks { + all_tasks.delete(&mut wtxn, &task.uid)?; + canceled_by.delete(&mut wtxn, &task.uid)?; + } + Ok(()) +} + +// Same shape with a typed wrapper whose tail segment lowercases to +// `path` (`PathBuf` → `pathbuf` does NOT match, but `Path` does). +// Confirms the Rust `parameter` arm in `collect_param_names` keeps +// `Path` out of `unit.params` even when wrapped in a generic. + +struct Wrapper(T); +struct PathHandle; +struct Item { + uid: u32, +} +struct Repo; +impl Repo { + fn delete(&self, _u: &u32) {} +} + +fn cleanup_internal(out: Wrapper, items: &[Item]) { + let _ = out; + let repo = Repo; + for item in items { + repo.delete(&item.uid); + } +} diff --git a/tests/benchmark/corpus/rust/auth/safe_row_fetch_multiline_let.rs b/tests/benchmark/corpus/rust/auth/safe_row_fetch_multiline_let.rs index be51e8b4..146d44be 100644 --- a/tests/benchmark/corpus/rust/auth/safe_row_fetch_multiline_let.rs +++ b/tests/benchmark/corpus/rust/auth/safe_row_fetch_multiline_let.rs @@ -4,7 +4,7 @@ // (the call body wraps onto the next line for readability). Before // the line-counting fix, `row_population_data` recorded the // `let_declaration`'s start row while `op.line` saw the inner call's -// start row — they differed by one and the row-fetch exemption +// start row, they differed by one and the row-fetch exemption // missed. Recording the **call**'s start line aligns the two and // the exemption fires for the multi-line shape too. @@ -52,7 +52,7 @@ pub async fn lock_comment( let comment_id = req.comment_id; let local_instance_id = local_user_view.person.instance_id; - // Multi-line let — the let_declaration starts on this line, but + // Multi-line let, the let_declaration starts on this line, but // the inner `CommentView::read(..)` call starts on the next line. // `op.line` for the read sink is the call's line, not the let's. let orig_comment = diff --git a/tests/benchmark/corpus/rust/auth/safe_row_population_reverse_walk.rs b/tests/benchmark/corpus/rust/auth/safe_row_population_reverse_walk.rs index 23bb2b5d..b749ad10 100644 --- a/tests/benchmark/corpus/rust/auth/safe_row_population_reverse_walk.rs +++ b/tests/benchmark/corpus/rust/auth/safe_row_population_reverse_walk.rs @@ -4,7 +4,7 @@ // `let community = Community::read(pool, req.community_id)` records // `community → [req.community_id]` in `row_population_data`. An auth // check `check_community_user_action(&user, &community, ..)` then -// authorises the row — and any **downstream** operation that re-uses +// authorises the row, and any **downstream** operation that re-uses // `req.community_id` (a later mutation by the same id, or a related // view fetched by the same id) is materially covered by that check. // @@ -71,7 +71,7 @@ pub async fn transfer_community( pool: &mut Pool, local_user_view: LocalUserView, ) -> Result<(), ()> { - // Row fetch — `community` is populated from `req.community_id`. + // Row fetch, `community` is populated from `req.community_id`. let community = Community::read(pool, req.community_id)?; // Authorisation check on the fetched row. Subject = `community` @@ -84,7 +84,7 @@ pub async fn transfer_community( // the row that was fetched with this id). CommunityActions::delete_mods_for_community(pool, req.community_id)?; - // Local alias of the same request field — `var_alias_chain` + // Local alias of the same request field, `var_alias_chain` // records `community_id → "req.community_id"` so the reverse-walk // also covers downstream sinks that pass the bare alias. Before // the alias-chain fix, the next read fired diff --git a/tests/benchmark/corpus/rust/auth/safe_typed_path_int_extractor.rs b/tests/benchmark/corpus/rust/auth/safe_typed_path_int_extractor.rs index adc7a9f2..6a652e59 100644 --- a/tests/benchmark/corpus/rust/auth/safe_typed_path_int_extractor.rs +++ b/tests/benchmark/corpus/rust/auth/safe_typed_path_int_extractor.rs @@ -1,7 +1,7 @@ // Phase 5 typed-extractor exclusion: an Axum-style `Path` // parameter is a framework-validated numeric extractor. The runtime // guarantees a numeric value, so even though `project_id` reaches a -// SQL helper, the rule must NOT fire — the value cannot carry an +// SQL helper, the rule must NOT fire, the value cannot carry an // injection payload nor bypass ownership. use axum::extract::Path; diff --git a/tests/benchmark/corpus/rust/auth/self_publish_email.rs b/tests/benchmark/corpus/rust/auth/self_publish_email.rs index d86b261d..0a62dc8c 100644 --- a/tests/benchmark/corpus/rust/auth/self_publish_email.rs +++ b/tests/benchmark/corpus/rust/auth/self_publish_email.rs @@ -36,7 +36,7 @@ mod serde_json { } // Real-repo shape from website/src/handlers/social.rs: -// `realtime::publish_to_user(&ctx.env, &user.email, ...)` — publish +// `realtime::publish_to_user(&ctx.env, &user.email, ...)`, publish // to the authed user's OWN channel keyed by their email. The // `email` / `username` / `handle` fields of a self-actor binding // reference the actor's own identity, just like `id` / `user_id`, diff --git a/tests/benchmark/corpus/rust/auth/self_scoped_user.rs b/tests/benchmark/corpus/rust/auth/self_scoped_user.rs index f27590e4..0ce808fa 100644 --- a/tests/benchmark/corpus/rust/auth/self_scoped_user.rs +++ b/tests/benchmark/corpus/rust/auth/self_scoped_user.rs @@ -5,7 +5,7 @@ mod auth { pub async fn require_auth(_r: &super::Req, _c: &super::Ctx) -> Result // The handler's `get_peer_ids(&db, user.id)` call below must not be // flagged. `user` is bound from `auth::require_auth(..)` so `user.id` -// is the caller's own id — the call is self-referential, not a foreign +// is the caller's own id, the call is self-referential, not a foreign // scoped id. The library-style helper below is a pass-through so its // body contains no DB sinks (the internal `user_id` → DB flow is a // separate pattern covered by helper-summary lifting). diff --git a/tests/benchmark/corpus/rust/auth/sql_join_acl.rs b/tests/benchmark/corpus/rust/auth/sql_join_acl.rs index 6da72bae..ef2a6108 100644 --- a/tests/benchmark/corpus/rust/auth/sql_join_acl.rs +++ b/tests/benchmark/corpus/rust/auth/sql_join_acl.rs @@ -2,7 +2,7 @@ // against an ACL table (`group_members`) with a WHERE clause that pins // the row to the current user (`gm.user_id = ?1` bound to `user.id`). // Every returned row is membership-gated by construction, so downstream -// uses of the row's columns (`group_id` here) are authorized — the +// uses of the row's columns (`group_id` here) are authorized, the // `realtime::publish_to_group` call MUST NOT be flagged as missing an // ownership check after B3. struct Ctx; diff --git a/tests/benchmark/corpus/rust/auth/transitive_helper.rs b/tests/benchmark/corpus/rust/auth/transitive_helper.rs index c1c2ae9d..17b60b5d 100644 --- a/tests/benchmark/corpus/rust/auth/transitive_helper.rs +++ b/tests/benchmark/corpus/rust/auth/transitive_helper.rs @@ -1,7 +1,7 @@ // target: authorization happens inside `validate_target`, which // internally calls `authz::require_membership` against the same // `group_id` the handler subsequently mutates. The current rule cannot -// see this transitively — B4 lifts per-function auth-check summaries +// see this transitively, B4 lifts per-function auth-check summaries // (which positional params are auth-checked) so the handler-level call // to `validate_target(&db, group_id, user.id)` is recognised as an // auth check covering `group_id`. Result: `db.exec(..)` MUST NOT flag @@ -45,7 +45,7 @@ pub async fn handle_create_comment( let user = auth::require_auth(&req, &ctx).await?; let db = Db; - // Authorization happens inside validate_target — helper-summary + // Authorization happens inside validate_target, helper-summary // lifting propagates the per-param auth check so this covers // `group_id`. validate_target(&db, group_id, user.id).await?; diff --git a/tests/benchmark/corpus/rust/auth/unsafe_dto_string_field_axum.rs b/tests/benchmark/corpus/rust/auth/unsafe_dto_string_field_axum.rs index 07231938..3961d7c5 100644 --- a/tests/benchmark/corpus/rust/auth/unsafe_dto_string_field_axum.rs +++ b/tests/benchmark/corpus/rust/auth/unsafe_dto_string_field_axum.rs @@ -1,7 +1,7 @@ // Phase 6 D06 (negative): same DTO shape as // `safe_dto_int_field_axum.rs` but the flow uses the `doc_id` field // whose declared type is `String`. Phase 6 must NOT exempt the -// member-access subject — String DTO fields can carry an injection +// member-access subject, String DTO fields can carry an injection // payload, so the auth rule must continue to fire. use axum::extract::Json; diff --git a/tests/benchmark/corpus/rust/auth/unsafe_handler_local_collection_does_not_blanket_suppress.rs b/tests/benchmark/corpus/rust/auth/unsafe_handler_local_collection_does_not_blanket_suppress.rs new file mode 100644 index 00000000..6a392bec --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/unsafe_handler_local_collection_does_not_blanket_suppress.rs @@ -0,0 +1,28 @@ +// Vulnerable counterpart to `safe_local_collection_param_types.rs` +// and `safe_param_type_segment_idents.rs`. Proves the LocalCollection +// receiver-type override and the Rust `parameter` arm in +// `collect_param_names` don't blanket-suppress real handlers that mix +// in-memory containers with persistent-store calls (`db.update`). +// Scoped identifier (`req.target_user_id`) flows into a real DB +// mutation with no preceding ownership check, must still fire. + +use std::collections::HashMap; + +struct DocumentRequest { + target_user_id: u64, + new_owner: u64, +} + +struct DbConnection; +impl DbConnection { + fn update_owner(&self, _doc_id: u64, _owner: u64) {} +} + +// `cache: &mut HashMap` is a local container, its +// mutations are non-auth-relevant. But `db.update_owner` is a +// real persistent-store write, classified as `DbMutation`, and the +// handler still has no auth check on `req.target_user_id`. +async fn change_owner(req: DocumentRequest, cache: &mut HashMap, db: DbConnection) { + cache.remove(&req.target_user_id); // local container op, OK + db.update_owner(req.target_user_id, req.new_owner); // <-- IDOR sink +} diff --git a/tests/benchmark/corpus/rust/auth/unsafe_row_fetch_no_authz.rs b/tests/benchmark/corpus/rust/auth/unsafe_row_fetch_no_authz.rs index f237fd33..93b0b394 100644 --- a/tests/benchmark/corpus/rust/auth/unsafe_row_fetch_no_authz.rs +++ b/tests/benchmark/corpus/rust/auth/unsafe_row_fetch_no_authz.rs @@ -1,4 +1,4 @@ -// Vulnerable counterpart to `row_fetch_then_authorize.rs` — the row is +// Vulnerable counterpart to `row_fetch_then_authorize.rs`, the row is // fetched by user-supplied id but no authorization function names it. // The row-fetch exemption must NOT fire here; the rule should still // flag the read as missing an ownership/membership check. diff --git a/tests/benchmark/corpus/rust/auth/unsafe_row_population_no_check.rs b/tests/benchmark/corpus/rust/auth/unsafe_row_population_no_check.rs index a8ccf877..ae9a2f0e 100644 --- a/tests/benchmark/corpus/rust/auth/unsafe_row_population_no_check.rs +++ b/tests/benchmark/corpus/rust/auth/unsafe_row_population_no_check.rs @@ -33,12 +33,12 @@ pub async fn transfer_community( req: TransferCommunity, pool: &mut Pool, ) -> Result<(), ()> { - // Row fetch — populates `community → [req.community_id]` — but + // Row fetch, populates `community → [req.community_id]`, but // no `check_*_action(&user, &community, ..)` follows. let _community = Community::read(pool, req.community_id)?; // Mutation by id with no preceding ownership/membership check. - // This is the genuine IDOR — must flag. + // This is the genuine IDOR, must flag. CommunityActions::delete_mods_for_community(pool, req.community_id)?; Ok(()) diff --git a/tests/benchmark/corpus/rust/sqli/sqli_metachar_gate_wrong_sink.rs b/tests/benchmark/corpus/rust/sqli/sqli_metachar_gate_wrong_sink.rs index ed9a5ead..0671f9cc 100644 --- a/tests/benchmark/corpus/rust/sqli/sqli_metachar_gate_wrong_sink.rs +++ b/tests/benchmark/corpus/rust/sqli/sqli_metachar_gate_wrong_sink.rs @@ -3,7 +3,7 @@ use rusqlite::Connection; fn main() { let user_id = env::var("USER_ID").unwrap(); - // Rejecting shell metacharacters does NOT make SQL injection safe — + // Rejecting shell metacharacters does NOT make SQL injection safe , // the metachar gate only covers shell-family sinks. if user_id.contains(";") || user_id.contains("|") { return; diff --git a/tests/benchmark/corpus/rust/traversal/traversal_no_sanitizer.rs b/tests/benchmark/corpus/rust/traversal/traversal_no_sanitizer.rs index 11d3aad1..1704ba62 100644 --- a/tests/benchmark/corpus/rust/traversal/traversal_no_sanitizer.rs +++ b/tests/benchmark/corpus/rust/traversal/traversal_no_sanitizer.rs @@ -1,6 +1,6 @@ // rs-path-006: Negative-case guard for PathFact. // -// No sanitiser and no narrowing — PathFact stays Top on every axis, so +// No sanitiser and no narrowing, PathFact stays Top on every axis, so // the FILE_IO sink MUST fire. This fixture guards against PathFact // over-suppression sneaking into `is_path_safe_for_sink`. use std::env; diff --git a/tests/benchmark/corpus/typescript/auth/safe_local_collection_receiver.ts b/tests/benchmark/corpus/typescript/auth/safe_local_collection_receiver.ts new file mode 100644 index 00000000..c04caf40 --- /dev/null +++ b/tests/benchmark/corpus/typescript/auth/safe_local_collection_receiver.ts @@ -0,0 +1,96 @@ +// Real-repo shape from excalidraw's element manipulation libraries +// (`packages/element/src/binding.ts`, `frame.ts`, `duplicate.ts`, +// `DebugCanvas.tsx`). In a pure data-manipulation function whose +// receiver is a JS built-in collection (`Map`, `Set`, `WeakMap`, +// `WeakSet`, `Array`) — either declared inline (`new Map()`), +// annotated directly (`m: Map`), or aliased via a same-file +// `type X = Map` — the call site is a container operation, +// not a data-layer read/mutation, and `js.auth.missing_ownership_check` +// must not flag. +// +// Closes the excalidraw FP cluster (66 → ~9 on +// `js.auth.missing_ownership_check`). The fix lives at the deepest +// representable layer: SSA `TypeFacts::constructor_type` recognises +// `new Map()` / `new Set()` constructors as +// `TypeKind::LocalCollection`; `cfg::params::ts_type_to_local_collection` +// extends `classify_param_type_ts` so explicitly-typed params resolve +// to `LocalCollection` independent of NestJS decorator presence; +// `cfg::dto::collect_type_alias_local_collections` populates a +// per-file `TYPE_ALIAS_LC` set so same-file `type X = Map<...>` +// aliases also resolve. The auth analyser already exempts +// `LocalCollection`-typed receivers via +// `auth_analysis::sink_class_for_type → InMemoryLocal`. + +type ElementsMap = Map; +type IdMap = Map; +type GroupSet = Set; +type ElementArray = readonly { id: string }[]; + +interface BindingFix { + elementId: string; +} + +// ── 1. Direct Map<...> annotation on a parameter ──────────────────── +function lookupBinding( + binding: BindingFix, + origIdToDuplicateId: Map, +): string | undefined { + return origIdToDuplicateId.get(binding.elementId); +} + +// ── 2. Same-file `type X = Map<...>` alias ───────────────────────── +function debugRender(elementsMap: ElementsMap, id: string) { + const bindable = elementsMap.get(id); + if (!bindable) return null; + return bindable; +} + +// ── 3. Set / WeakMap / WeakSet annotation ────────────────────────── +function trackVisited(visited: Set, key: string) { + if (!visited.has(key)) { + visited.add(key); + } + return visited.size; +} + +function rememberElement( + cache: WeakMap, + obj: object, + v: string, +) { + cache.set(obj, v); + return cache.get(obj); +} + +// ── 4. Array generics (`T[]`, `Array`, `ReadonlyArray`) ────── +function findItemArr(arr: { id: string }[], targetId: string) { + return arr.find((x) => x.id === targetId); +} + +function findItemReadonly(arr: ElementArray, targetId: string) { + return arr.find((x) => x.id === targetId); +} + +function findItemGeneric(arr: Array, v: string) { + return arr.find((x) => x === v); +} + +// ── 5. Local `new Map()` / `new Set()` constructors ──────────────── +function buildIndex(items: { id: string; v: string }[]) { + const idx = new Map(); + for (const it of items) { + idx.set(it.id, it.v); + } + return idx.get(items[0]?.id ?? ""); +} + +// ── 6. Type-alias chain (alias of alias) ─────────────────────────── +function aliasOfAlias(m: IdMap, k: string) { + return m.get(k); +} + +// ── 7. Set with `add` / `has` (mutation-side) ────────────────────── +function trackGroup(groups: GroupSet, g: string) { + groups.add(g); + return groups.has(g); +} diff --git a/tests/benchmark/corpus/typescript/auth/vuln_local_collection_does_not_blanket_suppress.ts b/tests/benchmark/corpus/typescript/auth/vuln_local_collection_does_not_blanket_suppress.ts new file mode 100644 index 00000000..5e1a16c6 --- /dev/null +++ b/tests/benchmark/corpus/typescript/auth/vuln_local_collection_does_not_blanket_suppress.ts @@ -0,0 +1,28 @@ +// Vulnerable counterpart to `safe_local_collection_receiver.ts`. +// +// Pinned to prove the LocalCollection-receiver fix does NOT +// blanket-suppress missing-ownership findings on real DB / API +// receivers that happen to share method names (`get`, `find`, `set`) +// with JS built-in collections. When the receiver type is a real +// `Prisma` / `Repository` / `db` chain — not a tracked Map / Set / +// Array — the auth analyser must still fire. + +interface PrismaClient { + user: { + findUnique(args: { where: { id: string } }): Promise<{ id: string } | null>; + update(args: { where: { id: string }; data: object }): Promise; + }; +} + +declare const prisma: PrismaClient; + +// User passes an attacker-controlled id. No prior auth check; receiver +// is a Prisma client (NOT a Map / Set / Array), so the missing-ownership +// rule must fire on `prisma.user.findUnique`. +export async function dangerousFetch(targetUserId: string) { + return prisma.user.findUnique({ where: { id: targetUserId } }); +} + +export async function dangerousMutate(targetUserId: string, data: object) { + return prisma.user.update({ where: { id: targetUserId }, data }); +} diff --git a/tests/benchmark/corpus/typescript/safe/safe_helper_with_validator.ts b/tests/benchmark/corpus/typescript/safe/safe_helper_with_validator.ts new file mode 100644 index 00000000..b3f943fa --- /dev/null +++ b/tests/benchmark/corpus/typescript/safe/safe_helper_with_validator.ts @@ -0,0 +1,33 @@ +// Helper-summary all_validated propagation (precision regression +// guard). The helper performs an indirect-validator check on +// `child.webhookUrl` and throws on failure; callers passing tainted +// `child` should NOT see the helper's `param_to_sink` summary refire +// because the validator inside the helper proved the path safe. +// +// Pinned by tests/lib::helper_with_validator_does_not_propagate_to_caller_via_summary. + +import express, { Request, Response } from 'express'; +import axios from 'axios'; + +interface IWebhookFilterPart { + webhookUrl?: string; +} + +declare function validateUrlSsrf(url: string): Promise; + +async function getWebhookResponse(child: IWebhookFilterPart) { + const ssrfError = await validateUrlSsrf(child.webhookUrl); + if (ssrfError) { + throw new Error('blocked'); + } + return await axios.post(child.webhookUrl, {}); +} + +const app = express(); +app.use(express.json()); + +app.post('/run', async (req: Request, res: Response) => { + const child: IWebhookFilterPart = req.body.filter; + const r = await getWebhookResponse(child); + res.json({ r }); +}); diff --git a/tests/benchmark/corpus/typescript/safe/safe_indirect_validator.ts b/tests/benchmark/corpus/typescript/safe/safe_indirect_validator.ts new file mode 100644 index 00000000..a234520d --- /dev/null +++ b/tests/benchmark/corpus/typescript/safe/safe_indirect_validator.ts @@ -0,0 +1,23 @@ +// Indirect-validator branch narrowing (precision regression guard). +// Pattern: `const err = validateXxx(input); if (err) throw …;` — +// the validator's input is treated as validated on the success +// branch, so the downstream sink does not refire. +// +// Pinned by tests/lib::indirect_validator_narrowing_marks_arg_validated. + +import express, { Request, Response } from 'express'; +import axios from 'axios'; + +declare function validateUrlSsrf(url: string): Promise; + +const app = express(); + +app.get('/proxy', async (req: Request, res: Response) => { + const target = req.query.url as string; + const ssrfError = await validateUrlSsrf(target); + if (ssrfError) { + throw new Error('blocked'); + } + const response = await axios.get(target); + res.send(response.data); +}); diff --git a/tests/benchmark/corpus/typescript/safe/safe_strapi_db_query_chain.ts b/tests/benchmark/corpus/typescript/safe/safe_strapi_db_query_chain.ts new file mode 100644 index 00000000..bed63dc3 --- /dev/null +++ b/tests/benchmark/corpus/typescript/safe/safe_strapi_db_query_chain.ts @@ -0,0 +1,37 @@ +// Strapi-style ORM accessor chain — `.db.query(MODEL_UID).(...)`. +// MODEL_UID is a literal model identifier, not raw SQL; the trailing +// findOne/findMany/create/update/delete/count are intrinsically parameterised +// by the ORM (per-call values arrive through field-keyed object literals +// that the driver escapes). Should NOT fire as a SQL-injection sink. + +declare const strapi: any; + +async function getApiToken(whereParams: Record) { + const token = await strapi.db.query('admin::api-token').findOne({ + select: ['id', 'name'], + where: whereParams, + }); + return token; +} + +async function listTokens() { + return strapi.db.query('admin::api-token').findMany({ + where: { type: 'read-only' }, + }); +} + +async function createToken(data: unknown) { + return strapi.db.query('admin::api-token').create({ data }); +} + +async function updateToken(id: number, data: unknown) { + return strapi.db.query('admin::api-token').update({ where: { id }, data }); +} + +async function deleteToken(id: number) { + return strapi.db.query('admin::api-token').delete({ where: { id } }); +} + +async function countTokens() { + return strapi.db.query('admin::api-token').count(); +} diff --git a/tests/benchmark/corpus/typescript/sqli/sqli_db_query_concat.ts b/tests/benchmark/corpus/typescript/sqli/sqli_db_query_concat.ts new file mode 100644 index 00000000..e4d14f8d --- /dev/null +++ b/tests/benchmark/corpus/typescript/sqli/sqli_db_query_concat.ts @@ -0,0 +1,26 @@ +// Vulnerable counterpart — bare `connection.query(...)` and chained +// `db.query(...).then(...)` whose arg 0 is concatenated with attacker +// input. Both must still fire as SQL_QUERY sinks: the chain has no +// ORM-method outer call (`.then` is a Promise method, not an ORM +// accessor), and arg 0 is not a string literal in the second case. + +import express, { Request, Response } from 'express'; + +declare const connection: any; +declare const db: any; + +const app = express(); + +app.get('/user', (req: Request, res: Response) => { + const name = req.query.name as string; + // bare SQL — real SQLi sink, no chain + connection.query(`SELECT * FROM users WHERE name = '${name}'`); +}); + +app.get('/by-id', async (req: Request, res: Response) => { + const id = req.query.id as string; + // chained `.then` is a Promise method, not an ORM accessor; arg 0 is + // also a binary_expression (not a string literal) so the ORM-shape + // recogniser refuses to suppress. + db.query("SELECT * FROM users WHERE id = " + id).then((rows: any) => res.json(rows[0])); +}); diff --git a/tests/benchmark/cve_corpus/java/CVE-2022-1471/patched.java b/tests/benchmark/cve_corpus/java/CVE-2022-1471/patched.java new file mode 100644 index 00000000..c48a4731 --- /dev/null +++ b/tests/benchmark/cve_corpus/java/CVE-2022-1471/patched.java @@ -0,0 +1,37 @@ +// Nyx CVE benchmark fixture (patched counterpart). +// +// CVE: CVE-2022-1471 +// Project: SnakeYAML (snakeyaml/snakeyaml) +// License: Apache-2.0 +// (https://github.com/snakeyaml/snakeyaml/blob/master/LICENSE.txt) +// Advisory: https://github.com/advisories/GHSA-mjmj-j48q-9wg2 +// +// Patched variant: the parser is constructed with `SafeConstructor`, +// which restricts the YAML tag handler set to primitives + standard +// collections. SnakeYAML 2.0 ships with `SafeConstructor` as the +// default; pre-2.0 consumers patched their own call sites to pass +// `SafeConstructor` explicitly (the form below). +// +// Patched-fix simplification: the upstream remediation also covers +// callers that need richer types via custom `Constructor` subclasses +// with declared safe types — those are out of scope for this fixture. + +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.yaml.snakeyaml.LoaderOptions; +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.constructor.SafeConstructor; + +public class YamlConfigServlet extends HttpServlet { + @Override + protected void doPost(HttpServletRequest req, HttpServletResponse res) throws Exception { + String body = req.getReader().readLine(); + // Patched: SafeConstructor forbids arbitrary class tags; + // any non-primitive `!!…` payload throws ConstructorException. + Yaml yaml = new Yaml(new SafeConstructor(new LoaderOptions())); + Object loaded = yaml.load(body); + res.setHeader("X-Yaml-Class", loaded.getClass().getName()); + res.setStatus(HttpServletResponse.SC_OK); + } +} diff --git a/tests/benchmark/cve_corpus/java/CVE-2022-1471/vulnerable.java b/tests/benchmark/cve_corpus/java/CVE-2022-1471/vulnerable.java new file mode 100644 index 00000000..e548d73e --- /dev/null +++ b/tests/benchmark/cve_corpus/java/CVE-2022-1471/vulnerable.java @@ -0,0 +1,43 @@ +// Nyx CVE benchmark fixture. +// +// CVE: CVE-2022-1471 +// Project: SnakeYAML (snakeyaml/snakeyaml; consumed via any app +// that constructs `new Yaml()` and calls `.load()` on +// attacker-controlled bytes) +// License: Apache-2.0 +// (https://github.com/snakeyaml/snakeyaml/blob/master/LICENSE.txt) +// Advisory: https://github.com/advisories/GHSA-mjmj-j48q-9wg2 +// https://nvd.nist.gov/vuln/detail/CVE-2022-1471 +// Vulnerable: SnakeYAML <= 1.33; the default `Constructor` accepts +// arbitrary tags (`!!javax.script.ScriptEngineManager`, +// `!!java.net.URLClassLoader`, etc.) and instantiates any +// class via reflection, reaching RCE on consumers that +// feed network input straight into Yaml.load(). +// +// Verbatim load-bearing lines: the unsafe `new Yaml()` construction +// and the `yaml.load(body)` call mirror the call-site shape called +// out in the advisory's "vulnerable code" example. The patched fix +// (next file) shows the SnakeYAML 2.0 fix pattern of explicitly +// passing `new SafeConstructor(new LoaderOptions())`. +// +// Trims: imports trimmed to just SnakeYAML and Servlet API; no +// helper / logging code. + +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.yaml.snakeyaml.Yaml; + +public class YamlConfigServlet extends HttpServlet { + @Override + protected void doPost(HttpServletRequest req, HttpServletResponse res) throws Exception { + String body = req.getReader().readLine(); + // Vulnerable: default Constructor allows arbitrary class + // instantiation via YAML tag handlers — `body` may contain + // `!!javax.script.ScriptEngineManager` and friends. + Yaml yaml = new Yaml(); + Object loaded = yaml.load(body); + res.setHeader("X-Yaml-Class", loaded.getClass().getName()); + res.setStatus(HttpServletResponse.SC_OK); + } +} diff --git a/tests/benchmark/cve_corpus/java/CVE-2022-42889/patched.java b/tests/benchmark/cve_corpus/java/CVE-2022-42889/patched.java new file mode 100644 index 00000000..9f19adbe --- /dev/null +++ b/tests/benchmark/cve_corpus/java/CVE-2022-42889/patched.java @@ -0,0 +1,33 @@ +// Nyx CVE benchmark fixture (patched counterpart). +// +// CVE: CVE-2022-42889 ("Text4Shell") +// Project: Apache Commons Text (apache/commons-text) +// License: Apache-2.0 +// (https://github.com/apache/commons-text/blob/master/LICENSE.txt) +// Advisory: https://github.com/advisories/GHSA-599f-7c49-w659 +// +// Patched variant: the substitutor is built with `new StringSubstitutor()` +// (no factory) so the lookup map is empty — `${anything}` becomes a +// literal pass-through. This is the recommended app-side mitigation +// for callers that cannot upgrade past 1.9, and it is also the +// behaviour of the 1.10.0 default `createDefault()` factory which +// drops the `script:` / `dns:` / `url:` interpolation lookups. + +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.apache.commons.text.StringSubstitutor; + +public class TemplateRenderServlet extends HttpServlet { + @Override + protected void doGet(HttpServletRequest req, HttpServletResponse res) throws Exception { + String input = req.getParameter("template"); + // Patched: no interpolator constructed; the substitutor has + // no lookups registered, so `${…}` is left as a literal in + // the rendered output. No script/dns/url evaluation. + StringSubstitutor substitutor = new StringSubstitutor(); + String rendered = substitutor.replace(input); + res.setHeader("X-Rendered-Length", String.valueOf(rendered.length())); + res.setStatus(HttpServletResponse.SC_OK); + } +} diff --git a/tests/benchmark/cve_corpus/java/CVE-2022-42889/vulnerable.java b/tests/benchmark/cve_corpus/java/CVE-2022-42889/vulnerable.java new file mode 100644 index 00000000..54cd69be --- /dev/null +++ b/tests/benchmark/cve_corpus/java/CVE-2022-42889/vulnerable.java @@ -0,0 +1,45 @@ +// Nyx CVE benchmark fixture. +// +// CVE: CVE-2022-42889 (a.k.a. "Text4Shell") +// Project: Apache Commons Text (apache/commons-text); consumed via +// any app that calls `StringSubstitutor.createInterpolator()` +// on attacker-controlled input. +// License: Apache-2.0 +// (https://github.com/apache/commons-text/blob/master/LICENSE.txt) +// Advisory: https://github.com/advisories/GHSA-599f-7c49-w659 +// https://nvd.nist.gov/vuln/detail/CVE-2022-42889 +// Vulnerable: commons-text 1.5 .. 1.9. `createInterpolator()` +// enables the `script:`, `dns:`, and `url:` lookups by +// default, so a substitution like `${script:javascript:…}` +// evaluates JavaScript via the JSR-223 ScriptEngineManager +// — full RCE on any consumer that feeds untrusted input +// through `.replace()`. +// +// Verbatim load-bearing lines: the `StringSubstitutor.createInterpolator()` +// factory call and the `interpolator.replace(input)` sink mirror the +// minimal triggering pattern published in the OSS-Security advisory +// (https://www.openwall.com/lists/oss-security/2022/10/13/4) and the +// vendor mitigation guidance for 1.10.0. +// +// Trims: imports limited to commons-text + servlet; no surrounding +// templating boilerplate. + +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.apache.commons.text.StringSubstitutor; + +public class TemplateRenderServlet extends HttpServlet { + @Override + protected void doGet(HttpServletRequest req, HttpServletResponse res) throws Exception { + String input = req.getParameter("template"); + // Vulnerable: createInterpolator() enables script:/dns:/url: + // lookups by default; .replace() evaluates them against + // `input` — `${script:js:…}` → arbitrary JavaScript via the + // JDK ScriptEngineManager. + StringSubstitutor interpolator = StringSubstitutor.createInterpolator(); + String rendered = interpolator.replace(input); + res.setHeader("X-Rendered-Length", String.valueOf(rendered.length())); + res.setStatus(HttpServletResponse.SC_OK); + } +} diff --git a/tests/benchmark/cve_corpus/python/CVE-2025-69662/patched.py b/tests/benchmark/cve_corpus/python/CVE-2025-69662/patched.py new file mode 100644 index 00000000..7c640d1b --- /dev/null +++ b/tests/benchmark/cve_corpus/python/CVE-2025-69662/patched.py @@ -0,0 +1,47 @@ +# Nyx CVE benchmark fixture. +# +# CVE: CVE-2025-69662 +# Project: geopandas (geopandas/geopandas) +# License: BSD-3-Clause (https://github.com/geopandas/geopandas/blob/main/LICENSE.txt) +# Advisory: https://github.com/advisories/GHSA-6497-prx7-gpmq +# Patched: 6aa8ef14ffdee4ba1044349ab948e1a1fbfaf419 geopandas/io/sql.py:432-438 +# +# Fix: replace the f-string-built Find_SRID probe with a +# bound-parameter SQLAlchemy text() statement; SQLAlchemy passes the +# values via the driver's parameter binding, so attacker-supplied +# identifiers can no longer break out of the literal context. +# +# Trims: +# - Same scaffolding trim as vulnerable.py — `.fetchone()[0]` (post- +# sink result extraction) removed. +# - Patched-fix simplification: the upstream fix nests +# `text(...).bindparams(...)` directly inside `connection.execute(...)`. +# The fixture lifts the bound-parameter clause into a local `stmt` +# so the `.bindparams` call is a top-level CFG node — without this +# reshape, cfg-unguarded-sink fires on the surrounding execute +# because the inlined sanitizer-in-arg shape is not yet recognised +# by the dominator-based guard check. The verbatim bytes of the +# `text(...).bindparams(...)` clause are preserved. + +from flask import Flask, request +from sqlalchemy import create_engine, text + +app = Flask(__name__) +engine = create_engine("postgresql://localhost/geo") + + +@app.post("/upload-layer") +def upload_layer(): + body = request.get_json(force=True) or {} + geom_name = body.get("geom_name", "geom") + name = body.get("table", "data") + schema_name = body.get("schema", "public") + with engine.begin() as connection: + # Verbatim bytes from sql.py:433-437 — bound-parameter probe. + stmt = text( + "SELECT Find_SRID(:schema_name, :name, :geom_name);" + ).bindparams( + schema_name=schema_name, name=name, geom_name=geom_name + ) + connection.execute(stmt) + return {"ok": True} diff --git a/tests/benchmark/cve_corpus/python/CVE-2025-69662/vulnerable.py b/tests/benchmark/cve_corpus/python/CVE-2025-69662/vulnerable.py new file mode 100644 index 00000000..9b0f8d57 --- /dev/null +++ b/tests/benchmark/cve_corpus/python/CVE-2025-69662/vulnerable.py @@ -0,0 +1,46 @@ +# Nyx CVE benchmark fixture. +# +# CVE: CVE-2025-69662 +# Project: geopandas (geopandas/geopandas) +# License: BSD-3-Clause (https://github.com/geopandas/geopandas/blob/main/LICENSE.txt) +# Advisory: https://github.com/advisories/GHSA-6497-prx7-gpmq +# Vulnerable: c301579e0ac4034c19bece63c08bf628613700b4 geopandas/io/sql.py:432-435 +# +# geopandas.GeoDataFrame.to_postgis() concatenated the GeoDataFrame's +# geometry column name (and the schema/table names) into a Find_SRID +# probe via f-string. A user uploading a GeoDataFrame whose geometry +# column was named with embedded SQL (e.g. "geom'); DROP TABLE...--") +# achieved arbitrary SQL execution against the target Postgres database. +# +# Trims: +# - Surrounding to_postgis() body (CRS lookup, EWKB conversion, dtype +# dict construction at L399-422) that scaffolds the vulnerable +# Find_SRID probe. +# - Trailing `.fetchone()[0]` on the connection.execute(...) result — +# downstream of the sink (result extraction), not on the flow path. +# +# Only the source statement (geom_name from request input), the +# f-string SQL builder, and the connection.execute(text(...)) sink are +# preserved verbatim from sql.py:432-435. + +from flask import Flask, request +from sqlalchemy import create_engine, text + +app = Flask(__name__) +engine = create_engine("postgresql://localhost/geo") + + +@app.post("/upload-layer") +def upload_layer(): + body = request.get_json(force=True) or {} + # geom_name is supplied by the API caller — no validation upstream. + geom_name = body.get("geom_name", "geom") + name = body.get("table", "data") + schema_name = body.get("schema", "public") + with engine.begin() as connection: + # Verbatim from sql.py:432-435 — Find_SRID probe with + # f-string-interpolated identifiers. + connection.execute( + text(f"SELECT Find_SRID('{schema_name}', '{name}', '{geom_name}');") + ) + return {"ok": True} diff --git a/tests/benchmark/cve_corpus/python/CVE-2026-33626/patched.py b/tests/benchmark/cve_corpus/python/CVE-2026-33626/patched.py new file mode 100644 index 00000000..409f6533 --- /dev/null +++ b/tests/benchmark/cve_corpus/python/CVE-2026-33626/patched.py @@ -0,0 +1,79 @@ +# Nyx CVE benchmark fixture. +# +# CVE: CVE-2026-33626 +# Project: LMDeploy (InternLM/lmdeploy) +# License: Apache-2.0 (https://github.com/InternLM/lmdeploy/blob/main/LICENSE) +# Advisory: https://github.com/advisories/GHSA-25c5-rg58-mhxh +# Patched: 71d64a339edb901e9005358e0633fbbab367d626 lmdeploy/vl/media/connection.py:24-69 +# +# Fix: introduce `_is_safe_url(url)` which resolves the hostname via +# `socket.getaddrinfo`, walks every returned IP, and rejects any that +# aren't `is_global` (covers loopback, RFC1918 private, link-local, +# multicast, reserved, unspecified). The vulnerable scheme-only check +# is replaced by this allowlist gate before the fetch. +# +# Trims: same scaffolding trim as vulnerable.py — MediaIO generic +# plumbing replaced with a Flask handler; fetch_timeout env-var +# resolution collapsed to a literal. The `_is_safe_url` body, the +# replacement gate at L55-58, and the `client.get(..., +# allow_redirects=True)` fetch are preserved verbatim from the fix +# commit. + +import ipaddress +import socket +from urllib.parse import urlparse + +import requests +from flask import Flask, request + +app = Flask(__name__) +headers = {"User-Agent": "Mozilla/5.0"} + + +def _is_safe_url(url: str) -> tuple[bool, str]: + """Check if the URL is safe to fetch (not internal/private).""" + try: + parsed = urlparse(url) + if parsed.scheme not in ('http', 'https'): + return False, f'Unsupported scheme: {parsed.scheme}' + + hostname = parsed.hostname + if not hostname: + return False, 'Could not parse hostname from URL' + + # check all IPs (IPv4 + IPv6) using getaddrinfo + try: + infos = socket.getaddrinfo(hostname, None) + except socket.gaierror: + return False, 'Hostname resolution failed' + + for info in infos: + ip = ipaddress.ip_address(info[4][0]) + # block any IP that is not globally routable + if not ip.is_global: + return False, f'Blocked non-global IP detected: {ip}' + + return True, 'URL is safe' + except Exception as e: + return False, f'URL validation failed: {str(e)}' + + +@app.post("/load-image") +def load_image(): + body = request.get_json(force=True) or {} + url = body.get("url", "") + url_spec = urlparse(url) + # Verbatim from connection.py:55-58 — replaces the scheme-only + # check with a private-IP-blocking allowlist. + is_safe, reason = _is_safe_url(url_spec.geturl()) + if not is_safe: + raise ValueError(f'URL is blocked for security reasons: {reason}') + + fetch_timeout = 10 + client = requests.Session() + client.max_redirects = 3 + response = client.get( + url_spec.geturl(), headers=headers, timeout=fetch_timeout, allow_redirects=True + ) + response.raise_for_status() + return {"size": len(response.content)} diff --git a/tests/benchmark/cve_corpus/python/CVE-2026-33626/vulnerable.py b/tests/benchmark/cve_corpus/python/CVE-2026-33626/vulnerable.py new file mode 100644 index 00000000..1b1ade8b --- /dev/null +++ b/tests/benchmark/cve_corpus/python/CVE-2026-33626/vulnerable.py @@ -0,0 +1,51 @@ +# Nyx CVE benchmark fixture. +# +# CVE: CVE-2026-33626 +# Project: LMDeploy (InternLM/lmdeploy) +# License: Apache-2.0 (https://github.com/InternLM/lmdeploy/blob/main/LICENSE) +# Advisory: https://github.com/advisories/GHSA-25c5-rg58-mhxh +# Vulnerable: 819a80836e991ca3f427b0e85faca159083d3d40 lmdeploy/vl/media/connection.py:23-37 +# +# LMDeploy's vision-language image loader accepted user-supplied +# image URLs from the chat-completion request and fetched them via +# `requests.Session().get(url)` after only a scheme check. Attackers +# embedded URLs pointing at internal network services or cloud +# metadata endpoints (e.g. http://169.254.169.254/...) and exfiltrated +# the response back through the model output. +# +# Trims: +# - Surrounding _load_data_url / file-URL branches that don't reach +# the HTTP sink (lines 41+). +# - The scheme-only allowlist check at L24-25 of upstream. The +# CVE is host-based SSRF (private IP / cloud-metadata host); the +# scheme check was the insufficient validation the fix replaces. +# Removing it keeps the load-bearing source → sink flow intact. +# - The fetch_timeout env-var resolution (L28-31) — collapsed to a +# literal so the fixture is self-contained. +# - MediaIO[_M] generic plumbing — replaced with a Flask handler so +# the source is a concrete request flow. +# +# The verbatim load-bearing lines are the `client = requests.Session()` +# constructor and the `client.get(url_spec.geturl(), headers=headers, +# timeout=fetch_timeout)` fetch site at lines 33-34 of upstream. + +from urllib.parse import urlparse + +import requests +from flask import Flask, request + +app = Flask(__name__) +headers = {"User-Agent": "Mozilla/5.0"} + + +@app.post("/load-image") +def load_image(): + body = request.get_json(force=True) or {} + url = body.get("url", "") + url_spec = urlparse(url) + fetch_timeout = 10 + # Verbatim from connection.py:33-34 — Session().get(url). + client = requests.Session() + response = client.get(url_spec.geturl(), headers=headers, timeout=fetch_timeout) + response.raise_for_status() + return {"size": len(response.content)} diff --git a/tests/benchmark/cve_corpus/rust/CVE-2018-20997/patched.rs b/tests/benchmark/cve_corpus/rust/CVE-2018-20997/patched.rs index 794660d4..6724ec06 100644 --- a/tests/benchmark/cve_corpus/rust/CVE-2018-20997/patched.rs +++ b/tests/benchmark/cve_corpus/rust/CVE-2018-20997/patched.rs @@ -2,7 +2,7 @@ // // CVE: CVE-2018-20997 // Advisory: https://rustsec.org/advisories/RUSTSEC-2018-0003 -// Project: tar-rs — zip-slip fix +// Project: tar-rs, zip-slip fix // License: MIT OR Apache-2.0 // // Patched variant: the extractor rejects any entry path that contains diff --git a/tests/benchmark/cve_corpus/rust/CVE-2018-20997/vulnerable.rs b/tests/benchmark/cve_corpus/rust/CVE-2018-20997/vulnerable.rs index 9b10f45d..80d4adde 100644 --- a/tests/benchmark/cve_corpus/rust/CVE-2018-20997/vulnerable.rs +++ b/tests/benchmark/cve_corpus/rust/CVE-2018-20997/vulnerable.rs @@ -2,18 +2,18 @@ // // CVE: CVE-2018-20997 // Advisory: https://rustsec.org/advisories/RUSTSEC-2018-0003 -// Project: tar-rs (alexcrichton/tar-rs) — "zip slip" on tar extraction +// Project: tar-rs (alexcrichton/tar-rs), "zip slip" on tar extraction // License: MIT OR Apache-2.0 (https://github.com/alexcrichton/tar-rs/blob/main/LICENSE-MIT) // // tar-rs <= 0.4.15 trusted tar entry paths verbatim when unpacking. // A crafted archive with an entry named `../../etc/shadow` would cause // `Archive::unpack` to write outside the destination directory, giving // malicious tarballs arbitrary file write. Every consumer that -// streamed user-supplied archives — package managers, OCI tooling, -// container image importers — inherited the traversal. +// streamed user-supplied archives, package managers, OCI tooling, +// container image importers, inherited the traversal. // -// This fixture is a minimal reproducer of the unsafe sink pattern — -// attacker-controlled archive entry path -> fs::File::create(path) — not +// This fixture is a minimal reproducer of the unsafe sink pattern , +// attacker-controlled archive entry path -> fs::File::create(path), not // an excerpt of tar-rs internals. The entry path is modelled as an env // var so the single-file benchmark harness sees the flow; in a real // extractor the same shape fires for `archive.entries()?.map(|e| diff --git a/tests/benchmark/cve_corpus/rust/CVE-2022-36113/patched.rs b/tests/benchmark/cve_corpus/rust/CVE-2022-36113/patched.rs index 272609cc..d548f06f 100644 --- a/tests/benchmark/cve_corpus/rust/CVE-2022-36113/patched.rs +++ b/tests/benchmark/cve_corpus/rust/CVE-2022-36113/patched.rs @@ -2,7 +2,7 @@ // // CVE: CVE-2022-36113 // Advisory: https://blog.rust-lang.org/2022/09/14/cargo-cves.html -// Project: cargo — `.cargo-ok` symlink follow fix +// Project: cargo, `.cargo-ok` symlink follow fix // License: MIT OR Apache-2.0 // // Patched variant: the crate name is passed through diff --git a/tests/benchmark/cve_corpus/rust/CVE-2022-36113/vulnerable.rs b/tests/benchmark/cve_corpus/rust/CVE-2022-36113/vulnerable.rs index a88d35a4..b6c65e6e 100644 --- a/tests/benchmark/cve_corpus/rust/CVE-2022-36113/vulnerable.rs +++ b/tests/benchmark/cve_corpus/rust/CVE-2022-36113/vulnerable.rs @@ -3,7 +3,7 @@ // CVE: CVE-2022-36113 // Advisory: https://blog.rust-lang.org/2022/09/14/cargo-cves.html // https://rustsec.org/advisories/RUSTSEC-2022-0064 -// Project: cargo (rust-lang/cargo) — "Arbitrary file corruption through +// Project: cargo (rust-lang/cargo), "Arbitrary file corruption through // crate extraction" (`.cargo-ok` symlink following) // License: MIT OR Apache-2.0 (https://github.com/rust-lang/cargo/blob/master/LICENSE-MIT) // @@ -15,9 +15,9 @@ // switched the marker open to `OpenOptions::create_new(true)` so a // pre-existing symlink aborts the extraction. // -// This fixture is a minimal reproducer of the unsafe sink pattern — +// This fixture is a minimal reproducer of the unsafe sink pattern , // attacker-controlled crate name plumbed into the marker path -> -// fs::File::create(marker) through a symlink — not an excerpt of cargo +// fs::File::create(marker) through a symlink, not an excerpt of cargo // internals. use std::env; use std::fs::File; diff --git a/tests/benchmark/cve_corpus/rust/CVE-2024-24576/patched.rs b/tests/benchmark/cve_corpus/rust/CVE-2024-24576/patched.rs index c79c25d0..8d191197 100644 --- a/tests/benchmark/cve_corpus/rust/CVE-2024-24576/patched.rs +++ b/tests/benchmark/cve_corpus/rust/CVE-2024-24576/patched.rs @@ -2,7 +2,7 @@ // // CVE: CVE-2024-24576 // Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0003 -// Project: Rust standard library — "BatBadBut" +// Project: Rust standard library, "BatBadBut" // License: MIT OR Apache-2.0 // // Patched variant: the caller filters the argument through a cmd.exe- diff --git a/tests/benchmark/cve_corpus/rust/CVE-2024-24576/vulnerable.rs b/tests/benchmark/cve_corpus/rust/CVE-2024-24576/vulnerable.rs index 32698660..be1f751f 100644 --- a/tests/benchmark/cve_corpus/rust/CVE-2024-24576/vulnerable.rs +++ b/tests/benchmark/cve_corpus/rust/CVE-2024-24576/vulnerable.rs @@ -3,7 +3,7 @@ // CVE: CVE-2024-24576 // Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0003 // Blog: https://blog.rust-lang.org/2024/04/09/cve-2024-24576.html -// Project: Rust standard library (std::process::Command) — "BatBadBut" +// Project: Rust standard library (std::process::Command), "BatBadBut" // License: MIT OR Apache-2.0 (https://github.com/rust-lang/rust/blob/master/COPYRIGHT) // // Rust < 1.77.2 on Windows built the argv for .bat/.cmd invocations by @@ -14,8 +14,8 @@ // line, and every consumer of `std::process::Command::new("...bat")` // on Windows inherited the RCE. // -// This fixture is a minimal reproducer of the unsafe sink pattern — -// caller-supplied input -> Command::new("update.bat").arg(name) — not +// This fixture is a minimal reproducer of the unsafe sink pattern , +// caller-supplied input -> Command::new("update.bat").arg(name), not // an excerpt of rustc / libstd internals. The source is modelled as // `env::var` so the single-file benchmark harness sees the flow; in a // real deployment the same shape fires for an Axum/Actix/Rocket handler diff --git a/tests/benchmark/cve_corpus/typescript/GHSA-4x48-cgf9-q33f/patched.ts b/tests/benchmark/cve_corpus/typescript/GHSA-4x48-cgf9-q33f/patched.ts new file mode 100644 index 00000000..e91af55d --- /dev/null +++ b/tests/benchmark/cve_corpus/typescript/GHSA-4x48-cgf9-q33f/patched.ts @@ -0,0 +1,62 @@ +// Nyx CVE benchmark fixture (patched counterpart). +// +// CVE: GHSA-4x48-cgf9-q33f (no CVE id assigned) +// Project: Novu (novuhq/novu) +// License: MIT (libs/application-generic — see LICENSE-MIT) +// Advisory: https://github.com/novuhq/novu/security/advisories/GHSA-4x48-cgf9-q33f +// Patched: 87d965eb88340ac7cd262dd52c8015acd092dc68 +// libs/application-generic/src/usecases/conditions-filter/conditions-filter.usecase.ts:241-289 +// +// The fix performs the existing call-site SSRF check `validateUrlSsrf` +// (already used by the HTTP-Request workflow step) before the webhook +// POST. The branch validates protocol/host and rejects when the URL +// hits localhost/private/cloud-metadata addresses; only on success +// does control reach axios.post. +// +// Patched-fix simplification: validateUrlSsrf is sourced from +// '../../utils/ssrf-url-validation.ts' upstream — inlined here as a +// no-op signature so the fixture parses without the larger novu +// monorepo. The branch shape (early throw on truthy ssrfError) is +// verbatim from the patch. + +import express, { Request, Response } from 'express'; +import axios from 'axios'; + +interface IWebhookFilterPart { + webhookUrl?: string; +} + +declare function validateUrlSsrf(url: string): Promise; + +async function getWebhookResponse( + child: IWebhookFilterPart, +): Promise | undefined> { + if (!child.webhookUrl) return undefined; + + const payload = {}; + const config: { headers: Record } = { headers: {} }; + + const ssrfError = await validateUrlSsrf(child.webhookUrl); + + if (ssrfError) { + throw new Error( + JSON.stringify({ + message: ssrfError, + data: 'Webhook URL blocked by SSRF protection.', + }) + ); + } + + return await axios.post(child.webhookUrl, payload, config).then((response) => { + return response.data as Record; + }); +} + +const app = express(); +app.use(express.json()); + +app.post('/conditions-filter/run', async (req: Request, res: Response) => { + const child: IWebhookFilterPart = req.body.filter; + const result = await getWebhookResponse(child); + res.json({ result }); +}); diff --git a/tests/benchmark/cve_corpus/typescript/GHSA-4x48-cgf9-q33f/vulnerable.ts b/tests/benchmark/cve_corpus/typescript/GHSA-4x48-cgf9-q33f/vulnerable.ts new file mode 100644 index 00000000..d3fc5d66 --- /dev/null +++ b/tests/benchmark/cve_corpus/typescript/GHSA-4x48-cgf9-q33f/vulnerable.ts @@ -0,0 +1,53 @@ +// Nyx CVE benchmark fixture. +// +// CVE: GHSA-4x48-cgf9-q33f (no CVE id assigned) +// Project: Novu (novuhq/novu) +// License: MIT (libs/application-generic — see LICENSE-MIT) +// Advisory: https://github.com/novuhq/novu/security/advisories/GHSA-4x48-cgf9-q33f +// Vulnerable: 87d965eb88340ac7cd262dd52c8015acd092dc68^ +// libs/application-generic/src/usecases/conditions-filter/conditions-filter.usecase.ts:241-272 +// +// `getWebhookResponse` POSTs to a user-configured webhook URL using raw +// `axios.post(child.webhookUrl, ...)` with no SSRF validation. The +// `child` filter part is sourced from a workflow filter config the +// caller controls, so the URL flows attacker-influenced into axios. +// +// Trims: +// - HMAC config branch (verbatim kept; not on the flow path but +// trivial scaffolding to keep the call shape). +// - buildHmac, buildPayload, processFilter dispatcher, environment +// repository lookups, decryptApiKey usage. Verbatim load-bearing +// lines are the IWebhookFilterPart param shape and the +// axios.post(child.webhookUrl, payload, config) call. + +import express, { Request, Response } from 'express'; +import axios from 'axios'; + +interface IWebhookFilterPart { + webhookUrl?: string; +} + +async function getWebhookResponse( + child: IWebhookFilterPart, +): Promise | undefined> { + if (!child.webhookUrl) return undefined; + + const payload = {}; + + const config: { headers: Record } = { + headers: {}, + }; + + return await axios.post(child.webhookUrl, payload, config).then((response) => { + return response.data as Record; + }); +} + +const app = express(); +app.use(express.json()); + +app.post('/conditions-filter/run', async (req: Request, res: Response) => { + const child: IWebhookFilterPart = req.body.filter; + const result = await getWebhookResponse(child); + res.json({ result }); +}); diff --git a/tests/benchmark/ground_truth.json b/tests/benchmark/ground_truth.json index 4a9796e0..a6483664 100644 --- a/tests/benchmark/ground_truth.json +++ b/tests/benchmark/ground_truth.json @@ -3,7 +3,7 @@ "metadata": { "description": "Nyx benchmark ground truth", "created": "2026-03-20", - "corpus_size": 433 + "corpus_size": 458 }, "cases": [ { @@ -8394,6 +8394,35 @@ "disabled": false, "notes": "Prisma $queryRawUnsafe \u2014 TS-specific ORM sink" }, + { + "case_id": "ts-sqli-003", + "file": "typescript/sqli/sqli_db_query_concat.ts", + "language": "typescript", + "is_vulnerable": true, + "vuln_class": "sqli", + "cwe": "CWE-89", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [ + "cfg-unguarded-sink" + ], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "sqli", + "real-repo-precision-2026-04-29", + "regression-guard" + ], + "disabled": false, + "notes": "Vulnerable counterpart for ts-safe-017 \u2014 bare `connection.query(`SELECT...`)` and chained `db.query(SQL).then(...)` (Promise method, not ORM accessor) must still fire as SQL_QUERY sinks even after the ORM-chain recogniser landed." + }, { "case_id": "ts-cmdi-001", "file": "typescript/cmdi/cmdi_exec_template.ts", @@ -9560,6 +9589,72 @@ "disabled": false, "notes": "CVE-2023-26159 patched counterpart: URL allowlist check guards axios.get; regression guard that Nyx does not refire on the fix" }, + { + "case_id": "cve-ts-ghsa-4x48-cgf9-q33f-vulnerable", + "file": "cve_corpus/typescript/GHSA-4x48-cgf9-q33f/vulnerable.ts", + "language": "typescript", + "is_vulnerable": true, + "vuln_class": "ssrf", + "cwe": "CWE-918", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 51, + 51 + ] + ], + "expected_source_lines": [ + [ + 50, + 50 + ] + ], + "tags": [ + "cve", + "novu", + "ssrf", + "vulnerable" + ], + "disabled": false, + "notes": "GHSA-4x48-cgf9-q33f: Novu conditions-filter webhook bypassed validateUrlSsrf; raw axios.post(child.webhookUrl) is the cross-function SSRF sink. MIT-licensed libs/application-generic package." + }, + { + "case_id": "cve-ts-ghsa-4x48-cgf9-q33f-patched", + "file": "cve_corpus/typescript/GHSA-4x48-cgf9-q33f/patched.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "novu", + "patched", + "negative" + ], + "disabled": false, + "notes": "GHSA-4x48-cgf9-q33f patched: validateUrlSsrf(child.webhookUrl) followed by `if (ssrfError) throw` guards the axios.post call; regression guard for the indirect-validator branch narrowing + summary all_validated propagation." + }, { "case_id": "cve-py-2017-18342-vulnerable", "file": "cve_corpus/python/CVE-2017-18342/vulnerable.py", @@ -9629,6 +9724,144 @@ "disabled": false, "notes": "CVE-2017-18342 patched counterpart: yaml.safe_load replaces yaml.load; regression guard that Nyx does not refire on the fix" }, + { + "case_id": "cve-py-2025-69662-vulnerable", + "file": "cve_corpus/python/CVE-2025-69662/vulnerable.py", + "language": "python", + "is_vulnerable": true, + "vuln_class": "sql_injection", + "cwe": "CWE-89", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "py.sqli.text_format" + ], + "allowed_alternative_rule_ids": [ + "taint-unsanitised-flow" + ], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 43, + 44 + ] + ], + "expected_source_lines": [ + [ + 35, + 35 + ] + ], + "tags": [ + "cve", + "geopandas", + "sql_injection", + "flask", + "sqlalchemy" + ], + "disabled": false, + "notes": "CVE-2025-69662: geopandas to_postgis() interpolated GeoDataFrame's geometry column name into Find_SRID probe via f-string; SQL injection on user-uploaded layer. BSD-3-Clause" + }, + { + "case_id": "cve-py-2025-69662-patched", + "file": "cve_corpus/python/CVE-2025-69662/patched.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "py.sqli.text_format", + "py.sqli.execute_format", + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "geopandas", + "patched", + "negative" + ], + "disabled": false, + "notes": "CVE-2025-69662 patched counterpart: text(...).bindparams() replaces f-string interpolation; regression guard that Nyx does not refire on the fix" + }, + { + "case_id": "cve-py-2026-33626-vulnerable", + "file": "cve_corpus/python/CVE-2026-33626/vulnerable.py", + "language": "python", + "is_vulnerable": true, + "vuln_class": "ssrf", + "cwe": "CWE-918", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 49, + 49 + ] + ], + "expected_source_lines": [ + [ + 43, + 43 + ] + ], + "tags": [ + "cve", + "lmdeploy", + "ssrf", + "flask", + "requests" + ], + "disabled": false, + "notes": "CVE-2026-33626: LMDeploy vision-language image loader fetched user-supplied URLs via requests.Session().get without private-IP guard; SSRF / cloud-metadata exfil. Apache-2.0" + }, + { + "case_id": "cve-py-2026-33626-patched", + "file": "cve_corpus/python/CVE-2026-33626/patched.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "lmdeploy", + "patched", + "negative" + ], + "disabled": false, + "notes": "CVE-2026-33626 patched counterpart: _is_safe_url private-IP allowlist gate replaces scheme-only check; regression guard that Nyx does not refire on the fix" + }, { "case_id": "cve-php-2017-9841-vulnerable", "file": "cve_corpus/php/CVE-2017-9841/vulnerable.php", @@ -10694,6 +10927,147 @@ "disabled": false, "notes": "CVE-2017-12629 patched counterpart: transformer name allowlist + in-process secure TransformerFactory removes the Runtime.exec path; regression guard that Nyx does not refire on the fix" }, + { + "case_id": "cve-java-2022-1471-vulnerable", + "file": "cve_corpus/java/CVE-2022-1471/vulnerable.java", + "language": "java", + "is_vulnerable": true, + "vuln_class": "deserialization", + "cwe": "CWE-502", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "java.deser.snakeyaml_unsafe_constructor" + ], + "allowed_alternative_rule_ids": [ + "taint-unsanitised-flow" + ], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 38, + 38 + ] + ], + "expected_source_lines": [ + [ + 34, + 34 + ] + ], + "tags": [ + "cve", + "snakeyaml", + "deserialization", + "servlet" + ], + "disabled": false, + "notes": "CVE-2022-1471: SnakeYAML <2.0 default Constructor accepts arbitrary class tags (`!!javax.script.ScriptEngineManager`, `!!java.net.URLClassLoader`, ...) reaching RCE on apps that load attacker-controlled YAML. Apache-2.0" + }, + { + "case_id": "cve-java-2022-1471-patched", + "file": "cve_corpus/java/CVE-2022-1471/patched.java", + "language": "java", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "java.deser.snakeyaml_unsafe_constructor", + "java.deser.readobject", + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "snakeyaml", + "patched", + "negative" + ], + "disabled": false, + "notes": "CVE-2022-1471 patched counterpart: explicit SafeConstructor argument restricts the YAML tag handler set to primitives + standard collections; regression guard that Nyx does not refire on the safe form" + }, + { + "case_id": "cve-java-2022-42889-vulnerable", + "file": "cve_corpus/java/CVE-2022-42889/vulnerable.java", + "language": "java", + "is_vulnerable": true, + "vuln_class": "code_exec", + "cwe": "CWE-94", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "java.code_exec.text4shell_interpolator" + ], + "allowed_alternative_rule_ids": [ + "taint-unsanitised-flow" + ], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 40, + 40 + ] + ], + "expected_source_lines": [ + [ + 35, + 35 + ] + ], + "tags": [ + "cve", + "commons-text", + "text4shell", + "code-exec", + "servlet" + ], + "disabled": false, + "notes": "CVE-2022-42889 (Text4Shell): Apache Commons Text 1.5..1.9 StringSubstitutor.createInterpolator() enables script:/dns:/url: lookups; ${script:js:...} reaches the JSR-223 ScriptEngineManager. Apache-2.0" + }, + { + "case_id": "cve-java-2022-42889-patched", + "file": "cve_corpus/java/CVE-2022-42889/patched.java", + "language": "java", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "java.code_exec.text4shell_interpolator", + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "commons-text", + "text4shell", + "patched", + "negative" + ], + "disabled": false, + "notes": "CVE-2022-42889 patched counterpart: substitutor built directly with `new StringSubstitutor()` so the lookup map is empty; ${...} pass-through. No script/dns/url evaluation." + }, { "case_id": "rs-auth-001", "file": "rust/auth/actix_scoped_write_missing.rs", @@ -12233,6 +12607,89 @@ "disabled": false, "notes": "TS cross-function bool validator; deferred \u2014 same reason as js-safe-016." }, + { + "case_id": "ts-safe-017", + "file": "typescript/safe/safe_strapi_db_query_chain.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-89", + "provenance": "real-repo-distilled", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "cfg-unguarded-sink", + "taint-unsanitised-flow" + ], + "expected_severity": "NONE", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "real-repo-precision-2026-04-29", + "strapi", + "orm-chain" + ], + "disabled": false, + "notes": "Strapi-style ORM accessor: `.db.query(MODEL_UID).(...)`; the `db.query` call's literal model UID + the ORM-method outer chain (findOne/findMany/create/update/delete/count) prove the chain is parameterised. Synthesised same-node Sanitizer(SQL_QUERY) suppresses cfg-unguarded-sink and taint-unsanitised-flow." + }, + { + "case_id": "ts-safe-018", + "file": "typescript/safe/safe_indirect_validator.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow", + "cfg-unguarded-sink" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "indirect-validator", + "ssrf", + "negative" + ], + "disabled": false, + "notes": "Indirect-validator branch narrowing — `const err = validateUrlSsrf(target); if (err) throw …;` should suppress the downstream axios.get sink. Pinned by tests/lib::indirect_validator_narrowing_marks_arg_validated." + }, + { + "case_id": "ts-safe-019", + "file": "typescript/safe/safe_helper_with_validator.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "helper-summary", + "ssrf", + "negative" + ], + "disabled": false, + "notes": "Helper-summary all_validated propagation — when a helper's body validates the param via `validateXxx`, the per-param probe's all_validated event should be skipped during summary extraction so callers don't refire the cross-fn SSRF. Pinned by tests/lib::helper_with_validator_does_not_propagate_to_caller_via_summary." + }, { "case_id": "py-auth-decorator-001", "file": "python/safe/safe_login_required_decorator.py", @@ -12897,6 +13354,31 @@ "disabled": false, "notes": "Happy-path `if (!data.error && Array.isArray(...))` and body-mentioning-err do not fire `cfg-error-fallthrough` (website/public/app/core/app.js)" }, + { + "case_id": "js-safe-realrepo-006", + "file": "javascript/safe/safe_localised_gherkin_regex.js", + "language": "javascript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "panic-guard", + "negative", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Panic guard: CodeMirror Gherkin tokenizer ships a long localised regex inside a boolean sub-condition. Naive byte-slice truncation in CFG condition-text (`t[..MAX_CONDITION_TEXT_LEN]`) panicked when byte 256 landed inside a multi-byte UTF-8 character (Gurmukhi `ਖ`). Engine fix: src/utils/snippet.rs::truncate_at_char_boundary applied at three CFG sites + two symex display sites (gogs public/plugins/codemirror-5.17.0/mode/gherkin/gherkin.js:107)." + }, { "case_id": "go-safe-realrepo-001", "file": "go/safe/safe_error_log_only_function.go", @@ -13126,6 +13608,33 @@ "disabled": false, "notes": "`func (c *Cache) ...` with `c.foo()` / `c.Fs.Create(...)` intra-struct dispatches \u2014 Go method receivers must seed `non_sink_vars` so the verb-name fallback doesn't fire on bare-receiver internal calls. Closes the hugo cache/filecache.go cluster (~48 hits)." }, + { + "case_id": "go-safe-realrepo-006", + "file": "go/safe/safe_test_helper_fatal.go", + "language": "go", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "cfg-error-fallthrough" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cfg", + "negative", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "`if err != nil { c.Fatalf(...) }` / `os.Exit` / `log.Fatalf` / `panic(err)` are documented terminators (Goexit-class). cfg-error-fallthrough must walk through them as terminating paths. Closes the minio test-file cluster (49+34+12+11+9+7+7 hits across xl-storage_test.go, erasure-healing_test.go, format-erasure_test.go, \u2026). Engine fix: src/cfg_analysis/error_handling.rs::call_never_returns." + }, { "case_id": "go-auth-realrepo-001", "file": "go/auth/vuln_repo_findbyid_no_auth.go", @@ -13429,6 +13938,59 @@ "disabled": false, "notes": "Regression guard: same TRPC handler shape as ts-auth-realrepo-004 but the SQL parameter is `input.targetUserId` (request body field), not `ctx.user.id`. The TRPC ctx self-actor exemption must apply ONLY to ctx.user. subjects, never to other paths in the same param." }, + { + "case_id": "ts-auth-realrepo-006", + "file": "typescript/auth/safe_local_collection_receiver.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "js.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "negative", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Excalidraw `Map` / `Set` / `WeakMap` / `WeakSet` / `Array` / `T[]` / `readonly T[]` receivers — direct annotation, same-file `type X = Map<...>` aliasing, and inline `new Map()` constructor. SSA `constructor_type` JS/TS arm + `cfg::params::ts_type_to_local_collection` + `cfg::dto::collect_type_alias_local_collections` route every shape through `TypeKind::LocalCollection` → `SinkClass::InMemoryLocal`, suppressing missing-ownership." + }, + { + "case_id": "ts-auth-realrepo-007", + "file": "typescript/auth/vuln_local_collection_does_not_blanket_suppress.ts", + "language": "typescript", + "is_vulnerable": true, + "vuln_class": "auth", + "cwe": "CWE-639", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "js.auth.missing_ownership_check" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Vulnerable counterpart to ts-auth-realrepo-006: `prisma.user.findUnique` / `prisma.user.update` with attacker-supplied id and no preceding auth check. Receiver is NOT a tracked Map / Set / Array, so the LocalCollection fix must NOT suppress this — proves the type-aware suppression doesn't blanket-cover real DB clients that share method names (`get`, `find`, `update`) with JS containers." + }, { "case_id": "rs-auth-realrepo-009", "file": "rust/auth/safe_local_user_view_extractor.rs", @@ -13484,6 +14046,89 @@ "disabled": false, "notes": "Negative counterpart for the LocalUserView recogniser: handler takes the typed extractor but mutates a row by `req.target_user_id` (foreign id) without any ownership check \u2014 must still flag. Guards against an over-broad recogniser that would treat any handler with a self-actor extractor as authorised by default." }, + { + "case_id": "rs-auth-realrepo-011", + "file": "rust/auth/safe_param_type_segment_idents.rs", + "language": "rust", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "negative", + "real-repo-precision-2026-04-29", + "noise-budget-zero" + ], + "disabled": false, + "notes": "Rust `parameter` arm in `collect_param_names` keeps type-segment idents (`std`, `path`, `Path`) out of `unit.params` so `dst: &std::path::Path` doesn't gate `unit_has_user_input_evidence` open via the framework-name allow-list (`path`). Surfaced from meilisearch/index-scheduler/scheduler/process_snapshot_creation.rs::remove_tasks where `dst: &std::path::Path` made every `db.delete(task.uid)` fire missing-ownership-check." + }, + { + "case_id": "rs-auth-realrepo-012", + "file": "rust/auth/safe_local_collection_param_types.rs", + "language": "rust", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "negative", + "real-repo-precision-2026-04-29", + "noise-budget-zero" + ], + "disabled": false, + "notes": "Rust function-parameter type annotations naming an in-memory container (`RoaringBitmap`, `HashMap`, `HashSet`, `BTreeSet`) classify the receiver as `TypeKind::LocalCollection` \u2192 `SinkClass::InMemoryLocal`, suppressing the verb-name dispatch's DbMutation classification. Surfaced from meilisearch/index-scheduler/scheduler/enterprise_edition/network.rs::balance_shards (`unsharded: RoaringBitmap`). Mirrors the JS/TS `ts_type_to_local_collection` fix from 2026-04-29." + }, + { + "case_id": "rs-auth-realrepo-013", + "file": "rust/auth/unsafe_handler_local_collection_does_not_blanket_suppress.rs", + "language": "rust", + "is_vulnerable": true, + "vuln_class": "auth", + "cwe": "CWE-285", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "High", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "positive", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Negative-counterpart guard for the LocalCollection / parameter-name fixes: handler takes a HashMap typed param (in-memory bookkeeping) but ALSO calls `db.update_owner(req.target_user_id, ...)` (real DbMutation). The cache mutation must not blanket-suppress the persistent-store mutation \u2014 the rule must still fire on `db.update_owner`." + }, { "case_id": "ruby-safe-ar-query-shapes-001", "file": "ruby/safe/safe_active_record_query_shapes.rb", @@ -13715,6 +14360,120 @@ ], "disabled": false, "notes": "Concatenated SQL passed to em.createQuery(...) \u2014 receiver-chain walk sees binary_expression at arg 0, refuses to synthesise sanitizer, structural sink fires. Regression guard for the JPA parameterised-execute fix." + }, + { + "case_id": "py-auth-realrepo-005", + "file": "python/safe/safe_fastapi_route_dependencies_auth.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "py.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "fastapi", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Distilled from airflow api_fastapi/core_api/routes/public/connections.py: FastAPI route decorator carries `dependencies=[Depends(requires_access_connection(method=\"DELETE\"))]`; the Flask extractor's new `dependencies=` kwarg walker plus inject_middleware_auth subject synthesis recognises the auth gate." + }, + { + "case_id": "py-auth-realrepo-007", + "file": "python/safe/safe_fastapi_route_level_row_fetch.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "py.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "fastapi", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Distilled from airflow api_fastapi/core_api/routes/public/dag_run.py: FastAPI route decorator carries `dependencies=[Depends(requires_access_dag(method=\"GET\"))]`; the route-level guard must cover not only direct path-param subjects (filter_by(dag_id=dag_id)) but also row-variable receivers (`dag.cleanup_runs(...)` after `dag = session.scalar(select(DagModel)...)`). Pinned by the `is_route_level` short-circuit in `auth_check_covers_subject` plus the kind-aware `function_params_route_handler` that includes id-like Python typed params (`dag_id: str`) in `unit.params`." + }, + { + "case_id": "py-auth-realrepo-006", + "file": "python/safe/safe_pytest_sqlalchemy_session.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "py.auth.missing_ownership_check", + "py.auth.token_override_without_validation" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "pytest", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Distilled from airflow tests/unit/models/test_backfill.py: pytest test methods with SQLAlchemy `session` fixture and `session.commit()` calls. Bare `session.` no longer counts as auth Session evidence; only `session.` (user/user_id/...) does." + }, + { + "case_id": "py-auth-realrepo-007", + "file": "python/auth/vuln_fastapi_route_no_dependencies.py", + "language": "python", + "is_vulnerable": true, + "vuln_class": "auth", + "cwe": "CWE-862", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "py.auth.missing_ownership_check" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 15, + 15 + ] + ], + "expected_source_lines": [], + "tags": [ + "auth", + "fastapi", + "real-repo-precision-2026-04-29" + ], + "disabled": false, + "notes": "Vulnerable counterpart to py-auth-realrepo-005: same FastAPI route shape but no `dependencies=[Depends(...)]` keyword arg. Regression guard: the dependency-injection recogniser must not blanket-suppress every FastAPI route." } ] } diff --git a/tests/benchmark/results/latest.json b/tests/benchmark/results/latest.json index 8a9c2166..25a5477a 100644 --- a/tests/benchmark/results/latest.json +++ b/tests/benchmark/results/latest.json @@ -1,6 +1,6 @@ { "benchmark_version": "1.0", - "timestamp": "2026-04-29T05:42:03Z", + "timestamp": "2026-04-29T21:50:34Z", "scanner_version": "0.5.0", "scanner_config": { "analysis_mode": "Full", @@ -9,9 +9,9 @@ "state_analysis_enabled": true, "worker_threads": 1 }, - "ground_truth_hash": "sha256:3e034f1fc5c7bb7838f1fb2c63de5ca5a36aacfdf5d66cf25f30bff99f25f1cf", - "corpus_size": 433, - "cases_run": 432, + "ground_truth_hash": "sha256:5b391d654f88673e5a200af875d513cf83812af747739395e8315768b8983ce3", + "corpus_size": 458, + "cases_run": 457, "cases_skipped": 1, "outcomes": [ { @@ -1306,6 +1306,74 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "cve-java-2022-1471-patched", + "file": "cve_corpus/java/CVE-2022-1471/patched.java", + "language": "java", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-java-2022-1471-vulnerable", + "file": "cve_corpus/java/CVE-2022-1471/vulnerable.java", + "language": "java", + "vuln_class": "deserialization", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "java.deser.snakeyaml_unsafe_constructor" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "java.deser.snakeyaml_unsafe_constructor" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-java-2022-42889-patched", + "file": "cve_corpus/java/CVE-2022-42889/patched.java", + "language": "java", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-java-2022-42889-vulnerable", + "file": "cve_corpus/java/CVE-2022-42889/vulnerable.java", + "language": "java", + "vuln_class": "code_exec", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "java.code_exec.text4shell_interpolator" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "java.code_exec.text4shell_interpolator" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "cve-js-2019-14939-patched", "file": "cve_corpus/javascript/CVE-2019-14939/patched.js", @@ -1520,6 +1588,76 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "cve-py-2025-69662-patched", + "file": "cve_corpus/python/CVE-2025-69662/patched.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-py-2025-69662-vulnerable", + "file": "cve_corpus/python/CVE-2025-69662/vulnerable.py", + "language": "python", + "vuln_class": "sql_injection", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 35:12)", + "py.sqli.text_format" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 35:12)", + "py.sqli.text_format" + ], + "security_finding_count": 2, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-py-2026-33626-patched", + "file": "cve_corpus/python/CVE-2026-33626/patched.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-py-2026-33626-vulnerable", + "file": "cve_corpus/python/CVE-2026-33626/vulnerable.py", + "language": "python", + "vuln_class": "ssrf", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 43:12)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 43:12)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "cve-rb-2013-0156-patched", "file": "cve_corpus/ruby/CVE-2013-0156/patched.rb", @@ -1737,6 +1875,40 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "cve-ts-ghsa-4x48-cgf9-q33f-patched", + "file": "cve_corpus/typescript/GHSA-4x48-cgf9-q33f/patched.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-ts-ghsa-4x48-cgf9-q33f-vulnerable", + "file": "cve_corpus/typescript/GHSA-4x48-cgf9-q33f/vulnerable.ts", + "language": "typescript", + "vuln_class": "ssrf", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 50:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 50:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "go-auth-realrepo-001", "file": "go/auth/vuln_repo_findbyid_no_auth.go", @@ -2371,6 +2543,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "go-safe-realrepo-006", + "file": "go/safe/safe_test_helper_fatal.go", + "language": "go", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "go-sqli-001", "file": "go/sqli/sqli_concat.go", @@ -3590,6 +3777,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "js-safe-realrepo-006", + "file": "javascript/safe/safe_localised_gherkin_regex.js", + "language": "javascript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "js-sqli-001", "file": "javascript/sqli/sqli_concat.js", @@ -4497,6 +4699,70 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "py-auth-realrepo-005", + "file": "python/safe/safe_fastapi_route_dependencies_auth.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "py-auth-realrepo-006", + "file": "python/safe/safe_pytest_sqlalchemy_session.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "py-auth-realrepo-007", + "file": "python/safe/safe_fastapi_route_level_row_fetch.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "py-auth-realrepo-007", + "file": "python/auth/vuln_fastapi_route_no_dependencies.py", + "language": "python", + "vuln_class": "auth", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "py.auth.missing_ownership_check" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "py.auth.missing_ownership_check" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "py-cmdi-001", "file": "python/cmdi/cmdi_direct.py", @@ -5630,6 +5896,55 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "rs-auth-realrepo-011", + "file": "rust/auth/safe_param_type_segment_idents.rs", + "language": "rust", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "rs-auth-realrepo-012", + "file": "rust/auth/safe_local_collection_param_types.rs", + "language": "rust", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "rs-auth-realrepo-013", + "file": "rust/auth/unsafe_handler_local_collection_does_not_blanket_suppress.rs", + "language": "rust", + "vuln_class": "auth", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "rs.auth.missing_ownership_check" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "rs-auth-typed-extractors-001", "file": "rust/auth/safe_typed_path_int_extractor.rs", @@ -7043,6 +7358,42 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "ts-auth-realrepo-006", + "file": "typescript/auth/safe_local_collection_receiver.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "ts-auth-realrepo-007", + "file": "typescript/auth/vuln_local_collection_does_not_blanket_suppress.ts", + "language": "typescript", + "vuln_class": "auth", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "js.auth.missing_ownership_check", + "js.auth.missing_ownership_check" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "js.auth.missing_ownership_check", + "js.auth.missing_ownership_check" + ], + "security_finding_count": 2, + "non_security_finding_count": 0 + }, { "case_id": "ts-cmdi-001", "file": "typescript/cmdi/cmdi_exec_template.ts", @@ -7493,6 +7844,53 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "ts-safe-017", + "file": "typescript/safe/safe_strapi_db_query_chain.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "ts.quality.any_annotation" + ], + "security_finding_count": 0, + "non_security_finding_count": 1 + }, + { + "case_id": "ts-safe-018", + "file": "typescript/safe/safe_indirect_validator.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "ts-safe-019", + "file": "typescript/safe/safe_helper_with_validator.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "ts-secrets-001", "file": "typescript/secrets/fallback_secret.ts", @@ -7552,6 +7950,30 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "ts-sqli-003", + "file": "typescript/sqli/sqli_db_query_concat.ts", + "language": "typescript", + "vuln_class": "sqli", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "taint-unsanitised-flow (source 15:5)", + "taint-unsanitised-flow (source 21:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "ts.quality.any_annotation", + "ts.quality.any_annotation", + "taint-unsanitised-flow (source 15:5)", + "taint-unsanitised-flow (source 21:5)", + "ts.quality.any_annotation" + ], + "security_finding_count": 2, + "non_security_finding_count": 3 + }, { "case_id": "ts-ssrf-001", "file": "typescript/ssrf/ssrf_axios_user_url.ts", @@ -7771,22 +8193,22 @@ } ], "aggregate_file_level": { - "tp": 216, + "tp": 225, "fp": 1, "fn_": 0, - "tn": 215, - "precision": 0.9953917050691244, + "tn": 231, + "precision": 0.995575221238938, "recall": 1.0, - "f1": 0.997690531177829 + "f1": 0.9977827050997783 }, "aggregate_rule_level": { - "tp": 216, + "tp": 225, "fp": 1, "fn_": 0, - "tn": 215, - "precision": 0.9953917050691244, + "tn": 231, + "precision": 0.995575221238938, "recall": 1.0, - "f1": 0.997690531177829 + "f1": 0.9977827050997783 }, "by_language": { "c": { @@ -7811,16 +8233,16 @@ "tp": 25, "fp": 1, "fn_": 0, - "tn": 27, + "tn": 28, "precision": 0.9615384615384616, "recall": 1.0, "f1": 0.9803921568627451 }, "java": { - "tp": 17, + "tp": 19, "fp": 0, "fn_": 0, - "tn": 18, + "tn": 20, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -7829,7 +8251,7 @@ "tp": 19, "fp": 0, "fn_": 0, - "tn": 23, + "tn": 24, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -7844,10 +8266,10 @@ "f1": 1.0 }, "python": { - "tp": 23, + "tp": 26, "fp": 0, "fn_": 0, - "tn": 23, + "tn": 28, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -7862,19 +8284,19 @@ "f1": 1.0 }, "rust": { - "tp": 33, + "tp": 34, "fp": 0, "fn_": 0, - "tn": 37, + "tn": 39, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "typescript": { - "tp": 29, + "tp": 32, "fp": 0, "fn_": 0, - "tn": 18, + "tn": 23, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -7882,7 +8304,7 @@ }, "by_vuln_class": { "auth": { - "tp": 13, + "tp": 16, "fp": 0, "fn_": 0, "tn": 0, @@ -7909,7 +8331,7 @@ "f1": 1.0 }, "code_exec": { - "tp": 2, + "tp": 3, "fp": 0, "fn_": 0, "tn": 0, @@ -7945,7 +8367,7 @@ "f1": 1.0 }, "deserialization": { - "tp": 4, + "tp": 5, "fp": 0, "fn_": 0, "tn": 0, @@ -8002,7 +8424,7 @@ "tp": 0, "fp": 1, "fn_": 0, - "tn": 215, + "tn": 231, "precision": 0.0, "recall": 1.0, "f1": 0.0 @@ -8016,8 +8438,17 @@ "recall": 1.0, "f1": 1.0 }, + "sql_injection": { + "tp": 1, + "fp": 0, + "fn_": 0, + "tn": 0, + "precision": 1.0, + "recall": 1.0, + "f1": 1.0 + }, "sqli": { - "tp": 29, + "tp": 30, "fp": 0, "fn_": 0, "tn": 0, @@ -8026,7 +8457,7 @@ "f1": 1.0 }, "ssrf": { - "tp": 26, + "tp": 28, "fp": 0, "fn_": 0, "tn": 0, @@ -8046,31 +8477,31 @@ }, "by_confidence": { ">=High": { - "tp": 90, - "fp": 90, - "fn_": 126, - "tn": 126, - "precision": 0.5, - "recall": 0.4166666666666667, - "f1": 0.45454545454545453 + "tp": 79, + "fp": 104, + "fn_": 146, + "tn": 128, + "precision": 0.43169398907103823, + "recall": 0.3511111111111111, + "f1": 0.3872549019607843 }, ">=Low": { - "tp": 94, - "fp": 102, - "fn_": 122, - "tn": 114, - "precision": 0.47959183673469385, - "recall": 0.4351851851851852, - "f1": 0.4563106796116505 + "tp": 81, + "fp": 116, + "fn_": 144, + "tn": 116, + "precision": 0.41116751269035534, + "recall": 0.36, + "f1": 0.3838862559241706 }, ">=Medium": { - "tp": 94, - "fp": 102, - "fn_": 122, - "tn": 114, - "precision": 0.47959183673469385, - "recall": 0.4351851851851852, - "f1": 0.4563106796116505 + "tp": 81, + "fp": 116, + "fn_": 144, + "tn": 116, + "precision": 0.41116751269035534, + "recall": 0.36, + "f1": 0.3838862559241706 } } } \ No newline at end of file diff --git a/tests/benchmark_test.rs b/tests/benchmark_test.rs index d9792e31..5560d3a1 100644 --- a/tests/benchmark_test.rs +++ b/tests/benchmark_test.rs @@ -191,7 +191,7 @@ struct BenchmarkResults { // ── Scanning ───────────────────────────────────────────────────────── fn scan_corpus_file(corpus_root: &Path, relative_path: &str) -> Vec { - // `cve_corpus/*` cases live in a sibling of `corpus/` — see + // `cve_corpus/*` cases live in a sibling of `corpus/`, see // `tests/benchmark/cve_corpus/`. let source = if relative_path.starts_with("cve_corpus/") { corpus_root @@ -679,7 +679,7 @@ fn benchmark_evaluation() { // on this corpus, so 5pp is generous enough to absorb honest // FP↔TN trades while still catching a real regression in a // vulnerability class. When you land a durable, measurable - // improvement, tighten these floors — do not relax them to paper + // improvement, tighten these floors, do not relax them to paper // over a regression. let rule = &results.aggregate_rule_level; assert!( @@ -790,7 +790,7 @@ fn score_rule_level_with_diags( fn sha256_hex(data: &[u8]) -> String { use std::io::Write; - // Simple SHA-256 via command — avoids adding a crypto dependency. + // Simple SHA-256 via command, avoids adding a crypto dependency. let mut child = std::process::Command::new("shasum") .args(["-a", "256"]) .stdin(std::process::Stdio::piped()) diff --git a/tests/cli_validation_tests.rs b/tests/cli_validation_tests.rs index 9c742bcf..39e4f492 100644 --- a/tests/cli_validation_tests.rs +++ b/tests/cli_validation_tests.rs @@ -3,7 +3,7 @@ //! Nyx's surface is a `clap` parser plus a handful of downstream validators //! (`SeverityFilter::parse`, `Severity::from_str`, `Confidence::from_str`, //! `apply_profile`). These tests lock in the user-visible contract that -//! bad input exits non-zero with a message that names the offending flag — +//! bad input exits non-zero with a message that names the offending flag , //! a scanner that silently accepts a typo'd severity and returns zero //! findings is a footgun in CI. //! @@ -268,7 +268,7 @@ fn scan_quiet_suppresses_preview_banner() { .stderr(predicate::str::contains("Preview for C/C++").not()); } -/// JSON output format must not print the Preview banner either — machine- +/// JSON output format must not print the Preview banner either, machine- /// readable output has to stay clean on both stdout and stderr. #[test] fn scan_json_format_suppresses_preview_banner() { diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 35cf2cb6..74b0322e 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -179,7 +179,7 @@ pub fn validate_expectations(diags: &[Diag], fixture_dir: &Path) { } } - // Noise budget (optional — omitted on tight safe-code fixtures) + // Noise budget (optional, omitted on tight safe-code fixtures) if let Some(budget) = &exp.noise_budget { assert_max_findings(diags, budget.max_total_findings, budget.max_high_findings); } diff --git a/tests/concurrent_scan_tests.rs b/tests/concurrent_scan_tests.rs index ee85434d..2d52e52f 100644 --- a/tests/concurrent_scan_tests.rs +++ b/tests/concurrent_scan_tests.rs @@ -3,8 +3,8 @@ //! Production defaults run the scanner with `worker_threads > 1`, and callers //! embedding `nyx_scanner` (the forthcoming `serve` UI, CI wrappers, scripted //! harnesses) may invoke `scan_no_index` from multiple threads in the same -//! process. Shared engine state — label tables, framework-detection caches, -//! tree-sitter thread-local parsers, rayon globals, `once_cell` statics — +//! process. Shared engine state, label tables, framework-detection caches, +//! tree-sitter thread-local parsers, rayon globals, `once_cell` statics , //! must tolerate two simultaneous walks without races, panics, or diverging //! outputs. //! @@ -86,7 +86,7 @@ fn build_tree(root: &Path) { } /// Canonicalize a diag list for equality comparison. Finding output ordering -/// depends on rayon scheduling — the individual fields must be identical but +/// depends on rayon scheduling, the individual fields must be identical but /// the sequence is not. We sort by a stable composite key and stringify /// (Diag itself doesn't derive Ord). fn canonical_fingerprint(diags: &[Diag]) -> Vec { @@ -104,7 +104,7 @@ fn two_concurrent_scans_produce_identical_findings() { let root = tmp.path().to_path_buf(); build_tree(&root); - // Capture an initial single-threaded run so we have a reference point — + // Capture an initial single-threaded run so we have a reference point , // if the concurrent run produced a subset we want to know whether that // matches a known-good baseline or diverges from it. let baseline = scan_no_index(&root, &test_cfg()).expect("baseline scan must succeed"); @@ -138,7 +138,7 @@ fn two_concurrent_scans_produce_identical_findings() { ); } -/// Four concurrent scans over the same tree — larger blast radius for +/// Four concurrent scans over the same tree, larger blast radius for /// serialization bugs in shared caches. Runs on a small tree to keep /// CI time reasonable. #[test] diff --git a/tests/cross_file_abstract_tests.rs b/tests/cross_file_abstract_tests.rs index 5fcfe0b7..44c0626d 100644 --- a/tests/cross_file_abstract_tests.rs +++ b/tests/cross_file_abstract_tests.rs @@ -4,16 +4,16 @@ //! Three fixtures cover the documented transfer forms currently tractable //! against the JS/Python abstract-suppression pipelines: //! -//! * `cross_file_abstract_port_range` (Python) — Identity transfer on an +//! * `cross_file_abstract_port_range` (Python), Identity transfer on an //! integer-typed passthrough. The caller's literal `8080` crosses the //! file boundary and SHELL_ESCAPE suppression fires on the bounded int. -//! * `cross_file_abstract_bounded_index` (Python) — Clamped transfer +//! * `cross_file_abstract_bounded_index` (Python), Clamped transfer //! derived from a baseline-invariant fact. The callee returns a //! literal `42`; the per-parameter transfer attaches it as //! `Clamped { 42, 42 }` and the caller sees a bounded integer //! without the return-abstract channel alone carrying the fact //! through summary resolution ambiguity. -//! * `cross_file_abstract_url_prefix_lock` (JS) — String-prefix transfer +//! * `cross_file_abstract_url_prefix_lock` (JS), String-prefix transfer //! across an Identity wrapper. The caller writes //! `url = asIs('https://internal/...' + userPath)` and passes `url` to //! `axios.get`. The CFG node's `string_prefix` is consumed by the @@ -21,7 +21,7 @@ //! prefix locks the host and SSRF suppression fires. //! //! Each fixture's `expectations.json` treats the cross-file SHELL/SSRF -//! sink as *forbidden* on the main file — if cross-file abstract +//! sink as *forbidden* on the main file, if cross-file abstract //! propagation regresses, the sink fires and the forbidden-finding //! assertion trips. diff --git a/tests/cross_file_alias_tests.rs b/tests/cross_file_alias_tests.rs index 5379d945..3abe57b9 100644 --- a/tests/cross_file_alias_tests.rs +++ b/tests/cross_file_alias_tests.rs @@ -6,7 +6,7 @@ //! Three fixtures cover distinct structural shapes of the summary //! channel: //! -//! * `cross_file_alias_mutating_helper` (Java) — a void-returning +//! * `cross_file_alias_mutating_helper` (Java), a void-returning //! helper that stores its second argument into a field of its first //! argument. Without the points-to channel the cross-file summary //! loses every taint edge (void return, no container-op in @@ -14,20 +14,20 @@ //! edge and the caller observes the field write through the argument //! alias, producing a Runtime.exec finding. //! -//! * `cross_file_alias_returned_alias` (JS) — a passthrough helper +//! * `cross_file_alias_returned_alias` (JS), a passthrough helper //! whose return aliases its first parameter. `param_to_return` with //! `Identity` already covered the taint cap; the points-to channel //! adds the heap-identity alias `Param(0) → Return` so the caller //! threads the points-to set through the call. The existing -//! shell-exec sink must still fire — a regression guard on the +//! shell-exec sink must still fire, a regression guard on the //! return-alias channel. //! -//! * `cross_file_alias_bounded_graph` (Python) — a helper with a 20- +//! * `cross_file_alias_bounded_graph` (Python), a helper with a 20- //! edge alias graph that intentionally overflows `MAX_ALIAS_EDGES`. //! The assertion is that the scan *terminates* under the bounded //! analysis and falls back to the conservative //! `PointsToSummary::overflow` behaviour, not a specific finding -//! count — overflow is an operational guarantee, not a precision one. +//! count, overflow is an operational guarantee, not a precision one. mod common; @@ -76,7 +76,7 @@ fn cross_file_container_factory() { } /// Receiver-chain regression: tainted receiver flows through -/// `tainted.trim().toLowerCase()` — both zero-arg — and into +/// `tainted.trim().toLowerCase()`, both zero-arg, and into /// `Runtime.exec`. Pins the existing receiver-fallback behaviour so /// heap-aliasing changes do not regress it. #[test] diff --git a/tests/cross_file_body_loading_tests.rs b/tests/cross_file_body_loading_tests.rs index 2b4c32d9..03e4b5d2 100644 --- a/tests/cross_file_body_loading_tests.rs +++ b/tests/cross_file_body_loading_tests.rs @@ -4,7 +4,7 @@ //! The body-loading path is pure plumbing: the taint engine carries a //! `cross_file_bodies` field on `SsaTaintTransfer` that the cross-file //! inline path consumes. This test guards the *availability* -//! invariant — if pass 1 stops populating `bodies_by_key`, the inline +//! invariant, if pass 1 stops populating `bodies_by_key`, the inline //! path would silently fall back to summary resolution even when //! cross-file bodies could have given context-sensitive precision. //! @@ -19,7 +19,7 @@ use nyx_scanner::symbol::Lang; use nyx_scanner::utils::config::{AnalysisMode, Config}; use std::path::Path; -/// Test-local config mirror of `tests/common/mod.rs::test_config` — +/// Test-local config mirror of `tests/common/mod.rs::test_config` , /// kept inline so this file does not need to pull in the shared module /// (which `cargo test --test cross_file_body_loading_tests` would /// require extra wiring for). @@ -39,7 +39,7 @@ fn test_config() -> Config { /// Replay the pass-1 body-collection logic from `scan_filesystem` on a /// handful of files and return the resulting `GlobalSummaries`. /// -/// This mirrors the fold-body of `scan_filesystem`'s pass-1 rayon loop — +/// This mirrors the fold-body of `scan_filesystem`'s pass-1 rayon loop , /// the production code uses the same `analyse_file_fused` entry point /// and the same `insert` / `insert_ssa` / `insert_body` trio. Keeping /// the test close to that shape catches drift between the fused pipeline @@ -72,7 +72,7 @@ fn cross_file_body_loading_smoke_python_two_files() { let root = tmp.path(); // `a.py` defines a helper that takes one parameter, does a trivial - // string op, and returns. The body is intentionally small — we only + // string op, and returns. The body is intentionally small, we only // care that *any* eligible body is produced, not that it has // interesting taint content. let a_py = root.join("a.py"); @@ -133,7 +133,7 @@ fn cross_file_body_loading_smoke_python_two_files() { body.param_count ); - // Quick sanity on the SSA shape — an eligible body must have at + // Quick sanity on the SSA shape, an eligible body must have at // least one block. Zero blocks would mean we stored an empty stub, // which would let the inline path silently do nothing on every // inline attempt. @@ -146,7 +146,7 @@ fn cross_file_body_loading_smoke_python_two_files() { #[test] fn cross_file_body_loading_empty_without_callees() { // A single file with no inter-procedural flow is still expected to - // produce a body for its one function — that's what body loading + // produce a body for its one function, that's what body loading // enables. The *empty* case this test guards is "bodies_by_key // returns None when no bodies are loaded," which keeps the // threaded-through `Option` explicit for inline consumers. diff --git a/tests/cross_file_context_off_tests.rs b/tests/cross_file_context_off_tests.rs index d2ba049e..7c7e3223 100644 --- a/tests/cross_file_context_off_tests.rs +++ b/tests/cross_file_context_off_tests.rs @@ -8,8 +8,8 @@ //! findings. //! //! This binary is split from `cross_file_context_tests.rs` because -//! Cargo compiles each `tests/*.rs` file into its own test binary — -//! separate processes — so the `NYX_CONTEXT_SENSITIVE` env flip here +//! Cargo compiles each `tests/*.rs` file into its own test binary , +//! separate processes, so the `NYX_CONTEXT_SENSITIVE` env flip here //! does not race against the default-on tests running in parallel. //! //! The switch is read by `AnalysisOptions::current()` via the legacy diff --git a/tests/cross_file_context_tests.rs b/tests/cross_file_context_tests.rs index 1554a4f8..d4a4993d 100644 --- a/tests/cross_file_context_tests.rs +++ b/tests/cross_file_context_tests.rs @@ -6,24 +6,24 @@ //! The four fixtures under `tests/fixtures/cross_file_context_*` cover //! the documented precision wins and guardrails: //! -//! * `cross_file_context_two_call_sites` (Python) — two calls to the same +//! * `cross_file_context_two_call_sites` (Python), two calls to the same //! cross-file helper, one tainted and one with a constant literal. //! Asserts the tainted call still produces a finding. -//! * `cross_file_context_callback` (JS) — cross-file helper invokes a +//! * `cross_file_context_callback` (JS), cross-file helper invokes a //! caller-side function passed as a callback. Inline re-analysis of //! the helper must resolve the callback binding and surface the //! flow through `child_process.exec`. -//! * `cross_file_context_sanitizer` (JS) — cross-file sanitizer applied +//! * `cross_file_context_sanitizer` (JS), cross-file sanitizer applied //! before an HTML sink. Regression guard: cross-file inline must not //! introduce a taint finding when the sanitiser is recognised. -//! * `cross_file_context_deep_chain` (Python) — A -> B -> C chain with +//! * `cross_file_context_deep_chain` (Python), A -> B -> C chain with //! the sink in C. k=1 means B->C resolves via summary; the end-to-end //! finding must still surface so callers cannot lose recall on deep //! chains. //! //! The `bodies_by_key_populated_for_cross_file_fixtures` test is a //! direct `GlobalSummaries`-level assertion that pass 1 loaded cross-file -//! SSA bodies for each fixture — i.e. the cross-file inline path has +//! SSA bodies for each fixture, i.e. the cross-file inline path has //! something to consult. If this assertion flips to zero, cross-file //! inline would silently fall back to summary resolution and every //! expectations.json check above would be driven by the less precise @@ -65,7 +65,7 @@ fn test_config() -> Config { /// Walk a fixture directory and replay the pass-1 body collection that /// `scan_filesystem` does, returning the merged `GlobalSummaries`. /// -/// This is used purely for the availability assertion — the actual +/// This is used purely for the availability assertion, the actual /// scans under test go through the regular `scan_no_index` entry point. fn pass1_bodies(root: &Path) -> GlobalSummaries { let cfg = test_config(); @@ -132,7 +132,7 @@ fn cross_file_context_sanitizer() { } /// Three-file deep chain (A -> B -> C) with the sink in C. The -/// end-to-end flow must still surface — k=1 depth cap on inline does +/// end-to-end flow must still surface, k=1 depth cap on inline does /// not drop recall because B -> C resolves via summary. #[test] fn cross_file_context_deep_chain() { diff --git a/tests/cross_file_phi_tests.rs b/tests/cross_file_phi_tests.rs index ab59411e..0792fcea 100644 --- a/tests/cross_file_phi_tests.rs +++ b/tests/cross_file_phi_tests.rs @@ -4,18 +4,18 @@ //! Three fixtures cover distinct structural shapes of the per-return-path //! transform: //! -//! * `cross_file_phi_validated_branch` (Python) — a callee whose two +//! * `cross_file_phi_validated_branch` (Python), a callee whose two //! return branches are both `Identity` on the value, differing only in //! the predicate gate. The required SQLi finding confirms the //! summary-application path does not regress on the common "union is //! precise enough" case. -//! * `cross_file_phi_partial_sanitiser` (JS) — the callee has two +//! * `cross_file_phi_partial_sanitiser` (JS), the callee has two //! returns with *different* transforms (Identity vs //! StripBits(HTML_ESCAPE)). The caller invokes the unsanitised branch, -//! so the XSS sink must still fire — a regression guard against a +//! so the XSS sink must still fire, a regression guard against a //! per-path application that over-eagerly attributes sanitation across //! all branches. -//! * `cross_file_phi_both_branches_safe` (Go) — both return paths run +//! * `cross_file_phi_both_branches_safe` (Go), both return paths run //! the same sanitising validator. The SQL sink is on the forbidden //! list: if the per-path decomposition regresses to "either branch //! could be raw" the caller would pick up a false positive. diff --git a/tests/db_corruption_tests.rs b/tests/db_corruption_tests.rs index c21c9f11..ad085570 100644 --- a/tests/db_corruption_tests.rs +++ b/tests/db_corruption_tests.rs @@ -6,10 +6,10 @@ //! clear error instead of panicking, hanging, or producing nonsense //! findings. These tests exercise both classes of corruption: //! -//! 1. Truncation to zero bytes — SQLite treats a zero-length file as a +//! 1. Truncation to zero bytes, SQLite treats a zero-length file as a //! fresh empty DB. We expect the indexer to bootstrap the schema and //! carry on. -//! 2. Arbitrary garbage in the header — SQLite rejects this with +//! 2. Arbitrary garbage in the header, SQLite rejects this with //! `SQLITE_NOTADB` during pragma/schema execution. We expect the //! indexer to return a structured error, not a panic. //! @@ -122,7 +122,7 @@ fn zero_truncated_db_rebuilds_on_init() { let pool = Indexer::init(&db_path) .expect("Indexer::init should bootstrap a schema into an empty file"); - // After init, the DB is empty of prior state — an indexed scan should + // After init, the DB is empty of prior state, an indexed scan should // still run end-to-end but will effectively be acting like a cold // rebuild. We don't re-call build_index here because the plan is to // confirm the raw init path is resilient. @@ -143,14 +143,14 @@ fn zero_truncated_db_rebuilds_on_init() { } /// Clobber the SQLite magic header with garbage bytes. This is the -/// "actual corruption" case — SQLite rejects it with `SQLITE_NOTADB` the +/// "actual corruption" case, SQLite rejects it with `SQLITE_NOTADB` the /// first time pragma or SQL is executed, which surfaces as /// `NyxError::Sql(_)` from `Indexer::init`. #[test] fn garbage_header_db_returns_structured_error() { let (_project_name, db_path, _project, _db_dir) = build_indexed_project(); - // Write 100 bytes of `0xFF` — guaranteed not to match SQLite's header + // Write 100 bytes of `0xFF`, guaranteed not to match SQLite's header // magic "SQLite format 3\0". clobber_header(&db_path, 0xFF, 100); @@ -186,7 +186,7 @@ fn garbage_header_db_returns_structured_error() { // NOTE: A mid-file corruption test (garbage at bytes 100..200, preserving // SQLite magic) was attempted and is deliberately omitted. That shape // triggers a slow corruption-detection path in SQLite where `Indexer::init` -// takes 150–200 seconds before returning — unsuitable for CI wall-clock +// takes 150–200 seconds before returning, unsuitable for CI wall-clock // budgets. The two tests above already cover the "corrupt-on-arrival" // cases that users actually hit (crash-truncated file, deliberate clobber). // A follow-up should either short-circuit `PRAGMA integrity_check` up diff --git a/tests/dedup_alternative_paths_tests.rs b/tests/dedup_alternative_paths_tests.rs index 41dee306..63501b78 100644 --- a/tests/dedup_alternative_paths_tests.rs +++ b/tests/dedup_alternative_paths_tests.rs @@ -2,7 +2,7 @@ //! [`nyx_scanner::taint::analyse_file`] must preserve distinct flows //! that share a source but differ on validation status or intermediate //! variables. Historically the dedup collapsed all `(body_id, sink, -//! source)` siblings, preferring the validated one — so an unguarded +//! source)` siblings, preferring the validated one, so an unguarded //! exploit on a sibling branch was silently dropped in favour of a //! neighbouring guarded flow. //! @@ -35,7 +35,7 @@ fn dedup_preserves_validated_and_unvalidated_flows() { validate_expectations(&diags, &dir); // Load-bearing assertion: the two flows live on distinct sink - // lines (6 and 8 in the source — actual lines depend on the + // lines (6 and 8 in the source, actual lines depend on the // fixture file format, so we only assert distinct sinks). let taint: Vec<&nyx_scanner::commands::scan::Diag> = diags .iter() @@ -58,7 +58,7 @@ fn dedup_preserves_validated_and_unvalidated_flows() { .collect::>(), ); - // The two findings must live on different source lines — if the + // The two findings must live on different source lines, if the // engine collapses them into one, the test will fail here even // when the count assertion above coincidentally passes (e.g. if // a future change started emitting one validated and one @@ -73,7 +73,7 @@ fn dedup_preserves_validated_and_unvalidated_flows() { // Every taint finding must carry a stable `finding_id` that // downstream formatters can reference. This is the plumbing that - // feeds alternative-path cross-linking — verify it is non-empty + // feeds alternative-path cross-linking, verify it is non-empty // for every taint finding so regressions in `analyse_file`'s // post-dedup `make_finding_id` pass surface here. for d in &taint { @@ -87,7 +87,7 @@ fn dedup_preserves_validated_and_unvalidated_flows() { ); } - // At least one validated/unvalidated split must be present — the + // At least one validated/unvalidated split must be present, the // whole point of the fixture is that a guarded branch and an // unguarded branch reach `exec(input)` and both must report. // We do not require an exact split since future sanitization @@ -103,7 +103,7 @@ fn dedup_preserves_validated_and_unvalidated_flows() { is not behind any allowlist. Found only validated findings.", ); // `validated` may legitimately be empty if the engine does not yet - // recognise `isWhitelisted` as a predicate — the fixture is still + // recognise `isWhitelisted` as a predicate, the fixture is still // load-bearing because the `min_count: 2` in expectations.json // asserts both findings surface regardless of which is classified // as validated. Drop the assertion to avoid gating the regression diff --git a/tests/determinism_threads_tests.rs b/tests/determinism_threads_tests.rs index d48a666c..f3cbe89c 100644 --- a/tests/determinism_threads_tests.rs +++ b/tests/determinism_threads_tests.rs @@ -2,9 +2,9 @@ //! //! The scanner's two-pass pipeline runs rayon `par_iter` over files in //! both pass-1 (summary extraction) and pass-2 (rule evaluation), and -//! merges summaries via `try_reduce`. A latent ordering bug — a +//! merges summaries via `try_reduce`. A latent ordering bug, a //! shared mutable state hit unprotected from multiple threads, or a -//! `HashMap` iteration order leaking into a finding identity — can +//! `HashMap` iteration order leaking into a finding identity, can //! surface as a diagnostic that appears with 4 workers but not with 1. //! //! This test runs the same fixture under worker-thread counts of 1, @@ -14,7 +14,7 @@ //! assertion fires only on real output divergence. //! //! If this test ever flakes, prefer investigating the engine over -//! weakening the normaliser — engine-level determinism across thread +//! weakening the normaliser, engine-level determinism across thread //! counts is load-bearing for reproducible CI runs. mod common; diff --git a/tests/engine_notes_rank_tests.rs b/tests/engine_notes_rank_tests.rs index 11b8d712..22af571e 100644 --- a/tests/engine_notes_rank_tests.rs +++ b/tests/engine_notes_rank_tests.rs @@ -26,7 +26,7 @@ use nyx_scanner::rank::{compute_attack_rank, rank_diags}; // ── Diag factories ───────────────────────────────────────────────────── /// A converged taint finding that the points-based scorer will score -/// as `Confidence::High`. Used as the "clean" baseline — any delta +/// as `Confidence::High`. Used as the "clean" baseline, any delta /// against this must come from attached engine notes. fn high_confidence_taint_diag(path: &str, line: u32) -> Diag { Diag { @@ -204,7 +204,7 @@ fn rank_diags_sorts_converged_above_capped_at_same_severity() { #[test] fn rank_diags_preserves_severity_tier_under_bail() { // High + Bail must still outrank Medium + clean at the same - // evidence-strength baseline — this is the tier-boundary invariant + // evidence-strength baseline, this is the tier-boundary invariant // that the -8 completeness magnitude is calibrated for. let mut high_bailed = high_confidence_taint_diag("a.rs", 1); attach_notes( @@ -421,7 +421,7 @@ fn sarif_omits_loss_direction_for_informational_only() { fn every_engine_note_direction_is_documented() { // Enumerate every EngineNote variant and assert its direction. // The intent is that a contributor adding a new variant will cause - // this test to fail to compile (no match arm) — a structural guard + // this test to fail to compile (no match arm), a structural guard // against silent misclassification. fn check(note: EngineNote, expected: LossDirection) { assert_eq!( diff --git a/tests/engine_notes_tests.rs b/tests/engine_notes_tests.rs index 6d05b11c..c98bd449 100644 --- a/tests/engine_notes_tests.rs +++ b/tests/engine_notes_tests.rs @@ -2,7 +2,7 @@ //! test forces a specific cap-site to fire on a tiny fixture by //! overriding the engine's safety cap, then asserts either that the //! corresponding observability counter moved *or* that the note -//! propagated to a produced finding — whichever is the more stable +//! propagated to a produced finding, whichever is the more stable //! signal for that cap. mod common; @@ -19,7 +19,7 @@ use std::path::Path; use std::sync::Mutex; /// Process-wide atomics for cap overrides mean tests that fiddle with -/// them must run serially — cargo test defaults to parallel. +/// them must run serially, cargo test defaults to parallel. static CAP_GUARD: Mutex<()> = Mutex::new(()); fn fixture(name: &str) -> std::path::PathBuf { @@ -32,7 +32,7 @@ fn fixture(name: &str) -> std::path::PathBuf { fn worklist_cap_trips_observability_counter() { let _guard = CAP_GUARD.lock().unwrap_or_else(|e| e.into_inner()); // Force a very tight worklist budget so every body with > 0 blocks - // trips the cap. The observability counter is the stable signal — + // trips the cap. The observability counter is the stable signal , // note attribution to a specific finding may be lost on bodies that // capped *before* emitting their sink event. reset_worklist_observability(); @@ -59,7 +59,7 @@ fn origins_cap_trips_observability_on_multi_source_fixture() { // Set origins to 1 and scan a fixture with multiple top-level // sources flowing into the same sink. Any non-trivial taint flow // will produce at least one tainted value whose origin list hit the - // cap — detected by the post-hoc saturation scan at the end of + // cap, detected by the post-hoc saturation scan at the end of // `run_ssa_taint_internal`. reset_origins_observability(); set_max_origins_override(1); diff --git a/tests/fetch_data_exfil_integration_tests.rs b/tests/fetch_data_exfil_integration_tests.rs new file mode 100644 index 00000000..0a213d8a --- /dev/null +++ b/tests/fetch_data_exfil_integration_tests.rs @@ -0,0 +1,125 @@ +//! Integration tests for the `Cap::DATA_EXFIL` detector class. +//! +//! Validates per-cap attribution at multi-gate call sites: a single `fetch` +//! call carries both an SSRF gate (URL flow) and a DATA_EXFIL gate (body / +//! headers / json flow), and a tainted body must not surface as SSRF and +//! vice versa. Also sanity-checks the SARIF output so the new finding +//! class produces a distinct rule id. + +mod common; + +use common::scan_fixture_dir; +use nyx_scanner::commands::scan::Diag; +use nyx_scanner::utils::config::AnalysisMode; +use std::path::PathBuf; + +fn js_fixture_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("js") +} + +fn diags_for(file: &str) -> Vec { + let dir = js_fixture_dir(); + let all = scan_fixture_dir(&dir, AnalysisMode::Full); + all.into_iter().filter(|d| d.path.ends_with(file)).collect() +} + +#[test] +fn fetch_body_data_exfil_emits_data_exfil_not_ssrf() { + let diags = diags_for("fetch_body_data_exfil.js"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + let plain_taint = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + assert!( + exfil >= 1, + "expected at least one taint-data-exfiltration finding, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + plain_taint, + 0, + "fixed-URL fetch with tainted body must NOT emit SSRF \ + (taint-unsanitised-flow), got {plain_taint}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn fetch_ssrf_url_tainted_emits_ssrf_not_data_exfil() { + let diags = diags_for("fetch_ssrf_url_tainted.js"); + let ssrf = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + assert!( + ssrf >= 1, + "expected at least one taint-unsanitised-flow (SSRF) finding, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + exfil, + 0, + "tainted-URL fetch must NOT emit DATA_EXFIL, got {exfil}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() { + use nyx_scanner::output::build_sarif; + + let dir = js_fixture_dir(); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + let sarif = build_sarif(&diags, &dir); + + let rules = sarif["runs"][0]["tool"]["driver"]["rules"] + .as_array() + .expect("SARIF rules array"); + let rule_ids: Vec<&str> = rules.iter().filter_map(|r| r["id"].as_str()).collect(); + + assert!( + rule_ids.contains(&"taint-data-exfiltration"), + "SARIF rules must contain taint-data-exfiltration, got: {rule_ids:?}" + ); + assert!( + rule_ids.contains(&"taint-unsanitised-flow"), + "SARIF rules must contain taint-unsanitised-flow, got: {rule_ids:?}" + ); + + // Each finding should reference exactly one rule, and the cap-specific + // class must not be folded back into the generic taint bucket. + let results = sarif["runs"][0]["results"] + .as_array() + .expect("SARIF results array"); + let exfil_results = results + .iter() + .filter(|r| r["ruleId"].as_str() == Some("taint-data-exfiltration")) + .count(); + let ssrf_results = results + .iter() + .filter(|r| r["ruleId"].as_str() == Some("taint-unsanitised-flow")) + .count(); + assert!( + exfil_results >= 1, + "expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {exfil_results}", + ); + assert!( + ssrf_results >= 1, + "expected >= 1 SARIF result with ruleId taint-unsanitised-flow, got {ssrf_results}", + ); +} diff --git a/tests/fixtures/async_rust/main.rs b/tests/fixtures/async_rust/main.rs index 31fbf29b..b3b5eae1 100644 --- a/tests/fixtures/async_rust/main.rs +++ b/tests/fixtures/async_rust/main.rs @@ -1,7 +1,7 @@ // Regression fixture: Rust async flow through `tokio::process::Command`. // // Per docs/language-maturity.md, Rust's Tokio process variants are not -// yet covered — the Tokio async process APIs are a known gap. The +// yet covered, the Tokio async process APIs are a known gap. The // fixture is checked in so that when Rust async-process coverage lands, // the engine begins producing the intended finding and the // `forbidden_findings` assertion forces whoever adds the coverage to diff --git a/tests/fixtures/auth_analysis/cross_file_helper_authz.rs b/tests/fixtures/auth_analysis/cross_file_helper_authz.rs index 6ed45ac6..e01d8e45 100644 --- a/tests/fixtures/auth_analysis/cross_file_helper_authz.rs +++ b/tests/fixtures/auth_analysis/cross_file_helper_authz.rs @@ -1,7 +1,7 @@ // Target: authorization happens inside `require_owner`, which // delegates to `require_group_member` (a configured authorization // check name). The handler in `cross_file_helper_handler.rs` -// delegates ownership validation to this helper — cross-file helper +// delegates ownership validation to this helper, cross-file helper // lifting should recognise the call as an auth check covering the // supplied `row`. struct Db; diff --git a/tests/fixtures/auth_analysis/db_connection_type_inferred.rs b/tests/fixtures/auth_analysis/db_connection_type_inferred.rs index 8bd89a59..836c1353 100644 --- a/tests/fixtures/auth_analysis/db_connection_type_inferred.rs +++ b/tests/fixtures/auth_analysis/db_connection_type_inferred.rs @@ -2,7 +2,7 @@ // produces a `DatabaseConnection` via SSA `constructor_type` (through // `peel_identity_suffix`, which strips `.unwrap()` before matching). The // handler then calls `conn.execute(..)`, a callee name that appears in -// neither `mutation_indicator_names` nor `read_indicator_names` for Rust — +// neither `mutation_indicator_names` nor `read_indicator_names` for Rust , // name-based classification returns `None`, so the ownership gate // already cannot flag the call. The type-map refinement should *still* // leave the call unflagged (the type map produces `DbMutation`, but diff --git a/tests/fixtures/auth_analysis/hashmap_local_noise.rs b/tests/fixtures/auth_analysis/hashmap_local_noise.rs index e7fa5d2e..d45ad6ed 100644 --- a/tests/fixtures/auth_analysis/hashmap_local_noise.rs +++ b/tests/fixtures/auth_analysis/hashmap_local_noise.rs @@ -16,7 +16,7 @@ pub async fn handle_list_peer_docs(req: Req, ctx: Ctx) -> Result { let user = auth::require_auth(&req, &ctx).await?; let doc_ids: Vec = vec![1, 2, 3]; - // Pure in-memory bookkeeping — no authorization decision here. + // Pure in-memory bookkeeping, no authorization decision here. let mut counts: HashMap = HashMap::new(); let mut seen: HashSet = HashSet::new(); for doc_id in &doc_ids { diff --git a/tests/fixtures/auth_analysis/helper_no_auth_lift.rs b/tests/fixtures/auth_analysis/helper_no_auth_lift.rs index 36698184..f1939d3b 100644 --- a/tests/fixtures/auth_analysis/helper_no_auth_lift.rs +++ b/tests/fixtures/auth_analysis/helper_no_auth_lift.rs @@ -1,5 +1,5 @@ // B4 regression guard: `format_target` does NOT auth-check -// `group_id` — it just constructs a string from it. The helper-lift +// `group_id`, it just constructs a string from it. The helper-lift // pass must not synthesise a covering AuthCheck on the handler's call // site, so the subsequent `db.exec("INSERT INTO comments …", &[group_id])` // MUST still flag. @@ -19,7 +19,7 @@ mod auth { } fn format_target(group_id: i64, suffix: &str) -> String { - // No auth check here — pure formatting. + // No auth check here, pure formatting. format!("group:{}{}", group_id, suffix) } diff --git a/tests/fixtures/auth_analysis/row_ownership_equality.rs b/tests/fixtures/auth_analysis/row_ownership_equality.rs index 6eb68651..321d2cb2 100644 --- a/tests/fixtures/auth_analysis/row_ownership_equality.rs +++ b/tests/fixtures/auth_analysis/row_ownership_equality.rs @@ -41,7 +41,7 @@ pub async fn handle_delete_doc(req: Req, ctx: Ctx, doc_id: i64) -> Result Result Result // The handler's `get_peer_ids(&db, user.id)` call below must not be // flagged. `user` is bound from `auth::require_auth(..)` so `user.id` -// is the caller's own id — the call is self-referential, not a foreign +// is the caller's own id, the call is self-referential, not a foreign // scoped id. The library-style helper below is a pass-through so its // body contains no DB sinks (the internal `user_id` → DB flow is a // separate pattern covered by helper-summary lifting). diff --git a/tests/fixtures/auth_analysis/sql_join_acl.rs b/tests/fixtures/auth_analysis/sql_join_acl.rs index 6da72bae..ef2a6108 100644 --- a/tests/fixtures/auth_analysis/sql_join_acl.rs +++ b/tests/fixtures/auth_analysis/sql_join_acl.rs @@ -2,7 +2,7 @@ // against an ACL table (`group_members`) with a WHERE clause that pins // the row to the current user (`gm.user_id = ?1` bound to `user.id`). // Every returned row is membership-gated by construction, so downstream -// uses of the row's columns (`group_id` here) are authorized — the +// uses of the row's columns (`group_id` here) are authorized, the // `realtime::publish_to_group` call MUST NOT be flagged as missing an // ownership check after B3. struct Ctx; diff --git a/tests/fixtures/auth_analysis/sql_no_acl_join_flags.rs b/tests/fixtures/auth_analysis/sql_no_acl_join_flags.rs index 892eb645..08347251 100644 --- a/tests/fixtures/auth_analysis/sql_no_acl_join_flags.rs +++ b/tests/fixtures/auth_analysis/sql_no_acl_join_flags.rs @@ -1,7 +1,7 @@ // B3 regression guard: the SELECT JOINs through `audit_log` (NOT in // the configured ACL list) and the WHERE clause pins on // `al.user_id = ?1`. The audit-log row's user is the audit subject, -// not the doc owner — so this query does NOT prove caller ownership +// not the doc owner, so this query does NOT prove caller ownership // of the returned `doc_id`. The downstream realtime publish MUST // still flag for a missing ownership check after B3. struct Ctx; diff --git a/tests/fixtures/auth_analysis/transitive_helper.rs b/tests/fixtures/auth_analysis/transitive_helper.rs index c1c2ae9d..17b60b5d 100644 --- a/tests/fixtures/auth_analysis/transitive_helper.rs +++ b/tests/fixtures/auth_analysis/transitive_helper.rs @@ -1,7 +1,7 @@ // target: authorization happens inside `validate_target`, which // internally calls `authz::require_membership` against the same // `group_id` the handler subsequently mutates. The current rule cannot -// see this transitively — B4 lifts per-function auth-check summaries +// see this transitively, B4 lifts per-function auth-check summaries // (which positional params are auth-checked) so the handler-level call // to `validate_target(&db, group_id, user.id)` is recognised as an // auth check covering `group_id`. Result: `db.exec(..)` MUST NOT flag @@ -45,7 +45,7 @@ pub async fn handle_create_comment( let user = auth::require_auth(&req, &ctx).await?; let db = Db; - // Authorization happens inside validate_target — helper-summary + // Authorization happens inside validate_target, helper-summary // lifting propagates the per-param auth check so this covers // `group_id`. validate_target(&db, group_id, user.id).await?; diff --git a/tests/fixtures/fp_guards/auth_local_collection_receiver/App.ts b/tests/fixtures/fp_guards/auth_local_collection_receiver/App.ts new file mode 100644 index 00000000..111f38cc --- /dev/null +++ b/tests/fixtures/fp_guards/auth_local_collection_receiver/App.ts @@ -0,0 +1,31 @@ +// FP guard for `js.auth.missing_ownership_check` — JS built-in +// container receivers must not be classified as data-layer sinks. +// See `tests/benchmark/corpus/typescript/auth/safe_local_collection_receiver.ts` +// for the full real-repo distillation. + +type ElementsMap = Map; + +function fromAlias(elementsMap: ElementsMap, id: string) { + return elementsMap.get(id); +} + +function fromDirectGeneric(m: Map, k: string) { + return m.get(k); +} + +function fromArrayShorthand(arr: { id: string }[], targetId: string) { + return arr.find((x) => x.id === targetId); +} + +function fromLocalConstructor() { + const cache = new Map(); + cache.set("a", "x"); + return cache.get("a"); +} + +function fromSet(visited: Set, k: string) { + if (!visited.has(k)) { + visited.add(k); + } + return visited.size; +} diff --git a/tests/fixtures/fp_guards/auth_local_collection_receiver/expectations.json b/tests/fixtures/fp_guards/auth_local_collection_receiver/expectations.json new file mode 100644 index 00000000..344bb06d --- /dev/null +++ b/tests/fixtures/fp_guards/auth_local_collection_receiver/expectations.json @@ -0,0 +1,16 @@ +{ + "required_findings": [], + "forbidden_findings": [ + { "id_prefix": "js.auth.missing_ownership_check" } + ], + "noise_budget": { + "max_total_findings": 1, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1000, + "max_ms_index_cold": 1500, + "max_ms_index_warm": 500, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/fp_guards/auth_rust_param_typed_local_collection/expectations.json b/tests/fixtures/fp_guards/auth_rust_param_typed_local_collection/expectations.json new file mode 100644 index 00000000..3735360a --- /dev/null +++ b/tests/fixtures/fp_guards/auth_rust_param_typed_local_collection/expectations.json @@ -0,0 +1,16 @@ +{ + "required_findings": [], + "forbidden_findings": [ + { "id_prefix": "rs.auth.missing_ownership_check" } + ], + "noise_budget": { + "max_total_findings": 2, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1000, + "max_ms_index_cold": 1500, + "max_ms_index_warm": 500, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/fp_guards/auth_rust_param_typed_local_collection/snapshot.rs b/tests/fixtures/fp_guards/auth_rust_param_typed_local_collection/snapshot.rs new file mode 100644 index 00000000..e249f635 --- /dev/null +++ b/tests/fixtures/fp_guards/auth_rust_param_typed_local_collection/snapshot.rs @@ -0,0 +1,93 @@ +// Real-repo precision guard mirroring meilisearch's index-scheduler +// shape: +// crates/index-scheduler/src/scheduler/process_snapshot_creation.rs::remove_tasks +// (`unsafe fn remove_tasks(tasks: &[Task], dst: &std::path::Path, +// index_base_map_size: usize)` plus per-loop bitmap mutations on +// destructured heed `Database` handles), plus the LocalCollection +// receiver-type cluster +// (`crates/index-scheduler/src/scheduler/enterprise_edition/network.rs::balance_shards`, +// `unsharded: RoaringBitmap`). +// +// Both engine fixes must hold: the Rust `parameter` arm in +// `collect_param_names` (only descends into `pattern`, never `type`) +// and the Rust LocalCollection type-text classifier +// (`rust_type_to_local_collection`). Without either, this file would +// produce missing-ownership-check findings on internal helpers / +// in-memory bitmap mutations. + +use std::collections::{BTreeSet, HashMap, HashSet}; + +struct RoaringBitmap; +impl RoaringBitmap { + fn new() -> Self { Self } + fn insert(&mut self, _x: u32) -> bool { true } + fn remove(&mut self, _x: u32) -> bool { true } + fn contains(&self, _x: u32) -> bool { true } +} + +struct Task { uid: u32 } + +struct Database; +impl Database { + fn delete(&self, _w: &mut u32, _u: &u32) -> Result<(), ()> { Ok(()) } +} + +struct TaskQueue { + all_tasks: Database, + canceled_by: Database, +} + +// Rust `parameter` arm: type-segment idents (`std`, `path`, `Path`) +// must NOT pollute `unit.params` and gate user-input-evidence open. +unsafe fn remove_tasks( + tasks: &[Task], + dst: &std::path::Path, + sz: usize, +) -> Result<(), ()> { + let _ = (dst, sz); + let mut wtxn = 0u32; + let task_queue = TaskQueue { + all_tasks: Database, + canceled_by: Database, + }; + let TaskQueue { all_tasks, canceled_by } = task_queue; + for task in tasks { + all_tasks.delete(&mut wtxn, &task.uid)?; + canceled_by.delete(&mut wtxn, &task.uid)?; + } + Ok(()) +} + +// LocalCollection typed param: `unsharded: RoaringBitmap` resolves to +// `TypeKind::LocalCollection`, so `unsharded.insert(docid)` / +// `unsharded.remove(docid)` classify as `SinkClass::InMemoryLocal` +// (non-auth-relevant). +fn balance_shards(mut unsharded: RoaringBitmap, docid: u32) { + unsharded.insert(docid); + unsharded.remove(docid); +} + +// `&'a mut HashMap<...>` reference + lifetime: ref-stripping must +// reach the type head. +fn store_shard_docids<'a>( + new_shard_docids: &'a mut HashMap, + shard: String, + docid: u32, +) { + new_shard_docids.insert(shard, docid); +} + +fn add_user_id(ids: &mut HashSet, user_id: u64) { + ids.insert(user_id); + ids.remove(&user_id); +} + +fn collect_seen(seen: &mut BTreeSet, item_id: u32) { + seen.insert(item_id); +} + +fn build_local_set(task_id: u32) -> RoaringBitmap { + let mut s = RoaringBitmap::new(); + s.insert(task_id); + s +} diff --git a/tests/fixtures/fp_guards/cfg_utf8_long_condition/App.js b/tests/fixtures/fp_guards/cfg_utf8_long_condition/App.js new file mode 100644 index 00000000..2d81961f --- /dev/null +++ b/tests/fixtures/fp_guards/cfg_utf8_long_condition/App.js @@ -0,0 +1,41 @@ +// FP guard / panic guard — CFG condition-text truncation must be UTF-8 safe. +// +// Reproduces the gogs scan crash where a CodeMirror Gherkin tokenizer ships a +// long localised regex inside a boolean sub-condition (`stream.match(/.../) && +// other`). When `push_condition_node` textualises the sub-expression, the +// regex literal exceeds MAX_CONDITION_TEXT_LEN (256 bytes); naive byte-slice +// truncation panicked when byte 256 landed inside a multi-byte UTF-8 +// character (here Gurmukhi `ਖ`, three bytes). Engine fix in +// `src/utils/snippet.rs::truncate_at_char_boundary`, applied at three CFG +// sites + two symex display sites. +// +// Invariant: scanning this file must terminate without panicking the rayon +// worker, regardless of where byte 256 lands. + +function tokenLocalisedFeatureKeyword(stream, state) { + if ( + !state.inKeywordLine && + state.allowFeature && + stream.match(/(機能|功能|フィーチャ|기능|โครงหลัก|ความสามารถ|ความต้องการทางธุรกิจ|ಹೆಚ್ಚಳ|గుణము|ಮುಹಾಂದರಾ|ਮੁਹਾਂਦਰਾ|ਨਕਸ਼ ਨੁਹਾਰ|ਖਾਸੀਅਤ|रूप लेख|وِیژگی|خاصية|תכונה|Функціонал|Функция|Функционалност|Функционал|Үзенчәлеклелек|Свойство|Особина|Мөмкинлек|Могућност|Λειτουργία|Δυνατότητα|Właściwość|Vlastnosť|Trajto|Tính năng|Savybė|Požiadavka|Požadavek|Potrzeba biznesowa|Özellik|Osobina|Ominaisuus|Omadus|Mogućnost|Mogucnost|Jellemző|Funzionalità|Funktionalitéit|Funktionalität|Funkcja|Funkcionalnost|Funkcionalitāte|Funkcia|Fungsi|Functionaliteit|Funcționalitate|Funcţionalitate|Functionalitate|Funcionalitat|Funcionalidade|Fonctionnalité|Fitur|Fīča|Feature|Eiginleiki|Egenskap|Egenskab|Característica|Caracteristica|Business Need|Aspekt|Arwedd|Ability):/) + ) { + state.inKeywordLine = true; + return "keyword"; + } + return null; +} + +// Sanity: also exercise the let-match-guard truncation site +// (`emit_rust_match_guard_if`) by way of a JS analogue with a CFG-relevant +// boolean chain that wraps localised text into the second branch. The CFG +// builder still has to textualise both arms. +function classify(s) { + if ( + s.length > 0 && + s.indexOf("ਨਕਸ਼ ਨੁਹਾਰ ਖਾਸੀਅਤ रूप लेख وِیژگی خاصية תכונה Функціонал Функция Функционалност Функционал Үзенчәлеклелек Свойство Особина Мөмкинлек Могућност Λειτουργία Δυνατότητα") >= 0 + ) { + return "localised"; + } + return "ascii"; +} + +module.exports = { tokenLocalisedFeatureKeyword, classify }; diff --git a/tests/fixtures/fp_guards/cfg_utf8_long_condition/expectations.json b/tests/fixtures/fp_guards/cfg_utf8_long_condition/expectations.json new file mode 100644 index 00000000..64561035 --- /dev/null +++ b/tests/fixtures/fp_guards/cfg_utf8_long_condition/expectations.json @@ -0,0 +1,14 @@ +{ + "required_findings": [], + "forbidden_findings": [], + "noise_budget": { + "max_total_findings": 0, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1000, + "max_ms_index_cold": 1500, + "max_ms_index_warm": 500, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/fp_guards/framework_fastapi_route_level_auth/App.py b/tests/fixtures/fp_guards/framework_fastapi_route_level_auth/App.py new file mode 100644 index 00000000..d74bb818 --- /dev/null +++ b/tests/fixtures/fp_guards/framework_fastapi_route_level_auth/App.py @@ -0,0 +1,51 @@ +""" +FP guard for FastAPI / Flask route-level dependency-injection auth. + +The `dependencies=[Depends(requires_access_dag(...))]` decorator +authorises the entire handler — every value the handler receives, +every row it fetches, and every operation downstream. The +`is_route_level` flag on the injected AuthCheck tells +`auth_check_covers_subject` to short-circuit `true`, suppressing +`py.auth.missing_ownership_check` on the body's ORM calls (`filter_by`, +`scalar`, …) and on row-variable receivers (`dag.cleanup_runs(...)`). + +A bare route with no `dependencies=` keyword is a real ownership- +check FP — the engine must still flag it. The vulnerable +counterpart lives in +`tests/benchmark/corpus/python/auth/vuln_fastapi_route_no_dependencies.py`. +""" +from fastapi import Depends, FastAPI + +router = FastAPI() + + +def requires_access_dag(method: str, access_entity=None): + def check(): + ... + return check + + +@router.get( + "/{dag_id}/runs/{run_id}", + dependencies=[Depends(requires_access_dag(method="GET"))], +) +def get_dag_run(dag_id: str, run_id: str, session): + """Path params + ORM call covered by route-level guard.""" + dag_run = session.scalar( + select(DagRun).filter_by(dag_id=dag_id, run_id=run_id) + ) + if dag_run is None: + raise HTTPException(404, "not found") + return dag_run + + +@router.delete( + "/{dag_id}", + dependencies=[Depends(requires_access_dag(method="DELETE"))], +) +def delete_dag(dag_id: str, session): + """Row fetch + row-variable method call covered by route-level guard.""" + dag = session.scalar(select(DagModel).where(DagModel.dag_id == dag_id)) + if dag is None: + raise HTTPException(404, "not found") + dag.cleanup_runs(session=session) diff --git a/tests/fixtures/fp_guards/framework_fastapi_route_level_auth/expectations.json b/tests/fixtures/fp_guards/framework_fastapi_route_level_auth/expectations.json new file mode 100644 index 00000000..faac8738 --- /dev/null +++ b/tests/fixtures/fp_guards/framework_fastapi_route_level_auth/expectations.json @@ -0,0 +1,16 @@ +{ + "required_findings": [], + "forbidden_findings": [ + { "id_prefix": "py.auth.missing_ownership_check" } + ], + "noise_budget": { + "max_total_findings": 2, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1500, + "max_ms_index_cold": 2000, + "max_ms_index_warm": 800, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/fp_guards/framework_strapi_db_query_chain/App.ts b/tests/fixtures/fp_guards/framework_strapi_db_query_chain/App.ts new file mode 100644 index 00000000..7f105b9b --- /dev/null +++ b/tests/fixtures/fp_guards/framework_strapi_db_query_chain/App.ts @@ -0,0 +1,40 @@ +// Strapi-style ORM accessor: `.db.query(MODEL_UID).(...)`. +// MODEL_UID is a literal model identifier (not raw SQL); the trailing +// findOne/findMany/create/update/delete/count are intrinsically +// parameterised — the actual SQL is generated by the ORM and per-call +// values arrive through field-keyed object literals the driver escapes. +// +// FP-guard: cfg-unguarded-sink and taint-unsanitised-flow must NOT +// fire on this shape. + +declare const strapi: any; + +async function getApiToken(whereParams: Record) { + return strapi.db.query('admin::api-token').findOne({ + select: ['id', 'name', 'lastUsedAt'], + populate: ['permissions'], + where: whereParams, + }); +} + +async function listTokens() { + return strapi.db.query('admin::api-token').findMany({ + where: { type: 'read-only' }, + }); +} + +async function createToken(data: unknown) { + return strapi.db.query('admin::api-token').create({ data }); +} + +async function updateToken(id: number, data: unknown) { + return strapi.db.query('admin::api-token').update({ where: { id }, data }); +} + +async function deleteToken(id: number) { + return strapi.db.query('admin::api-token').delete({ where: { id } }); +} + +async function countTokens() { + return strapi.db.query('admin::api-token').count(); +} diff --git a/tests/fixtures/fp_guards/framework_strapi_db_query_chain/expectations.json b/tests/fixtures/fp_guards/framework_strapi_db_query_chain/expectations.json new file mode 100644 index 00000000..76103a18 --- /dev/null +++ b/tests/fixtures/fp_guards/framework_strapi_db_query_chain/expectations.json @@ -0,0 +1,17 @@ +{ + "required_findings": [], + "forbidden_findings": [ + { "id_prefix": "cfg-unguarded-sink" }, + { "id_prefix": "taint-unsanitised-flow" } + ], + "noise_budget": { + "max_total_findings": 3, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1000, + "max_ms_index_cold": 1500, + "max_ms_index_warm": 500, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/js/fetch_body_data_exfil.js b/tests/fixtures/js/fetch_body_data_exfil.js new file mode 100644 index 00000000..8d2792b6 --- /dev/null +++ b/tests/fixtures/js/fetch_body_data_exfil.js @@ -0,0 +1,13 @@ +// DATA_EXFIL fixture: a fixed destination URL and an attacker-influenced +// body. SSRF must NOT fire (destination is hardcoded) but `Cap::DATA_EXFIL` +// must fire on the body field — request-bound bytes are leaving the process +// via the outbound request payload. +// +// Driven by `fetch_data_exfil_integration_tests.rs`. +function leakBody(req) { + var payload = req.body.message; + fetch('/endpoint', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/fixtures/js/fetch_ssrf_url_tainted.js b/tests/fixtures/js/fetch_ssrf_url_tainted.js new file mode 100644 index 00000000..ed72ca8d --- /dev/null +++ b/tests/fixtures/js/fetch_ssrf_url_tainted.js @@ -0,0 +1,10 @@ +// SSRF regression fixture: attacker-controlled destination URL. SSRF must +// fire on the URL flow (arg 0) and `Cap::DATA_EXFIL` must NOT fire — the two +// classes share the callee but cap attribution is per-position so a tainted +// URL never surfaces as data exfiltration. +// +// Driven by `fetch_data_exfil_integration_tests.rs`. +function proxy(req) { + var target = req.query.target; + fetch(target); +} diff --git a/tests/fixtures/mixed_project/config.rs b/tests/fixtures/mixed_project/config.rs index 66aa2efa..3cb85483 100644 --- a/tests/fixtures/mixed_project/config.rs +++ b/tests/fixtures/mixed_project/config.rs @@ -2,7 +2,7 @@ use std::env; use std::fs; use std::process::Command; -/// Infrastructure provisioning tool — Rust core. +/// Infrastructure provisioning tool, Rust core. /// Reads infrastructure config from environment and executes provisioning commands. struct InfraConfig { @@ -56,7 +56,7 @@ fn apply_terraform() { .unwrap(); } -/// Destroys infrastructure — reads target from env. +/// Destroys infrastructure, reads target from env. /// VULN: env var flows into Command fn destroy_cluster() { let cluster = env::var("DESTROY_TARGET").unwrap(); diff --git a/tests/fixtures/patterns/java/negative.java b/tests/fixtures/patterns/java/negative.java index 21b8328c..adfe6112 100644 --- a/tests/fixtures/patterns/java/negative.java +++ b/tests/fixtures/patterns/java/negative.java @@ -1,5 +1,9 @@ import java.sql.*; import java.security.SecureRandom; +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.LoaderOptions; +import org.yaml.snakeyaml.constructor.SafeConstructor; +import org.apache.commons.text.StringSubstitutor; class Negative { // Safe: parameterized query @@ -19,4 +23,17 @@ class Negative { void safeLiteralQuery(Statement stmt) throws Exception { stmt.executeQuery("SELECT COUNT(*) FROM users"); } + + // Safe: SnakeYAML 2.0 / explicit SafeConstructor — CVE-2022-1471 fix shape. + void safeSnakeyamlSafeConstructor(String body) { + LoaderOptions opts = new LoaderOptions(); + Yaml yaml = new Yaml(new SafeConstructor(opts)); + Object data = yaml.load(body); + } + + // Safe: empty StringSubstitutor — no interpolator factory — CVE-2022-42889 fix shape. + String safeStringSubstitutorPassthrough(String input) { + StringSubstitutor s = new StringSubstitutor(); + return s.replace(input); + } } diff --git a/tests/fixtures/patterns/java/positive.java b/tests/fixtures/patterns/java/positive.java index 03c83f9e..4fe3a991 100644 --- a/tests/fixtures/patterns/java/positive.java +++ b/tests/fixtures/patterns/java/positive.java @@ -1,6 +1,8 @@ import java.io.*; import java.util.Random; import java.security.MessageDigest; +import org.yaml.snakeyaml.Yaml; +import org.apache.commons.text.StringSubstitutor; class Positive { // java.deser.readobject @@ -45,4 +47,16 @@ class Positive { void triggerGetWriterPrint(javax.servlet.http.HttpServletResponse resp) throws Exception { resp.getWriter().println("" + "data" + ""); } + + // java.deser.snakeyaml_unsafe_constructor — CVE-2022-1471 regression guard. + void triggerSnakeyamlUnsafeConstructor() throws Exception { + Yaml yaml = new Yaml(); + Object data = yaml.load("payload"); + } + + // java.code_exec.text4shell_interpolator — CVE-2022-42889 regression guard. + String triggerText4ShellInterpolator(String input) { + StringSubstitutor s = StringSubstitutor.createInterpolator(); + return s.replace(input); + } } diff --git a/tests/fixtures/patterns/python/positive.py b/tests/fixtures/patterns/python/positive.py index 8063a7eb..ba9d8380 100644 --- a/tests/fixtures/patterns/python/positive.py +++ b/tests/fixtures/patterns/python/positive.py @@ -42,6 +42,14 @@ def trigger_yaml(data): def trigger_sql_concat(cursor, user): cursor.execute("SELECT * FROM users WHERE name = '" + user + "'") +# py.sqli.execute_format (f-string variant) +def trigger_sql_fstring(cursor, user): + cursor.execute(f"SELECT * FROM users WHERE name = '{user}'") + +# py.sqli.text_format +def trigger_sqlalchemy_text_fstring(connection, user): + connection.execute(text(f"SELECT * FROM users WHERE name = '{user}'")) + # py.crypto.md5 def trigger_md5(data): hashlib.md5(data) diff --git a/tests/fixtures/phase_c_auth_taint/handler_sanitized.rs b/tests/fixtures/phase_c_auth_taint/handler_sanitized.rs index f0de965e..3422b042 100644 --- a/tests/fixtures/phase_c_auth_taint/handler_sanitized.rs +++ b/tests/fixtures/phase_c_auth_taint/handler_sanitized.rs @@ -23,7 +23,7 @@ mod auth { // Negative control: the handler validates ownership via // `authz::require_group_member(...)?` before the realtime publish. Phase C -// should NOT emit `rs.auth.missing_ownership_check.taint` here — the +// should NOT emit `rs.auth.missing_ownership_check.taint` here, the // sanitizer clears `UNAUTHORIZED_ID` from the argument SSA values. pub async fn handle_publish_checked(Path(group_id): Path) -> Result<&'static str, ()> { let user = auth::current_user(); diff --git a/tests/fixtures/real_world/javascript/taint/fetch_object_url_tainted_fires.expect.json b/tests/fixtures/real_world/javascript/taint/fetch_object_url_tainted_fires.expect.json index 03f08131..e4c53ac0 100644 --- a/tests/fixtures/real_world/javascript/taint/fetch_object_url_tainted_fires.expect.json +++ b/tests/fixtures/real_world/javascript/taint/fetch_object_url_tainted_fires.expect.json @@ -1,6 +1,6 @@ { - "description": "fetch({url: taintedUrl, body: fixed}) — destination-aware object-literal case. url is tainted, must fire.", - "tags": ["taint", "ssrf", "fetch", "destination-aware", "object-config"], + "description": "fetch({url: taintedUrl, body: fixed}) — destination-aware object-literal case. url is tainted (SSRF), body is fixed. SSRF must fire and the cross-boundary data-exfiltration class (Cap::DATA_EXFIL) must NOT fire — the two classes share the callee but cap attribution is per-position.", + "tags": ["taint", "ssrf", "fetch", "destination-aware", "object-config", "cap-attribution"], "modes": ["full"], "expected": [ { @@ -10,6 +10,12 @@ "line_range": [6, 14], "evidence_contains": [], "notes": "req.query.target → fetch({url: target, ...}) — tainted destination field under object-literal shape." + }, + { + "rule_id": "taint-data-exfiltration", + "must_not_match": true, + "line_range": [6, 14], + "notes": "body is a fixed literal '{}' — DATA_EXFIL must NOT fire on this site (regression guard for per-cap attribution)." } ] } diff --git a/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json b/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json index 69ad24f3..0c42f768 100644 --- a/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json +++ b/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json @@ -1,6 +1,6 @@ { - "description": "fetch() request body carries attacker-controlled content but the destination URL is fixed. Under the destination-aware SSRF gate, only taint reaching the URL (arg 0 / object `url` field) activates — body taint must be silenced.", - "tags": ["taint", "ssrf", "fetch", "destination-aware", "regression-fp"], + "description": "fetch() with a fixed destination URL and an attacker-controlled body. SSRF must NOT fire (destination is not attacker-influenced) and the cross-boundary data-exfiltration class (Cap::DATA_EXFIL) MUST fire on the body field.", + "tags": ["taint", "data-exfil", "fetch", "destination-aware", "cap-attribution"], "modes": ["full"], "expected": [ { @@ -8,6 +8,12 @@ "must_not_match": true, "line_range": [7, 14], "notes": "fetch('/api/telemetry', {body: payload}) — arg 0 is a fixed string, body taint must not fire as SSRF." + }, + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [7, 14], + "notes": "Body field carries req.body.message → must fire DATA_EXFIL (sensitive data leaving the process via outbound request payload)." } ] } diff --git a/tests/fixtures/real_world/rust/taint/unsafe_replace_chain_no_credit.rs b/tests/fixtures/real_world/rust/taint/unsafe_replace_chain_no_credit.rs index a3b613ba..72400fb5 100644 --- a/tests/fixtures/real_world/rust/taint/unsafe_replace_chain_no_credit.rs +++ b/tests/fixtures/real_world/rust/taint/unsafe_replace_chain_no_credit.rs @@ -2,7 +2,7 @@ use std::env; use std::fs; // Wrapper whose replace chain strips only unrelated characters. The scanner -// must NOT treat this as a path-traversal sanitizer — the taint path should +// must NOT treat this as a path-traversal sanitizer, the taint path should // still be flagged. fn rewrite(s: &str) -> String { s.replace("foo", "bar").replace("baz", "qux") diff --git a/tests/fixtures/rust_module_path_resolution/src/auth/session.rs b/tests/fixtures/rust_module_path_resolution/src/auth/session.rs index 28d566ab..4302d092 100644 --- a/tests/fixtures/rust_module_path_resolution/src/auth/session.rs +++ b/tests/fixtures/rust_module_path_resolution/src/auth/session.rs @@ -1,5 +1,5 @@ // Session-module validate: accidentally shells out with its param. -// Same name + arity as auth::token::validate — ambiguous without a use map. +// Same name + arity as auth::token::validate, ambiguous without a use map. // If cross-file resolution incorrectly targets this function from main.rs, // the param taint from env::var will flow into Command::arg → taint finding. pub fn validate(input: &str) -> String { diff --git a/tests/fixtures/rust_module_path_resolution/src/auth/token.rs b/tests/fixtures/rust_module_path_resolution/src/auth/token.rs index 0008e0ea..99939656 100644 --- a/tests/fixtures/rust_module_path_resolution/src/auth/token.rs +++ b/tests/fixtures/rust_module_path_resolution/src/auth/token.rs @@ -1,5 +1,5 @@ // Token-module validate: strips shell metacharacters and returns a safe value. -// No sink in the body — purely a pass-through sanitizer. +// No sink in the body, purely a pass-through sanitizer. pub fn validate(input: &str) -> String { input.replace(['&', ';', '|', '$', '`', '\\', '"', '\''], "") } diff --git a/tests/fixtures/rust_module_path_resolution/src/main.rs b/tests/fixtures/rust_module_path_resolution/src/main.rs index 98f4dd54..98e487bd 100644 --- a/tests/fixtures/rust_module_path_resolution/src/main.rs +++ b/tests/fixtures/rust_module_path_resolution/src/main.rs @@ -1,7 +1,7 @@ use crate::auth::token::validate; // `validate(&cmd)` must resolve unambiguously to `auth::token::validate` -// (a pass-through sanitizer) — NOT `auth::session::validate` (which sinks +// (a pass-through sanitizer), NOT `auth::session::validate` (which sinks // its arg into std::process::Command). A correct use-map driven resolver // produces zero cross-file taint findings on this file. fn main() { diff --git a/tests/fixtures/state/auth_decorator_rust_non_auth.rs b/tests/fixtures/state/auth_decorator_rust_non_auth.rs index 4b9d95f2..4d47dde1 100644 --- a/tests/fixtures/state/auth_decorator_rust_non_auth.rs +++ b/tests/fixtures/state/auth_decorator_rust_non_auth.rs @@ -1,6 +1,6 @@ use std::process::Command; -// #[inline] is NOT an auth attribute — finding should fire. +// #[inline] is NOT an auth attribute, finding should fire. #[inline] fn handle_request(req: &str) { Command::new("sh").arg("-c").arg("ls /tmp").status().unwrap(); diff --git a/tests/fixtures/state/rust_box_owned.rs b/tests/fixtures/state/rust_box_owned.rs index bd48d242..06b174a8 100644 --- a/tests/fixtures/state/rust_box_owned.rs +++ b/tests/fixtures/state/rust_box_owned.rs @@ -1,5 +1,5 @@ fn boxed() { let b = Box::new(42); println!("{}", b); - // b dropped — no leak + // b dropped, no leak } diff --git a/tests/fixtures/state/rust_raii_file_no_leak.rs b/tests/fixtures/state/rust_raii_file_no_leak.rs index 4070a045..1a704ff3 100644 --- a/tests/fixtures/state/rust_raii_file_no_leak.rs +++ b/tests/fixtures/state/rust_raii_file_no_leak.rs @@ -5,5 +5,5 @@ fn read_file() { let mut f = File::open("/tmp/test").unwrap(); let mut buf = String::new(); f.read_to_string(&mut buf).unwrap(); - // f dropped by RAII — no leak + // f dropped by RAII, no leak } diff --git a/tests/fixtures/symex/match_suppresses_safe_arm.rs b/tests/fixtures/symex/match_suppresses_safe_arm.rs index 2fb36b99..f03c26d9 100644 --- a/tests/fixtures/symex/match_suppresses_safe_arm.rs +++ b/tests/fixtures/symex/match_suppresses_safe_arm.rs @@ -19,7 +19,7 @@ enum Cap { pub fn dispatch(cap: Cap) { let user_cmd = env::var("USER_CMD").unwrap_or_default(); match cap { - // Raw arm — tainted user_cmd flows directly into the shell. + // Raw arm, tainted user_cmd flows directly into the shell. Cap::Raw => { Command::new("sh") .arg("-c") @@ -27,7 +27,7 @@ pub fn dispatch(cap: Cap) { .output() .unwrap(); } - // Safe arm — allowlist-guarded execution. + // Safe arm, allowlist-guarded execution. Cap::Safe => { let allowed = ["ls", "date"]; if allowed.contains(&user_cmd.as_str()) { diff --git a/tests/gauss_seidel_tests.rs b/tests/gauss_seidel_tests.rs index d0e7321b..82d750dc 100644 --- a/tests/gauss_seidel_tests.rs +++ b/tests/gauss_seidel_tests.rs @@ -4,7 +4,7 @@ //! Default mode is Jacobi (order-independent, reproducible). //! Gauss-Seidel is opt-in via `NYX_JS_GAUSS_SEIDEL=1` (or the //! test-only override). The two variants must produce **equal -//! findings** on every fixture — this is the core correctness +//! findings** on every fixture, this is the core correctness //! invariant for shipping G-S behind a flag. //! //! If this test ever fails, Gauss-Seidel has a precision leak and @@ -33,7 +33,7 @@ static GS_TEST_GUARD: Mutex<()> = Mutex::new(()); /// Sort findings into a deterministic order that ignores /// non-semantic fields so we can compare Jacobi vs. Gauss-Seidel -/// runs. Comparing raw `Diag` equality would be too strict — +/// runs. Comparing raw `Diag` equality would be too strict , /// evidence ordering, span-derived IDs, and rank scores can differ /// harmlessly between variants. We assert on the tuple /// `(path, line, col, id, severity, suppressed)` which is the @@ -68,8 +68,8 @@ fn finding_identities( /// Phase-C correctness invariant: Jacobi and Gauss-Seidel produce /// **equal findings** on the deep-chain fixture. /// -/// Gauss-Seidel may converge in fewer iterations — that is the whole -/// point of the optimisation — but the set of findings and their +/// Gauss-Seidel may converge in fewer iterations, that is the whole +/// point of the optimisation, but the set of findings and their /// primary locations must be identical. A divergence here would /// mean G-S is cutting off a real flow or introducing a spurious /// one; ship-blocking either way. diff --git a/tests/health_score_calibration.rs b/tests/health_score_calibration.rs index 50f3ca9d..10388267 100644 --- a/tests/health_score_calibration.rs +++ b/tests/health_score_calibration.rs @@ -6,7 +6,7 @@ //! fails fast if the change silently re-grades the boundary cases. //! //! Bands are deliberately wide (±5 points around the calibration -//! number) so honest curve-shape adjustments don't trip the test — +//! number) so honest curve-shape adjustments don't trip the test , //! it's a "did weights silently change everyone's grade?" guard, not //! an exact-output snapshot. //! diff --git a/tests/hierarchy_pipeline_tests.rs b/tests/hierarchy_pipeline_tests.rs index 38d2dfe5..2dba3113 100644 --- a/tests/hierarchy_pipeline_tests.rs +++ b/tests/hierarchy_pipeline_tests.rs @@ -40,7 +40,7 @@ struct File<'a> { } /// Run pass-1 extraction + merge over a synthetic file set, then -/// install the hierarchy index — mirroring exactly what production +/// install the hierarchy index, mirroring exactly what production /// scan paths do before pass 2 runs. fn build_gs(files: &[File<'_>]) -> GlobalSummaries { let cfg = test_config(AnalysisMode::Taint); @@ -63,7 +63,7 @@ fn build_gs(files: &[File<'_>]) -> GlobalSummaries { } // ───────────────────────────────────────────────────────────────────────── -// C1 — Java interface fan-out +// C1, Java interface fan-out // ───────────────────────────────────────────────────────────────────────── /// Pass-1 must extract the `class FileLogger implements ILogger` @@ -148,7 +148,7 @@ public class FileLogger implements ILogger { } // ───────────────────────────────────────────────────────────────────────── -// C2 — Rust trait fan-out +// C2, Rust trait fan-out // ───────────────────────────────────────────────────────────────────────── /// Pass-1 must extract `impl Logger for SafeLogger` and @@ -222,7 +222,7 @@ impl Logger for EvalLogger { arity: Some(2), }); // `arity = 2` because the trait method takes `(&self, &str)`. - // Some Rust pipelines record the receiver in arity, others don't — + // Some Rust pipelines record the receiver in arity, others don't , // accept either as long as both impls fan out. let widened_any_arity = if widened.is_empty() { gs.resolve_callee_widened(&CalleeQuery { @@ -252,7 +252,7 @@ impl Logger for EvalLogger { } // ───────────────────────────────────────────────────────────────────────── -// C3 — TypeScript class extends fan-out +// C3, TypeScript class extends fan-out // ───────────────────────────────────────────────────────────────────────── /// Pass-1 must extract `class Sub extends Super` and @@ -310,7 +310,7 @@ export class SubB extends Base { } // ───────────────────────────────────────────────────────────────────────── -// C4 — Python class hierarchy +// C4, Python class hierarchy // ───────────────────────────────────────────────────────────────────────── /// Pass-1 must extract `class Concrete(Base)` edges. The @@ -350,7 +350,7 @@ class Concrete(Base): } // ───────────────────────────────────────────────────────────────────────── -// C5 — Languages without an extractor are silently empty +// C5, Languages without an extractor are silently empty // ───────────────────────────────────────────────────────────────────────── /// Go's structural / implicit interface satisfaction is intractable @@ -358,7 +358,7 @@ class Concrete(Base): /// omitted** from the extractor. This test pins the contract: a Go /// program with what looks like inheritance produces an empty /// hierarchy index, and `resolve_callee_widened` collapses to today's -/// single-result behaviour — no fan-out, no regression. +/// single-result behaviour, no fan-out, no regression. #[test] fn go_program_produces_empty_hierarchy() { // Go interface + struct that satisfies it implicitly. No `extends` @@ -403,7 +403,7 @@ func (c *ConsoleLogger) Log(s string) { arity: Some(1), }); // Either empty (Logger has no Log method body in summaries) or - // single result — must NEVER fan out. + // single result, must NEVER fan out. assert!( widened.len() <= 1, "Go must produce ≤ 1 result with no hierarchy fan-out, got {widened:?}" @@ -411,7 +411,7 @@ func (c *ConsoleLogger) Log(s string) { } // ───────────────────────────────────────────────────────────────────────── -// C6 — Hierarchy install is idempotent +// C6, Hierarchy install is idempotent // ───────────────────────────────────────────────────────────────────────── /// Calling `install_hierarchy` twice produces the same view. This diff --git a/tests/hostile_input_tests.rs b/tests/hostile_input_tests.rs index a02f498d..427d38c4 100644 --- a/tests/hostile_input_tests.rs +++ b/tests/hostile_input_tests.rs @@ -4,7 +4,7 @@ //! potentially adversarial: arbitrarily large, pathologically nested, //! binary-ish, or deliberately crafted to wedge tree-sitter or the CFG //! builder. These tests exercise the user-facing size cap -//! (`scanner.max_file_size_mb`, default 16 MiB — enforced at the walker), +//! (`scanner.max_file_size_mb`, default 16 MiB, enforced at the walker), //! the per-file parse timeout (`analysis.engine.parse_timeout_ms`, default //! 10 s), and //! verify that the scanner survives several representative stress inputs @@ -81,7 +81,7 @@ where /// The walker's `max_file_size_mb` filter must drop oversize files before /// the pipeline ever opens them. This is the sole file-size gate: once a /// file is past the walker, the analysis pipeline does not re-check its -/// size — `max_file_size_mb = null` means truly unlimited parsing. The +/// size, `max_file_size_mb = null` means truly unlimited parsing. The /// pattern here (explicit `Some(1)`) is the interface every downstream /// caller can use to tighten the default further. #[test] @@ -96,7 +96,7 @@ fn walker_max_file_size_drops_oversize_files_before_scan() { std::fs::write(root.join("big.js"), big).unwrap(); let mut cfg = hostile_cfg(); - cfg.scanner.max_file_size_mb = Some(1); // 1 MiB — drops big.js, keeps small.js + cfg.scanner.max_file_size_mb = Some(1); // 1 MiB, drops big.js, keeps small.js let diags = scan_no_index(root, &cfg).expect("scan should succeed even with oversize files present"); @@ -109,7 +109,7 @@ fn walker_max_file_size_drops_oversize_files_before_scan() { /// Release-hardening regression: the default `ScannerConfig` must carry a /// finite ceiling so a fresh install never tries to parse a multi-gigabyte /// file from an untrusted repo. This test does not hard-code the exact -/// value — the property is that the default is *not* unlimited. +/// value, the property is that the default is *not* unlimited. #[test] fn default_config_has_finite_max_file_size() { let cfg = Config::default(); @@ -161,7 +161,7 @@ fn default_config_drops_file_above_cap() { } /// Operators who explicitly set `max_file_size_mb = null` must actually get -/// unlimited scanning — no silent hard cap overrides their decision. This +/// unlimited scanning, no silent hard cap overrides their decision. This /// locks in the contract: "unlimited means unlimited, trust the operator." /// The test uses a deliberately unsafe-looking JS source and asserts that /// the finding surfaces only in the unlimited run. @@ -182,7 +182,7 @@ fn explicit_unlimited_lifts_size_cap() { let mut cfg = hostile_cfg(); - // 1 MiB cap — must drop big.js entirely. + // 1 MiB cap, must drop big.js entirely. cfg.scanner.max_file_size_mb = Some(1); let tight = scan_no_index(root, &cfg).expect("tight-cap scan must succeed"); assert!( @@ -190,7 +190,7 @@ fn explicit_unlimited_lifts_size_cap() { "sanity: tight cap must have dropped big.js: {tight:?}", ); - // Explicit unlimited — the same file must now be visible to the + // Explicit unlimited, the same file must now be visible to the // scanner. Any pipeline exception would surface as a non-success. cfg.scanner.max_file_size_mb = None; let unlimited = with_time_budget(Duration::from_secs(20), "unlimited scan", || { @@ -257,7 +257,7 @@ fn empty_file_is_noop() { /// without blowing up. Minified bundles routinely hit this shape. We /// model it as ~10 000 independent short statements on one line (roughly /// what you see after bundler output) rather than one 500k-deep -/// right-associative expression — the latter is a separate stress case +/// right-associative expression, the latter is a separate stress case /// dominated by recursive descent and not representative of real input. /// /// Generous debug-build budget (20 s) because the full analysis pipeline @@ -315,7 +315,7 @@ fn deeply_nested_parens_do_not_stack_overflow() { /// builder. Each `if` frame in `build_sub` is ~10 KiB on debug builds, so /// 100 levels fits comfortably inside the production 8 MiB stack with room /// for the rest of the analysis pipeline above it. The goal is not to -/// probe the absolute limit — it is to lock in that a realistic generated- +/// probe the absolute limit, it is to lock in that a realistic generated- /// code depth does not crash the scanner. #[test] fn deeply_nested_if_statements_do_not_stack_overflow() { @@ -435,7 +435,7 @@ fn scan_of_mixed_hostile_directory_is_bounded() { } // ─────────────────────────────────────────────────────────────────────────── -// Symlink loops — infinite-loop resistance +// Symlink loops, infinite-loop resistance // ─────────────────────────────────────────────────────────────────────────── /// A self-referencing symlink (`a/self -> ../a`) is a classic hostile-input @@ -473,7 +473,7 @@ fn symlink_loop_does_not_hang_with_follow() { } /// Same fixture with `follow_symlinks = false` must also terminate in -/// bounded time — the symlink is not followed, so the loop never expands, +/// bounded time, the symlink is not followed, so the loop never expands, /// but we pin the contract so flipping the default cannot introduce a hang /// regression. #[cfg(unix)] diff --git a/tests/incremental_index_tests.rs b/tests/incremental_index_tests.rs index 06ef7fc8..d4f7bbc6 100644 --- a/tests/incremental_index_tests.rs +++ b/tests/incremental_index_tests.rs @@ -4,7 +4,7 @@ //! `FuncKey.disambig`. An earlier implementation keyed that field on //! the function node's `start_byte`, so inserting a line *above* an //! unchanged anonymous function shifted its identity and invalidated -//! persisted callback bindings and SSA summaries that referenced it — +//! persisted callback bindings and SSA summaries that referenced it , //! producing different diagnostics for semantically identical code. //! //! The disambig is now a depth-first preorder index over the file's @@ -52,7 +52,7 @@ struct ShiftedKey { } /// Normalize a rule id so the embedded `(source N:M)` suffix on taint -/// findings — which names the *source* line — is shifted by `line_delta` +/// findings, which names the *source* line, is shifted by `line_delta` /// instead of compared literally. fn normalize_rule_id(id: &str, line_delta: i64) -> String { let Some(open) = id.find("(source ") else { @@ -121,7 +121,7 @@ fn cold_build_index(project: &str, root: &Path, db_path: &Path, mode: AnalysisMo } // ───────────────────────────────────────────────────────────────────────────── -// Test 1 — local edit above a nested anonymous function. +// Test 1, local edit above a nested anonymous function. // ───────────────────────────────────────────────────────────────────────────── const LOCAL_FIXTURE_BEFORE: &str = "\ @@ -178,7 +178,7 @@ fn anon_fn_finding_stable_across_blank_line_prepend() { } // ───────────────────────────────────────────────────────────────────────────── -// Test 2 — cross-file callback resolution after comment-line insert. +// Test 2, cross-file callback resolution after comment-line insert. // ───────────────────────────────────────────────────────────────────────────── const CROSS_FILE_A_BEFORE: &str = "\ @@ -216,7 +216,7 @@ fn cross_file_anon_callback_stable_across_comment_insert() { // The cross-file flow should resolve: b.js → a.js (exported anon // function) → child_process.exec. Count taint findings in either - // file as the structural invariant — rank/line may legitimately + // file as the structural invariant, rank/line may legitimately // differ between scans, but the *presence* of a taint finding on // the sink side must not regress. fn taint_count_in(diags: &[Diag], rel: &str) -> usize { diff --git a/tests/indexed_parity_tests.rs b/tests/indexed_parity_tests.rs index 6e5d47b1..4b87a282 100644 --- a/tests/indexed_parity_tests.rs +++ b/tests/indexed_parity_tests.rs @@ -15,7 +15,7 @@ //! //! Path-dependent fields (absolute path, rank_score derived from ordering, //! evidence snippets that cite absolute paths) are excluded from the -//! fingerprint because they are not expected to diverge in meaning — only +//! fingerprint because they are not expected to diverge in meaning, only //! in representation. //! //! If an engine change is justified in making indexed and non-indexed diverge, @@ -49,10 +49,10 @@ use std::sync::Arc; /// if those match, confidence matches. /// /// **Does** include: -/// - `(line, col)` — where the finding is reported. -/// - `severity` — the analyst-visible triage axis. -/// - `rule_id` — which detector fired. -/// - `path_validated` — semantic axis used by triage UIs. +/// - `(line, col)`, where the finding is reported. +/// - `severity`, the analyst-visible triage axis. +/// - `rule_id`, which detector fired. +/// - `path_validated`, semantic axis used by triage UIs. /// /// If any of these differ between paths, the engine has genuinely produced /// different *findings*, not just different metadata. @@ -113,7 +113,7 @@ fn scan_indexed_cold(fixture_root: &Path, mode: AnalysisMode) -> (Vec, Pat // Keep tempdir alive by returning the db_path; actually return ownership of td. // We leak by forgetting the tempdir since the caller only needs the diags. // (Leaving tempdir scope drops it; we want it cleaned up, so we *don't* forget.) - // The tempdir drops here and removes the file — diags are already owned. + // The tempdir drops here and removes the file, diags are already owned. std::mem::drop(td); (diags, db_path) } @@ -146,7 +146,7 @@ fn format_fingerprint_set_diff( label_b: &str, b: &[Fingerprint], ) -> String { - // Count multiplicity of each fingerprint — divergence can be a changed + // Count multiplicity of each fingerprint, divergence can be a changed // *count* even when both sides contain the same key. let mut count_a: BTreeMap<&Fingerprint, usize> = BTreeMap::new(); let mut count_b: BTreeMap<&Fingerprint, usize> = BTreeMap::new(); @@ -279,7 +279,7 @@ fn run_parity_warm(fixture_name: &str, mode: AnalysisMode) { } // ───────────────────────────────────────────────────────────────────────────── -// Fixtures under parity contract — Full mode +// Fixtures under parity contract, Full mode // ───────────────────────────────────────────────────────────────────────────── // // Representative mix covering all 10 supported languages plus cross-file @@ -423,7 +423,7 @@ fn parity_full_route_registration_noise() { } // ───────────────────────────────────────────────────────────────────────────── -// Non-Full analysis modes — the Taint-mode filter divergence lives here +// Non-Full analysis modes, the Taint-mode filter divergence lives here // ───────────────────────────────────────────────────────────────────────────── // // Taint mode is the narrowest CFG-capable mode. Historically the indexed @@ -462,7 +462,7 @@ fn parity_ast_mode_patterns() { /// `auth.*`). These are produced by `run_cfg_analyses` under *any* CFG- /// capable mode, including Taint-only. A historical filter in the indexed /// path dropped everything that wasn't `taint*`/`cfg-*` from Taint-mode -/// output, silently swallowing state findings — this test pins that fix. +/// output, silently swallowing state findings, this test pins that fix. #[test] fn parity_taint_state_fixture() { run_parity("state", AnalysisMode::Taint); @@ -479,7 +479,7 @@ fn parity_ast_state_fixture() { } // ───────────────────────────────────────────────────────────────────────────── -// Warm-scan parity — detects caching bugs in the indexed path +// Warm-scan parity, detects caching bugs in the indexed path // ───────────────────────────────────────────────────────────────────────────── #[test] @@ -540,5 +540,5 @@ fn parity_full_sweep_all_fixtures() { // // None. Release-critical modes (Full, Taint, Cfg, Ast) must match bit-for-bit // on the finding fingerprint. If you think you need to add an exception, -// the test above should be the primary gate — don't loosen parity without +// the test above should be the primary gate, don't loosen parity without // writing a test that demonstrates *why* the divergence is acceptable. diff --git a/tests/inline_cache_origin_tests.rs b/tests/inline_cache_origin_tests.rs index 3bc15161..8c1c2b36 100644 --- a/tests/inline_cache_origin_tests.rs +++ b/tests/inline_cache_origin_tests.rs @@ -9,7 +9,7 @@ //! mis-attribute its source. //! //! A failure of this test implies a `taint-unsanitised-flow` finding is -//! naming the wrong source file/line — a credibility-killer for users +//! naming the wrong source file/line, a credibility-killer for users //! who then dismiss the tool as producing false positives. mod common; @@ -89,7 +89,7 @@ fn two_call_sites_get_distinct_source_attributions() { // 16: const sourceA = process.env.USER_INPUT; (call site 1 source) // 21: const sourceB = process.env.OTHER_INPUT; (call site 2 source) // - // The critical assertion is inequality — a naive cache would report + // The critical assertion is inequality, a naive cache would report // the FIRST-cached caller's source line on both findings (baking in // `VarTaint.origins` from whichever call fired first during // traversal). We also pin the exact expected lines so a silent @@ -117,7 +117,7 @@ fn two_call_sites_get_distinct_source_attributions() { fn inline_cache_reused_note_fires_on_second_call() { // Observability: the `InlineCacheReused` engine note is recorded // on cache-hit apply. At least one of the two call sites must - // carry it — whichever call loses the miss/hit race. + // carry it, whichever call loses the miss/hit race. // // The note is informational only: `EngineNote::InlineCacheReused` // returns `false` from `lowers_confidence()`, so its presence never diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index cb236017..82dad1c1 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -272,7 +272,7 @@ fn predicate_multi_arg_validator_wrong() { /// `setAttribute(attr, val)` with a dynamic first arg returns the /// ALL_ARGS_PAYLOAD sentinel, so sink scanning expands to every positional -/// arg — a tainted attribute name is itself a vulnerability path. Expects +/// arg, a tainted attribute name is itself a vulnerability path. Expects /// at least two findings (one per call where either arg is tainted). #[test] fn gated_sink_dynamic_activation() { @@ -316,7 +316,7 @@ fn cross_file_symex_js() { // --- True positives --------------------------------------------------------- /// Go: HTTP handler in handler.go passes r.FormValue("cmd") to runCommand() -/// defined in executor.go, which calls exec.Command — shell execution sink. +/// defined in executor.go, which calls exec.Command, shell execution sink. #[test] fn cross_file_go_handler_exec() { let dir = fixture_path("cross_file_go_handler_exec"); @@ -335,7 +335,7 @@ fn cross_file_java_sqli() { } /// TypeScript: router.ts reads req.query.url and forwards it to -/// fetchRemote() in httpClient.ts, which passes it to fetch() — SSRF. +/// fetchRemote() in httpClient.ts, which passes it to fetch(), SSRF. #[test] fn cross_file_ts_ssrf() { let dir = fixture_path("cross_file_ts_ssrf"); @@ -374,7 +374,7 @@ fn cross_file_js_sqli() { validate_expectations(&diags, &dir); } -/// Python: 3-file chain — os.environ in input_reader.py → passthrough in +/// Python: 3-file chain, os.environ in input_reader.py → passthrough in /// transform.py → subprocess.call in executor.py. Taint must survive two /// inter-file hops with no sanitisation. #[test] @@ -384,7 +384,7 @@ fn cross_file_py_nested_chain() { validate_expectations(&diags, &dir); } -/// Python: object attribute carries taint across files — JobRequest.cmd is +/// Python: object attribute carries taint across files, JobRequest.cmd is /// populated from os.environ in models.py; handler.py reads req.cmd and /// passes it to subprocess.call. #[test] @@ -397,7 +397,7 @@ fn cross_file_py_object_field() { // --- True negatives --------------------------------------------------------- /// Python: shlex.quote (SHELL_ESCAPE sanitiser) is defined in shell_utils.py -/// and called from handler.py before subprocess.call — no finding expected. +/// and called from handler.py before subprocess.call, no finding expected. #[test] fn cross_file_py_shlex_sanitizer() { let dir = fixture_path("cross_file_py_shlex_sanitizer"); @@ -406,7 +406,7 @@ fn cross_file_py_shlex_sanitizer() { } /// JavaScript: xss() HTML sanitiser defined in security.js is applied before -/// document.write in app.js — no taint-unsanitised-flow expected. +/// document.write in app.js, no taint-unsanitised-flow expected. #[test] fn cross_file_js_html_sanitized() { let dir = fixture_path("cross_file_js_html_sanitized"); @@ -415,7 +415,7 @@ fn cross_file_js_html_sanitized() { } /// Python: constants.py returns a hardcoded string literal; runner.py uses it -/// in subprocess.call — no taint source exists, so no finding expected. +/// in subprocess.call, no taint source exists, so no finding expected. #[test] fn cross_file_py_const_passthrough() { let dir = fixture_path("cross_file_py_const_passthrough"); @@ -424,7 +424,7 @@ fn cross_file_py_const_passthrough() { } /// Go: validation.go converts r.FormValue("id") with strconv.Atoi (Cap::all -/// sanitiser) before handler.go calls db.QueryRow — no SQL taint expected. +/// sanitiser) before handler.go calls db.QueryRow, no SQL taint expected. #[test] fn cross_file_go_int_validated() { let dir = fixture_path("cross_file_go_int_validated"); @@ -434,10 +434,10 @@ fn cross_file_go_int_validated() { // --- Near-miss cases -------------------------------------------------------- -/// Python near miss — TRUE POSITIVE: +/// Python near miss, TRUE POSITIVE: /// html_guard.py applies html.escape (HTML_ESCAPE cap) before a SQL /// concatenation in app.py. The HTML sanitiser does not cover SQL_QUERY -/// capability, so the flow is still vulnerable — Nyx should detect it. +/// capability, so the flow is still vulnerable, Nyx should detect it. /// Tests that the engine does not over-sanitise with the wrong cap type. #[test] fn cross_file_near_miss_wrong_sanitizer() { @@ -446,7 +446,7 @@ fn cross_file_near_miss_wrong_sanitizer() { validate_expectations(&diags, &dir); } -/// JavaScript near miss — TRUE NEGATIVE: +/// JavaScript near miss, TRUE NEGATIVE: /// session.js stores user input in `lastUser` but getDefaultQuery() returns /// the constant `defaultQuery`. app.js passes the result to pool.query(). /// A coarse analysis might falsely flag this; a precise one should not. @@ -458,12 +458,12 @@ fn cross_file_near_miss_field_isolation() { validate_expectations(&diags, &dir); } -/// Same-file identity collision — ADVERSARIAL. +/// Same-file identity collision, ADVERSARIAL. /// `runTask` is defined as a free function (shell-exec sink) AND as a /// method on multiple classes in the same file with conflicting /// security behaviours. A bare `runTask(tainted)` top-level call MUST /// resolve to the free function (its summary carries a SHELL_ESCAPE -/// sink) — the pre-fix resolver returned Ambiguous for this call and +/// sink), the pre-fix resolver returned Ambiguous for this call and /// silently dropped the finding. Regression guard for the bare-call /// free-function preference (resolve_callee step 5.5). #[test] @@ -476,7 +476,7 @@ fn same_name_collisions_js() { // ── New sink coverage fixtures ──────────────────────────────────────────── /// JS: execAsync wraps child_process.exec; user input flows through the -/// wrapper to the inner exec call — SHELL_ESCAPE finding expected. +/// wrapper to the inner exec call, SHELL_ESCAPE finding expected. #[test] fn exec_async_wrapper() { let dir = fixture_path("exec_async_wrapper"); @@ -484,7 +484,7 @@ fn exec_async_wrapper() { validate_expectations(&diags, &dir); } -/// JS: res.download(path.join(root, req.query.path)) — path traversal +/// JS: res.download(path.join(root, req.query.path)), path traversal /// via Express res.download FILE_IO sink. #[test] fn path_traversal_download() { @@ -493,7 +493,7 @@ fn path_traversal_download() { validate_expectations(&diags, &dir); } -/// JS: md5(password) and crypto.createHash("sha1") — weak hash patterns. +/// JS: md5(password) and crypto.createHash("sha1"), weak hash patterns. #[test] fn weak_hash_password() { let dir = fixture_path("weak_hash_password"); @@ -698,7 +698,7 @@ fn cross_file_info_leak() { validate_expectations(&diags, &dir); } -/// Python `subprocess.run(cmd, shell=True)` where `cmd` is user-controlled — +/// Python `subprocess.run(cmd, shell=True)` where `cmd` is user-controlled , /// the multi-kwarg SHELL_ESCAPE gate activates. Validates end-to-end wiring /// of `CallMeta.kwargs` through `classify_gated_sink`'s `dangerous_kwargs` /// path (presence-aware shell=True → dangerous). @@ -709,7 +709,7 @@ fn python_subprocess_shell_true_tainted() { validate_expectations(&diags, &dir); } -/// Python `subprocess.run([cmd], shell=False)` — shell kwarg present but not +/// Python `subprocess.run([cmd], shell=False)`, shell kwarg present but not /// dangerous. The gate must not fire and no taint flow should be reported. #[test] fn python_subprocess_shell_false_safe() { @@ -718,7 +718,7 @@ fn python_subprocess_shell_false_safe() { validate_expectations(&diags, &dir); } -/// Python `subprocess.run([cmd])` — no shell kwarg (default shell=False). +/// Python `subprocess.run([cmd])`, no shell kwarg (default shell=False). /// The gate must not fire and no taint flow should be reported. #[test] fn python_subprocess_shell_default_safe() { @@ -736,7 +736,7 @@ fn python_subprocess_shell_default_safe() { // into five categories so a single regression cannot silently erase a // whole category's coverage. -/// FP guard — sanitizer edge case: hand-rolled HTML escape covers +/// FP guard, sanitizer edge case: hand-rolled HTML escape covers /// document.write sink. #[test] fn fp_guard_sanitizer_html_escape_js() { @@ -745,7 +745,7 @@ fn fp_guard_sanitizer_html_escape_js() { validate_expectations(&diags, &dir); } -/// FP guard — sanitizer edge case: shlex.quote with shell metacharacters. +/// FP guard, sanitizer edge case: shlex.quote with shell metacharacters. #[test] fn fp_guard_sanitizer_shlex_quote_py() { let dir = fixture_path("fp_guards/sanitizer_shlex_quote_py"); @@ -753,7 +753,7 @@ fn fp_guard_sanitizer_shlex_quote_py() { validate_expectations(&diags, &dir); } -/// FP guard — sanitizer edge case: encodeURIComponent on a URL argument. +/// FP guard, sanitizer edge case: encodeURIComponent on a URL argument. #[test] fn fp_guard_sanitizer_url_encode_js() { let dir = fixture_path("fp_guards/sanitizer_url_encode_js"); @@ -761,7 +761,7 @@ fn fp_guard_sanitizer_url_encode_js() { validate_expectations(&diags, &dir); } -/// FP guard — sanitizer edge case: multi-step chain (`.strip()` then +/// FP guard, sanitizer edge case: multi-step chain (`.strip()` then /// `shlex.quote`) preserves the final SHELL_ESCAPE cap. #[test] fn fp_guard_sanitizer_multi_step_py() { @@ -770,7 +770,7 @@ fn fp_guard_sanitizer_multi_step_py() { validate_expectations(&diags, &dir); } -/// FP guard — type-driven suppression: `int()` parse of env port +/// FP guard, type-driven suppression: `int()` parse of env port /// before `socket.bind`. #[test] fn fp_guard_types_int_port_py() { @@ -779,7 +779,7 @@ fn fp_guard_types_int_port_py() { validate_expectations(&diags, &dir); } -/// FP guard — type-driven suppression: `int()` parse guarantees SQL +/// FP guard, type-driven suppression: `int()` parse guarantees SQL /// concat is decimal-only. #[test] fn fp_guard_types_int_id_sql_py() { @@ -788,7 +788,7 @@ fn fp_guard_types_int_id_sql_py() { validate_expectations(&diags, &dir); } -/// FP guard — type-driven suppression: Go `strconv.Atoi` covers +/// FP guard, type-driven suppression: Go `strconv.Atoi` covers /// Cap::all on the resulting int. #[test] fn fp_guard_types_parse_int_go() { @@ -797,7 +797,7 @@ fn fp_guard_types_parse_int_go() { validate_expectations(&diags, &dir); } -/// FP guard — type-driven suppression: bool comparison never reaches +/// FP guard, type-driven suppression: bool comparison never reaches /// a string-context sink. #[test] fn fp_guard_types_bool_flag_py() { @@ -806,7 +806,7 @@ fn fp_guard_types_bool_flag_py() { validate_expectations(&diags, &dir); } -/// FP guard — struct-field isolation: JS object `safeField` used at +/// FP guard, struct-field isolation: JS object `safeField` used at /// sink, tainted `unsafeField` unused. #[test] fn fp_guard_fields_object_isolation_js() { @@ -815,7 +815,7 @@ fn fp_guard_fields_object_isolation_js() { validate_expectations(&diags, &dir); } -/// FP guard — struct-field isolation: Python class attributes — only +/// FP guard, struct-field isolation: Python class attributes, only /// the hardcoded attribute flows to the sink. #[test] fn fp_guard_fields_class_attr_py() { @@ -824,7 +824,7 @@ fn fp_guard_fields_class_attr_py() { validate_expectations(&diags, &dir); } -/// FP guard — struct-field isolation: Python dict keys — only the +/// FP guard, struct-field isolation: Python dict keys, only the /// constant key flows to the sink. #[test] fn fp_guard_fields_dict_key_py() { @@ -833,7 +833,7 @@ fn fp_guard_fields_dict_key_py() { validate_expectations(&diags, &dir); } -/// FP guard — struct-field isolation: nested JS objects — sibling path +/// FP guard, struct-field isolation: nested JS objects, sibling path /// isolation at `cfg.auth.*`. #[test] fn fp_guard_fields_nested_object_js() { @@ -842,7 +842,7 @@ fn fp_guard_fields_nested_object_js() { validate_expectations(&diags, &dir); } -/// FP guard — cross-call-site specialization: same callee, two callers +/// FP guard, cross-call-site specialization: same callee, two callers /// (one tainted, one constant). Required finding only from the /// tainted caller. #[test] @@ -852,7 +852,7 @@ fn fp_guard_call_site_specialization_py() { validate_expectations(&diags, &dir); } -/// FP guard — cross-call-site specialization: JS helper called with a +/// FP guard, cross-call-site specialization: JS helper called with a /// literal SQL string must not inherit taint. #[test] fn fp_guard_call_site_specialization_js() { @@ -861,7 +861,7 @@ fn fp_guard_call_site_specialization_js() { validate_expectations(&diags, &dir); } -/// FP guard — cross-call-site specialization: helper called with a +/// FP guard, cross-call-site specialization: helper called with a /// shlex.quote-sanitised value, inline analysis sees SHELL_ESCAPE cap. #[test] fn fp_guard_call_site_sanitized_caller_py() { @@ -870,8 +870,8 @@ fn fp_guard_call_site_sanitized_caller_py() { validate_expectations(&diags, &dir); } -/// FP guard — cross-call-site specialization: polymorphic caller -/// (int branch and constant branch) — neither carries a payload. +/// FP guard, cross-call-site specialization: polymorphic caller +/// (int branch and constant branch), neither carries a payload. #[test] fn fp_guard_call_site_polymorphic_py() { let dir = fixture_path("fp_guards/call_site_polymorphic_py"); @@ -879,7 +879,7 @@ fn fp_guard_call_site_polymorphic_py() { validate_expectations(&diags, &dir); } -/// FP guard — framework-safe pattern: Rails `sanitize` before render. +/// FP guard, framework-safe pattern: Rails `sanitize` before render. #[test] fn fp_guard_framework_rails_sanitize() { let dir = fixture_path("fp_guards/framework_rails_sanitize"); @@ -887,7 +887,7 @@ fn fp_guard_framework_rails_sanitize() { validate_expectations(&diags, &dir); } -/// FP guard — framework-safe pattern: Flask + MarkupSafe `escape`. +/// FP guard, framework-safe pattern: Flask + MarkupSafe `escape`. #[test] fn fp_guard_framework_flask_escape() { let dir = fixture_path("fp_guards/framework_flask_escape"); @@ -895,7 +895,7 @@ fn fp_guard_framework_flask_escape() { validate_expectations(&diags, &dir); } -/// FP guard — framework-safe pattern: Express `res.json` with a +/// FP guard, framework-safe pattern: Express `res.json` with a /// constant payload is not an XSS sink. #[test] fn fp_guard_framework_express_res_json() { @@ -904,7 +904,21 @@ fn fp_guard_framework_express_res_json() { validate_expectations(&diags, &dir); } -/// FP guard — framework-safe pattern: JDBC PreparedStatement.setString +/// FP guard, FastAPI `dependencies=[Depends(requires_access_*)]` +/// route-level guard short-circuits `auth_check_covers_subject` so +/// the handler body's path-param ORM calls and row-variable method +/// calls do not trip `py.auth.missing_ownership_check`. Pinned by +/// the `is_route_level` flag on `AuthCheck` plus the kind-aware +/// `function_params_route_handler` that includes id-like Python +/// typed params (`dag_id: str`) in `unit.params`. +#[test] +fn fp_guard_framework_fastapi_route_level_auth() { + let dir = fixture_path("fp_guards/framework_fastapi_route_level_auth"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + +/// FP guard, framework-safe pattern: JDBC PreparedStatement.setString /// covers SQL_QUERY on the bound parameter. #[test] fn fp_guard_framework_prepared_stmt_java() { @@ -913,7 +927,7 @@ fn fp_guard_framework_prepared_stmt_java() { validate_expectations(&diags, &dir); } -/// FP guard — JPA parameterised execute chain +/// FP guard, JPA parameterised execute chain /// (`em.createQuery(LITERAL).setParameter(...).executeUpdate()`). /// Pinned from a 150-finding cluster in keycloak's /// `JpaEventStoreProvider.java`. The engine walks the receiver chain @@ -928,7 +942,24 @@ fn fp_guard_framework_jpa_parameterised_execute() { validate_expectations(&diags, &dir); } -/// FP guard — composer / PSR-4 autoloader closure includes a parameter. +/// FP guard, Strapi-style ORM accessor chain +/// (`.db.query(MODEL_UID).(...)`). Pinned from a +/// ~98-finding `cfg-unguarded-sink` + 40-finding `taint-unsanitised-flow` +/// cluster across strapi services (api-token, transfer/token, user, +/// release, …). When the chain shape `*.query(LITERAL).` , +/// `findOne|findMany|findFirst|findUnique|find|create|createMany|update| +/// updateMany|upsert|delete|deleteMany|count|aggregate|distinct|save` , +/// is detected, a same-node `Sanitizer(SQL_QUERY)` is synthesised that +/// reflexively dominates the sink. Bare `connection.query(...)` and +/// chained `.then` (Promise method) are not affected. +#[test] +fn fp_guard_framework_strapi_db_query_chain() { + let dir = fixture_path("fp_guards/framework_strapi_db_query_chain"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + +/// FP guard, composer / PSR-4 autoloader closure includes a parameter. /// Pinned from a 32-finding cluster in nextcloud's vendored /// `composer/composer/ClassLoader.php` plus three further methods /// (Router::requireRouteFile, Installer::includeAppScript, @@ -943,7 +974,7 @@ fn fp_guard_php_include_param_passthrough() { validate_expectations(&diags, &dir); } -/// FP guard — `unserialize($x, ['allowed_classes' => …])` PHP 7+ +/// FP guard, `unserialize($x, ['allowed_classes' => …])` PHP 7+ /// structural mitigation against object injection. Pinned from /// nextcloud's profiler / DAV custom-properties / queue-bus call sites /// where `allowed_classes` is set to `false`, an array literal, or a @@ -955,7 +986,28 @@ fn fp_guard_php_unserialize_allowed_classes() { validate_expectations(&diags, &dir); } -/// FP guard — C/C++ buffer-overflow pattern rules +/// FP guard, JS / TS local-collection receivers. Pinned from the +/// excalidraw element-manipulation cluster (66 → ~9 on +/// `js.auth.missing_ownership_check` over the repo). The fix lives at +/// the deepest representable layer: SSA `TypeFacts::constructor_type` +/// recognises `new Map()` / `new Set()` / `new WeakMap()` / +/// `new WeakSet()` / `new Array()` as `TypeKind::LocalCollection`; +/// `cfg::params::ts_type_to_local_collection` extends +/// `classify_param_type_ts` so explicitly-typed params resolve to +/// `LocalCollection` independent of NestJS decorator presence; +/// `cfg::dto::collect_type_alias_local_collections` populates a +/// per-file `TYPE_ALIAS_LC` set so same-file `type X = Map<...>` +/// aliases also resolve. The auth analyser already exempts +/// `LocalCollection`-typed receivers via +/// `auth_analysis::sink_class_for_type → InMemoryLocal`. +#[test] +fn fp_guard_auth_local_collection_receiver() { + let dir = fixture_path("fp_guards/auth_local_collection_receiver"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + +/// FP guard, C/C++ buffer-overflow pattern rules /// (`c.memory.strcpy`, `strcat`, `sprintf`) over-fire when the source / /// format-string argument is a literal whose contributed length is /// statically bounded. Pinned from a 938-finding cluster across postgres @@ -969,3 +1021,71 @@ fn fp_guard_c_buffer_literal_src() { let diags = scan_fixture_dir(&dir, AnalysisMode::Full); validate_expectations(&diags, &dir); } + +/// FP guard, `rs.auth.missing_ownership_check` over-fires on Rust +/// helpers when (a) a parameter's TYPE annotation contains an +/// identifier whose lower-case form matches the framework-request-name +/// allow-list (`path`, `req`, `request`, `ctx`, `body`, …), e.g. +/// `dst: &std::path::Path` contributes the `Path` ident, or (b) a +/// receiver typed as an in-memory container (`RoaringBitmap`, +/// `HashMap`, `HashSet`) is treated as a `DbMutation` because +/// the verb-name dispatch (`is_mutation: insert/remove`) doesn't see +/// the type. Both clusters surfaced from meilisearch's +/// `index-scheduler` crate +/// (`scheduler/process_snapshot_creation.rs::remove_tasks` for (a), +/// `scheduler/enterprise_edition/network.rs::balance_shards` for (b)). +/// +/// Engine fixes: +/// * `src/auth_analysis/extract/common.rs::collect_param_names` , +/// added a Rust `parameter` arm that descends only into the +/// `pattern` field, never the `type` field. Type-segment idents +/// no longer pollute `unit.params` and the +/// `unit_has_user_input_evidence` gate stays closed on internal +/// helpers whose true params carry no user-input shape. +/// * `src/cfg/params.rs::rust_type_to_local_collection` (new) + +/// `classify_param_type_rust` rewire, Rust function-parameter +/// type annotations naming a known local-collection type +/// (`Vec`/`HashMap`/`HashSet`/`BTreeMap`/`BTreeSet`/`VecDeque`/ +/// `BinaryHeap`/`LinkedList`/`IndexMap`/`IndexSet`/`SmallVec`/ +/// `DashMap`/`DashSet`/`FxHashMap`/`FxHashSet`/`RoaringBitmap`/ +/// `RoaringTreemap`, plus `[T; N]` / `[T]` array-and-slice +/// shorthand) classify the receiver as `TypeKind::LocalCollection`, +/// which `auth_analysis::sink_class_for_type` maps to +/// `SinkClass::InMemoryLocal` (non-auth-relevant). +/// * `src/ssa/type_facts.rs::is_rust_local_collection_constructor` , +/// `RoaringBitmap` / `RoaringTreemap` added to the constructor-type +/// table so `let s = RoaringBitmap::new(); s.insert(...)` also +/// classifies correctly. +/// +/// Persistent-store types like heed `Database<...>` / `sled::Db` / +/// `Mutex>` deliberately stay `None` so real IDOR +/// detection on persistent-store calls is preserved (covered by the +/// `unsafe_handler_local_collection_does_not_blanket_suppress.rs` +/// vulnerable counterpart). +#[test] +fn fp_guard_auth_rust_param_typed_local_collection() { + let dir = fixture_path("fp_guards/auth_rust_param_typed_local_collection"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + +/// Panic guard, CFG condition-text truncation (and symex display +/// truncation) must round byte cuts down to the nearest UTF-8 char +/// boundary. Reproduces the gogs scan crash where +/// `public/plugins/codemirror-5.17.0/mode/gherkin/gherkin.js` ships a +/// long localised regex (Gurmukhi `ਖ`, Devanagari, CJK, Cyrillic…) inside +/// a boolean sub-condition; byte 256 landed inside `'ਖ'` (3-byte UTF-8) +/// and `t[..MAX_CONDITION_TEXT_LEN].to_string()` panicked the rayon +/// worker. Engine fix: +/// `src/utils/snippet.rs::truncate_at_char_boundary`, applied at three +/// CFG sites (`src/cfg/conditions.rs::push_condition_node`, +/// `emit_rust_match_guard_if`, `src/cfg/mod.rs::extract_condition`) and +/// two symex display sites (`src/symex/value.rs::Display`). Invariant: +/// scanning this file must terminate without panicking, regardless of +/// where byte 256 lands inside the regex literal. +#[test] +fn fp_guard_cfg_utf8_long_condition() { + let dir = fixture_path("fp_guards/cfg_utf8_long_condition"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} diff --git a/tests/js_ts_pass2_convergence_tests.rs b/tests/js_ts_pass2_convergence_tests.rs index 2a47fb03..2e2b76ad 100644 --- a/tests/js_ts_pass2_convergence_tests.rs +++ b/tests/js_ts_pass2_convergence_tests.rs @@ -4,7 +4,7 @@ //! body's exit state (filtered to top-level keys) back into the shared //! seed and re-runs non-toplevel bodies with the enlarged seed. The //! hardcoded cap of `3` that used to live in `analyse_file` silently -//! truncated any file whose convergence required 4+ rounds — this +//! truncated any file whose convergence required 4+ rounds, this //! phase lifts the cap to [`JS_TS_PASS2_SAFETY_CAP`] (64), adds an //! observability counter, and tags cap-hit findings with //! [`EngineNote::InFileFixpointCapped`]. @@ -30,7 +30,7 @@ fn fixture_path(name: &str) -> std::path::PathBuf { /// Serialize any test that mutates the pass-2 cap override or reads /// `last_js_ts_pass2_iterations()`. The override is a process-wide -/// `AtomicUsize` and `cargo test` runs tests in parallel by default — +/// `AtomicUsize` and `cargo test` runs tests in parallel by default , /// without this guard, one test's override leaks into another's scan. static PASS2_TEST_GUARD: Mutex<()> = Mutex::new(()); @@ -73,7 +73,7 @@ fn js_ts_pass2_deep_chain_emits_transitive_finding() { /// Override plumbing: verify that `set_js_ts_pass2_cap_override` binds /// the effective cap and that restoring the default clears cleanly. /// -/// We use a cap of 1 (meaning `rounds == 0` — the pass-2 loop does not +/// We use a cap of 1 (meaning `rounds == 0`, the pass-2 loop does not /// enter). This is the sharpest possible override and exercises the /// "cap bound to minimum" code path. The counter must then fall back /// to the pass-1-only value of `1`. @@ -82,7 +82,7 @@ fn js_ts_pass2_cap_override_binds_effective_cap() { let _guard = PASS2_TEST_GUARD.lock().unwrap_or_else(|e| e.into_inner()); let dir = fixture_path("js_ts_pass2_deep_chain"); - // First scan with the cap forced to 1 — the pass-2 loop does not + // First scan with the cap forced to 1, the pass-2 loop does not // enter at all (`max_iterations.saturating_sub(1) == 0`). The // counter must report exactly `1` (the sentinel for "pass-1 // containment ran, no pass-2 iterations"). @@ -120,7 +120,7 @@ fn js_ts_pass2_cap_override_binds_effective_cap() { /// identify potentially-imprecise results. The deep-chain fixture's /// pass-2 seed actually grows between rounds (`seed_handler` publishes /// `globalG1` to other bodies), so forcing the cap to `2` binds the -/// loop at a single round — the seed grew, no convergence was +/// loop at a single round, the seed grew, no convergence was /// detected, and the note path fires. #[test] fn js_ts_pass2_cap_hit_emits_engine_note() { @@ -130,7 +130,7 @@ fn js_ts_pass2_cap_hit_emits_engine_note() { // cap=2 → max_iterations=2, rounds=1. Round 0 combines // `seed_handler`'s exit (which includes `globalG1`) into the - // seed — the seed grows from empty to 1 entry, so the + // seed, the seed grows from empty to 1 entry, so the // convergence-equality branch does not fire. Loop exits with // `converged_early = false`, note emission triggers. set_js_ts_pass2_cap_override(2); diff --git a/tests/malformed_config_tests.rs b/tests/malformed_config_tests.rs index a6e5db0a..d4eab335 100644 --- a/tests/malformed_config_tests.rs +++ b/tests/malformed_config_tests.rs @@ -39,7 +39,7 @@ fn load_with_local(dir: &Path, contents: &str) -> Result { fn syntactically_invalid_toml_returns_parse_error() { let tmp = tempfile::tempdir().unwrap(); - // `foo = [[` is an unterminated array-of-tables header — pure syntax + // `foo = [[` is an unterminated array-of-tables header, pure syntax // error at the lexer level. let result = load_with_local(tmp.path(), "foo = [[\n"); @@ -62,7 +62,7 @@ fn syntactically_invalid_toml_returns_parse_error() { fn type_mismatch_in_known_field_returns_error() { let tmp = tempfile::tempdir().unwrap(); - // `performance.worker_threads` is typed `Option` — a bare string + // `performance.worker_threads` is typed `Option`, a bare string // is unambiguously wrong and must be rejected. let contents = "\ [performance]\n\ @@ -74,7 +74,7 @@ worker_threads = \"auto\"\n\ Err(NyxError::Toml(e)) => { let msg = e.to_string(); // Deserialisation errors should name either the field or the - // expected type — be lenient on exact wording. + // expected type, be lenient on exact wording. assert!( msg.contains("worker_threads") || msg.to_lowercase().contains("integer") @@ -91,7 +91,7 @@ worker_threads = \"auto\"\n\ /// A semantically-invalid config (e.g. `server.port = 0`) must be caught by /// `Config::validate`, surfacing as a `ConfigValidation` error that lists /// the offending section and field. This is a second layer of defence past -/// deserialisation — types parse fine, but values are out of range. +/// deserialisation, types parse fine, but values are out of range. #[test] fn out_of_range_value_fails_validation() { let tmp = tempfile::tempdir().unwrap(); @@ -120,7 +120,7 @@ port = 0\n\ /// (e.g. switching to strict mode) is explicit rather than surprising. /// /// If strict-mode is later desired, this test should be flipped to assert -/// the error path — but in either case the behaviour is explicit. +/// the error path, but in either case the behaviour is explicit. #[test] fn unknown_top_level_section_is_tolerated_today() { let tmp = tempfile::tempdir().unwrap(); @@ -157,7 +157,7 @@ bogus_unknown_field = 42\n\ )); } -/// Empty `nyx.local` (zero-byte file) must load cleanly — the merge overlays +/// Empty `nyx.local` (zero-byte file) must load cleanly, the merge overlays /// nothing onto defaults. #[test] fn empty_user_config_uses_defaults() { diff --git a/tests/panic_recovery_tests.rs b/tests/panic_recovery_tests.rs index 7cdd94e7..b35d2680 100644 --- a/tests/panic_recovery_tests.rs +++ b/tests/panic_recovery_tests.rs @@ -18,7 +18,7 @@ use std::panic::AssertUnwindSafe; use std::path::Path; use std::sync::Mutex; -/// Env-var writes are process-global — integration tests run multiple +/// Env-var writes are process-global, integration tests run multiple /// `#[test]` functions in one binary, and rayon dispatches the analyser on /// background threads that read the env table concurrently. Serialize the /// set/clear dance so a test that expects "no injection" never races a test @@ -73,7 +73,7 @@ where } /// With injection armed and a file whose path contains the marker, the scan -/// MUST fail in a way the caller can observe — either a propagated panic or +/// MUST fail in a way the caller can observe, either a propagated panic or /// a returned error. Silently succeeding would mean findings from poisoned /// analysis were emitted as legitimate output. We also verify the clean /// file on disk is a plausible target (the injection only fires for the @@ -83,7 +83,7 @@ fn scan_surfaces_injected_panic_from_worker() { let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); - // Clean file — if the injection hook incorrectly fired on every path we + // Clean file, if the injection hook incorrectly fired on every path we // would see this one panic too. std::fs::write( root.join("normal.js"), @@ -92,7 +92,7 @@ fn scan_surfaces_injected_panic_from_worker() { ) .unwrap(); - // File whose path contains the marker — must trigger the injected panic. + // File whose path contains the marker, must trigger the injected panic. let poisoned = format!("{PANIC_MARKER}.js"); std::fs::write( root.join(&poisoned), @@ -111,16 +111,16 @@ fn scan_surfaces_injected_panic_from_worker() { // Current behaviour (pre-`enable_panic_recovery`): the scan panics // out of rayon. If a future change adds panic containment, the scan - // would instead return Ok with a warning — that counts as surfacing + // would instead return Ok with a warning, that counts as surfacing // the failure and is also acceptable here. The thing we refuse to // accept silently is a successful scan that claims the poisoned file // was analysed without incident. match outcome { Err(_panic) => { - // Panic propagated — expected today. + // Panic propagated, expected today. } Ok(Err(_nyx_err)) => { - // Graceful error — acceptable if recovery ever lands. + // Graceful error, acceptable if recovery ever lands. } Ok(Ok(_diags)) => { // If the scan completes successfully, the poisoned file was @@ -151,7 +151,7 @@ fn clean_scan_without_injection_does_not_panic() { std::fs::write(root.join(format!("{PANIC_MARKER}.js")), b"var safe = 1;\n").unwrap(); // Ensure the marker is not armed for this test even if a prior test - // leaked state (belt-and-suspenders — `with_panic_injection` already + // leaked state (belt-and-suspenders, `with_panic_injection` already // cleans up, but concurrent test binaries share a process env). let guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); unsafe { @@ -161,7 +161,7 @@ fn clean_scan_without_injection_does_not_panic() { .expect("clean scan with injection disarmed must succeed"); drop(guard); - // The JS file has cp.exec(cmd) on a tainted arg — at minimum one + // The JS file has cp.exec(cmd) on a tainted arg, at minimum one // finding should surface, proving the scan actually analysed files // rather than silently short-circuiting. assert!( @@ -203,7 +203,7 @@ fn recovery_mode_skips_poisoned_file_and_continues() { let tmp = tempfile::tempdir().unwrap(); let root = tmp.path(); - // Clean file with a tainted cp.exec — we expect at least one finding. + // Clean file with a tainted cp.exec, we expect at least one finding. std::fs::write( root.join("normal.js"), b"const cp = require('child_process');\n\ @@ -238,7 +238,7 @@ fn recovery_mode_skips_poisoned_file_and_continues() { ), }; - // The clean file must still surface its finding — proof the rayon + // The clean file must still surface its finding, proof the rayon // pipeline kept running after the poisoned worker panicked. assert!( diags diff --git a/tests/parse_timeout_test.rs b/tests/parse_timeout_test.rs index fa8dd6c9..6435e30e 100644 --- a/tests/parse_timeout_test.rs +++ b/tests/parse_timeout_test.rs @@ -45,7 +45,7 @@ fn build_bulk_source(bytes_target: usize) -> String { #[test] fn parse_timeout_config_short_circuits_parse() { - // ~1 MiB of valid JS — plenty of real parser work to observe the + // ~1 MiB of valid JS, plenty of real parser work to observe the // timeout. Still well under MAX_PARSE_BYTES. let source = build_bulk_source(1_000_000); @@ -68,7 +68,7 @@ fn parse_timeout_config_short_circuits_parse() { // A timed-out parse surfaces a synthetic informational diag // carrying an `EngineNote::ParseTimeout` so downstream tooling can // tell "we found nothing" from "we stopped looking". Any other - // finding would imply the parser actually produced a tree — i.e. + // finding would imply the parser actually produced a tree, i.e. // the timeout did not short-circuit. assert!( diags.iter().all(|d| d.id == "engine.parse_timeout"), diff --git a/tests/pattern_tests.rs b/tests/pattern_tests.rs index 28bddb63..f320d965 100644 --- a/tests/pattern_tests.rs +++ b/tests/pattern_tests.rs @@ -204,7 +204,7 @@ fn tier_a_patterns_have_no_heuristic_in_description() { } } - // Warn but don't fail — descriptions are informational + // Warn but don't fail, descriptions are informational if !violations.is_empty() { eprintln!( "WARNING: Tier A patterns with heuristic-like descriptions:\n {}", @@ -277,6 +277,9 @@ fn positive_java() { "java.reflection.method_invoke", "java.sqli.execute_concat", "java.crypto.insecure_random", + // CVE-2022-1471 SnakeYAML / CVE-2022-42889 Text4Shell. + "java.deser.snakeyaml_unsafe_constructor", + "java.code_exec.text4shell_interpolator", ], ); } @@ -293,6 +296,11 @@ fn positive_python() { "py.cmdi.os_popen", "py.deser.pickle_loads", "py.deser.yaml_load", + // CVE-2025-69662 / CVE-2025-24793 motivated f-string SQLi. + // py.sqli.execute_format must fire on the f-string shape and + // py.sqli.text_format must fire on the SQLAlchemy text() shape. + "py.sqli.execute_format", + "py.sqli.text_format", ], ); } diff --git a/tests/perf_breakdown.rs b/tests/perf_breakdown.rs index 1fccad62..c067f767 100644 --- a/tests/perf_breakdown.rs +++ b/tests/perf_breakdown.rs @@ -2,7 +2,7 @@ //! //! Run with: cargo test --test perf_breakdown --release -- --nocapture stage_breakdown //! -//! Not a regression test — prints µs/file for each pipeline stage so we can +//! Not a regression test, prints µs/file for each pipeline stage so we can //! locate hot stages without a sampling profiler. use nyx_scanner::ast; @@ -25,7 +25,7 @@ fn pct(samples: &mut [u128], p: f64) -> u128 { /// Mirrors the production `scan_filesystem` pass-1 + pass-2 shape: both /// passes call `analyse_file_fused` (pass 1 with `global=None`, pass 2 with -/// `global=Some`). This is the path the perf fix targets — the bench +/// `global=Some`). This is the path the perf fix targets, the bench /// `full_scan` benchmark instead uses `extract_summaries_from_file` + /// `run_rules_on_file`, which doesn't exercise the /// `lower_all_functions_from_bodies` redundancy fixed below. @@ -136,7 +136,7 @@ fn fused_walltime() { /// Production-equivalent fused stage breakdown: mirrors the post-round-1 /// `analyse_file_fused` pipeline (shared lowering, no double-lower). -/// Use this — `stage_breakdown` over-counts because its helper double-lowers. +/// Use this, `stage_breakdown` over-counts because its helper double-lowers. #[test] fn fused_stage_breakdown() { use nyx_scanner::ast::{analyse_file_fused, perf_stage_breakdown_fused}; diff --git a/tests/perf_tests.rs b/tests/perf_tests.rs index 767f0638..a7822768 100644 --- a/tests/perf_tests.rs +++ b/tests/perf_tests.rs @@ -91,7 +91,7 @@ fn bench_indexed(fixture_dir: &Path, iterations: usize) -> (u64, u64) { let _ = scan_with_index_parallel("bench", Arc::clone(&pool), &cfg, false, fixture_dir); cold_durations.push(start.elapsed().as_millis() as u64); - // Warm: second scan on same index — files unchanged + // Warm: second scan on same index, files unchanged let start = Instant::now(); let _ = scan_with_index_parallel("bench", Arc::clone(&pool), &cfg, false, fixture_dir); warm_durations.push(start.elapsed().as_millis() as u64); @@ -126,7 +126,7 @@ fn run_fixture_bench(name: &str) { // Shared GitHub Actions runners have unpredictable CPU contention; // give "lenient" fixtures 2x headroom so a slow-but-passing scanner // does not flake the build. "strict" fixtures still keep a small - // cushion — regressions at that level are real. + // cushion, regressions at that level are real. let multiplier = if perf.ci_mode == "lenient" { 2.0 } else { 1.25 }; let max_no_index = (perf.max_ms_no_index as f64 * multiplier) as u64; let max_cold = (perf.max_ms_index_cold as f64 * multiplier) as u64; diff --git a/tests/phase12_switch_tests.rs b/tests/phase12_switch_tests.rs index 5864bc65..26a8b201 100644 --- a/tests/phase12_switch_tests.rs +++ b/tests/phase12_switch_tests.rs @@ -1,11 +1,11 @@ //! Switch-lowering fixture coverage. //! -//! * `large_switch_go` — Go switch with 6+ mutually exclusive cases +//! * `large_switch_go`, Go switch with 6+ mutually exclusive cases //! dispatching to distinct sinks. Exercises multi-case taint flow; //! succeeds regardless of whether SSA lowering emits //! `Terminator::Switch` or the legacy cascade of `Branch` headers. //! -//! * `switch_fall_through_c` — C switch with explicit fall-through, +//! * `switch_fall_through_c`, C switch with explicit fall-through, //! regression-guarding the cascade-preserving lowering for languages //! whose switch semantics allow cases to be non-exclusive. diff --git a/tests/phase8_fragility_tests.rs b/tests/phase8_fragility_tests.rs index 1804d095..30a2b76a 100644 --- a/tests/phase8_fragility_tests.rs +++ b/tests/phase8_fragility_tests.rs @@ -9,15 +9,15 @@ //! here and update the expectations. //! //! Fixture layout: -//! * closure capture — +//! * closure capture , //! - `closure_capture_py` (required) //! - `closure_capture_js` (required) //! - `closure_capture_ts` (required) -//! * async/await — +//! * async/await , //! - `async_python` (required) -//! - `async_rust` (required — Tokio process coverage) +//! - `async_rust` (required, Tokio process coverage) //! - `async_promise_chain_js` (known gap) -//! * container-element taint — +//! * container-element taint , //! - `container_taint_py` (required) //! - `container_taint_js` (required) //! @@ -48,7 +48,7 @@ fn closure_capture_py() { validate_expectations(&diags, &dir); } -/// Closure-capture taint detection is now supported on the JS path — +/// Closure-capture taint detection is now supported on the JS path , /// arrow function captures of outer-scope tainted vars flow to the /// inner sink. See README.md for the intended flow. #[test] @@ -58,7 +58,7 @@ fn closure_capture_js() { validate_expectations(&diags, &dir); } -/// Closure-capture taint detection on the TS path — parity with the JS +/// Closure-capture taint detection on the TS path, parity with the JS /// sibling. Separately regression-guarded so the TypeScript grammar /// path does not silently diverge. #[test] @@ -84,7 +84,7 @@ fn async_rust() { validate_expectations(&diags, &dir); } -/// See README.md — taint across chained `.then` callbacks is not +/// See README.md, taint across chained `.then` callbacks is not /// modelled today. The `forbidden_findings` entry pins current /// behaviour; a future promise-resolution improvement must flip the /// expectation. diff --git a/tests/phase_c_auth_taint_tests.rs b/tests/phase_c_auth_taint_tests.rs index 13201e9a..b5e73411 100644 --- a/tests/phase_c_auth_taint_tests.rs +++ b/tests/phase_c_auth_taint_tests.rs @@ -1,4 +1,4 @@ -//! Phase C — auth-as-taint integration tests. +//! Phase C, auth-as-taint integration tests. //! //! Verifies the end-to-end flow of `Cap::UNAUTHORIZED_ID` folded into the //! SSA/taint engine: @@ -46,7 +46,7 @@ fn diags_for(diags: &[Diag], filename: &str) -> Vec { #[test] fn phase_c_flag_off_emits_no_auth_taint_finding() { - // Baseline: flag default (off) — no `rs.auth.missing_ownership_check.taint` + // Baseline: flag default (off), no `rs.auth.missing_ownership_check.taint` // diag should appear. This guards against the Phase C rules leaking when // the flag is not flipped. let cfg = common::test_config(AnalysisMode::Full); diff --git a/tests/pointer_disabled_bit_identity.rs b/tests/pointer_disabled_bit_identity.rs index 29267677..d548b440 100644 --- a/tests/pointer_disabled_bit_identity.rs +++ b/tests/pointer_disabled_bit_identity.rs @@ -8,7 +8,7 @@ //! superset that DROPS no genuine findings. //! //! Both modes are exercised in the same test process via a serial -//! mutex around env-var manipulation — cargo runs tests in parallel +//! mutex around env-var manipulation, cargo runs tests in parallel //! and an unprotected env-var write would leak between threads. //! //! A4 baseline snapshot: when the env variable @@ -43,12 +43,12 @@ fn fixture_path(name: &str) -> PathBuf { /// Fixture mix curated for the strict-additive guard. Picks shapes /// the pointer module actively touches: /// -/// * `container_taint_js` — JS container ops (push/shift/pop) flow +/// * `container_taint_js`, JS container ops (push/shift/pop) flow /// through the W2 / W4 ELEM cells when pointer is on. -/// * `container_taint_py` — Python container shapes mirror the JS path +/// * `container_taint_py`, Python container shapes mirror the JS path /// for non-method `__getitem__` / `__setitem__` (W5; deferred but /// the existing method-shape ops are still exercised). -/// * `cross_file_py_object_field` — field-flow shapes that exercise +/// * `cross_file_py_object_field`, field-flow shapes that exercise /// the W1 / W3 cross-call resolver with field-name keys. /// /// Picked deliberately small: every additional fixture multiplies the @@ -114,7 +114,7 @@ where /// it must not change the structural identity of any existing /// finding. The current curated fixtures exercise shapes the /// pointer module touches but where existing engine analyses already -/// produce all the findings — so the equality check is the right +/// produce all the findings, so the equality check is the right /// shape today. When pointer-on starts adding NEW findings to these /// fixtures, the test should be updated to assert /// `enabled.is_superset(disabled)`. @@ -178,7 +178,7 @@ fn pointer_disabled_finding_set_matches_baseline() { let snapshot_text = match std::fs::read_to_string(&snapshot_path) { Ok(s) => s, Err(_) => { - // First run / missing snapshot — write it and skip the + // First run / missing snapshot, write it and skip the // diff check. Subsequent runs will assert against this // captured value. if let Some(parent) = snapshot_path.parent() { diff --git a/tests/real_world_tests.rs b/tests/real_world_tests.rs index 001e3f6e..ad132aeb 100644 --- a/tests/real_world_tests.rs +++ b/tests/real_world_tests.rs @@ -5,10 +5,10 @@ //! //! # Environment Variables //! -//! - `NYX_TEST_LANG=python` — run only fixtures for one language -//! - `NYX_TEST_FIXTURE=cmdi_subprocess` — run only fixtures whose name contains this string -//! - `NYX_TEST_VERBOSE=1` — print full diff details for every fixture -//! - `NYX_TEST_CATEGORY=taint` — run only one category (taint/cfg/state/mixed) +//! - `NYX_TEST_LANG=python` , run only fixtures for one language +//! - `NYX_TEST_FIXTURE=cmdi_subprocess`, run only fixtures whose name contains this string +//! - `NYX_TEST_VERBOSE=1` , print full diff details for every fixture +//! - `NYX_TEST_CATEGORY=taint` , run only one category (taint/cfg/state/mixed) //! //! # Known-failure handling //! @@ -47,7 +47,7 @@ struct RealWorldExpectations { /// /// Use this to lock in precision for fixtures whose expected set is /// exhaustive for a given rule family. Typical value: - /// `["taint-unsanitised-flow"]` — any extra taint flow is a + /// `["taint-unsanitised-flow"]`, any extra taint flow is a /// precision regression. AST-pattern families (`*.code_exec.*`, /// `*.quality.*`) are intentionally excluded by default since they /// fire syntactically and bystander triggers aren't precision drift. @@ -72,7 +72,7 @@ struct ExpectedFinding { #[serde(default = "default_must_match")] must_match: bool, /// If true, presence of a matching finding is a hard failure (regression guard). - /// Overrides `must_match`. Useful for locking in FP suppressions — sanitizer + /// Overrides `must_match`. Useful for locking in FP suppressions, sanitizer /// wrappers, gated sinks, field-aware absence, Layer-B suppressions, etc. #[serde(default)] must_not_match: bool, @@ -87,7 +87,7 @@ struct ExpectedFinding { notes: String, /// Optional per-expectation mode filter. When absent, the expectation /// applies in every mode listed at the fixture level. When present, - /// only the listed modes evaluate this expectation — useful when a + /// only the listed modes evaluate this expectation, useful when a /// finding is mode-specific (e.g. a taint flow only resolves in `full` /// mode while the fixture also runs in `ast` mode for AST-pattern /// coverage). @@ -96,7 +96,7 @@ struct ExpectedFinding { /// Upper bound on matching diags. When set, the count of diags that /// match this expectation's filters (rule_id / severity / line_range / /// evidence_contains) must not exceed this value. Composes with - /// `must_match: true` — a `must_match: true, max_count: 1` expectation + /// `must_match: true`, a `must_match: true, max_count: 1` expectation /// means "exactly one matching finding must exist". Mutually exclusive /// with `must_not_match: true`; the combination is rejected at parse /// time. @@ -257,7 +257,7 @@ struct MatchResult { count_violations: Vec<(ExpectedFinding, usize)>, unexpected: Vec, /// Subset of `unexpected` whose rule-id matched a `strict_unexpected` - /// prefix for this fixture — these cause hard failure. + /// prefix for this fixture, these cause hard failure. strict_unexpected: Vec, matched: usize, } diff --git a/tests/scc_convergence_tests.rs b/tests/scc_convergence_tests.rs index 1e3ff390..2d250728 100644 --- a/tests/scc_convergence_tests.rs +++ b/tests/scc_convergence_tests.rs @@ -1,13 +1,13 @@ //! Regression tests for SCC fixed-point convergence in pass 2. //! -//! Pass 2 uses Jacobi iteration — each file in a mutually-recursive SCC +//! Pass 2 uses Jacobi iteration, each file in a mutually-recursive SCC //! is re-analysed against the *pre-iteration* `GlobalSummaries` snapshot, //! and updates are only visible on the next iteration. In a cross-file //! SCC with `k` functions arranged in a chain, facts introduced at one //! end of the chain need up to `k` iterations to propagate back to the //! other end. //! -//! Before this test was written, the hard cap was 3 — so any SCC with +//! Before this test was written, the hard cap was 3, so any SCC with //! 4+ cross-file functions silently lost precision. These fixtures //! exercise a 4-cycle and assert both that the transitive finding is //! reported and that the engine actually needed more than 3 iterations @@ -35,7 +35,7 @@ fn fixture_path(name: &str) -> std::path::PathBuf { /// Serialize any test that mutates the global SCC fix-point cap override /// or reads `last_scc_max_iterations()`. The override is a process-wide -/// `AtomicUsize` and `cargo test` runs tests in parallel by default — +/// `AtomicUsize` and `cargo test` runs tests in parallel by default , /// without this guard, one test's override leaks into another's scan and /// both the iteration count and the findings tag shift non-deterministically. static SCC_TEST_GUARD: Mutex<()> = Mutex::new(()); @@ -44,7 +44,7 @@ static SCC_TEST_GUARD: Mutex<()> = Mutex::new(()); /// across four separate files, with the only sink in `step_d`. The /// `param_to_sink` fact has to travel back through three cross-file /// summary-update iterations before `step_a`'s summary reflects the -/// transitive flow — without that, the caller in `server.py` never +/// transitive flow, without that, the caller in `server.py` never /// sees the XSS/CMDI. /// /// With the old `MAX_SCC_FIXPOINT_ITERS = 3` this test's required @@ -61,7 +61,7 @@ fn scc_deep_cycle_requires_multi_iter_convergence() { validate_expectations(&diags, &dir); // Observability assertion: prove the SCC actually exercised more - // than three iterations — otherwise this fixture would pass even + // than three iterations, otherwise this fixture would pass even // under the old bound and give false confidence. // // The exact bound is tight: a 4-cycle needs at least 4 iterations @@ -80,7 +80,7 @@ fn scc_deep_cycle_requires_multi_iter_convergence() { ); } -/// Existing 3-file Python SCC — lighter smoke test, verifies the +/// Existing 3-file Python SCC, lighter smoke test, verifies the /// iteration count stays in a sensible range. If this starts requiring /// many iterations something regressed in summary extraction. #[test] @@ -96,7 +96,7 @@ fn scc_small_cycle_converges_quickly() { // recursion edges here, summary refinement should still converge in // a small multiple of the chain depth. Current behaviour is iters=0 // because the call graph topo-order resolves these files without - // needing an SCC fix-point loop at all — allow that too so this + // needing an SCC fix-point loop at all, allow that too so this // test does not become load-bearing on SCC detection. assert!( iters <= 4, @@ -124,7 +124,7 @@ fn scc_cap_hit_still_emits_tagged_low_confidence_findings() { // Force the SCC fix-point loop to bail after 3 iterations. The // 4-cycle fixture needs >=4 iterations to fully propagate taint, so // the 3rd iteration's diags do contain the transitive taint finding - // but convergence has not been detected — this is the exact cap-hit + // but convergence has not been detected, this is the exact cap-hit // scenario users would see in production on a larger SCC. set_scc_fixpoint_cap_override(3); let diags = scan_fixture_dir(&dir, AnalysisMode::Full); @@ -143,7 +143,7 @@ fn scc_cap_hit_still_emits_tagged_low_confidence_findings() { "expected cap-override (3) to bind the fix-point loop; got {iters} iterations" ); - // (a) Taint findings must still be emitted — truncation is not + // (a) Taint findings must still be emitted, truncation is not // silent drop. let taint: Vec<_> = diags .iter() @@ -161,7 +161,7 @@ fn scc_cap_hit_still_emits_tagged_low_confidence_findings() { // (b) At least one finding from the unconverged SCC batch carries // the tag. Tagging is scoped to diags produced by the SCC fix-point - // loop itself — findings from non-recursive batches or orphan files + // loop itself, findings from non-recursive batches or orphan files // that happen to flow through SCC-internal summaries are // intentionally not re-tagged (they came from a batch that did // converge, modulo the referenced summary). @@ -210,8 +210,8 @@ fn scc_cap_hit_still_emits_tagged_low_confidence_findings() { /// Phase-E3 / Phase-B: verify that the worklist reduces per-iteration /// work without changing the final output. We do this by running the -/// 16-cycle fixture twice — once through the normal pass-2 path, -/// which uses the worklist — and asserting (a) findings match and +/// 16-cycle fixture twice, once through the normal pass-2 path, +/// which uses the worklist, and asserting (a) findings match and /// (b) iteration count stays within the same bound as the 8-cycle. /// /// This test is load-bearing for Phase-B correctness: if the worklist @@ -354,7 +354,7 @@ fn scc_cap_hit_records_classified_reason() { .collect::>() ); - // The reason must be *something* other than Unknown — that's the + // The reason must be *something* other than Unknown, that's the // whole point of Phase-D classification. Any structured variant // proves the trajectory pipeline fired end-to-end. for (d, reason) in &tagged { diff --git a/tests/scc_cross_file_tests.rs b/tests/scc_cross_file_tests.rs index c8f62d9c..b49c4fd8 100644 --- a/tests/scc_cross_file_tests.rs +++ b/tests/scc_cross_file_tests.rs @@ -9,13 +9,13 @@ //! //! The assertions below lock down: //! -//! * Cross-file SCCs converge — the required finding surfaces at the +//! * Cross-file SCCs converge, the required finding surfaces at the //! caller. //! * Iteration counts stay in a modest, pinned range (proves the cycle //! actually exercised the SCC fix-point loop rather than resolving //! via topological order). //! * Sanitised cross-file cycles do not produce a finding at the caller -//! — the joint convergence carries the sanitizer fact back across the +//! , the joint convergence carries the sanitizer fact back across the //! cycle. mod common; @@ -52,7 +52,7 @@ fn two_file_mutual_recursion_reaches_transitive_sink() { validate_expectations(&diags, &dir); // The 2-cycle should converge in very few iterations. Allow 0 - // (no SCC loop needed — topo order already handled it) through 5 + // (no SCC loop needed, topo order already handled it) through 5 // (some monotone refinement churn). A higher number indicates the // fix-point loop is churning near the cap. let iters = last_scc_max_iterations(); @@ -65,7 +65,7 @@ fn two_file_mutual_recursion_reaches_transitive_sink() { /// Three-way cross-file cycle: `node_a::forward_a → node_b::forward_b → /// node_c::forward_c → node_a::forward_a`. All three files sit in the /// same SCC. With `SCC_FIXPOINT_SAFETY_CAP = 64` the cycle converges -/// easily, but the iteration count must stay bounded — this test pins +/// easily, but the iteration count must stay bounded, this test pins /// the convergence envelope. #[test] fn three_file_cross_file_cycle_converges_within_bound() { @@ -101,7 +101,7 @@ fn recursive_with_sanitiser_suppresses_finding_at_caller() { let dir = fixture_path("cross_file_scc_recursive_with_sanitiser"); let diags = scan_fixture_dir(&dir, AnalysisMode::Full); - // `expectations.json` forbids py.cmdi in driver.py — joint + // `expectations.json` forbids py.cmdi in driver.py, joint // convergence must carry the sanitizer across the cycle. validate_expectations(&diags, &dir); diff --git a/tests/ssa_equivalence_tests.rs b/tests/ssa_equivalence_tests.rs index 8b26a087..b7888475 100644 --- a/tests/ssa_equivalence_tests.rs +++ b/tests/ssa_equivalence_tests.rs @@ -7,33 +7,33 @@ //! multi-tier correctness signal. Each `#[test]` fn below verifies a //! distinct property: //! -//! * `ssa_structural_invariants_corpus` — every body in every real-world +//! * `ssa_structural_invariants_corpus`, every body in every real-world //! fixture lowers to well-formed SSA. Enforced via //! [`nyx_scanner::ssa::invariants::check_structural_invariants`]: //! single-assignment, pred/succ symmetry, terminator/succs agreement, //! phi arity and operand sources, value-def coverage, and reachability. //! -//! * `ssa_lowering_is_deterministic` — lowering the same CFG twice produces +//! * `ssa_lowering_is_deterministic`, lowering the same CFG twice produces //! structurally identical SSA (equal fingerprint). Catches any incoming //! non-determinism introduced by hashing or iteration order. //! -//! * `ssa_optimize_is_idempotent` — `optimize_ssa` reaches a fixpoint on +//! * `ssa_optimize_is_idempotent`, `optimize_ssa` reaches a fixpoint on //! the first run: re-running it must prune zero branches, eliminate //! zero copies, and remove zero dead defs, and must not change the body //! fingerprint. Catches optimiser bugs where a second pass would find //! new work (indicating the first pass failed to converge). //! -//! * `summary_extraction_is_deterministic` — extracting summaries from the +//! * `summary_extraction_is_deterministic`, extracting summaries from the //! same bytes twice yields the same `(FuncSummary, SsaFuncSummary)` //! sets, compared via stable JSON serialisation. Catches any //! non-determinism in summary construction or cross-file key ordering. //! -//! * `scan_is_stable_across_runs` — a full two-pass scan produces the same +//! * `scan_is_stable_across_runs`, a full two-pass scan produces the same //! diag list when invoked twice on the same input. Runs on a curated //! per-language fixture subset to keep wall time bounded; the other //! tiers already cover full-corpus behaviour. //! -//! * `ssa_corpus_does_not_panic` — the original smoke check, kept to lock +//! * `ssa_corpus_does_not_panic`, the original smoke check, kept to lock //! in termination on the full fixture matrix. //! //! Run with: `cargo test --test ssa_equivalence_tests` @@ -292,7 +292,7 @@ fn ssa_lowering_is_deterministic() { /// Stronger determinism check than Tier 2: for every body in the corpus /// that carries ≥ 2 phis (where phi ordering is the most likely culprit /// for hasher-driven non-determinism), lower the CFG ten times in a row -/// and assert every fingerprint matches the first — bit-for-bit, with no +/// and assert every fingerprint matches the first, bit-for-bit, with no /// sort tolerance. Runs are interleaved across fixtures so that /// process-wide hasher state between lowerings is as adversarial as we /// can make it without `PYTHONHASHSEED`-style seeding. @@ -378,7 +378,7 @@ fn ssa_optimize_is_idempotent() { continue; }; - // First optimisation pass — may do real work. + // First optimisation pass, may do real work. let _ = optimize_ssa(&mut ssa, &body.graph, Some(lang)); let fp_after_first = body_fingerprint(&ssa); @@ -472,7 +472,7 @@ fn summary_extraction_is_deterministic() { // SSA summaries: compare after sorting by key (order from the extractor // is expected-deterministic, but if two runs diverge only in order the - // test should still pass — what matters is the set identity). + // test should still pass, what matters is the set identity). let mut ssa_a_sorted = ssa_a; let mut ssa_b_sorted = ssa_b; ssa_a_sorted.sort_by(|a, b| format!("{:?}", a.0).cmp(&format!("{:?}", b.0))); @@ -541,7 +541,7 @@ fn scan_is_stable_across_runs() { for &name in SCAN_STABILITY_SUBSET { let Some(fixture) = by_name.get(name).copied() else { - // Not a hard failure — curated names may drift as the corpus + // Not a hard failure, curated names may drift as the corpus // evolves. Log but continue so this tier stays useful. if verbose() { eprintln!("scan_is_stable_across_runs: missing fixture {name}"); @@ -576,7 +576,7 @@ fn scan_is_stable_across_runs() { // ── Tier 6: SSA lowering coverage sanity ───────────────────────────────── /// Guards against a silent regression that would make `lower_to_ssa` -/// return empty / trivially-satisfying bodies — which would make every +/// return empty / trivially-satisfying bodies, which would make every /// invariant check pass vacuously. Enforces that the corpus produces /// non-trivial SSA: many blocks, many instructions, at least one phi /// somewhere, at least one loop (back edge), and at least one call. @@ -631,7 +631,7 @@ fn ssa_lowering_produces_non_trivial_bodies() { } } - // Thresholds are generous — they only catch gross regressions (e.g. a + // Thresholds are generous, they only catch gross regressions (e.g. a // lowering bug that silently produces single-block bodies with no body // instructions). Update if the corpus intentionally shrinks. assert!(bodies > 200, "expected >200 bodies, got {bodies}"); @@ -712,7 +712,7 @@ fn build_and_lower_all(path: &Path, cfg: &Config) -> usize { // Construct a synthetic SsaBody where a block carries `SsaOp::CatchParam` // but is neither reachable from entry via normal flow nor listed as a // target of any exception edge. The invariant must report the -// orphan — this is the CFG-construction-bug signal the invariant is +// orphan, this is the CFG-construction-bug signal the invariant is // designed to surface. // // The test stays on the pure-function `check_catch_block_reachability` @@ -732,8 +732,8 @@ fn orphan_catch_block_triggers_reachability_invariant() { let dummy_cfg = NodeIndex::new(0); - // Block 0: entry — does not reach block 1 via succs. - // Block 1: orphan — carries CatchParam, not listed in exception_edges. + // Block 0: entry, does not reach block 1 via succs. + // Block 1: orphan, carries CatchParam, not listed in exception_edges. let body = SsaBody { blocks: vec![ SsaBlock { @@ -766,7 +766,7 @@ fn orphan_catch_block_triggers_reachability_invariant() { block: BlockId(1), }], cfg_node_map: Default::default(), - exception_edges: vec![], // intentionally empty — the orphan condition, + exception_edges: vec![], // intentionally empty, the orphan condition, field_interner: nyx_scanner::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), }; diff --git a/tests/state_tests.rs b/tests/state_tests.rs index 58a898e1..0596f454 100644 --- a/tests/state_tests.rs +++ b/tests/state_tests.rs @@ -444,7 +444,7 @@ fn js_fs_use_after_close() { fn java_twr_no_false_leak() { // Java try-with-resources guarantees AutoCloseable.close() is called. // The managed_resource flag on the acquire node suppresses false leaks. - // The fixture also contains unsafeManual() which genuinely leaks — + // The fixture also contains unsafeManual() which genuinely leaks , // only verify that the TWR function's acquire (line 5) doesn't leak. let findings = state_diags_for("java_try_with_resources.java"); let twr_leaks: Vec<_> = findings @@ -459,7 +459,7 @@ fn java_twr_no_false_leak() { .map(|d| (&d.id, d.line)) .collect::>() ); - // unsafeManual (lines 10-13) is a genuine leak — verify it's detected + // unsafeManual (lines 10-13) is a genuine leak, verify it's detected assert!( findings .iter() @@ -493,7 +493,7 @@ fn java_db_connection_leak() { } // ═══════════════════════════════════════════════════════════════════════ -// (8c) Go resource lifecycle — defer +// (8c) Go resource lifecycle, defer // ═══════════════════════════════════════════════════════════════════════ #[test] @@ -505,18 +505,18 @@ fn go_defer_close_no_findings() { #[test] fn go_defer_missing_leak() { - // No close at all — should produce resource leak. + // No close at all, should produce resource leak. assert_has_prefix("go_defer_missing.go", "state-resource-leak"); } #[test] fn go_no_defer_manual_close_clean() { - // Manual close at end of function — no leak. + // Manual close at end of function, no leak. assert_no_state_findings("go_no_defer_manual_close.go"); } // ═══════════════════════════════════════════════════════════════════════ -// (9) Auth — unauthed access detection +// (9) Auth, unauthed access detection // ═══════════════════════════════════════════════════════════════════════ #[test] @@ -538,7 +538,7 @@ fn auth_not_a_handler_no_finding() { #[test] fn auth_negated_condition_does_not_elevate() { - // if (!is_authenticated) { exec(...) } — negated condition. + // if (!is_authenticated) { exec(...) }, negated condition. // True branch is the unauthenticated path; auth must NOT be elevated. assert_has("auth_negated_condition.js", "state-unauthed-access"); } @@ -569,7 +569,7 @@ fn auth_substring_in_condition_no_false_elevate() { #[test] fn rust_raii_file_no_leak() { - // File::open uses RAII drop — managed_resource suppresses leak. + // File::open uses RAII drop, managed_resource suppresses leak. assert_no_state_findings("rust_raii_file_no_leak.rs"); } @@ -581,19 +581,19 @@ fn rust_box_owned_no_leak() { #[test] fn rust_explicit_drop_no_leak() { - // drop(f) is an explicit release — no leak. + // drop(f) is an explicit release, no leak. assert_no_state_findings("rust_explicit_drop.rs"); } #[test] fn rust_unsafe_alloc_clean() { - // alloc + dealloc — properly paired, no findings. + // alloc + dealloc, properly paired, no findings. assert_no_state_findings("rust_unsafe_alloc_clean.rs"); } #[test] fn rust_unsafe_alloc_leak() { - // alloc without dealloc — NOT RAII-managed, leak expected. + // alloc without dealloc, NOT RAII-managed, leak expected. assert_has_prefix("rust_unsafe_alloc_leak.rs", "state-resource-leak"); } @@ -621,13 +621,13 @@ fn cpp_smart_ptr_no_leak() { #[test] fn cpp_smart_ptr_scope_exit() { - // make_unique with return — RAII cleanup at scope exit. + // make_unique with return, RAII cleanup at scope exit. assert_no_state_findings("cpp_smart_ptr_scope_exit.cpp"); } #[test] fn cpp_unique_ptr_from_raw() { - // unique_ptr(new int(42)) — the constructor wraps a raw new. + // unique_ptr(new int(42)), the constructor wraps a raw new. // The unique_ptr constructor is not a tracked acquire, so no leak // from the outer call. The inner `new` might or might not be visible // depending on callee extraction depth. At minimum: no false alarm. @@ -648,7 +648,7 @@ fn cpp_unique_ptr_from_raw() { #[test] fn cpp_alias_before_delete() { - // p = new; q = p; delete q — tests ownership transfer semantics. + // p = new; q = p; delete q, tests ownership transfer semantics. // The assignment transfer moves lifecycle from p to q. // After delete q, the resource is closed. // At exit: q = CLOSED, p = MOVED → no leak. @@ -820,13 +820,13 @@ fn resource_as_function_arg_still_leaks() { #[test] fn resource_returned_from_factory() { - // Factory function: fopen result is returned to caller — not a leak. + // Factory function: fopen result is returned to caller, not a leak. assert_no_state_findings("resource_returned.c"); } #[test] fn returned_on_one_path_leaked_on_another() { - // Resource returned on one branch, leaked on another — still a finding. + // Resource returned on one branch, leaked on another, still a finding. assert_has( "returned_on_one_path_leaked_on_another.c", "state-resource-leak-possible", @@ -835,13 +835,13 @@ fn returned_on_one_path_leaked_on_another() { #[test] fn returned_on_all_success_paths() { - // Resource returned on all exit paths — no finding. + // Resource returned on all exit paths, no finding. assert_no_state_findings("returned_on_all_success_paths.c"); } #[test] fn return_null_after_open_without_close() { - // Opens resource then returns NULL — definite leak. + // Opens resource then returns NULL, definite leak. assert_has( "return_null_after_open_without_close.c", "state-resource-leak", @@ -850,13 +850,13 @@ fn return_null_after_open_without_close() { #[test] fn factory_leak_not_returned() { - // Opens resource, returns integer — resource leaked. + // Opens resource, returns integer, resource leaked. assert_has_prefix("factory_leak_not_returned.c", "state-resource-leak"); } #[test] fn loop_reopen_clean() { - // Each loop iteration opens and closes the file — clean at exit. + // Each loop iteration opens and closes the file, clean at exit. assert_no_state_findings("loop_reopen.c"); } @@ -935,13 +935,13 @@ fn ruby_tempfile_leak() { #[test] fn auth_false_positive_token() { - // generateToken() is NOT an auth check — finding should fire + // generateToken() is NOT an auth check, finding should fire assert_has("auth_false_positive_token.js", "state-unauthed-access"); } #[test] fn auth_decode_token_not_auth() { - // decodeToken() parses but does not enforce auth — finding should fire + // decodeToken() parses but does not enforce auth, finding should fire assert_has("auth_decode_token_not_auth.js", "state-unauthed-access"); } @@ -965,7 +965,7 @@ fn auth_require_role_protected() { #[test] fn auth_decorator_python_login_required() { - // @login_required seeds AuthLevel::Authed at function entry — the + // @login_required seeds AuthLevel::Authed at function entry, the // privileged sink inside should not trip state-unauthed-access. assert_absent( "auth_decorator_python_login_required.py", @@ -990,21 +990,21 @@ fn auth_decorator_python_admin_suppresses() { #[test] fn auth_decorator_python_non_auth_still_fires() { - // @app.route / @functools.lru_cache are NOT auth markers — the finding + // @app.route / @functools.lru_cache are NOT auth markers, the finding // must still fire on the unprotected handler. assert_has("auth_decorator_python_non_auth.py", "state-unauthed-access"); } #[test] fn auth_decorator_js_use_guards() { - // @UseGuards(AuthGuard) — AuthGuard in the argument list is the real + // @UseGuards(AuthGuard), AuthGuard in the argument list is the real // auth marker, matched by JS_AUTH matchers. assert_absent("auth_decorator_js_use_guards.ts", "state-unauthed-access"); } #[test] fn auth_decorator_js_non_auth_still_fires() { - // @Injectable / @Get are routing/DI decorators — not auth markers. + // @Injectable / @Get are routing/DI decorators, not auth markers. assert_has("auth_decorator_js_non_auth.ts", "state-unauthed-access"); } @@ -1039,7 +1039,7 @@ fn auth_decorator_ruby_no_filter_still_fires() { #[test] fn auth_decorator_ruby_before_action_only_excludes() { - // `before_action :auth, only: [:create]` — method `index` is NOT in the + // `before_action :auth, only: [:create]`, method `index` is NOT in the // only-list, so the filter does not apply and the sink fires. assert_has( "auth_decorator_ruby_before_action_only.rb", @@ -1049,7 +1049,7 @@ fn auth_decorator_ruby_before_action_only_excludes() { #[test] fn auth_decorator_ruby_before_action_only_includes() { - // Same filter — method `create` IS in the only-list, so the filter + // Same filter, method `create` IS in the only-list, so the filter // applies and the sink does not fire. assert_absent( "auth_decorator_ruby_before_action_only_match.rb", @@ -1059,7 +1059,7 @@ fn auth_decorator_ruby_before_action_only_includes() { #[test] fn auth_decorator_ruby_before_action_except_fires() { - // `before_action :auth, except: [:index]` — method `index` IS in the + // `before_action :auth, except: [:index]`, method `index` IS in the // except-list, so the filter is skipped and the sink fires. assert_has( "auth_decorator_ruby_before_action_except.rb", @@ -1069,7 +1069,7 @@ fn auth_decorator_ruby_before_action_except_fires() { #[test] fn auth_decorator_ruby_before_action_except_absent() { - // Same filter — method `create` is NOT in the except-list, so the filter + // Same filter, method `create` is NOT in the except-list, so the filter // applies and the sink does not fire. assert_absent( "auth_decorator_ruby_before_action_except_other.rb", diff --git a/tests/symex_switch_tests.rs b/tests/symex_switch_tests.rs index 71f62a2c..15617645 100644 --- a/tests/symex_switch_tests.rs +++ b/tests/symex_switch_tests.rs @@ -52,7 +52,7 @@ fn count_relevant(diags: &[Diag]) -> usize { /// hard regression. The exact finding count is left loose because /// per-case suppression precision depends on whether the constraint /// solver can refine the scrutinee (integer literals do, enum paths -/// do not — see `match_suppresses_safe_arm.rs`). +/// do not, see `match_suppresses_safe_arm.rs`). fn assert_at_least_one_finding(diags: &[Diag], label: &str) { let n = count_relevant(diags); assert!( @@ -74,7 +74,7 @@ fn symex_match_suppresses_safe_arm() { // tests/fixtures/real_world/rust/cfg/match_arms.rs which also only // emits quality findings, not taint). The acceptance for this // fixture is therefore: (1) the scan runs to completion without a - // panic — covered by the call to `scan_isolated` returning — and + // panic, covered by the call to `scan_isolated` returning, and // (2) at least one finding lands on the Raw arm body (lines // 22-29). The Safe arm at lines 31-36 must not regress beyond the // existing baseline. diff --git a/tests/symex_transform_tests.rs b/tests/symex_transform_tests.rs index a2f174a4..bc574671 100644 --- a/tests/symex_transform_tests.rs +++ b/tests/symex_transform_tests.rs @@ -1,4 +1,4 @@ -//! Symex encoding/decoding transform classification — Java / Go / Ruby. +//! Symex encoding/decoding transform classification, Java / Go / Ruby. //! //! Each fixture sets up a tainted source flowing through a known //! escape/encode helper into a sink whose vulnerability class is *not* @@ -11,7 +11,7 @@ //! The acceptance check is per-language: at least one taint diagnostic //! lands, and at least one such diagnostic carries an //! `evidence.symbolic.witness` string mentioning the transform's -//! display name (`urlEncode`, `htmlEscape`, etc.) — proving the new +//! display name (`urlEncode`, `htmlEscape`, etc.), proving the new //! Java/Go/Ruby classifiers in `src/symex/strings.rs` are wired through //! to witness generation. @@ -40,9 +40,9 @@ fn scan_isolated(fixture: &Path) -> Vec { /// Find a taint finding whose symex witness contains *any* of the given /// token alternatives. Either the transform display name (e.g. -/// `urlEncode`) appears verbatim — produced by the +/// `urlEncode`) appears verbatim, produced by the /// `detect_transform_mismatch` annotation when the symex value tree still -/// carries a tainted symbol — or the witness has been concrete-folded +/// carries a tainted symbol, or the witness has been concrete-folded /// through `encode_concrete_for_witness`, in which case the encoded /// artifact (e.g. a percent-escape) appears in place of the original /// characters. Both prove the new transform classifier is wired through @@ -99,7 +99,7 @@ fn assert_renderable_witness(diags: &[Diag], lang: &str, tokens: &[&str]) { // value tree carries a tainted symbol with the wrong-class encode // - a percent-escape appears when `evaluate_concrete` folded // `Encode(UrlEncode, …)` through `encode_concrete_for_witness` -// The raw callee name is intentionally NOT accepted — it would appear +// The raw callee name is intentionally NOT accepted, it would appear // even in the Display fallback when the classifier fails, making the // assertion meaningless. diff --git a/tests/topo_pass2_refinement_tests.rs b/tests/topo_pass2_refinement_tests.rs index 10506e05..872a6064 100644 --- a/tests/topo_pass2_refinement_tests.rs +++ b/tests/topo_pass2_refinement_tests.rs @@ -5,7 +5,7 @@ //! `scc_file_batches_with_metadata`). Before this wiring landed, the //! non-recursive batch path called `run_rules_on_file`, which discards //! refined SSA / body / auth artifacts. Caller-most batches (run -//! later in topo order) saw only pass-1 summaries — the refined cross- +//! later in topo order) saw only pass-1 summaries, the refined cross- //! file context produced by callee batches in pass 2 was lost. //! //! These tests pin the new contract: @@ -17,7 +17,7 @@ //! 3. The opt-out env var `NYX_TOPO_REFINE=0` restores the legacy //! `run_rules_on_file` path with no behavioural regression on //! required findings. -//! 4. The fixture's expectations.json is met under both modes — +//! 4. The fixture's expectations.json is met under both modes , //! proving that refinement is a precision-positive optimisation //! and not a soundness change. @@ -76,7 +76,7 @@ impl Drop for EnvScope { } // ───────────────────────────────────────────────────────────────────── -// D1 — Refinement is enabled by default and is observable +// D1, Refinement is enabled by default and is observable // ───────────────────────────────────────────────────────────────────── /// On a 2-file linear-chain fixture (caller → callee, no recursion), @@ -115,13 +115,13 @@ fn nonrecursive_batches_persist_refinements_by_default() { } // ───────────────────────────────────────────────────────────────────── -// D2 — Opt-out via NYX_TOPO_REFINE=0 restores legacy behaviour +// D2, Opt-out via NYX_TOPO_REFINE=0 restores legacy behaviour // ───────────────────────────────────────────────────────────────────── /// With `NYX_TOPO_REFINE=0`, the legacy non-recursive branch runs: /// `run_rules_on_file` is called and refined artifacts are NOT /// persisted, so the observability counter stays at zero. The fixture's -/// required findings must STILL be detected — confirming that the +/// required findings must STILL be detected, confirming that the /// refinement is precision-positive but not soundness-load-bearing. #[test] fn nonrecursive_batches_legacy_path_when_disabled() { @@ -142,7 +142,7 @@ fn nonrecursive_batches_legacy_path_when_disabled() { } // ───────────────────────────────────────────────────────────────────── -// D3 — Refinement does not regress findings vs the legacy path +// D3, Refinement does not regress findings vs the legacy path // ───────────────────────────────────────────────────────────────────── /// Run the same fixture twice (refine on / off) and assert the set of @@ -150,7 +150,7 @@ fn nonrecursive_batches_legacy_path_when_disabled() { /// the refine-on set is a *superset* of the legacy set; in practice /// the fixtures exercised here are small enough that the two should be /// equal. This test guards against the regression where refinement -/// silently *loses* findings — e.g. a refined summary masking a real +/// silently *loses* findings, e.g. a refined summary masking a real /// finding via accidental sanitiser inference. #[test] fn refinement_does_not_lose_required_findings_vs_legacy() { @@ -184,7 +184,7 @@ fn refinement_does_not_lose_required_findings_vs_legacy() { } // ───────────────────────────────────────────────────────────────────── -// D4 — Counter resets between scans +// D4, Counter resets between scans // ───────────────────────────────────────────────────────────────────── /// `last_topo_nonrecursive_refinements()` is reset to zero at the @@ -204,7 +204,7 @@ fn refinements_counter_resets_per_scan() { assert!(first > 0, "first scan must record refinements, got {first}"); // Second scan on the same fixture. Counter must reset to first - // scan's value (or close to it — the fixture is deterministic so + // scan's value (or close to it, the fixture is deterministic so // it should match), NOT accumulate to ~2 × first. let _ = scan_fixture_dir(&dir, AnalysisMode::Full); let second = last_topo_nonrecursive_refinements(); diff --git a/tests/typed_callgraph_audit.rs b/tests/typed_callgraph_audit.rs index af2a6213..34a604ae 100644 --- a/tests/typed_callgraph_audit.rs +++ b/tests/typed_callgraph_audit.rs @@ -5,7 +5,7 @@ //! The lower-level Phase 1 / 2 / 3 unit tests under //! `src/callgraph.rs::tests` and `src/taint/tests.rs` already prove the //! per-module API behaviour. These tests pin the *integration* -//! invariants — that the pipeline as a whole still produces the right +//! invariants, that the pipeline as a whole still produces the right //! `typed_call_receivers` entries on real source code, that the call //! graph picks the receiver-typed candidate at edge-insertion time, //! and that today's behaviour is preserved on every negative / @@ -39,7 +39,7 @@ struct File<'a> { /// /// The caller is responsible for picking absolute paths whose `Path` /// representation matches the namespace it expects on the resulting -/// [`FuncKey`]s — `extract_all_summaries_from_bytes` writes the raw +/// [`FuncKey`]s, `extract_all_summaries_from_bytes` writes the raw /// `path` into `FuncSummary::file_path` which then flows through to /// `FuncKey::namespace` after `merge_summaries`. fn pipeline_global_summaries(files: &[File<'_>]) -> GlobalSummaries { @@ -77,7 +77,7 @@ fn find_ssa<'a>( } // ───────────────────────────────────────────────────────────────────── -// A.2.1 — End-to-end pipeline test +// A.2.1, End-to-end pipeline test // ───────────────────────────────────────────────────────────────────── /// Pipeline test: Java caller invokes a method on a constructor-typed @@ -87,7 +87,7 @@ fn find_ssa<'a>( /// the typed receiver to `FileHandle::close`, not the same-name /// `Cache::close` overload). /// -/// **Audit gap A.2.1.G1 — closed 2026-04-26.** Previously, the SSA +/// **Audit gap A.2.1.G1, closed 2026-04-26.** Previously, the SSA /// summary extractor leaked synthetic external-capture `Param` ops /// into the summary's parameter-index references, so its FuncKey /// disambig got synthesised away from the matching FuncSummary @@ -203,7 +203,7 @@ class Cache { } // ───────────────────────────────────────────────────────────────────── -// A.2.4 — SQLite round-trip + rescan-cache parity +// A.2.4, SQLite round-trip + rescan-cache parity // ───────────────────────────────────────────────────────────────────── /// SQLite round-trip test for `typed_call_receivers`: an SSA summary @@ -333,7 +333,7 @@ class P { ); } -/// P-2: Java `HttpClient.newHttpClient(); c.send(...)` — typed receiver +/// P-2: Java `HttpClient.newHttpClient(); c.send(...)`, typed receiver /// `HttpClient`. This is the canonical Phase-10 type-inference shape. #[test] fn audit_p2_java_http_client_typed_receiver() { @@ -362,7 +362,7 @@ class P { ); } -/// P-3: Python `c = sqlite3.connect(...); c.execute(...)` — typed +/// P-3: Python `c = sqlite3.connect(...); c.execute(...)`, typed /// receiver `DatabaseConnection`. #[test] fn audit_p3_python_sqlite_connection_typed_receiver() { @@ -395,7 +395,7 @@ def use(): // A.3 negatives // ───────────────────────────────────────────────────────────────────── -/// N-1: a free-function call (no receiver — `new FileInputStream(...)` +/// N-1: a free-function call (no receiver, `new FileInputStream(...)` /// with no method-call follow-up) must not surface in /// `typed_call_receivers`. Even if the constructor produces a known /// type, the SSA Call carries `receiver: None` and the devirtualisation @@ -422,7 +422,7 @@ class P { ); } -/// N-3: Receiver type known but no matching container method — +/// N-3: Receiver type known but no matching container method , /// devirtualisation must NOT silently drop the edge. Today's /// name-only resolution still fires and finds the target. This is /// the receiver-misclassification fall-through invariant from @@ -444,7 +444,7 @@ fn audit_n3_zero_match_falls_through_to_today() { }; // Single `process` on `Worker`. Caller's typed_call_receivers - // says "Other" — there is no such container, so the typed lookup + // says "Other", there is no such container, so the typed lookup // misses and we fall through to today's name-only resolution. let worker = make("process", "Worker", "src/worker.rs", 1); let caller = FuncSummary { @@ -521,7 +521,7 @@ class P { /// R-3: Without a typed receiver entry, an ambiguous unqualified call /// must remain ambiguous (no edge added). Pin: devirtualisation is -/// strictly additive — it never resolves edges that today's pipeline +/// strictly additive, it never resolves edges that today's pipeline /// considers ambiguous unless real type info is present. #[test] fn audit_r3_ambiguous_without_typed_receiver_stays_ambiguous() { @@ -537,7 +537,7 @@ fn audit_r3_ambiguous_without_typed_receiver_stays_ambiguous() { let send_http = make("send", "src/http.rs"); let send_mail = make("send", "src/mail.rs"); - // Caller in a third file calls bare `send` — genuinely ambiguous. + // Caller in a third file calls bare `send`, genuinely ambiguous. let caller = FuncSummary { name: "go".into(), file_path: "src/main.rs".into(), @@ -571,7 +571,7 @@ fn audit_r3_ambiguous_without_typed_receiver_stays_ambiguous() { /// R-4: Arity overloads on the same container. When the typed /// receiver picks a container that hosts two arity-overloaded /// methods, the per-call-site `arity` filter must still pick the -/// right one — devirtualisation does not bypass arity narrowing. +/// right one, devirtualisation does not bypass arity narrowing. #[test] fn audit_r4_arity_filter_still_applies_after_devirt() { use nyx_scanner::summary::{CalleeSite, FuncSummary, merge_summaries}; diff --git a/tests/typed_extractors_audit.rs b/tests/typed_extractors_audit.rs index 2f758be7..93c4f75b 100644 --- a/tests/typed_extractors_audit.rs +++ b/tests/typed_extractors_audit.rs @@ -2,19 +2,19 @@ //! //! These tests directly drive the `cfg::params` matchers via the //! tree-sitter parser without spinning up the full scan pipeline. The -//! goal is to pin the matcher invariants — what qualifies as a typed -//! extractor, what does not — independent of which framework rules are +//! goal is to pin the matcher invariants, what qualifies as a typed +//! extractor, what does not, independent of which framework rules are //! loaded at scan time. //! //! Three audit dimensions are covered: -//! * **A1** — end-to-end wiring: classifier returns the expected +//! * **A1**, end-to-end wiring: classifier returns the expected //! `TypeKind` for each framework's canonical typed-extractor shape //! (Spring `@PathVariable`, NestJS `@Param`, Axum `Path`, //! FastAPI `Annotated[..., Path()]`). -//! * **A2** — Hard-Rule-3 negatives: bare primitives and +//! * **A2**, Hard-Rule-3 negatives: bare primitives and //! non-framework annotations / decorators / wrappers must NOT //! classify. -//! * **A5** — parser-driven matcher tests: every assertion is +//! * **A5**, parser-driven matcher tests: every assertion is //! produced from a real parsed AST so a future tree-sitter grammar //! bump can't silently break the matcher without flipping a test. @@ -36,7 +36,7 @@ fn parse(lang: &str, src: &str) -> tree_sitter::Tree { } /// Find the first function-like node in the tree whose `kind()` matches -/// `func_kind`. Returns `None` when none exists — parser fragility +/// `func_kind`. Returns `None` when none exists, parser fragility /// guard so failures surface as a panic in the test instead of a /// silent skip. fn first_node_of_kind<'a>( @@ -69,7 +69,7 @@ fn extract(lang: &str, src: &str, func_kinds: &[&str]) -> Vec<(String, Option` whose `UpdateDoc` @@ -290,7 +290,7 @@ fn rust_json_dto_returns_none_when_struct_missing_from_file() { assert_eq!(params[0].1, None); } -/// Phase 6 — DtoFields exposes a stable accessor surface for the +/// Phase 6, DtoFields exposes a stable accessor surface for the /// downstream auth analysis and type-fact engine. Pin the contract so /// future changes don't break that consumer. #[test] @@ -301,7 +301,7 @@ fn dto_fields_struct_api_is_stable() { assert_eq!(dto.class_name, "CreateUser"); assert_eq!(dto.get("age"), Some(&TypeKind::Int)); assert_eq!(dto.get("missing"), None); - // BTreeMap iteration order is sorted by key — stable + // BTreeMap iteration order is sorted by key, stable // serialisation invariant. let keys: Vec<_> = dto.fields.keys().cloned().collect(); assert_eq!(keys, vec!["age".to_string(), "email".to_string()]); @@ -314,7 +314,7 @@ fn dto_fields_struct_api_is_stable() { /// Audit A4: when two functions in the same file have parameters with /// the same name but different types (Spring `@PathVariable Long id` /// in one method, `@RequestParam String id` in another), the -/// per-body matcher must classify each correctly — the merger +/// per-body matcher must classify each correctly, the merger /// (`collect_file_var_types`) drops the entry when they conflict but /// the per-body classification stays right. This pins the matcher's /// per-body grain. @@ -379,7 +379,7 @@ fn java_path_variable_does_not_lift_annotation_into_param_names() { } "#; let params = extract("java", src, &["method_declaration"]); - // The collected param name is exactly "userId" — `PathVariable` + // The collected param name is exactly "userId", `PathVariable` // (the annotation token) must not become a param entry, otherwise // `apply_typed_bounded_params` would try to look it up. assert!(params.iter().all(|(name, _)| name != "PathVariable"));