Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

1
.gitignore vendored
View file

@ -5,6 +5,7 @@
/.idea
/frontend/node_modules
/src/server/assets/dist
/marketing
/.nyx
/logs
/book

View file

@ -33,6 +33,13 @@ pkg-url = "{ repo }/releases/download/v{ version }/nyx-{ target }{ archive-suffi
pkg-fmt = "zip"
bin-dir = "target/{ target }/release/{ bin }{ binary-ext }"
# docs.rs builds the `serve` feature (default) so the server module renders.
# `smt` is left off — bundled Z3 takes too long on docs.rs builders, and
# `smt-system-z3` needs a system library that isn't available there.
[package.metadata.docs.rs]
features = ["serve"]
rustdoc-args = ["--cfg", "docsrs"]
[features]
default = ["serve"]
serve = ["dep:axum", "dep:tokio", "dep:tokio-stream", "dep:tower-http"]

View file

@ -152,6 +152,12 @@ The corpus also holds a small set of vulnerable/patched pairs extracted from pub
Fixtures live under [`tests/benchmark/cve_corpus/`](tests/benchmark/cve_corpus/) with upstream attribution headers.
<!--
### Real-world findings
- **Nextcloud server**, [PR #59979](https://github.com/nextcloud/server/pull/59979), merged. The runtime decoder for this column already restricted `allowed_classes`, but the repair routine called `unserialize()` without it, so magic methods on referenced classes could still run. Fix matches the runtime path.
-->
---
## How it works

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 MiB

After

Width:  |  Height:  |  Size: 15 MiB

Before After
Before After

295
build.rs
View file

@ -1,7 +1,9 @@
use std::path::Path;
use std::path::{Path, PathBuf};
use std::process::Command;
fn main() {
render_docs_for_rustdoc();
// Only relevant when the serve feature is active
if std::env::var("CARGO_FEATURE_SERVE").is_err() {
return;
@ -14,11 +16,11 @@ fn main() {
println!("cargo:rerun-if-changed=src/server/assets/dist/index.html");
if index_html.exists() {
// Dist already built nothing to do
// Dist already built, nothing to do
return;
}
// Dist missing try to build frontend
// Dist missing, try to build frontend
let frontend_dir = Path::new("frontend");
if !frontend_dir.join("package.json").exists() {
emit_placeholder_and_warn(dist_dir);
@ -56,6 +58,293 @@ fn main() {
}
}
// ---------------------------------------------------------------------------
// Rustdoc / docs.rs: render docs/*.md into $OUT_DIR with relative .md links
// rewritten to absolute github.com/elicpeter/nyx URLs so they resolve when the
// markdown is embedded in rustdoc via #![doc = include_str!(...)].
//
// Source of truth stays in docs/. Files that don't exist (published-crate
// builds where docs/ wasn't packaged) fall back to a one-line stub so rustdoc
// still compiles.
// ---------------------------------------------------------------------------
const GH_DOCS_BASE: &str = "https://github.com/elicpeter/nyx/blob/master/docs";
struct DocSpec {
/// Path under docs/, e.g. "how-it-works.md" or "detectors/taint.md".
src: &'static str,
/// Output filename in $OUT_DIR.
out: &'static str,
}
const DOC_SPECS: &[DocSpec] = &[
DocSpec {
src: "how-it-works.md",
out: "lib_intro.md",
},
DocSpec {
src: "detectors/taint.md",
out: "taint.md",
},
DocSpec {
src: "detectors/cfg.md",
out: "cfg_analysis.md",
},
DocSpec {
src: "detectors/state.md",
out: "state.md",
},
DocSpec {
src: "detectors/patterns.md",
out: "patterns.md",
},
DocSpec {
src: "auth.md",
out: "auth_analysis.md",
},
];
fn render_docs_for_rustdoc() {
let Ok(out_dir) = std::env::var("OUT_DIR") else {
return;
};
let out_dir = PathBuf::from(out_dir);
let docs_dir = Path::new("docs");
for spec in DOC_SPECS {
let src_path = docs_dir.join(spec.src);
println!("cargo:rerun-if-changed=docs/{}", spec.src);
let out_path = out_dir.join(spec.out);
let rendered = match std::fs::read_to_string(&src_path) {
Ok(raw) => rewrite_doc_links(&raw, spec.src),
Err(_) => format!(
"See [`{base}/{src}`]({base}/{src}).\n",
base = GH_DOCS_BASE,
src = spec.src,
),
};
if let Err(e) = std::fs::write(&out_path, rendered) {
println!(
"cargo:warning=failed to write rendered doc {}: {}",
out_path.display(),
e
);
}
}
}
/// Render markdown for embedding in rustdoc.
///
/// 1. Rewrites relative `.md` links to absolute github.com URLs:
/// - inline links: `](path.md)` and `](path.md#anchor)`
/// - reference defs: `[id]: path.md`
/// 2. Labels unmarked fenced code blocks as `text` so rustdoc does not try
/// to compile them as Rust (and choke on Unicode like `→`).
/// 3. Annotates `rust` fences with `,ignore` so rustdoc doesn't try to
/// compile or run prose-level snippets as doctests. GitHub still
/// highlights them as Rust because it keys off the first token.
///
/// Skips link rewriting inside code fences. Skips link rewriting for URLs
/// that are already absolute (have a scheme), pure anchors (`#section`),
/// or non-`.md` paths.
fn rewrite_doc_links(content: &str, source_rel: &str) -> String {
let source_dir = Path::new(source_rel)
.parent()
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default();
let mut out = String::with_capacity(content.len() + 256);
let mut in_fence = false;
for line in content.split_inclusive('\n') {
let body = line.strip_suffix('\n').unwrap_or(line);
let trimmed = body.trim_start();
if trimmed.starts_with("```") {
let lang = trimmed.trim_start_matches('`').trim();
if in_fence {
in_fence = false;
out.push_str(line);
} else {
in_fence = true;
let indent_len = body.len() - trimmed.len();
if lang.is_empty() {
out.push_str(&body[..indent_len]);
out.push_str("```text");
if line.ends_with('\n') {
out.push('\n');
}
} else if is_rust_fence_needing_ignore(lang) {
out.push_str(&body[..indent_len]);
out.push_str("```rust,ignore");
if line.ends_with('\n') {
out.push('\n');
}
} else {
out.push_str(line);
}
}
continue;
}
if in_fence {
out.push_str(line);
} else {
rewrite_links_in_line(body, &source_dir, &mut out);
if line.ends_with('\n') {
out.push('\n');
}
}
}
out
}
fn rewrite_links_in_line(line: &str, source_dir: &str, out: &mut String) {
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
// Inline link: `](URL)`, markdown URLs do not contain a raw `)`.
if i + 1 < bytes.len() && bytes[i] == b']' && bytes[i + 1] == b'(' {
out.push_str("](");
i += 2;
let url_start = i;
while i < bytes.len() && bytes[i] != b')' {
i += 1;
}
let url = &line[url_start..i];
out.push_str(&maybe_rewrite_url(url, source_dir));
}
// Reference def: `]: URL`.
else if i + 2 < bytes.len()
&& bytes[i] == b']'
&& bytes[i + 1] == b':'
&& bytes[i + 2] == b' '
{
out.push_str("]: ");
i += 3;
let url_start = i;
while i < bytes.len() && bytes[i] != b' ' {
i += 1;
}
let url = &line[url_start..i];
out.push_str(&maybe_rewrite_url(url, source_dir));
} else {
// `]` (0x5D) is ASCII; UTF-8 continuation bytes are 0x80-0xBF
// and start bytes are 0xC0+, so byte-level scanning of `]` is
// safe. For non-ASCII bytes, copy the full codepoint at once.
let b = bytes[i];
if b < 0x80 {
out.push(b as char);
i += 1;
} else {
let len = utf8_seq_len(b);
let end = (i + len).min(bytes.len());
out.push_str(&line[i..end]);
i = end;
}
}
}
}
/// True for `rust` / `rust,...` fences that don't already opt out of
/// doctest execution. We rewrite these to `rust,ignore` because the prose
/// snippets in docs/ are illustrative, not standalone-compilable.
fn is_rust_fence_needing_ignore(lang: &str) -> bool {
let mut parts = lang.split(',').map(|p| p.trim());
let Some(first) = parts.next() else {
return false;
};
if !first.eq_ignore_ascii_case("rust") {
return false;
}
for tag in parts {
let t = tag.to_ascii_lowercase();
if t == "ignore" || t == "no_run" || t == "compile_fail" || t == "should_panic" {
return false;
}
}
true
}
fn utf8_seq_len(lead: u8) -> usize {
// lead < 0xC0 covers ASCII and unexpected continuation bytes; treat both as
// single-byte to make progress.
if lead < 0xC0 {
1
} else if lead < 0xE0 {
2
} else if lead < 0xF0 {
3
} else {
4
}
}
fn maybe_rewrite_url(url: &str, source_dir: &str) -> String {
if url.is_empty() {
return url.to_string();
}
// Already absolute (scheme://, mailto:, ssh://, etc.), leave alone.
if has_scheme(url) {
return url.to_string();
}
// Pure anchor, leave alone.
if url.starts_with('#') {
return url.to_string();
}
// Split off optional anchor.
let (path, anchor) = match url.find('#') {
Some(p) => (&url[..p], &url[p..]),
None => (url, ""),
};
// Only rewrite if the path looks like a markdown file.
if !path.ends_with(".md") {
return url.to_string();
}
// Resolve relative to source_dir.
let combined = if source_dir.is_empty() {
path.to_string()
} else {
format!("{}/{}", source_dir, path)
};
let normalised = normalise_path(&combined);
format!("{}/{}{}", GH_DOCS_BASE, normalised, anchor)
}
fn has_scheme(url: &str) -> bool {
// RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) ":"
let mut chars = url.chars();
let first = match chars.next() {
Some(c) => c,
None => return false,
};
if !first.is_ascii_alphabetic() {
return false;
}
for c in chars {
if c == ':' {
return true;
}
if !(c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) {
return false;
}
}
false
}
fn normalise_path(path: &str) -> String {
let mut stack: Vec<&str> = Vec::new();
for seg in path.split('/') {
match seg {
"" | "." => {}
".." => {
stack.pop();
}
other => stack.push(other),
}
}
stack.join("/")
}
fn emit_placeholder_and_warn(dist_dir: &Path) {
// Create minimal placeholder files so compilation succeeds
std::fs::create_dir_all(dist_dir).ok();

View file

@ -9,6 +9,16 @@ Nyx ships four independent detector families. They run together in `--mode full`
| [State model](detectors/state.md) | `state-*` | Per-function state lattice | Use-after-close, double-close, leaks, unauthenticated access |
| [AST patterns](detectors/patterns.md) | `<lang>.<cat>.<name>` | Tree-sitter structural match | Banned APIs, weak crypto, dangerous constructs |
The taint family is split into cap-specific rule classes when a sink callee carries multiple vulnerability classes:
| Rule id | Cap | Surface |
|---|---|---|
| `taint-unsanitised-flow` | every cap except `data_exfil` and `unauthorized_id` | Default taint flow class |
| `taint-data-exfiltration` | `data_exfil` | Sensitive data flowing into the payload of an outbound network request (body / headers / json on `fetch`, body on `XMLHttpRequest.send`). Distinct from SSRF: the destination is fixed but attacker-influenced bytes leave the process. |
| `rs.auth.missing_ownership_check.taint` | `unauthorized_id` | Rust auth subsystem fold-in; see [auth.md](auth.md). |
A single call site can fire several of these at once when it carries multiple gates — `fetch(taintedUrl, {body: tainted})` produces both an SSRF finding (URL flow) and a `taint-data-exfiltration` finding (body flow), each with its own cap mask rather than a conflated union.
For Rust auth-specific rules (`rs.auth.*`), see [auth.md](auth.md).
## How they combine

View file

@ -134,7 +134,8 @@ Sources, sanitizers, and sinks are linked by named capabilities. A sanitizer onl
| `fmt_string` | | | `printf(var)` |
| `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` with concatenation |
| `deserialize` | | | `pickle.loads`, `yaml.load`, `Marshal.load` |
| `ssrf` | | URL-prefix locks | `requests.get`, `fetch`, `HttpClient.send` |
| `ssrf` | | URL-prefix locks | `requests.get`, `fetch` URL arg, outbound HTTP destination |
| `data_exfil` | | | `fetch` body / headers / json, `XMLHttpRequest.send` body |
| `code_exec` | | | `eval`, `exec`, `Function` |
| `crypto` | | | weak-algorithm constructors |
| `unauthorized_id` | request-bound scoped IDs (Rust auth analysis) | ownership check | row-level write |

View file

@ -112,12 +112,14 @@ The tables below are generated from `src/patterns/<lang>.rs` by [`tools/docgen`]
| `go.crypto.md5` | Low | A | Medium |
| `go.crypto.sha1` | Low | A | Medium |
### Java: 8 patterns
### Java: 10 patterns
| Rule ID | Severity | Tier | Confidence |
|---|---|---|---|
| `java.cmdi.runtime_exec` | High | A | High |
| `java.code_exec.text4shell_interpolator` | High | A | High |
| `java.deser.readobject` | High | A | High |
| `java.deser.snakeyaml_unsafe_constructor` | High | A | High |
| `java.reflection.class_forname` | Medium | A | High |
| `java.reflection.method_invoke` | Medium | A | High |
| `java.sqli.execute_concat` | Medium | B | Medium |
@ -168,7 +170,7 @@ The tables below are generated from `src/patterns/<lang>.rs` by [`tools/docgen`]
| `php.crypto.rand` | Low | A | Medium |
| `php.crypto.sha1` | Low | A | Medium |
### Python: 13 patterns
### Python: 14 patterns
| Rule ID | Severity | Tier | Confidence |
|---|---|---|---|
@ -182,6 +184,7 @@ The tables below are generated from `src/patterns/<lang>.rs` by [`tools/docgen`]
| `py.code_exec.compile` | Medium | A | High |
| `py.deser.shelve_open` | Medium | A | High |
| `py.sqli.execute_format` | Medium | B | Medium |
| `py.sqli.text_format` | Medium | B | Medium |
| `py.xss.jinja_from_string` | Medium | A | High |
| `py.crypto.md5` | Low | A | Medium |
| `py.crypto.sha1` | Low | A | Medium |

View file

@ -19,8 +19,8 @@ use serde::{Deserialize, Serialize};
/// Bit-level abstract fact: known-zero and known-one masks.
///
/// - `top()` = `{known_zero: 0, known_one: 0}` no bits known
/// - `bottom()` = `{known_zero: MAX, known_one: MAX}` contradictory
/// - `top()` = `{known_zero: 0, known_one: 0}`, no bits known
/// - `bottom()` = `{known_zero: MAX, known_one: MAX}`, contradictory
/// - `from_const(n)` = all 64 bits known
///
/// Invariant: `known_zero & known_one == 0` for non-bottom values.
@ -253,7 +253,7 @@ impl AbstractDomain for BitFact {
}
}
/// Widen: same as join (finite lattice height 64 bits × 3 states).
/// Widen: same as join (finite lattice height, 64 bits × 3 states).
fn widen(&self, other: &Self) -> Self {
self.join(other)
}
@ -511,7 +511,7 @@ mod tests {
#[test]
fn right_shift_unknown_sign() {
// Sign bit unknown high bits after shift should be unknown
// Sign bit unknown, high bits after shift should be unknown
let a = BitFact {
known_zero: 0x0F,
known_one: 0,
@ -687,7 +687,7 @@ mod tests {
}
}
/// `a ⊓ b ⊑ a` and `a ⊓ b ⊑ b` meet is the greatest lower bound.
/// `a ⊓ b ⊑ a` and `a ⊓ b ⊑ b`, meet is the greatest lower bound.
#[test]
fn meet_is_lower_bound_bit() {
let xs = sample_bits();
@ -700,7 +700,7 @@ mod tests {
}
}
/// `a ⊑ a ⊔ b` and `b ⊑ a ⊔ b` join is the least upper bound.
/// `a ⊑ a ⊔ b` and `b ⊑ a ⊔ b`, join is the least upper bound.
#[test]
fn join_is_upper_bound_bit() {
let xs = sample_bits();

View file

@ -10,9 +10,9 @@ use serde::{Deserialize, Serialize};
/// Numeric interval: `[lo, hi]` inclusive bounds.
///
/// - `top()` = `[None, None]` any integer
/// - `bottom()` = `[1, 0]` empty / unsatisfiable (lo > hi)
/// - `exact(n)` = `[n, n]` singleton
/// - `top()` = `[None, None]`, any integer
/// - `bottom()` = `[1, 0]`, empty / unsatisfiable (lo > hi)
/// - `exact(n)` = `[n, n]`, singleton
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct IntervalFact {
pub lo: Option<i64>,
@ -278,7 +278,7 @@ impl IntervalFact {
/// - One non-negative singleton mask `m`: `[0, m]` regardless of other
/// operand's sign (two's complement AND with a non-negative mask always
/// produces a non-negative result bounded by the mask).
/// - Both non-negative: `[0, min(a.hi, b.hi)]` AND can only clear bits.
/// - Both non-negative: `[0, min(a.hi, b.hi)]`, AND can only clear bits.
pub fn bit_and(&self, other: &Self) -> Self {
if self.is_bottom() || other.is_bottom() {
return Self::bottom();
@ -330,7 +330,7 @@ impl IntervalFact {
/// - Singletons: exact computation.
/// - `x | 0` → `x`, `0 | x` → `x`.
/// - Both non-negative with known upper bounds: `[max(a.lo, b.lo),
/// next_pow2_minus1(max(a.hi, b.hi))]` OR can set any bit below
/// next_pow2_minus1(max(a.hi, b.hi))]`, OR can set any bit below
/// the highest set bit of either operand.
pub fn bit_or(&self, other: &Self) -> Self {
if self.is_bottom() || other.is_bottom() {
@ -1054,7 +1054,7 @@ mod tests {
let a = IntervalFact::exact(i64::MIN);
let b = IntervalFact::exact(-1);
let r = a.div(&b);
// Either bound becomes None (graceful) exact representation
// Either bound becomes None (graceful), exact representation
// depends on the impl, but we mainly assert no panic occurred
// and the result is a valid interval.
assert!(
@ -1078,7 +1078,7 @@ mod tests {
assert_eq!(r.hi, Some(2));
}
/// Modulo by an interval that *contains* zero must escape to Top
/// Modulo by an interval that *contains* zero must escape to Top ,
/// modulo-by-zero is undefined and we cannot precise-narrow it.
#[test]
fn modulo_divisor_spans_zero_is_top() {
@ -1096,7 +1096,7 @@ mod tests {
/// `[i64::MIN, i64::MAX]` is the maximal interval. Any join with
/// any other interval must remain `[i64::MIN, i64::MAX]` (or Top
/// equivalent) this guards against accidental narrowing on join.
/// equivalent), this guards against accidental narrowing on join.
#[test]
fn full_range_is_join_absorbing() {
let full = IntervalFact {
@ -1347,7 +1347,7 @@ mod tests {
);
}
/// Modulo with exact-zero divisor must escape to Top.
/// Modulo with exact-zero divisor, must escape to Top.
#[test]
fn modulo_by_exact_zero_is_top() {
let a = IntervalFact {

View file

@ -45,7 +45,7 @@ pub fn is_enabled() -> bool {
/// Per-SSA-value abstract element: product of all subdomains.
///
/// Each subdomain is independent join, meet, widen, and leq are applied
/// Each subdomain is independent, join, meet, widen, and leq are applied
/// component-wise. Adding a new subdomain requires adding a field here
/// and updating the component-wise implementations.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@ -182,15 +182,15 @@ pub const MAX_LITERAL_PREFIX_LEN: usize = 64;
/// restricted so the summary size stays constant regardless of callee body
/// complexity:
///
/// * [`IntervalTransfer::Top`] no interval knowledge crosses (default).
/// * [`IntervalTransfer::Identity`] return = param (pass-through).
/// * [`IntervalTransfer::Affine`] return = param * `mul` + `add` with
/// * [`IntervalTransfer::Top`], no interval knowledge crosses (default).
/// * [`IntervalTransfer::Identity`], return = param (pass-through).
/// * [`IntervalTransfer::Affine`], return = param * `mul` + `add` with
/// `i64` constants; overflow defaults to Top at apply time.
/// * [`IntervalTransfer::Clamped`] return is always in `[lo, hi]` regardless
/// * [`IntervalTransfer::Clamped`], return is always in `[lo, hi]` regardless
/// of input. Captures callee-intrinsic bounds (e.g. `saturating` ops).
///
/// No unbounded expression trees, no nesting. A callee whose behaviour does
/// not fit one of these forms falls back to `Top` we never try to encode
/// not fit one of these forms falls back to `Top`, we never try to encode
/// richer algebra in the summary.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum IntervalTransfer {
@ -247,9 +247,9 @@ impl IntervalTransfer {
/// Mirrors [`IntervalTransfer`] for the string subdomain. Bounded by
/// [`MAX_LITERAL_PREFIX_LEN`] to keep summary size constant.
///
/// * [`StringTransfer::Unknown`] default.
/// * [`StringTransfer::Identity`] return = param.
/// * [`StringTransfer::LiteralPrefix`] return has this literal prefix
/// * [`StringTransfer::Unknown`], default.
/// * [`StringTransfer::Identity`], return = param.
/// * [`StringTransfer::LiteralPrefix`], return has this literal prefix
/// regardless of input (callee-intrinsic).
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum StringTransfer {
@ -325,7 +325,7 @@ impl StringTransfer {
/// caller's knowledge of each argument, without having to re-run the callee.
///
/// Composition rule: `apply(input) = (interval.apply, string.apply,
/// bits=top)`. The bit domain is always Top we do not track cross-file
/// bits=top)`. The bit domain is always Top, we do not track cross-file
/// bit transfers.
#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct AbstractTransfer {
@ -351,7 +351,7 @@ impl AbstractTransfer {
Self::default()
}
/// True when neither subdomain carries any information equivalent to
/// True when neither subdomain carries any information, equivalent to
/// "omit this entry entirely".
pub fn is_top(&self) -> bool {
is_interval_top(&self.interval) && is_string_unknown(&self.string)
@ -410,7 +410,7 @@ impl AbstractState {
/// Set abstract value for an SSA value. Drops Top values to save space.
pub fn set(&mut self, v: SsaValue, val: AbstractValue) {
if val.is_top() {
// Don't store Top it's the default
// Don't store Top, it's the default
if let Ok(idx) = self.values.binary_search_by_key(&v, |(id, _)| *id) {
self.values.remove(idx);
}
@ -422,7 +422,7 @@ impl AbstractState {
if self.values.len() < MAX_ABSTRACT_VALUES {
self.values.insert(idx, (v, val));
}
// Over budget: silently drop (conservative defaults to Top)
// Over budget: silently drop (conservative, defaults to Top)
}
}
}

View file

@ -15,7 +15,7 @@
//! Each axis is a three-value lattice [`Tri::No`] / [`Tri::Yes`] / [`Tri::Maybe`]
//! where `Maybe` is Top (unknown) and `No` / `Yes` are the two definite
//! refinements. A value is path-safe for a FILE_IO sink iff
//! `dotdot == No && absolute == No` i.e. we have proof that *no* `..`
//! `dotdot == No && absolute == No`, i.e. we have proof that *no* `..`
//! component and *no* absolute root can leak through. `normalized == Yes`
//! alone is not sufficient (canonicalising an absolute input still produces
//! an absolute path); prefix_lock is used separately to certify containment
@ -52,7 +52,7 @@ pub enum Tri {
No,
/// Proven present.
Yes,
/// Unknown no transfer or guard has proved the axis yet.
/// Unknown, no transfer or guard has proved the axis yet.
Maybe,
}
@ -367,12 +367,12 @@ impl AbstractDomain for PathFact {
/// narrowed axis can be proved safe.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PathRejection {
/// `x.contains("..")` false branch proves `dotdot = No` on the receiver.
/// `x.contains("..")`, false branch proves `dotdot = No` on the receiver.
DotDot,
/// `x.starts_with("/")` / `x.starts_with('\\')` false branch proves
/// `x.starts_with("/")` / `x.starts_with('\\')`, false branch proves
/// `absolute = No` on the receiver.
AbsoluteSlash,
/// `x.is_absolute()` / `Path::new(x).is_absolute()` false branch proves
/// `x.is_absolute()` / `Path::new(x).is_absolute()`, false branch proves
/// `absolute = No` on the argument/receiver.
IsAbsolute,
/// Not a path-rejection idiom.
@ -384,7 +384,7 @@ pub enum PathRejection {
/// the listed axis is refined.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PathAssertion {
/// `x.starts_with("<literal_root>")` true branch attaches
/// `x.starts_with("<literal_root>")`, true branch attaches
/// `prefix_lock = Some("<literal_root>")` to the receiver.
PrefixLock(String),
/// Not a path-assertion idiom.
@ -426,7 +426,7 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec
let clause = clause.trim();
// Multi-axis special case: `!filepath.IsLocal(p)` (Go).
// `filepath.IsLocal` returns true iff the path stays within the
// current directory no leading `/`, no `..` segments, no Windows
// current directory, no leading `/`, no `..` segments, no Windows
// drive root. Idiomatic Go path-traversal guard:
// `if !filepath.IsLocal(p) { return }`
// The TRUE branch terminates; the FALSE branch (where IsLocal is
@ -449,7 +449,7 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec
out
}
/// Detect `!filepath.IsLocal(<expr>)` Go's idiomatic path-traversal
/// Detect `!filepath.IsLocal(<expr>)`, Go's idiomatic path-traversal
/// guard. Whitespace-tolerant: `! filepath.IsLocal(`, `!filepath . IsLocal(`,
/// etc. Used by [`classify_path_rejection_axes`] to inject both
/// [`PathRejection::DotDot`] and [`PathRejection::IsAbsolute`] on the false
@ -475,7 +475,7 @@ fn has_negated_filepath_is_local(clause: &str) -> bool {
fn classify_path_rejection_atom(clause: &str) -> PathRejection {
// `.contains("..")` (Rust, Java) / `.includes("..")` (JS/TS) /
// `.include?("..")` (Ruby) / `strings.Contains(s, "..")` (Go) /
// `strstr(s, "..")` (C/C++) every form recognised by
// `strstr(s, "..")` (C/C++), every form recognised by
// `extract_contains_arg` returns `..` if the needle is the dotdot
// segment.
if let Some(needle) = extract_contains_arg(clause)
@ -483,7 +483,7 @@ fn classify_path_rejection_atom(clause: &str) -> PathRejection {
{
return PathRejection::DotDot;
}
// Python `".." in s` operator form. Look for `".." in <something>`
// Python `".." in s`, operator form. Look for `".." in <something>`
// anywhere in the clause text. Conservative: requires the literal
// `".." in ` substring (whitespace-tolerant).
if has_python_dotdot_in(clause) {
@ -681,7 +681,7 @@ pub fn classify_path_assertion(text: &str) -> PathAssertion {
/// * Must be non-empty.
/// * The leaf segment must begin with an ASCII uppercase letter
/// (Rust's variant / struct / type grammar).
/// * The leaf segment must be ASCII alphanumeric / underscore no
/// * The leaf segment must be ASCII alphanumeric / underscore, no
/// method call noise (parentheses, argument lists) survives here
/// because callees arrive in their normalised scoped-identifier
/// form.
@ -700,7 +700,7 @@ pub fn is_structural_variant_ctor(callee: &str) -> bool {
// upper-camel-case names an enum variant or tuple struct (`Some`,
// `Ok`, `MyResult`). A scoped identifier whose *penultimate*
// segment is upper-camel-case names an associated constructor on
// that type `Box::new`, `Cell::from`, `PathBuf::with_capacity`,
// that type, `Box::new`, `Cell::from`, `PathBuf::with_capacity`,
// etc. The latter is the lower-leaf-case shape we want to admit
// alongside the bare-variant shape.
let segments: smallvec::SmallVec<[&str; 4]> =
@ -731,7 +731,7 @@ pub fn is_structural_variant_ctor(callee: &str) -> bool {
/// PathFact of the receiver/first argument (the value being sanitised);
/// it is used as the baseline to which the call's effect is applied.
///
/// Returned [`None`] means the callee is not a recognised path primitive
/// Returned [`None`] means the callee is not a recognised path primitive ,
/// the caller should leave the result at its pre-existing PathFact (Top).
///
/// Backwards-compatible wrapper around [`classify_path_primitive_rust`].
@ -743,7 +743,7 @@ pub fn classify_path_primitive(callee: &str, input_fact: &PathFact) -> Option<Pa
/// Per-language path-primitive dispatcher.
///
/// Routes to the language-specific classifier Rust, Python, JS/TS, Go,
/// Routes to the language-specific classifier, Rust, Python, JS/TS, Go,
/// Java, Ruby, PHP, or C/C++. Returns [`None`] for languages without a
/// classifier (or callees the language's classifier doesn't recognise).
pub fn classify_path_primitive_for_lang(
@ -784,7 +784,7 @@ pub fn is_structural_variant_ctor_for_lang(lang: crate::symbol::Lang, callee: &s
}
/// Per-language predicate for "this callee is a zero-arg fresh-allocation
/// constructor" used by the variant-rejection-path classifier so that
/// constructor", used by the variant-rejection-path classifier so that
/// `String::new()` (Rust) / `''` (Python/JS/Java/...) is recognised as a
/// no-attacker-content fresh value with cleared `dotdot`/`absolute` axes.
///
@ -803,7 +803,7 @@ pub fn is_zero_arg_allocator_for_lang(lang: crate::symbol::Lang, _callee: &str)
false
}
/// Rust path-primitive classifier `fs::canonicalize`, `Path::new`,
/// Rust path-primitive classifier, `fs::canonicalize`, `Path::new`,
/// `PathBuf::from`, identity-string conversions.
pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
// Accept both path-qualified (`std::fs::canonicalize`, `fs::canonicalize`)
@ -826,7 +826,7 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti
// `Path::new(s)` / `PathBuf::from(s)`:
// pass-through of the input's PathFact so downstream `starts_with`
// checks against a Path/PathBuf value still see the underlying
// string's narrowed axes. No axis is forced wrapping does not
// string's narrowed axes. No axis is forced, wrapping does not
// sanitize on its own.
"new" | "from" => {
if callee_contains_segment(callee, "Path") || callee_contains_segment(callee, "PathBuf")
@ -837,8 +837,8 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti
}
}
// Identity conversions on strings/paths. Each one re-binds the
// same logical value the converted String / PathBuf / OsString
// still describes the exact same filesystem path so the PathFact
// same logical value, the converted String / PathBuf / OsString
// still describes the exact same filesystem path, so the PathFact
// flows through unchanged. Without this, a sanitised `s: &str`
// would lose its narrowed axes the moment the helper returns
// `s.to_string()` / `s.to_owned()` / `String::from(s)`.
@ -849,7 +849,7 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti
}
}
/// Python path-primitive classifier `os.path.normpath`, `os.path.realpath`,
/// Python path-primitive classifier, `os.path.normpath`, `os.path.realpath`,
/// `pathlib.Path.resolve`, `os.path.abspath`.
///
/// Pattern conventions: tree-sitter-python emits dotted attribute access as
@ -893,7 +893,7 @@ pub fn classify_path_primitive_python(callee: &str, input_fact: &PathFact) -> Op
}
}
/// JavaScript / TypeScript path-primitive classifier Node's `path` module:
/// JavaScript / TypeScript path-primitive classifier, Node's `path` module:
/// `path.normalize`, `path.resolve`, `path.join`.
pub fn classify_path_primitive_js(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
let leaf = rightmost_segment(callee);
@ -920,7 +920,7 @@ pub fn classify_path_primitive_js(callee: &str, input_fact: &PathFact) -> Option
}
}
/// Go path-primitive classifier `path/filepath` package:
/// Go path-primitive classifier, `path/filepath` package:
/// `filepath.Clean`, `filepath.Abs`.
pub fn classify_path_primitive_go(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
let leaf = rightmost_segment(callee);
@ -947,7 +947,7 @@ pub fn classify_path_primitive_go(callee: &str, input_fact: &PathFact) -> Option
}
}
/// Java path-primitive classifier `java.nio.file.Path.normalize` /
/// Java path-primitive classifier, `java.nio.file.Path.normalize` /
/// `Paths.get(s).normalize().toAbsolutePath()`.
pub fn classify_path_primitive_java(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
let leaf = rightmost_segment(callee);
@ -980,7 +980,7 @@ pub fn classify_path_primitive_java(callee: &str, input_fact: &PathFact) -> Opti
}
}
/// Ruby path-primitive classifier `File.expand_path` / `Pathname#cleanpath`.
/// Ruby path-primitive classifier, `File.expand_path` / `Pathname#cleanpath`.
pub fn classify_path_primitive_ruby(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
let leaf = rightmost_segment(callee);
match leaf {
@ -1005,13 +1005,13 @@ pub fn classify_path_primitive_ruby(callee: &str, input_fact: &PathFact) -> Opti
}
}
/// PHP path-primitive classifier `realpath`, `basename`.
/// PHP path-primitive classifier, `realpath`, `basename`.
pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
let leaf = rightmost_segment(callee);
match leaf {
// `realpath($s)`:
// Resolves symlinks and `..`, returns absolute path. Returns
// `false` if the file doesn't exist but on the success path
// `false` if the file doesn't exist, but on the success path
// (which is what reaches a sink), it produces a clean absolute path.
"realpath" => {
let mut f = input_fact.clone();
@ -1021,7 +1021,7 @@ pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Optio
Some(f)
}
// `basename($s)`:
// Strips directory components guaranteed to contain no `..`
// Strips directory components, guaranteed to contain no `..`
// (basename of `..` is `..`, but basename of any traversal-
// prefixed path is just the leaf). Conservative: clear dotdot.
"basename" => {
@ -1034,7 +1034,7 @@ pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Optio
}
}
/// C / C++ path-primitive classifier POSIX `realpath`,
/// C / C++ path-primitive classifier, POSIX `realpath`,
/// `std::filesystem::canonical`.
pub fn classify_path_primitive_c_cpp(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
let leaf = rightmost_segment(callee);
@ -1089,7 +1089,7 @@ fn extract_contains_arg(text: &str) -> Option<String> {
"strstr(",
] {
if let Some(idx) = text.find(prefix) {
// Skip past the first argument (receiver) the literal needle
// Skip past the first argument (receiver), the literal needle
// is the second arg, separated by a comma. Find the comma at
// top level inside this call.
let inner = &text[idx + prefix.len()..];
@ -1123,7 +1123,7 @@ fn extract_starts_with_arg(text: &str) -> Option<String> {
return Some(s);
}
}
// Go free-function form `strings.HasPrefix(r, "/")` second arg.
// Go free-function form `strings.HasPrefix(r, "/")`, second arg.
if let Some(idx) = text.find("strings.HasPrefix(") {
let inner = &text[idx + "strings.HasPrefix(".len()..];
if let Some(comma_idx) = top_level_comma(inner) {
@ -1762,7 +1762,7 @@ mod tests {
assert!(is_structural_variant_ctor("Box::new"));
assert!(is_structural_variant_ctor("std::option::Option::Some"));
// User-defined upper-camel-case variant name participates the
// same way name list is not part of the contract.
// same way, name list is not part of the contract.
assert!(is_structural_variant_ctor("MyResult::Ok"));
assert!(is_structural_variant_ctor("Wrapper"));
}

View file

@ -1,6 +1,6 @@
//! String abstract domain for abstract interpretation.
//!
//! Tracks known prefix, suffix, and — when provably bounded — the finite set
//! Tracks known prefix, suffix, and, when provably bounded, the finite set
//! of possible concrete string values. Used for SSRF suppression (URL prefix
//! proves host is locked), command-injection suppression (lookup result
//! bounded to a safe set of literals), and general string analysis.
@ -78,7 +78,7 @@ impl StringFact {
/// the finite domain is `{s}`.
///
/// Empty prefix/suffix are normalised to `None` because "starts/ends with
/// the empty string" carries no constraint keeping `Some("")` would
/// the empty string" carries no constraint, keeping `Some("")` would
/// break join idempotence (`Some("")` ⊔ `Some("")` collapses to `None`).
pub fn exact(s: &str) -> Self {
let prefix = truncate_prefix(s);
@ -134,7 +134,7 @@ impl StringFact {
/// Inputs are sorted and deduped. If the cardinality exceeds
/// [`MAX_DOMAIN_SIZE`] or the input is empty, the domain collapses to
/// `None` (Top on this sub-field). The prefix/suffix sub-fields remain
/// unset callers can combine with [`Self::exact`] for single-element
/// unset, callers can combine with [`Self::exact`] for single-element
/// sets if tighter facts are desired.
pub fn finite_set(values: Vec<String>) -> Self {
let mut v = values;
@ -411,7 +411,7 @@ fn truncate_suffix(s: &str) -> String {
/// Longest common prefix of two strings, char-aligned.
///
/// Iterates by `char` rather than `byte` so multi-byte UTF-8 code points are
/// either kept whole or dropped a byte-wise comparison would slice into the
/// either kept whole or dropped, a byte-wise comparison would slice into the
/// middle of a code point and produce mojibake (`x as char` on a UTF-8
/// continuation byte yields a garbage Latin-1 character).
pub fn longest_common_prefix(a: &str, b: &str) -> String {
@ -746,7 +746,7 @@ mod tests {
let a = StringFact::from_prefix("https://api.example.com/");
let b = StringFact::from_prefix("https://db.example.com/");
let r = a.join(&b);
// Common prefix is "https://" anything past that diverges.
// Common prefix is "https://", anything past that diverges.
assert_eq!(
r.prefix.as_deref(),
Some("https://"),
@ -781,7 +781,7 @@ mod tests {
]
}
/// `x ⊔ x = x` join is idempotent across all sample shapes.
/// `x ⊔ x = x`, join is idempotent across all sample shapes.
#[test]
fn join_idempotent_string() {
for a in sample_strings() {
@ -789,7 +789,7 @@ mod tests {
}
}
/// `x ⊔ y = y ⊔ x` join is commutative.
/// `x ⊔ y = y ⊔ x`, join is commutative.
#[test]
fn join_commutative_string() {
let xs = sample_strings();
@ -806,7 +806,7 @@ mod tests {
}
}
/// `x ⊓ x = x` meet is idempotent.
/// `x ⊓ x = x`, meet is idempotent.
#[test]
fn meet_idempotent_string() {
for a in sample_strings() {
@ -814,7 +814,7 @@ mod tests {
}
}
/// `x ⊓ y = y ⊓ x` meet is commutative.
/// `x ⊓ y = y ⊓ x`, meet is commutative.
#[test]
fn meet_commutative_string() {
let xs = sample_strings();
@ -844,7 +844,7 @@ mod tests {
}
}
/// `x ⊑ x` leq is reflexive.
/// `x ⊑ x`, leq is reflexive.
#[test]
fn leq_reflexive_string() {
for a in sample_strings() {
@ -852,7 +852,7 @@ mod tests {
}
}
/// **Soundness**: `widen(a, b) ⊒ join(a, b)` widening must
/// **Soundness**: `widen(a, b) ⊒ join(a, b)`, widening must
/// over-approximate join, otherwise dataflow loses information.
#[test]
fn widen_over_approximates_join_string() {
@ -905,7 +905,7 @@ mod tests {
}
}
/// Empty-string exact value must distinguish from Top it is a
/// Empty-string exact value must distinguish from Top, it is a
/// singleton (`{""}`), not unconstrained. After the empty-prefix
/// normalisation, prefix/suffix are `None` (carry no extra info)
/// but the `domain` field still pins the value to exactly `""`.

View file

@ -127,12 +127,12 @@ use crate::utils::snippet::line_snippet as extract_line_snippet;
/// [`normalize_namespace`] convention) back to the absolute path the
/// diagnostic pipeline expects.
///
/// * Empty `file_rel` single-file scans normalize every namespace to
/// * Empty `file_rel`, single-file scans normalize every namespace to
/// `""`; treat that as "the file under analysis" and return
/// `fallback.to_string_lossy()`.
/// * `scan_root` absent we have no workspace root to resolve against;
/// * `scan_root` absent, we have no workspace root to resolve against;
/// return `file_rel` verbatim (it may already be absolute).
/// * Otherwise join `scan_root` with `file_rel`.
/// * Otherwise, join `scan_root` with `file_rel`.
fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -> String {
if file_rel.is_empty() {
return fallback.to_string_lossy().into_owned();
@ -163,7 +163,7 @@ fn build_taint_diag(
let source_info = cfg_graph.node_weight(finding.source);
// The reconstructed flow path is the authoritative view of where the
// taint started *in this body*. When present, prefer its first step's
// CFG span over `finding.source_span` which can be stale across
// CFG span over `finding.source_span`, which can be stale across
// multi-hop cross-body remaps (e.g. JS two-level solve where a
// callee-interior source gets its span rewritten to the enclosing
// body's entry node). Fall back to `source_span`, then to the source
@ -183,7 +183,7 @@ fn build_taint_diag(
// Prefer the source CFG node's callee string when it's a call expression
// (e.g. `os.getenv("X")`). For property-access sources like
// `navigator.userAgent` there is no callee fall back to the first flow
// `navigator.userAgent` there is no callee, fall back to the first flow
// step's `variable` (the SSA var name, e.g. "userAgent"), then to the
// source node's `taint.defines` / first `taint.uses` entry, before
// finally giving up and rendering "(unknown)".
@ -289,7 +289,7 @@ fn build_taint_diag(
// Convert raw flow steps to display FlowSteps. When the finding has a
// primary_location distinct from the call site, the last raw step is
// really the Call reclassify it and append a synthetic Sink step
// really the Call, reclassify it and append a synthetic Sink step
// pointing at the callee-internal dangerous instruction so analysts
// see both the call site and the final sink in the trace.
let mut flow_steps: Vec<FlowStep> = finding
@ -348,7 +348,7 @@ fn build_taint_diag(
.clone()
.or_else(|| Some(short_call_site.clone()));
// Resolved sink capability bits used by deduplication to distinguish
// Resolved sink capability bits, used by deduplication to distinguish
// sinks with different cap types on the same source line (e.g.
// `sink_sql(x); sink_shell(x);`).
let sink_caps_bits: u16 = cfg_graph[finding.sink]
@ -361,13 +361,33 @@ fn build_taint_diag(
})
.fold(0u16, |acc, b| acc | b);
// Phase C: when the sink's required caps include UNAUTHORIZED_ID — and
// the finding actually reached that sink via the taint engine — use a
// dedicated auth rule id so the finding is namespaced alongside the
// standalone `auth_analysis` subsystem's output instead of being folded
// into the generic `taint-unsanitised-flow` bucket.
let diag_id = if sink_caps_bits & crate::labels::Cap::UNAUTHORIZED_ID.bits() != 0 {
// Cap-specific rule-id routing.
//
// 1. `UNAUTHORIZED_ID`: namespace alongside the standalone `auth_analysis`
// subsystem's output so cross-tool aggregation lines up.
// 2. `DATA_EXFIL`: route to `taint-data-exfiltration` so SARIF surfaces a
// distinct rule id from SSRF, the two share callees (e.g. `fetch`)
// but represent different vulnerability classes.
//
// Prefer the per-finding `effective_sink_caps` (set by the multi-gate
// SSA dispatch) when populated; fall back to the union of all sink-label
// caps on the CFG node so legacy paths that build findings without
// setting `effective_sink_caps` still pick the right rule id.
let effective_caps = if finding.effective_sink_caps.is_empty() {
crate::labels::Cap::from_bits_truncate(sink_caps_bits)
} else {
finding.effective_sink_caps
};
let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
"rs.auth.missing_ownership_check.taint".to_string()
} else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& !effective_caps.contains(crate::labels::Cap::SSRF)
{
format!(
"taint-data-exfiltration (source {}:{})",
source_point.row + 1,
source_point.column + 1
)
} else {
format!(
"taint-unsanitised-flow (source {}:{})",
@ -452,7 +472,7 @@ fn build_taint_diag(
/// Resolve a file extension to a language slug (e.g. `"rust"`,
/// `"javascript"`). Public façade over [`lang_for_path`] for callers
/// that only need the slug used by the debug API to look up
/// that only need the slug, used by the debug API to look up
/// per-language rule enablement without re-parsing the file.
pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> {
lang_for_path(path).map(|(_, slug)| slug)
@ -467,7 +487,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
// use `.cc` / `.cxx` / `.hpp` / `.hh` / `.h++` rather than the
// `.cpp` synthetic-fixture extension. Without these mappings,
// the scanner silently skipped them. Headers (`.h` is omitted
// intentionally it's also valid C and disambiguating without a
// intentionally, it's also valid C and disambiguating without a
// build system is brittle).
Some("cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++") => {
Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp"))
@ -481,7 +501,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
"typescript",
)),
// TSX grammar is a superset of TypeScript plus JSX element/attribute
// nodes all TypeScript KINDS / RULES / PARAM_CONFIG entries apply,
// nodes, all TypeScript KINDS / RULES / PARAM_CONFIG entries apply,
// and JSX-specific sinks (e.g. `dangerouslySetInnerHTML`) layer on top
// via the same `typescript` slug.
Some("tsx") => Some((
@ -493,7 +513,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
"javascript",
)),
// JSX uses the same JavaScript grammar (tree-sitter-javascript handles
// JSX natively) slug "javascript" so all JS rules apply.
// JSX natively), slug "javascript" so all JS rules apply.
Some("jsx") => Some((
Language::from(tree_sitter_javascript::LANGUAGE),
"javascript",
@ -739,7 +759,7 @@ impl<'a> ParsedSource<'a> {
continue;
}
// Layer C: PHP `unserialize($x, ['allowed_classes' => [...]])`
// or `unserialize($x, ['allowed_classes' => false])`
// or `unserialize($x, ['allowed_classes' => false])` ,
// PHP 7+ structural mitigation against object injection.
// When the call passes an `allowed_classes` option set to
// either `false` (no class instantiation) or an array
@ -762,7 +782,7 @@ impl<'a> ParsedSource<'a> {
// format-string contributes attacker-controlled length.
// When the source argument is a string literal (or a
// ternary of two string literals), the contributed length
// is statically bounded there is no overflow vector
// is statically bounded, there is no overflow vector
// for an attacker even if the destination buffer is
// mis-sized. Same principle for `sprintf` when the
// format string is a literal containing no bare `%s`
@ -818,7 +838,7 @@ impl<'a> ParsedSource<'a> {
/// Sort, dedup, and optionally downgrade severity for non-production paths.
///
/// Dedup key matches the `issues` table PRIMARY KEY `(file_id, rule_id,
/// line, col)` severity is NOT part of the key. Two diags that agree
/// line, col)`, severity is NOT part of the key. Two diags that agree
/// on (line, col, id) but differ in severity (e.g. a pattern-rule finding
/// plus a taint-pipeline finding on the same call) would otherwise survive
/// dedup here and crash the indexer with a UNIQUE constraint violation.
@ -854,7 +874,7 @@ impl<'a> ParsedFile<'a> {
// project-level `FrameworkContext` misses frameworks the file
// obviously imports. Augment the per-file rule set with any
// framework-conditional rules keyed off in-file import specifiers
// (e.g. `import fastify from 'fastify'`). Idempotent skips
// (e.g. `import fastify from 'fastify'`). Idempotent, skips
// frameworks already active from the manifest pass.
let in_file_fws =
crate::utils::project::detect_in_file_frameworks(source.bytes, source.lang_slug);
@ -931,13 +951,13 @@ impl<'a> ParsedFile<'a> {
self.source.lang_slug,
);
// Phase 6 (typed call-graph subtype awareness): every
// every
// `FuncSummary` exported from this file carries a copy of the
// file's `hierarchy_edges` so the inheritance / impl /
// implements relationships persist through SQLite round-trips
// and re-merge into `crate::callgraph::TypeHierarchyIndex` at
// call-graph build time. Cheap (one clone per summary) and
// strictly additive `merge_summaries` deduplicates downstream.
// strictly additive, `merge_summaries` deduplicates downstream.
if !self.file_cfg.hierarchy_edges.is_empty() {
let edges = self.file_cfg.hierarchy_edges.clone();
for s in &mut out {
@ -982,7 +1002,7 @@ impl<'a> ParsedFile<'a> {
///
/// Returns two vectors keyed by canonical [`crate::symbol::FuncKey`].
/// The `FuncKey` identity preserves `(lang, namespace, container, name,
/// arity, disambig, kind)` so two same-name definitions in this file
/// arity, disambig, kind)`, so two same-name definitions in this file
/// (e.g. a free `process` and a `Worker::process`, or overloads with
/// different arities) land on distinct entries instead of the later one
/// shadowing the earlier one.
@ -1003,7 +1023,7 @@ impl<'a> ParsedFile<'a> {
// Use the FileCfg path (same one `analyse_file` uses at taint time) so
// the SSA summaries stored cross-file match exactly what pass 2 will
// resolve against no NodeIndex-space or entry-detection drift.
// resolve against, no NodeIndex-space or entry-detection drift.
let locator = crate::summary::SinkSiteLocator {
tree: &self.source.tree,
bytes: self.source.bytes,
@ -1024,7 +1044,7 @@ impl<'a> ParsedFile<'a> {
/// Lower every function body in this file to SSA exactly once. Used by
/// [`analyse_file_fused`] to share the result between the taint engine
/// ([`run_cfg_analyses_with_lowered`]) and the SSA artifact filter
/// ([`build_eligible_bodies_from_lowered`]) the prior code path lowered
/// ([`build_eligible_bodies_from_lowered`]), the prior code path lowered
/// twice (once inside `analyse_file`, once inside
/// `extract_ssa_artifacts_from_file_cfg`) and accounted for ~24% of the
/// pass-2 wall-clock on the bench corpus.
@ -1038,7 +1058,7 @@ impl<'a> ParsedFile<'a> {
/// here populates `param_to_sink` with concrete coordinates that the
/// emission path then promotes into `Finding.primary_location`,
/// causing the same-file summary-resolved sink to be reported at the
/// callee-internal sink line instead of the call site which both
/// callee-internal sink line instead of the call site, which both
/// duplicates the intraprocedural finding the taint engine already
/// emits at that exact line and re-attributes the flow finding away
/// from the user-visible call site. Closure-capture, lambda, and
@ -1263,13 +1283,11 @@ impl<'a> ParsedFile<'a> {
state::build_resource_method_summaries(&self.file_cfg.bodies, caller_lang);
let mut all_state_findings = Vec::new();
for body in &self.file_cfg.bodies {
// Phase 2 of the pointer-analysis rollout: when
// `NYX_POINTER_ANALYSIS=1` is set, derive a `var_name →
// PtrProxyHint` map from the body's points-to facts so
// the proxy-acquire transfer can suppress SymbolId
// attribution on field-aliased receivers (e.g. `m :=
// c.mu; m.Lock()`). Strict-additive — `None` when the
// env-var is unset and behaviour matches today exactly.
// When `NYX_POINTER_ANALYSIS=1` is set, derive a
// `var_name → PtrProxyHint` map from the body's
// points-to facts so the proxy-acquire transfer can
// suppress SymbolId attribution on field-aliased
// receivers (e.g. `m := c.mu; m.Lock()`).
let body_pointer_hints = cfg_analysis::build_body_const_facts(body, caller_lang)
.as_ref()
.and_then(|f| {
@ -1379,15 +1397,11 @@ impl<'a> ParsedFile<'a> {
)
}
/// Build a per-file `var_name → TypeKind` map by running SSA + type
/// facts on each body and copying type facts for SSA values whose
/// definition recorded a source-level variable name. When the same
/// name resolves to different non-`Unknown` types across bodies the
/// entry is dropped — absence is safe because the auth analysis
/// sink gate simply falls back to its syntactic heuristics. Returns
/// `None` when no body produces any typed variable (non-Rust files
/// currently emit few `LocalCollection` / security-typed facts, but
/// this path is language-agnostic).
/// Build a per-file `var_name → TypeKind` map from SSA + type facts.
/// Conflicting non-`Unknown` types across bodies drop the entry ,
/// absence is safe because the auth sink gate falls back to
/// syntactic heuristics. Returns `None` when no body produces a
/// typed variable.
fn collect_file_var_types(&self) -> Option<auth_analysis::VarTypes> {
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
let mut merged: std::collections::HashMap<String, crate::ssa::type_facts::TypeKind> =
@ -1492,7 +1506,7 @@ pub fn build_cfg_for_file(path: &Path, cfg: &Config) -> NyxResult<Option<(FileCf
/// Parse a file and return its `AuthorizationModel` for debug inspection.
///
/// Runs only the auth-extraction pipeline no taint, no CFG construction.
/// Runs only the auth-extraction pipeline, no taint, no CFG construction.
/// Returns `None` for binary files or unsupported languages. Used by the
/// `/api/debug/auth` route to surface the structured authorization model
/// (routes, units, sensitive operations, auth checks) in the debug UI.
@ -1607,7 +1621,7 @@ pub fn perf_stage_breakdown_fused(
/// Diagnostic stage-timing helper for the perf audit.
///
/// Times each stage of pass 2 internally and returns µs counts. Returns
/// `None` for unsupported languages. Not used in production just for
/// `None` for unsupported languages. Not used in production, just for
/// `tests/perf_breakdown.rs` to attribute time inside `run_rules_on_bytes`
/// without touching the hot path.
#[doc(hidden)]
@ -1651,7 +1665,7 @@ pub fn perf_stage_breakdown(
///
/// This is the shared pass-1 pipeline for indexed scans: parses once, builds
/// CFG once, and returns both summary types. Uses the same `ParsedFile`
/// pipeline as `analyse_file_fused` no divergent extraction path.
/// pipeline as `analyse_file_fused`, no divergent extraction path.
pub fn extract_all_summaries_from_bytes(
bytes: &[u8],
path: &Path,
@ -1727,7 +1741,7 @@ fn is_call_all_args_literal(node: tree_sitter::Node, bytes: &[u8]) -> bool {
}
// If the argument list is empty (no args), we conservatively do NOT
// suppress the danger may come from side effects, not arguments.
// suppress, the danger may come from side effects, not arguments.
has_any_arg
}
@ -1745,7 +1759,7 @@ fn find_enclosing_call(mut node: tree_sitter::Node) -> Option<tree_sitter::Node>
if kind == "function_call_expression" {
return Some(node);
}
// Stop at scope/statement boundaries don't cross into outer calls
// Stop at scope/statement boundaries, don't cross into outer calls
if kind.contains("block")
|| kind.contains("body")
|| kind == "program"
@ -1780,13 +1794,20 @@ fn find_arg_list(call: tree_sitter::Node) -> Option<tree_sitter::Node> {
fn is_literal_node(node: tree_sitter::Node, bytes: &[u8]) -> bool {
let kind = node.kind();
match kind {
// String literals (most languages)
// String literals, but Python's `string` node also covers
// f-strings, which carry `interpolation` children. An f-string
// with interpolation is *not* a literal: it embeds arbitrary
// expressions, so a sink call like `cursor.execute(f"…{x}")`
// must not be suppressed under Layer A's "all-literal args"
// shortcut. Same shape applies to any tree-sitter grammar
// that nests an `interpolation` (or `string_interpolation`)
// child inside a string node.
"string"
| "string_literal"
| "interpreted_string_literal"
| "raw_string_literal"
| "string_content"
| "string_fragment" => true,
| "string_fragment" => !has_interpolation(node),
// Numeric literals
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
@ -1901,7 +1922,7 @@ fn is_php_include_param_passthrough(include_node: tree_sitter::Node, bytes: &[u8
}
return true;
}
// Stop at class/program scope without a matching function bare
// Stop at class/program scope without a matching function, bare
// top-level `include $var` does not benefit from this guard.
"program" | "class_declaration" | "trait_declaration" | "interface_declaration" => {
return false;
@ -2011,7 +2032,7 @@ fn is_var_reassigned_before(
/// PHP-only: returns `true` when the captured `function_call_expression`
/// node is `unserialize($x, [..., 'allowed_classes' => <ARRAY|false>, ...])`.
/// This is the canonical PHP 7+ structural mitigation against object
/// injection explicitly restricting which classes the deserialiser may
/// injection, explicitly restricting which classes the deserialiser may
/// instantiate. Only suppress when the option is either:
///
/// - `'allowed_classes' => false` (no class instantiation), or
@ -2091,9 +2112,9 @@ fn is_php_unserialize_allowed_classes_restricted(
// Accept structural mitigation forms. The intent signal is
// "developer explicitly set allowed_classes to something other than
// `true`":
// - boolean `false` no class instantiation at all
// - array literal explicit allow-list
// - class-constant reference `self::ALLOWED_CLASSES` /
// - boolean `false` , no class instantiation at all
// - array literal , explicit allow-list
// - class-constant reference , `self::ALLOWED_CLASSES` /
// `Foo::CONSTANTS` resolved to
// a const array; engine cannot
// statically inspect, but the
@ -2126,7 +2147,7 @@ fn is_php_unserialize_allowed_classes_restricted(
/// `cpp.memory.*` mirrors) when the source argument can carry
/// attacker-controlled length. Calls whose source is a string literal
/// have a compile-time bound and cannot overflow due to attacker input
/// a too-small destination is a fixed developer bug (caught by
///, a too-small destination is a fixed developer bug (caught by
/// compiler warnings / `-fstack-protector` / clang-tidy / ASan), not an
/// exploitable channel. Suppressing these literal-source calls is a
/// deliberate noise / false-positive reduction aligned with Nyx's scope
@ -2141,14 +2162,14 @@ fn is_php_unserialize_allowed_classes_restricted(
/// - `tests/fixtures/real_world/c/state/malloc_lifecycle.expect.json`
/// - `tests/fixtures/real_world/cpp/state/new_delete.expect.json`
/// - `tests/fixtures/real_world/cpp/state/malloc_branches.expect.json`
/// - Positive cases (suppression must NOT fire source is a parameter
/// - Positive cases (suppression must NOT fire, source is a parameter
/// or other attacker-reachable value) live as hard expectations
/// (`must_match: true`) in the taint fixtures:
/// - `tests/fixtures/real_world/c/taint/buffer_overflow.c`
/// - `tests/fixtures/real_world/cpp/taint/gets_strcpy.cpp`
///
/// Removing this function or weakening its predicate would be caught by
/// neither it would be caught by the unit tests below.
/// neither, it would be caught by the unit tests below.
///
/// Pattern rules `c.memory.strcpy` / `c.memory.strcat` / `c.memory.sprintf`
/// (and the `cpp.memory.*` mirrors) flag the call syntactically; their
@ -2173,7 +2194,7 @@ fn is_php_unserialize_allowed_classes_restricted(
/// - source / format is an identifier (could be tainted, e.g.
/// `sprintf(buf, fmt, …)`) → keep firing
/// - format is `concatenated_string` containing identifier macros (e.g.
/// `"%" PRId64`) we cannot statically expand the macro, so refuse
/// `"%" PRId64`), we cannot statically expand the macro, so refuse
/// - bare `%s` in format → keep firing (could read unbounded length)
fn is_c_buffer_call_literal_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
let kind = match rule_id {
@ -2226,7 +2247,7 @@ enum CBufferRule {
/// True for: a C/C++ string literal, OR a `conditional_expression` whose
/// consequence + alternative are both either string literals or ALL_CAPS
/// identifiers (the canonical preprocessor-macro naming convention for
/// string-constant `#define`s `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
/// string-constant `#define`s, `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
/// pervasively in postgres' `formatting.c::DCH_a_m`). Parenthesised forms
/// are unwrapped.
///
@ -2348,7 +2369,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
}
i += 1;
if i >= bytes.len() {
// trailing `%` malformed, refuse to suppress
// trailing `%`, malformed, refuse to suppress
return false;
}
if bytes[i] == b'%' {
@ -2391,7 +2412,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
let conv = bytes[i];
i += 1;
match conv {
// Numeric / char / pointer specifiers bounded output for any input
// Numeric / char / pointer specifiers, bounded output for any input
b'd' | b'i' | b'u' | b'o' | b'x' | b'X' | b'c' | b'e' | b'E' | b'f' | b'F' | b'g'
| b'G' | b'a' | b'A' | b'p' | b'n' => continue,
// String specifier: only safe when precision-bounded
@ -2494,7 +2515,7 @@ struct TaintSuppressionCtx {
/// distinguish "taint proved safe" from "taint failed to track".
taint_finding_lines_by_func: HashMap<Option<String>, HashSet<usize>>,
/// Functions where the SSA engine emitted at least one
/// `all_validated` event every tainted input to *some* sink in
/// `all_validated` event, every tainted input to *some* sink in
/// the function passed through a recognised validation/
/// sanitisation predicate. Drained from
/// `take_all_validated_spans`; positive evidence that the engine
@ -2502,14 +2523,14 @@ struct TaintSuppressionCtx {
/// `taint-unsanitised-flow` finding fired and no Sanitizer label
/// is present. Covers validation, dominator-based pruning,
/// early-return guards, type-check predicates, and interprocedural
/// sanitiser wrappers all of which legitimately clear taint via
/// sanitiser wrappers, all of which legitimately clear taint via
/// SSA branch-narrowing rather than a labelled sanitiser node.
engine_validated_funcs: HashSet<Option<String>>,
/// Functions where some Source's defining variable is later
/// rebound to a literal RHS (carries `TaintMeta.const_text`) in
/// the same scope, with no Source label on the rebinding node.
/// Positive evidence that the engine's SSA renaming structurally
/// kills the source's taint before any sink can read it covers
/// kills the source's taint before any sink can read it, covers
/// `cmd = getenv(); cmd = "echo hello"; system(cmd)` patterns
/// where the rebind is what makes the code safe but the engine
/// has no `Sanitizer` label or `taint-unsanitised-flow` finding to
@ -2520,7 +2541,7 @@ struct TaintSuppressionCtx {
/// interprocedural analysis cleared the flow through a
/// user-defined wrapper (e.g. `def sanitize(s): return
/// shlex.quote(s)`). The current per-function `Sanitizer` check
/// only sees direct sanitisers in the *caller's* scope without
/// only sees direct sanitisers in the *caller's* scope, without
/// this signal, every helper-wrapped sanitiser fires as an
/// AST-pattern FP because the engine cleared the value via Phase
/// 11 inline analysis but the sink's enclosing scope has no
@ -2687,7 +2708,7 @@ impl TaintSuppressionCtx {
// an "interproc sanitiser caller" when its body invokes any
// helper whose own body contains a labelled Sanitizer. This
// handles wrappers like `def sanitize(s): return
// shlex.quote(s)` — the engine clears taint via Phase 11
// shlex.quote(s)`, the engine clears taint via
// inline analysis, but the caller's scope has no labelled
// Sanitizer of its own to satisfy Condition 4(b).
let mut interproc_sanitizer_callers: HashSet<Option<String>> = HashSet::new();
@ -2703,7 +2724,7 @@ impl TaintSuppressionCtx {
// each to its enclosing function via `sink_func_at_line`, and
// record the function as "engine-validated". The set was
// populated by `ssa_events_to_findings` whenever the engine
// emitted an `SsaTaintEvent { all_validated: true, .. }`
// emitted an `SsaTaintEvent { all_validated: true, .. }` ,
// i.e. the engine reached a sink and proved every tainted
// input passed validation. This is the broadest form of
// engine-success evidence, covering predicate validation
@ -2762,7 +2783,7 @@ impl TaintSuppressionCtx {
// sink, since taint couldn't have evaluated a flow that doesn't exist.
let func = match self.sink_func_at_line.get(&line) {
Some(f) => f,
None => return false, // No CFG sink at this line taint had no opportunity to evaluate
None => return false, // No CFG sink at this line, taint had no opportunity to evaluate
};
match self.source_lines_by_func.get(func) {
Some(source_lines) => {
@ -2788,7 +2809,7 @@ impl TaintSuppressionCtx {
// OR
// (c) the SSA engine emitted at least one `all_validated`
// event in this function (engine reached *some* sink and
// proved every tainted input was validated covers
// proved every tainted input was validated, covers
// predicate validation, dominator early-return,
// type-check predicates, and interprocedural sanitiser
// wrappers that don't carry an explicit Sanitizer
@ -2796,18 +2817,18 @@ impl TaintSuppressionCtx {
// OR
// (d) the function rebinds a Source's defining variable to
// a literal RHS at a later line (engine's SSA renaming
// structurally kills taint before any sink reads it
// structurally kills taint before any sink reads it ,
// covers `cmd = getenv(); cmd = "echo"; system(cmd)`),
// OR
// (e) the function calls a same-file helper whose body
// contains a labelled Sanitizer (interprocedural
// sanitiser wrapper covers `def sanitize(s): return
// sanitiser wrapper, covers `def sanitize(s): return
// shlex.quote(s)` patterns where the engine clears
// taint via Phase 11 inline analysis but the caller's
// taint via inline analysis but the caller's
// scope has no Sanitizer label of its own).
//
// When none hold, we can't distinguish silent engine failure
// from real safety e.g. Go points-to limitation on `&local`
// from real safety, e.g. Go points-to limitation on `&local`
// Decode destinations leaves the chain writeback fired but the
// field-cell propagation dead, suppressing legitimate
// AST-pattern findings on every Go CRUD handler whose Decode
@ -2854,7 +2875,7 @@ pub fn run_rules_on_bytes(
maybe_inject_test_panic(path);
let Some(source) = ParsedSource::try_new(bytes, path)? else {
// Not a recognized tree-sitter language try text-based patterns,
// Not a recognized tree-sitter language, try text-based patterns,
// but first surface a parse-timeout synthetic diag if that's what
// caused try_new to return None.
let mut out = scan_text_based_patterns(bytes, path, cfg);
@ -2964,7 +2985,7 @@ pub fn analyse_file_fused(
maybe_inject_test_panic(path);
let Some(source) = ParsedSource::try_new(bytes, path)? else {
// Not a recognized tree-sitter language try text-based patterns,
// Not a recognized tree-sitter language, try text-based patterns,
// and surface a parse-timeout synthetic diag if that's what caused
// try_new to return None.
let mut diags = scan_text_based_patterns(bytes, path, cfg);
@ -2995,7 +3016,7 @@ pub fn analyse_file_fused(
let (ssa_summaries, ssa_bodies) = if needs_cfg {
// Lower SSA exactly once and feed both the taint engine and the
// SSA-artifact extractor. Pre-fix, both consumers re-lowered the
// same `FileCfg` independently `lower_all_functions_from_bodies`
// same `FileCfg` independently, `lower_all_functions_from_bodies`
// accounted for ~20% of `analyse_file_fused` wall-clock on the
// bench corpus.
//
@ -3294,7 +3315,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
"method param pass-through should be recognised"
);
// Local variable assigned from concat NOT a pass-through.
// Local variable assigned from concat, NOT a pass-through.
let code = b"<?php\nclass C { function f(string $base): void { $f = $base . '/x.php'; include $f; } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3303,7 +3324,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
"concat-built local should NOT be treated as pass-through"
);
// Param reassigned before include NOT a pass-through.
// Param reassigned before include, NOT a pass-through.
let code = b"<?php\nfunction f($file) { $file = $_GET['x']; include $file; }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3312,7 +3333,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
"reassigned param should NOT be treated as pass-through"
);
// Top-level (no enclosing function) NOT a pass-through.
// Top-level (no enclosing function), NOT a pass-through.
let code = b"<?php\n$file = $_GET['x'];\ninclude $file;\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3357,7 +3378,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
"allowed_classes => self::CONST should be recognised as safe"
);
// allowed_classes => true unsafe default, must NOT be suppressed
// allowed_classes => true, unsafe default, must NOT be suppressed
let code = b"<?php\n$x = unserialize($d, ['allowed_classes' => true]);\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3366,7 +3387,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
"allowed_classes => true is the unsafe default, should NOT be suppressed"
);
// No second arg must NOT be suppressed
// No second arg, must NOT be suppressed
let code = b"<?php\n$x = unserialize($d);\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3375,7 +3396,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
"single-arg unserialize should NOT be suppressed"
);
// Dynamic options variable must NOT be suppressed
// Dynamic options variable, must NOT be suppressed
let code = b"<?php\n$x = unserialize($d, $opts);\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
@ -3387,7 +3408,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
#[test]
fn sprintf_format_safety_classifier() {
// Numeric / char / pointer specifiers bounded by definition.
// Numeric / char / pointer specifiers, bounded by definition.
assert!(sprintf_format_is_safe(""));
assert!(sprintf_format_is_safe("hello world"));
assert!(sprintf_format_is_safe("%d"));
@ -3396,11 +3417,11 @@ fn sprintf_format_safety_classifier() {
assert!(sprintf_format_is_safe("%5d %x %llo"));
assert!(sprintf_format_is_safe("%%literal-percent"));
assert!(sprintf_format_is_safe("%p"));
// Precision-bounded `%s` / `%.*s` output capped at precision.
// Precision-bounded `%s` / `%.*s`, output capped at precision.
assert!(sprintf_format_is_safe(" %.*s"));
assert!(sprintf_format_is_safe("%.5s"));
assert!(sprintf_format_is_safe("[%-.10s]"));
// Bare `%s` / width-only `%5s` width is a *minimum*, length is
// Bare `%s` / width-only `%5s`, width is a *minimum*, length is
// unbounded. Must NOT be suppressed.
assert!(!sprintf_format_is_safe("%s"));
assert!(!sprintf_format_is_safe("hello %s world"));
@ -3441,7 +3462,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
let q_strcat = r#"(call_expression function: (identifier) @id (#eq? @id "strcat")) @vuln"#;
let q_sprintf = r#"(call_expression function: (identifier) @id (#eq? @id "sprintf")) @vuln"#;
// strcpy(dst, "literal") postgres autoprewarm shape.
// strcpy(dst, "literal"), postgres autoprewarm shape.
let code = b"void f(char *d) { strcpy(d, \"pg_prewarm\"); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcpy);
@ -3450,7 +3471,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with string-literal source must be suppressed"
);
// strcpy(dst, cond ? "a" : "b") string-literal ternary.
// strcpy(dst, cond ? "a" : "b"), string-literal ternary.
let code = b"void f(char *s, int h) { strcpy(s, (h >= 12) ? \"p.m.\" : \"a.m.\"); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcpy);
@ -3459,7 +3480,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with ternary-of-literals source must be suppressed"
);
// strcpy(dst, cond ? P_M_STR : A_M_STR) postgres formatting.c
// strcpy(dst, cond ? P_M_STR : A_M_STR), postgres formatting.c
// shape with #define'd ALL_CAPS string-constant macros.
let code = b"#define P_M_STR \"p.m.\"\n#define A_M_STR \"a.m.\"\nvoid f(char *s, int h) { strcpy(s, (h >= 12) ? P_M_STR : A_M_STR); }\n";
let tree = parser.parse(code, None).unwrap();
@ -3469,7 +3490,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with ternary-of-ALL_CAPS-macros must be suppressed"
);
// strcpy(dst, cond ? var_a : var_b) lowercase variables, NOT a
// strcpy(dst, cond ? var_a : var_b), lowercase variables, NOT a
// recognisable preprocessor macro shape. Must NOT suppress.
let code = b"void f(char *s, int h, char *a, char *b) { strcpy(s, (h >= 12) ? a : b); }\n";
let tree = parser.parse(code, None).unwrap();
@ -3479,7 +3500,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with ternary-of-lowercase-vars must NOT be suppressed"
);
// strcat(dst, "literal") same principle as strcpy.
// strcat(dst, "literal"), same principle as strcpy.
let code = b"void f(char *d) { strcat(d, \" (done)\"); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcat);
@ -3488,7 +3509,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcat with string-literal source must be suppressed"
);
// sprintf(dst, "%lld%c", ...) numeric format string.
// sprintf(dst, "%lld%c", ...), numeric format string.
let code = b"void f(char *cp, long long v, char u) { sprintf(cp, \"%lld%c\", v, u); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3497,7 +3518,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"sprintf with numeric-only format must be suppressed"
);
// sprintf(str, " %.*s", N, x) precision-bounded `%s`.
// sprintf(str, " %.*s", N, x), precision-bounded `%s`.
let code = b"void f(char *str, int n, const char *x) { sprintf(str, \" %.*s\", n, x); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3506,7 +3527,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"sprintf with precision-bounded `%.*s` must be suppressed"
);
// strcpy(dst, src) where src is a non-literal must NOT suppress.
// strcpy(dst, src) where src is a non-literal, must NOT suppress.
let code = b"void f(char *d, char **a) { strcpy(d, a[1]); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_strcpy);
@ -3515,7 +3536,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"strcpy with non-literal source must NOT be suppressed"
);
// sprintf with bare `%s` must NOT suppress.
// sprintf with bare `%s`, must NOT suppress.
let code = b"void f(char *b, const char *u) { sprintf(b, \"%s\", u); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3525,7 +3546,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
);
// sprintf with non-literal format (concatenated_string with PRI* macro)
// must NOT suppress (engine cannot statically expand the macro).
//, must NOT suppress (engine cannot statically expand the macro).
let code = b"void f(char *b, long long v) { sprintf(b, \"%\" PRId64, v); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_c_capture(&tree, code, q_sprintf);
@ -3543,3 +3564,51 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
"Layer D should only fire for buffer-overflow rule ids"
);
}
/// Regression: `is_literal_node` must NOT classify a Python f-string
/// (a `string` node containing `interpolation` children) as literal.
/// Layer A's "all-args-literal → suppress Security finding" shortcut
/// otherwise hides every CVE that injects via `cursor.execute(f"…{x}…")`
/// or `text(f"…{x}…")`. Motivated by CVE-2025-69662 (geopandas SQLi
/// via `text(f"SELECT … '{geom_name}' …")`) and CVE-2025-24793
/// (snowflake-connector-python f-string-built CREATE STAGE / DROP).
#[test]
fn is_literal_node_rejects_python_fstring_with_interpolation() {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
parser.set_language(&lang).unwrap();
// f-string with one interpolation segment, must be non-literal.
let code = b"x = f\"SELECT * WHERE y = '{u}'\"\n";
let tree = parser.parse(code, None).unwrap();
let assignment = tree
.root_node()
.child(0)
.and_then(|s| s.child(0))
.expect("assignment node");
let rhs = assignment
.child_by_field_name("right")
.expect("RHS of assignment");
assert_eq!(rhs.kind(), "string");
assert!(
!is_literal_node(rhs, code),
"f-string with interpolation must not be classified as literal"
);
// Plain string literal, must remain literal.
let code = b"x = \"plain literal\"\n";
let tree = parser.parse(code, None).unwrap();
let assignment = tree
.root_node()
.child(0)
.and_then(|s| s.child(0))
.expect("assignment node");
let rhs = assignment
.child_by_field_name("right")
.expect("RHS of assignment");
assert_eq!(rhs.kind(), "string");
assert!(
is_literal_node(rhs, code),
"plain string literal must be classified as literal"
);
}

View file

@ -220,7 +220,7 @@ fn check_token_override_without_validation(
let mut findings = Vec::new();
for unit in &model.units {
// The rule reasons about "Token acceptance flow" by
// The rule reasons about "Token acceptance flow", by
// construction, that is a user-facing handler that receives a
// token from the client and writes through token-bound state.
// Internal helpers, Celery / cron tasks, Django migrations,
@ -335,15 +335,12 @@ fn has_prior_subject_auth(
})
}
/// Phase A4 row-fetch exemption.
/// Row-fetch exemption.
///
/// Recognises the canonical "fetch-then-authorize" idiom in row-level
/// authz code: a route handler fetches a row by id (`let community =
/// Community::read(pool, data.community_id)?`), then calls a named
/// authorization function on the fetched row (`check_community_user_action(
/// &user, &community, ...)`). The authorization check appears
/// textually after the fetch, so the existing `check.line <= op.line`
/// rule cannot cover the fetch.
/// Recognises the "fetch-then-authorize" idiom: a handler fetches a
/// row by id then calls a named authorization function on it. The
/// check appears textually after the fetch, so the
/// `check.line <= op.line` rule cannot cover the fetch.
///
/// The exemption fires only when:
/// 1. `op` is the row-fetch operation itself (line == row let-line).
@ -353,7 +350,7 @@ fn has_prior_subject_auth(
/// Coverage is intentionally narrow: only the row-fetch operation is
/// exempted. Any sink that runs *between* the fetch and the check
/// (e.g. `delete(community)` before `check_*`) still flags, because
/// its subject is `community` itself — not a fetch arg — and we
/// its subject is `community` itself, not a fetch arg, and we
/// require the operation to be a row-fetch site to apply the
/// exemption.
fn has_row_fetch_exemption(unit: &AnalysisUnit, op: &SensitiveOperation) -> bool {
@ -374,8 +371,8 @@ fn has_row_fetch_exemption(unit: &AnalysisUnit, op: &SensitiveOperation) -> bool
// Look for any non-login auth check whose subjects mention the row.
// Match against the *root* of the subject's chain (`a.b.c` → `a`)
// so an auth check on a row's nested field e.g.
// `is_mod_or_admin(pool, &user, comment_view.community.id)`
// so an auth check on a row's nested field, e.g.
// `is_mod_or_admin(pool, &user, comment_view.community.id)` ,
// still names the row var.
unit.auth_checks.iter().any(|check| {
if matches!(
@ -425,6 +422,32 @@ fn has_prior_collection_auth(
}
fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &AnalysisUnit) -> bool {
// **Route-level guard short-circuit.**
//
// A check declared at the route boundary (Flask `@requires_role`,
// FastAPI `dependencies=[Depends(requires_access_dag(method=
// "POST", access_entity=DagAccessEntity.RUN))]`, Django
// `@permission_required`, Spring `@PreAuthorize`, Rails
// `before_action :authorize`, axum `RequireAuthorizationLayer`)
// gates the entire handler. The decorator / dependency call is
// opaque to the engine, the inner `requires_access_dag` carries
// no per-arg `ValueRef` pointing back into the handler body, so
// the per-name subject coverage walk below cannot match it. The
// structural shape, however, is unambiguous: every value the
// handler receives, every row it fetches, and every sink it
// calls runs after the route-level check has decided
// authorization.
//
// `has_prior_subject_auth` already filters out
// `LoginGuard` / `TokenExpiry` / `TokenRecipient` kinds before
// calling this helper (login alone proves identity, not
// authorization), so by the time we land here the kind is
// `Other` / `Membership` / `Ownership` / `AdminGuard`, i.e. an
// authorization-bearing decorator-level check. Returning `true`
// unconditionally for those is the correct semantics.
if check.is_route_level {
return true;
}
let subject_key = canonical_subject_name(subject);
let subject_related_base = related_subject_base(subject);
// A2 + B3: walk the row-binding chain from this subject so a
@ -447,7 +470,7 @@ fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &Analy
// check authorizes the resulting row (e.g. `check_community_user_action(
// &user, &community, ..)` after `let community = Community::read(
// pool, data.community_id)`), the check materially covers
// `data.community_id` too it gated access to the row that was
// `data.community_id` too, it gated access to the row that was
// fetched using that id, so any subsequent operation re-using the
// same id (read of a related view, mutation on the row itself) is
// within the scope of that authorization.
@ -527,7 +550,7 @@ fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &Analy
/// to recover every ancestor row binding name. Cycle-safe via a
/// visited set; depth-bounded at 16 hops to keep the worst case
/// trivial. Returns a vec containing `start` followed by each
/// ancestor empty when `start` is empty.
/// ancestor, empty when `start` is empty.
fn row_binding_chain(unit: &AnalysisUnit, start: &str) -> Vec<String> {
let mut chain: Vec<String> = Vec::new();
if start.is_empty() {
@ -583,7 +606,7 @@ fn is_relevant_target_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
/// it to a literal constant (`id := "id"`, `let userId = 1`, etc.).
/// Such bindings cannot be user-controlled and so must not be
/// classified as scoped-identifier subjects. Only matches plain
/// `Identifier`-kind subjects (no base/field) member chains like
/// `Identifier`-kind subjects (no base/field), member chains like
/// `req.params.id` still pass through to the regular checks.
fn is_const_bound_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
if subject.base.is_some() || subject.field.is_some() {
@ -594,22 +617,22 @@ fn is_const_bound_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
/// True iff `subject` is a plain identifier that resolves to a
/// function parameter whose static type is a payload-incompatible
/// scalar (numeric or boolean see [`super::apply_typed_bounded_params`]).
/// scalar (numeric or boolean, see [`super::apply_typed_bounded_params`]).
/// Spring `@PathVariable Long userId`, Axum `Path<i64>`, NestJS
/// `@Param('id') id: number`, and FastAPI `user_id: int` all qualify.
///
/// Phase 6: also matches member-access subjects like `dto.userId`
/// also matches member-access subjects like `dto.userId`
/// when `dto` is a typed-extractor parameter recognised by a Phase
/// 1-2 matcher AND the field's declared TypeKind is Int/Bool.
fn is_typed_bounded_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
if subject.base.is_none() && subject.field.is_none() {
return unit.typed_bounded_vars.contains(&subject.name);
}
// Phase 6: member-access shape `base.field` whose `base` is a
// member-access shape `base.field` whose `base` is a
// typed-extractor parameter and whose field is declared as an
// Int/Bool in the same-file DTO definition. Per Hard Rule 3,
// only fires when the base param itself was recognised by a
// Phase 1-2 matcher — bare `dto.age` without a framework gate
// typed-extractor matcher, bare `dto.age` without a framework gate
// never lifts.
let Some(base) = subject.base.as_deref() else {
return false;
@ -645,7 +668,7 @@ fn is_actor_context_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
// A3: `V.id`-shape subjects where `V` is bound from a login-guard /
// auth-check call (or from a typed self-actor extractor parameter)
// are the caller's own id. `V.group_id` / `V.workspace_id` stay
// relevant only self-identifier fields trip this branch, so
// relevant, only self-identifier fields trip this branch, so
// foreign scoped ids on the same actor binding still flag.
if let Some(base) = subject.base.as_deref() {
let root = base.split('.').next().unwrap_or(base);
@ -657,7 +680,7 @@ fn is_actor_context_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
}
// Transitive copy of `V.id`: `let uid = user.id; query(.., &[uid])`
// the subject `uid` is a plain identifier with no base/field, but
//, the subject `uid` is a plain identifier with no base/field, but
// was recorded as a self-actor id copy at extract time. Treat it
// as actor context.
if unit.self_actor_id_vars.contains(&subject.name) {
@ -810,15 +833,15 @@ fn is_id_like_name(name: &str) -> bool {
}
/// True when the analysis unit shows positive evidence of receiving
/// user-controlled input the precondition for any auth rule that
/// user-controlled input, the precondition for any auth rule that
/// reasons about "scoped identifier" or "token-acceptance flow"
/// shapes.
///
/// A unit qualifies if any of the following hold:
/// * It is a recognised framework route handler (`RouteHandler`
/// * It is a recognised framework route handler (`RouteHandler` ,
/// the strongest signal: registered with a router).
/// * It accesses a request-shaped value (`request.body`, `req.params`,
/// `c.Query(..)`, etc.) populated as `context_inputs`.
/// `c.Query(..)`, etc.), populated as `context_inputs`.
/// * It declares at least one parameter whose name signals an
/// externally-supplied value (id-like, token-like, request-like).
/// Internal helpers that take only typed objects
@ -826,7 +849,7 @@ fn is_id_like_name(name: &str) -> bool {
/// `items`) are excluded.
///
/// Migrations, Celery tasks, pytest fixtures, conftest hooks, and
/// pure utility helpers fail all three conditions and are skipped
/// pure utility helpers fail all three conditions and are skipped ,
/// they cannot, by construction, be the entry point of an
/// authentication-bearing flow.
fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool {
@ -843,7 +866,7 @@ fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool {
/// as part of its calling contract? Captures three classes of name:
/// * id-like (`*_id`, `*Id`, `id`, `*Ids`),
/// * token-like (`token`, `*_token`, `accessToken`),
/// * framework-request objects (`request`, `req`, `ctx` the
/// * framework-request objects (`request`, `req`, `ctx`, the
/// standard names used by Express/Django/Flask/Gin/Axum/NestJS
/// handlers as the parameter that carries the HTTP request).
///
@ -851,12 +874,26 @@ fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool {
/// functions that, while not registered as route handlers, are
/// clearly invoked with caller-supplied identifiers or request data.
fn is_external_input_param_name(name: &str) -> bool {
// Pytest / unittest.mock convention: parameters injected by
// `@mock.patch(...)` decorators are universally named
// `mock_<thing>` (`mock_project_id`, `mock_session`,
// `mock_user_id`). Their values are MagicMock instances created
// by the test framework, not user-supplied input, even when the
// suffix carries an id-shaped tail. Refusing the entire `mock_`
// prefix is structural (mirrors pytest's documented convention)
// and closes the airflow `tests/unit/google/cloud/hooks/`
// cluster where every test method takes
// `(self, get_conn, mock_project_id)` and the suffix tripped the
// id-like heuristic.
if name.starts_with("mock_") || name.starts_with("mocked_") {
return false;
}
if is_id_like_name(name) {
return true;
}
let lower = name.to_ascii_lowercase();
// Token-shaped: bare `token` or any `*_token` / `*Token` /
// `accessToken` / `refreshToken`-style suffix. Conservative —
// `accessToken` / `refreshToken`-style suffix. Conservative ,
// only fires on explicit token-naming, not on incidental
// substrings.
if lower == "token" || lower.ends_with("_token") || lower.ends_with("token") {
@ -951,7 +988,7 @@ mod tests {
assert!(is_actor_context_subject(&member("user", "uid"), &unit));
// Pitfall guard: `user.group_id` / `user.workspace_id` stay
// relevant only self-identifier fields trip the widening.
// relevant, only self-identifier fields trip the widening.
assert!(!is_actor_context_subject(
&member("user", "group_id"),
&unit
@ -962,7 +999,7 @@ mod tests {
));
// Variables not in self_actor_vars fall back to the existing
// identity-key match `target.id` still flags.
// identity-key match, `target.id` still flags.
assert!(!is_actor_context_subject(&member("target", "id"), &unit));
}
@ -1036,7 +1073,7 @@ mod tests {
assert!(!is_relevant_target_subject(&plain("id"), &unit));
// Plain `id` NOT in the const-bound set still flags as
// relevant regression guard for the user-controlled case.
// relevant, regression guard for the user-controlled case.
let unit2 = empty_unit();
assert!(is_relevant_target_subject(&plain("id"), &unit2));
@ -1046,12 +1083,12 @@ mod tests {
assert!(is_relevant_target_subject(&member("req", "id"), &unit));
}
/// Phase 5 typed-bounded subject exclusion: a parameter whose
/// Hierarchy: a parameter whose
/// static type was recovered as `Int`/`Bool` (Spring `Long userId`,
/// Axum `Path<i64>`, FastAPI `user_id: int`) has its name added to
/// `unit.typed_bounded_vars` by `apply_typed_bounded_params`. The
/// subject `userId` then must not be classified as a scoped
/// identifier the framework guarantees the value is numeric and
/// identifier, the framework guarantees the value is numeric and
/// cannot drive ownership-bypass.
#[test]
fn typed_bounded_plain_subjects_are_not_relevant() {
@ -1066,7 +1103,7 @@ mod tests {
assert!(is_relevant_target_subject(&plain("user_id"), &unit2));
// Member access `req.user_id` is unaffected (only plain
// identifiers are exempted fields/base remain regular
// identifiers are exempted, fields/base remain regular
// subjects so DTO-shape leaks still flag).
unit.typed_bounded_vars.insert("req".into());
assert!(is_relevant_target_subject(&member("req", "user_id"), &unit));
@ -1080,17 +1117,17 @@ mod tests {
#[test]
fn unit_user_input_evidence_recognises_external_inputs() {
// Function with no params and no context_inputs (Celery task
// shape) must NOT count as user-input-bearing.
// shape), must NOT count as user-input-bearing.
let mut unit = empty_unit();
assert!(!unit_has_user_input_evidence(&unit));
// Adding internal-typed params (apps, schema_editor Django
// Adding internal-typed params (apps, schema_editor, Django
// migration RunPython callback shape) keeps the gate closed.
unit.params.push("apps".into());
unit.params.push("schema_editor".into());
assert!(!unit_has_user_input_evidence(&unit));
// pytest hook shape: (config, items) gate stays closed.
// pytest hook shape: (config, items), gate stays closed.
let mut unit = empty_unit();
unit.params.push("config".into());
unit.params.push("items".into());
@ -1161,14 +1198,22 @@ mod tests {
assert!(!is_external_input_param_name("manager"));
// `c` alone is too common as a local variable to count.
assert!(!is_external_input_param_name("c"));
// Pytest / unittest.mock fixture-injected mocks: `mock_<x>` /
// `mocked_<x>` names are MagicMock instances, not user input,
// even when the suffix (`mock_project_id`) is id-shaped.
assert!(!is_external_input_param_name("mock_project_id"));
assert!(!is_external_input_param_name("mock_session"));
assert!(!is_external_input_param_name("mock_user_id"));
assert!(!is_external_input_param_name("mocked_request"));
assert!(!is_external_input_param_name("mocked_token"));
}
/// Phase A4 row-fetch exemption.
/// Row-fetch exemption.
///
/// Row var declared at line 10; auth check naming the row appears
/// at line 20. An operation at line 10 (the fetch) is exempted
/// because the auth check authorises the resulting row. Coverage
/// is intentionally narrow — operations between fetch (10) and
/// is intentionally narrow, operations between fetch (10) and
/// check (20) that are NOT row-fetch sites must still flag.
#[test]
fn row_fetch_exemption_covers_fetch_when_check_names_row() {
@ -1192,6 +1237,7 @@ mod tests {
line: 20,
args: Vec::new(),
condition_text: None,
is_route_level: false,
});
let fetch_op = SensitiveOperation {
@ -1206,7 +1252,7 @@ mod tests {
assert!(has_row_fetch_exemption(&unit, &fetch_op));
// Operation at a different line (between fetch and check) is
// NOT a row-fetch site exemption does not apply.
// NOT a row-fetch site, exemption does not apply.
let mid_op = SensitiveOperation {
kind: OperationKind::Mutation,
sink_class: None,
@ -1229,7 +1275,7 @@ mod tests {
"community".to_string(),
(10, vec![member("data", "community_id")]),
);
// No auth check pushed exemption must NOT apply.
// No auth check pushed, exemption must NOT apply.
let fetch_op = SensitiveOperation {
kind: OperationKind::Read,
@ -1256,7 +1302,7 @@ mod tests {
(10, vec![member("data", "community_id")]),
);
// Login-only check on the row should NOT exempt the row-fetch
// login proves identity, not authorization.
//, login proves identity, not authorization.
unit.auth_checks.push(AuthCheck {
kind: AuthCheckKind::LoginGuard,
callee: "require_login".into(),
@ -1265,6 +1311,7 @@ mod tests {
line: 20,
args: Vec::new(),
condition_text: None,
is_route_level: false,
});
let fetch_op = SensitiveOperation {
@ -1305,10 +1352,11 @@ mod tests {
line: 20,
args: Vec::new(),
condition_text: None,
is_route_level: false,
};
// Direct member subject `data.community_id` (the original
// request field) covered via reverse-walk.
// request field), covered via reverse-walk.
assert!(auth_check_covers_subject(
&check,
&member("data", "community_id"),
@ -1334,7 +1382,7 @@ mod tests {
/// Subject as plain identifier copied from the request
/// (`let community_id = data.community_id; let community =
/// Community::read(pool, community_id);`) must also benefit from
/// the reverse-walk `row_population_data["community"]` then
/// the reverse-walk, `row_population_data["community"]` then
/// records `[community_id]` (a plain identifier, not the
/// member-access shape).
#[test]
@ -1352,6 +1400,7 @@ mod tests {
line: 20,
args: Vec::new(),
condition_text: None,
is_route_level: false,
};
assert!(auth_check_covers_subject(
@ -1392,9 +1441,10 @@ mod tests {
line: 20,
args: Vec::new(),
condition_text: None,
is_route_level: false,
};
// Sink subject is the bare alias covered via the chain.
// Sink subject is the bare alias, covered via the chain.
assert!(auth_check_covers_subject(
&check,
&plain("community_id"),
@ -1412,4 +1462,73 @@ mod tests {
// Plain identifier with no alias entry must NOT be covered.
assert!(!auth_check_covers_subject(&check, &plain("post_id"), &unit));
}
/// Route-level guard short-circuit (FastAPI / Flask /
/// Django / Spring / Rails / axum decorator-level auth).
///
/// The decorator-level `@requires_role` /
/// `dependencies=[Depends(requires_access_dag(...))]` /
/// `before_action :authorize` runs before the handler body and
/// authorizes every value the handler receives. The check has
/// no per-arg `ValueRef` pointing back into the body, so the
/// per-name subject coverage walk cannot model the semantics.
/// `auth_check_covers_subject` short-circuits `true` for any
/// authorization-bearing route-level check (LoginGuard etc. are
/// already filtered out by `has_prior_subject_auth`).
#[test]
fn auth_check_covers_subject_route_level_short_circuits() {
use crate::auth_analysis::model::{AuthCheck, AuthCheckKind};
let unit = empty_unit();
let route_check = AuthCheck {
kind: AuthCheckKind::Other,
callee: "requires_access_dag".into(),
subjects: Vec::new(), // route-level checks carry no body subjects
span: (0, 0),
line: 0,
args: Vec::new(),
condition_text: None,
is_route_level: true,
};
// Any subject is covered when the check is route-level ,
// path param, request body field, row-fetch receiver, all of
// them. The per-name walk would have rejected each.
assert!(auth_check_covers_subject(
&route_check,
&plain("dag_id"),
&unit
));
assert!(auth_check_covers_subject(
&route_check,
&member("req", "dag_run_id"),
&unit
));
assert!(auth_check_covers_subject(
&route_check,
&plain("dag"),
&unit
));
// Sanity check: an in-body check with no subjects (the prior
// shape) does NOT cover arbitrary subjects. Without the
// route-level flag, the empty subjects vec means the
// `check.subjects.iter().any(...)` walk fails for every
// candidate.
let in_body_check = AuthCheck {
kind: AuthCheckKind::Other,
callee: "requires_access_dag".into(),
subjects: Vec::new(),
span: (0, 0),
line: 0,
args: Vec::new(),
condition_text: None,
is_route_level: false,
};
assert!(!auth_check_covers_subject(
&in_body_check,
&plain("dag_id"),
&unit
));
}
}

View file

@ -173,7 +173,7 @@ impl AuthAnalysisRules {
/// Does the LAST segment of the callee match a configured non-sink
/// method name (case-sensitive exact)? Used to recognise DOM-API
/// methods like `addEventListener` / `appendChild` regardless of
/// receiver `someElement.addEventListener` is just as
/// receiver, `someElement.addEventListener` is just as
/// categorically client-side as `document.addEventListener`.
pub fn callee_has_non_sink_method(&self, callee: &str) -> bool {
let last = bare_method_name(callee);
@ -200,19 +200,19 @@ impl AuthAnalysisRules {
/// Classify a call into a [`SinkClass`].
///
/// Dispatch order (first match wins):
/// 1. `InMemoryLocal` receiver is a known non-sink collection
/// 1. `InMemoryLocal`, receiver is a known non-sink collection
/// (tracked in `non_sink_vars` or matches a configured
/// non-sink prefix).
/// 2. `RealtimePublish` receiver first-segment matches a
/// 2. `RealtimePublish`, receiver first-segment matches a
/// configured realtime prefix (e.g. `realtime`, `pubsub`).
/// 3. `OutboundNetwork` receiver first-segment matches a
/// 3. `OutboundNetwork`, receiver first-segment matches a
/// configured outbound-network prefix (e.g. `http`, `reqwest`).
/// 4. `CacheCrossTenant` receiver first-segment matches a
/// 4. `CacheCrossTenant`, receiver first-segment matches a
/// configured cache prefix (e.g. `cache`, `redis`).
/// 5. `DbMutation` callee name matches `mutation_indicator_names`.
/// 6. `DbCrossTenantRead` callee name matches `read_indicator_names`.
/// 5. `DbMutation`, callee name matches `mutation_indicator_names`.
/// 6. `DbCrossTenantRead`, callee name matches `read_indicator_names`.
///
/// Returns `None` when the callee matches none of the above the
/// Returns `None` when the callee matches none of the above, the
/// call site is ignored by ownership-gap checks.
pub fn classify_sink_class(
&self,
@ -227,8 +227,8 @@ impl AuthAnalysisRules {
// (`el.addEventListener`, `parent.appendChild`) are categorically
// not data-layer auth-relevant operations. These shapes would
// otherwise prefix-match read/mutation indicators (`get`, `add`,
// `remove`) `getElementById` canonicalises to `getelementbyid`
// which `starts_with("get")` and falsely classify as
// `remove`), `getElementById` canonicalises to `getelementbyid`
// which `starts_with("get")`, and falsely classify as
// `DbCrossTenantRead` / `DbMutation`.
if self.callee_has_non_sink_global_receiver(callee)
|| self.callee_has_non_sink_method(callee)
@ -251,7 +251,7 @@ impl AuthAnalysisRules {
// receiver. When the receiver chain itself contains a call
// expression (`w.Header().Get(..)`, `r.URL.Query().Get(..)`,
// `db.Tx(..).Query(..)`), the receiver is the *return value of
// another call* its type is opaque to the auth analyser and
// another call*, its type is opaque to the auth analyser and
// the bare verb match is too speculative to assume a data-layer
// sink. The realtime/outbound/cache prefix dispatches above
// already match by the chain root; if none of them claimed the
@ -501,6 +501,13 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
"user_passes_test".into(),
"verify_access".into(),
"authorize".into(),
// FastAPI dependency-injection auth idiom: airflow uses
// `Depends(requires_access_dag(method="GET"))`,
// `requires_access_connection(...)`, etc. The unwrapped
// inner call name is `requires_access_<resource>`; the
// `requires_access` prefix matches all variants via
// `matches_name`.
"requires_access".into(),
],
mutation_indicator_names: vec![
"update".into(),
@ -615,7 +622,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
"verify_access!".into(),
"can_access?".into(),
"can?".into(),
// Rails per-record permission predicates the canonical
// Rails per-record permission predicates, the canonical
// "load by id, then check on the loaded record" idiom
// (see redmine `app/controllers/issues_controller.rb`,
// mastodon controllers, diaspora ApplicationController).
@ -961,7 +968,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
"can_access".into(),
"can_manage".into(),
// Common project-specific helpers seen in real Axum/Rocket
// codebases kept as defaults so user code that names
// codebases, kept as defaults so user code that names
// its membership helper after the resource still gets
// recognised. Users can extend via `nyx.toml`.
"require_group_member".into(),
@ -1045,7 +1052,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
"FxHashSet".into(),
"DashMap".into(),
"DashSet".into(),
// `serde_json::Map` (last-segment `Map`) common JSON
// `serde_json::Map` (last-segment `Map`), common JSON
// body builder where `m.insert("k", v)` is a string-key
// assignment on an in-memory object, not a DB write.
"Map".into(),
@ -1161,7 +1168,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
],
non_sink_receiver_types: Vec::new(),
non_sink_receiver_name_prefixes: Vec::new(),
// Browser/DOM globals calls on these receivers are
// Browser/DOM globals, calls on these receivers are
// categorically client-side (no server-side authorization
// semantics). Without this list, `document.getElementById`
// would prefix-match the read-indicator `get`,
@ -1196,7 +1203,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
"WeakMap".into(),
"WeakSet".into(),
],
// DOM-API methods when the LAST segment of the callee
// DOM-API methods, when the LAST segment of the callee
// matches, the call is non-data-layer regardless of receiver
// (`el.addEventListener`, `parent.appendChild`). These
// methods would otherwise prefix-match `add`, `remove`,
@ -1345,7 +1352,7 @@ pub fn first_receiver_segment(callee: &str) -> &str {
callee.split('.').next().unwrap_or(callee)
}
/// True when the callee's receiver chain contains a call expression
/// True when the callee's receiver chain contains a call expression ,
/// i.e. the LAST segment is being invoked on the *return value* of an
/// earlier call (`w.Header().Get`, `r.URL.Query().Get`,
/// `db.Tx(opts).Query`). Detected as: the substring before the last
@ -1366,7 +1373,7 @@ pub fn receiver_is_chained_call(callee: &str) -> bool {
/// (`member`, `owner`, `admin`, `access`, `permission`, `manager`,
/// `editor`, `viewer`, `user`, `mod`). The resource segment is
/// project-specific (`trip`, `doc`, `project`, `community`, …) and
/// cannot be enumerated in the static defaults but the
/// cannot be enumerated in the static defaults, but the
/// prefix+role pattern is unambiguous enough that recognising it as
/// an authorization check is safe. Also accepts `is_<role>` /
/// `is_<role>_(or|and)_<role>...` predicate forms (`is_admin`,
@ -1398,7 +1405,7 @@ fn is_require_resource_role_call(name: &str) -> bool {
}
// Pattern 2: `is_<role>` and `is_<role>_(or|and)_<role>...`.
// Conservative role list excludes `user` / `staff` to avoid
// Conservative role list, excludes `user` / `staff` to avoid
// matching ambiguous predicates like `is_user`.
if let Some(rest) = lower.strip_prefix("is_")
&& !rest.is_empty()
@ -1682,7 +1689,7 @@ mod tests {
assert!(receiver_is_chained_call("r.URL.Query().Get"));
assert!(receiver_is_chained_call("db.Tx(opts).Query"));
assert!(receiver_is_chained_call("client.WithToken(t).Get"));
// Pure field/identifier chain no `(` anywhere.
// Pure field/identifier chain, no `(` anywhere.
assert!(!receiver_is_chained_call("repo.Find"));
assert!(!receiver_is_chained_call("c.Fs.Create"));
assert!(!receiver_is_chained_call("globalBatchJobsMetrics.save"));
@ -1701,7 +1708,7 @@ mod tests {
let empty: HashSet<String> = HashSet::new();
// Chained-call receiver: verb-name fallback is suppressed.
// The minio `w.Header().Get(constName)` cluster `Get` would
// The minio `w.Header().Get(constName)` cluster, `Get` would
// match the `Get` read indicator on a bare receiver but the
// chained-call shape masks the receiver type.
assert_eq!(rules.classify_sink_class("w.Header().Get", &empty), None);
@ -1742,7 +1749,7 @@ mod tests {
let rules = build_auth_rules(&cfg, "javascript");
let empty: HashSet<String> = HashSet::new();
// Globals receiver-first-segment match.
// Globals, receiver-first-segment match.
assert_eq!(
rules.classify_sink_class("document.getElementById", &empty),
Some(SinkClass::InMemoryLocal)
@ -1760,7 +1767,7 @@ mod tests {
Some(SinkClass::InMemoryLocal)
);
// Method allowlist last-segment match regardless of receiver.
// Method allowlist, last-segment match regardless of receiver.
assert_eq!(
rules.classify_sink_class("input.addEventListener", &empty),
Some(SinkClass::InMemoryLocal)
@ -1801,22 +1808,22 @@ mod tests {
assert!(rules.is_authorization_check("authz::require_trip_member"));
assert!(rules.is_authorization_check("self.require_album_editor"));
// Negatives random `require_*` calls without a known role
// Negatives, random `require_*` calls without a known role
// suffix do NOT count as authorization.
assert!(!rules.is_authorization_check("require_db"));
assert!(!rules.is_authorization_check("require_user"));
assert!(!rules.is_authorization_check("require_login"));
// Bare `require_member` / `require_owner` (no resource segment)
// aren't enough the resource segment is what makes the helper
// aren't enough, the resource segment is what makes the helper
// unambiguous.
assert!(!rules.is_authorization_check("require_member"));
assert!(!rules.is_authorization_check("require_owner"));
}
/// Phase A4 — broader verb / role / context-suffix shapes seen in
/// real-world Rust apps. `check_<resource>_<role>_action` is the
/// canonical lemmy idiom; verifying the `is_<role>` predicate
/// recogniser closes `is_mod_or_admin` style checks.
/// Broader verb / role / context-suffix shapes seen in real-world
/// Rust apps. `check_<resource>_<role>_action` is the canonical
/// lemmy idiom; the `is_<role>` predicate recogniser closes
/// `is_mod_or_admin` style checks.
#[test]
fn is_authorization_check_recognises_check_action_and_predicate_shapes() {
let cfg = Config::default();
@ -1847,7 +1854,7 @@ mod tests {
assert!(rules.is_authorization_check("is_admin_or_moderator"));
assert!(rules.is_authorization_check("is_member_and_owner"));
// Negatives predicates whose tokens are NOT known auth roles.
// Negatives, predicates whose tokens are NOT known auth roles.
assert!(!rules.is_authorization_check("is_user"));
assert!(!rules.is_authorization_check("is_logged_in"));
assert!(!rules.is_authorization_check("is_active"));

View file

@ -384,8 +384,8 @@ fn classify_rocket_param(
///
/// **Looser than [`super::common::is_self_actor_type_text`] by
/// design.** This recogniser runs only on the type of a route-bound
/// parameter appearing in a route handler signature is itself a
/// strong signal and a false positive here just over-credits the
/// parameter, appearing in a route handler signature is itself a
/// strong signal, and a false positive here just over-credits the
/// route with a login guard, which is conservative w.r.t. flagging.
/// `is_self_actor_type_text` runs on every parameter, including in
/// non-route functions, and a false positive there suppresses
@ -625,6 +625,11 @@ pub(crate) fn inject_guard_checks(
line,
args: call.args.clone(),
condition_text: None,
// Route-level guard injected from a tower / axum layer
// (`RequireAuthorizationLayer`, `axum_login::login_required!`,
// …). Tells `auth_check_covers_subject` to short-circuit
// for any non-login-guard match.
is_route_level: true,
});
}
}

File diff suppressed because it is too large Load diff

View file

@ -209,7 +209,12 @@ fn collect_class_based_routes(
}
let line = method_node.start_position().row + 1;
for call in &middleware_calls {
if let Some(check) = auth_check_from_call_site(call, line, rules) {
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
// Django class-based-view decorators (`@method_decorator(login_required)`,
// `@permission_required(...)`) and DRF `permission_classes`
// are declared at the route boundary; mark route-level
// so coverage applies to the action body's operations.
check.is_route_level = true;
unit.auth_checks.push(check);
}
}
@ -443,7 +448,14 @@ fn inject_middleware_auth(
return;
};
for call in middleware_calls {
if let Some(check) = auth_check_from_call_site(call, line, rules) {
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
// Django decorators (`@login_required`, `@permission_required`,
// `@user_passes_test`, etc.) and DRF `permission_classes` are
// declared at the route boundary; mark route-level so
// `auth_check_covers_subject` short-circuits `true` for any
// non-login-guard match. See flask.rs / model.rs for the
// full rationale.
check.is_route_level = true;
unit.auth_checks.push(check);
}
}

View file

@ -67,6 +67,15 @@ fn maybe_collect_flask_route(
for decorator in decorator_expressions(node) {
if let Some(mut specs) = parse_flask_route_decorator(decorator, bytes) {
route_specs.append(&mut specs);
// FastAPI puts route-level dependencies (auth checks +
// logging hooks) inside the route decorator's
// `dependencies=[Depends(...)]` keyword argument, instead
// of as separate `@decorator` lines like Flask. Walk the
// route decorator's keyword args for that shape and lift
// each `Depends(call(...))` element into the
// middleware_calls list, so the same `inject_middleware_auth`
// path that Flask uses also picks up FastAPI auth deps.
middleware_calls.extend(extract_fastapi_dependencies(decorator, bytes));
} else {
middleware_calls.extend(expand_decorator_calls(decorator, bytes));
}
@ -220,6 +229,75 @@ fn expand_decorator_calls(node: Node<'_>, bytes: &[u8]) -> Vec<CallSite> {
vec![call_site_from_node(node, bytes)]
}
/// Walk the route-decorator call's keyword args looking for the FastAPI
/// `dependencies=[Depends(call(...)), Depends(call), ...]` shape. For
/// each `Depends(...)` list element, extract the inner callable as a
/// `CallSite` so it can flow through `inject_middleware_auth` and be
/// matched against the per-language authorization-check / login-guard
/// name lists. Refuses non-call elements and `Depends(...)` without a
/// recognised inner call shape.
///
/// The function is decoupled from Flask semantics (Flask routes never
/// use `dependencies=`); the lookup is purely structural and matches
/// FastAPI's documented dependency-injection convention. Lives in the
/// flask module because Flask's route-decorator parser already targets
/// the `@<router>.<method>(<path>, ...)` shape that FastAPI shares.
fn extract_fastapi_dependencies(decorator_expr: Node<'_>, bytes: &[u8]) -> Vec<CallSite> {
if decorator_expr.kind() != "call" {
return Vec::new();
}
let Some(arguments) = decorator_expr.child_by_field_name("arguments") else {
return Vec::new();
};
let Some(value) = keyword_argument_value(arguments, bytes, "dependencies") else {
return Vec::new();
};
let mut out = Vec::new();
for element in named_children(value) {
if let Some(call) = unwrap_depends_call(element, bytes) {
out.push(call);
}
}
out
}
/// Unwrap one `Depends(...)` list element from a FastAPI `dependencies`
/// list and return the inner callable as a `CallSite`. Three shapes
/// are accepted:
/// * `Depends(callee(arg1, arg2))`, most common, the inner call is
/// the callable factory invocation; record `callee` as the auth
/// check.
/// * `Depends(callee)`, bare reference; record `callee` itself.
/// * `Depends()` / non-`Depends` items, skipped.
fn unwrap_depends_call(node: Node<'_>, bytes: &[u8]) -> Option<CallSite> {
if node.kind() != "call" {
return None;
}
let function = node.child_by_field_name("function")?;
let function_text = text(function, bytes);
if !is_depends_callee(&function_text) {
return None;
}
let arguments = node.child_by_field_name("arguments")?;
let first = named_children(arguments).into_iter().next()?;
match first.kind() {
"call" => Some(call_site_from_node(first, bytes)),
"identifier" | "attribute" | "scoped_identifier" => Some(call_site_from_node(first, bytes)),
_ => None,
}
}
/// True for the FastAPI `Depends` marker, including the
/// fully-qualified `fastapi.Depends` form. Conservative: only literal
/// matches, no canonicalisation.
fn is_depends_callee(callee: &str) -> bool {
let trimmed = callee.trim();
matches!(
trimmed,
"Depends" | "fastapi.Depends" | "fastapi.params.Depends"
)
}
fn inject_middleware_auth(
model: &mut AuthorizationModel,
unit_idx: usize,
@ -231,8 +309,48 @@ fn inject_middleware_auth(
return;
};
for call in middleware_calls {
if let Some(check) = auth_check_from_call_site(call, line, rules) {
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
// Mark as route-level: the check is declared at the route
// boundary (Flask `@requires_role(...)` decorator, FastAPI
// `dependencies=[Depends(...)]`, or any custom-router
// equivalent) and semantically authorizes every value the
// handler receives, path param, body, query, downstream
// row fetches, the lot. `auth_check_covers_subject` reads
// `is_route_level` and short-circuits `true` for any
// non-login-guard match, which is the correct shape for a
// decorator-level guard whose inner call carries no
// per-arg subject ref pointing back into the handler body.
// LoginGuard / TokenExpiry / TokenRecipient kinds are
// already excluded by `has_prior_subject_auth`'s filter
// before they reach `auth_check_covers_subject`, so the
// flag is safe to set unconditionally here, it has no
// effect on those kinds.
check.is_route_level = true;
unit.auth_checks.push(check);
}
}
}
#[cfg(test)]
mod fastapi_dependencies_tests {
use super::is_depends_callee;
/// `is_depends_callee` only matches the FastAPI `Depends` marker.
/// Any other wrapper call inside `dependencies=[...]` is ignored ,
/// extracting an inner callee from the wrong wrapper would
/// misclassify logging hooks or filter callables as auth checks.
#[test]
fn is_depends_callee_recognises_canonical_forms() {
assert!(is_depends_callee("Depends"));
assert!(is_depends_callee("fastapi.Depends"));
assert!(is_depends_callee("fastapi.params.Depends"));
// Whitespace tolerance.
assert!(is_depends_callee(" Depends "));
// Negatives.
assert!(!is_depends_callee("Annotated"));
assert!(!is_depends_callee("Body"));
assert!(!is_depends_callee("Depends.something"));
assert!(!is_depends_callee("RequiresAuth"));
assert!(!is_depends_callee(""));
}
}

View file

@ -61,5 +61,104 @@ pub fn extract_authorization_model(
}
}
// **Dedup units by span across extractors.** Multiple extractors
// (e.g. Flask + Django on a Python file) each call
// `collect_top_level_units`, producing one unit per top-level
// function. When one extractor also recognises a route on that
// function and promotes its copy to `RouteHandler` (with injected
// middleware auth checks), the *other* extractor's untouched
// `Function` copy still runs through `check_ownership_gaps` and
// emits the FP from a unit that never saw the middleware-derived
// auth check.
//
// This step keeps a single canonical unit per source span,
// preferring `RouteHandler` over `Function`, merging auth_checks
// and folding operation lists conservatively. Route registrations
// are remapped to the surviving unit index.
deduplicate_units_by_span(&mut model);
model
}
fn deduplicate_units_by_span(model: &mut AuthorizationModel) {
use crate::auth_analysis::model::{AnalysisUnit, AnalysisUnitKind};
use std::collections::HashMap;
// First pass: choose a winner for each span, prefer the
// first-seen `RouteHandler` over any `Function` copy.
let mut winner_by_span: HashMap<(usize, usize), usize> = HashMap::new();
for (idx, unit) in model.units.iter().enumerate() {
let key = unit.span;
match winner_by_span.get(&key) {
None => {
winner_by_span.insert(key, idx);
}
Some(&existing) => {
let prev_kind = model.units[existing].kind;
if prev_kind != AnalysisUnitKind::RouteHandler
&& unit.kind == AnalysisUnitKind::RouteHandler
{
winner_by_span.insert(key, idx);
}
}
}
}
// Second pass: drain auth_checks from losers so we can append them
// to the winners after the layout collapses.
let mut moved_checks: Vec<Vec<crate::auth_analysis::model::AuthCheck>> =
Vec::with_capacity(model.units.len());
for old_idx in 0..model.units.len() {
let span = model.units[old_idx].span;
let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
if winner == old_idx {
moved_checks.push(Vec::new());
} else {
moved_checks.push(std::mem::take(&mut model.units[old_idx].auth_checks));
}
}
// Third pass: emit surviving units (clone the winners) and build
// the old-idx → new-idx remap.
let mut new_idx_for_old: HashMap<usize, usize> = HashMap::new();
let mut surviving: Vec<AnalysisUnit> = Vec::with_capacity(winner_by_span.len());
for old_idx in 0..model.units.len() {
let span = model.units[old_idx].span;
let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
if winner == old_idx {
new_idx_for_old.insert(old_idx, surviving.len());
surviving.push(model.units[old_idx].clone());
}
}
// Fourth pass: drain loser auth_checks into their winners, deduping
// by (span, callee). Operations are not merged: both extractor
// passes recompute the same operation list from the AST, so the
// winner already carries the canonical set.
for (old_idx, checks) in moved_checks.iter_mut().enumerate() {
let span = model.units[old_idx].span;
let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
if winner == old_idx {
continue;
}
let Some(&new_winner_idx) = new_idx_for_old.get(&winner) else {
continue;
};
for check in checks.drain(..) {
let already_present = surviving[new_winner_idx]
.auth_checks
.iter()
.any(|existing| existing.span == check.span && existing.callee == check.callee);
if !already_present {
surviving[new_winner_idx].auth_checks.push(check);
}
}
}
model.units = surviving;
for route in &mut model.routes {
if let Some(&new_idx) = new_idx_for_old.get(&route.unit_idx) {
route.unit_idx = new_idx;
}
}
}

View file

@ -137,7 +137,14 @@ fn maybe_collect_controller(
let line = child.start_position().row + 1;
let middleware_calls = applicable_filters(&filter_directives, &action_name);
for call in &middleware_calls {
if let Some(check) = auth_check_from_call_site(call, line, rules) {
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
// Rails `before_action :authorize_user`-style filter
// callbacks run before the action and authorize the
// entire request, same shape as FastAPI / Flask
// `dependencies=[Depends(...)]`. Mark route-level so
// `auth_check_covers_subject` covers the row-fetches
// and downstream sinks the action body performs.
check.is_route_level = true;
unit.auth_checks.push(check);
}
}

View file

@ -114,7 +114,13 @@ fn maybe_collect_route(
);
let line = block.start_position().row + 1;
for call in before_filters {
if let Some(check) = auth_check_from_call_site(call, line, rules) {
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
// Sinatra `before` filters run before the route handler
// body and authorize the request as a whole, same shape
// as Rails `before_action`. Route-level so coverage
// applies to the handler's row fetches and downstream
// sinks.
check.is_route_level = true;
unit.auth_checks.push(check);
}
}

View file

@ -111,7 +111,15 @@ fn maybe_collect_controller(
rules,
);
for call in &middleware_calls {
if let Some(check) = auth_check_from_call_site(call, line, rules) {
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
// Spring `@PreAuthorize` / `@Secured` /
// `@RolesAllowed` annotations are declared at the
// method or class boundary and authorize the entire
// request, same shape as FastAPI / Flask
// `dependencies=[Depends(...)]`. Mark route-level
// so `auth_check_covers_subject` covers row fetches
// and downstream sinks in the handler body.
check.is_route_level = true;
unit.auth_checks.push(check);
}
}

View file

@ -1,3 +1,5 @@
#![doc = include_str!(concat!(env!("OUT_DIR"), "/auth_analysis.md"))]
pub mod checks;
pub mod config;
pub mod extract;
@ -26,7 +28,7 @@ fn byte_offset_to_point(tree: &Tree, byte: usize) -> tree_sitter::Point {
/// source-level variable name. Built at `run_auth_analysis` call sites
/// by merging type facts across all bodies in the file; a variable name
/// with conflicting types in different bodies is dropped (absence is
/// safe the sink gate just falls back to name-based classification).
/// safe, the sink gate just falls back to name-based classification).
pub type VarTypes = HashMap<String, TypeKind>;
#[allow(clippy::too_many_arguments)]
@ -87,7 +89,7 @@ pub fn run_auth_analysis(
/// Used by pass 1 to persist per-file auth summaries for cross-file
/// helper lifting. Only returns summaries for units whose body
/// already proves at least one positional parameter under ownership /
/// membership / admin / authorization check i.e. the exact
/// membership / admin / authorization check, i.e. the exact
/// single-file lift set, so the cross-file variant does not widen what
/// counts as a helper.
pub fn extract_auth_summaries_by_key(
@ -198,7 +200,7 @@ fn build_unit_summary(unit: &model::AnalysisUnit) -> Option<model::AuthCheckSumm
/// Walk every `SensitiveOperation` in the model and, when the call's
/// receiver root variable has a known SSA type, override `sink_class`
/// to the type-implied class. Strictly additive only overrides
/// to the type-implied class. Strictly additive, only overrides
/// when the type map produces a definite class, otherwise leaves the
/// name/prefix-derived classification intact.
fn apply_var_types_to_model(
@ -229,11 +231,11 @@ fn apply_var_types_to_model(
/// reassignment from user input (`let id = req.params.id`) never gets
/// suppressed by accident.
///
/// Phase 6: when a parameter's type is a [`TypeKind::Dto`], lift each
/// when a parameter's type is a [`TypeKind::Dto`], lift each
/// of its `Int`/`Bool` fields as `typed_bounded_dto_fields[<param>]`
/// so member-access subjects like `dto.age` are recognised as
/// payload-incompatible. Only fires when the base param itself was
/// recognised as a typed extractor by a Phase 1-2 matcher — bare
/// recognised as a typed extractor by a typed-extractor matcher, bare
/// parameters with no framework gate never lift their fields.
fn apply_typed_bounded_params(model: &mut model::AuthorizationModel, var_types: &VarTypes) {
for unit in &mut model.units {
@ -310,7 +312,7 @@ fn sink_class_for_type(
///
/// When `global_summaries` is `Some`, cross-file helpers are looked up
/// via [`GlobalSummaries::get_auth`] after the same-file summary
/// gather this recovers the handler-in-file-A calling
/// gather, this recovers the handler-in-file-A calling
/// `require_owner`-in-file-B case that single-file lifting cannot see.
fn apply_helper_lifting(
model: &mut model::AuthorizationModel,
@ -408,7 +410,7 @@ fn build_helper_summaries(
let mut summary = AuthCheckSummary::default();
for check in &unit.auth_checks {
// We only lift checks that actively prove ownership /
// membership / admin-rights / authorize-helper login
// membership / admin-rights / authorize-helper, login
// and token-validity checks don't justify foreign-id
// mutations and we want to keep parity with
// `has_prior_subject_auth`'s filter.
@ -435,7 +437,7 @@ fn build_helper_summaries(
}
}
if !summary.param_auth_kinds.is_empty() {
// Deduplicate by last segment of the function name the
// Deduplicate by last segment of the function name, the
// lifting site matches the call's last segment too.
let last = name.rsplit('.').next().unwrap_or(name).to_string();
summaries
@ -492,7 +494,7 @@ fn stronger_check_kind(a: model::AuthCheckKind, b: model::AuthCheckKind) -> mode
/// For one unit, synthesise an `AuthCheck` at every call site that
/// targets a helper with a non-trivial summary. Subjects are taken
/// from `call_site.args_value_refs[K]` for each auth-checked param
/// position K these are the caller's concrete subjects passed at
/// position K, these are the caller's concrete subjects passed at
/// that arg slot, exactly what `auth_check_covers_subject` needs.
fn synthesise_checks_for_unit(
unit: &model::AnalysisUnit,
@ -501,7 +503,7 @@ fn synthesise_checks_for_unit(
let line_of = |span: (usize, usize)| -> usize {
// Span is byte offsets; we don't have direct access to a Tree
// here. Caller assigns line via `line` field on call_site
// through CallSite metadata absence fall back to the unit's
// through CallSite metadata absence, fall back to the unit's
// line since covers_subject uses `check.line <= op.line` and
// helper calls are typically near the unit start.
let _ = span;
@ -541,6 +543,7 @@ fn synthesise_checks_for_unit(
line,
args: call.args.clone(),
condition_text: None,
is_route_level: false,
});
}
out
@ -563,7 +566,7 @@ fn call_site_line(unit: &model::AnalysisUnit, call: &model::CallSite) -> Option<
None
}
/// Cross-file variant of [`synthesise_checks_for_unit`] for each
/// Cross-file variant of [`synthesise_checks_for_unit`], for each
/// call site in `unit`, resolve the callee against `GlobalSummaries`
/// and look up an `AuthCheckSummary` that was persisted by some other
/// file's pass-1 extraction. Skips call sites already handled by the
@ -589,7 +592,7 @@ fn synthesise_cross_file_checks_for_unit(
if unit.name.as_deref() == Some(last) {
continue;
}
// Skip if the single-file map already handled this callee
// Skip if the single-file map already handled this callee ,
// that path has richer same-file context (existing
// summaries from sibling units in this model) and its
// synthesised check is strictly more precise.
@ -636,6 +639,7 @@ fn synthesise_cross_file_checks_for_unit(
line,
args: call.args.clone(),
condition_text: None,
is_route_level: false,
});
}
out
@ -767,7 +771,7 @@ mod tests {
Some(SinkClass::DbCrossTenantRead)
);
// DatabaseConnection: unrecognized verb (`execute`) → DbMutation
// (conservative default treat as write-shaped).
// (conservative default, treat as write-shaped).
assert_eq!(
sink_class_for_type(&TypeKind::DatabaseConnection, "conn.execute", &rules),
Some(SinkClass::DbMutation)
@ -819,7 +823,7 @@ mod tests {
)));
let var_types: VarTypes = HashMap::new();
apply_var_types_to_model(&mut model, &rules, &var_types);
// Unchanged no entry in var_types for `db`.
// Unchanged, no entry in var_types for `db`.
assert_eq!(
model.units[0].operations[0].sink_class,
Some(SinkClass::DbMutation)

View file

@ -55,7 +55,7 @@ pub enum OperationKind {
}
/// Classification of a sensitive operation by the resource it targets.
/// `check_ownership_gaps` only fires on the first five classes
/// `check_ownership_gaps` only fires on the first five classes ,
/// `InMemoryLocal` is never authorization-relevant.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SinkClass {
@ -76,7 +76,7 @@ pub enum SinkClass {
/// (Redis / memcache / distributed cache client).
CacheCrossTenant,
/// A method call against a local, in-memory collection (HashMap,
/// HashSet, Vec, …) never authorization-relevant.
/// HashSet, Vec, …), never authorization-relevant.
InMemoryLocal,
}
@ -133,6 +133,33 @@ pub struct AuthCheck {
pub line: usize,
pub args: Vec<String>,
pub condition_text: Option<String>,
/// True when the check was declared at the route boundary
/// (decorator / middleware / dependency-injection list) rather
/// than as a per-call check inside the handler body.
///
/// Route-level non-login-guard checks authorize the *entire*
/// handler, they gate every value the handler receives, every
/// row the handler fetches, and every operation downstream. An
/// in-body `auth_check_covers_subject` walk that requires a
/// per-name subject match cannot model that semantics: a
/// FastAPI `dependencies=[Depends(requires_access_dag(method=
/// "POST", access_entity=DagAccessEntity.RUN))]` is opaque to
/// the engine, the inner `requires_access_dag` call carries no
/// per-arg subject ref pointing to `dag_id` or `dag.id`. The
/// flag tells `auth_check_covers_subject` to short-circuit
/// `true` for any non-login-guard route-level check, leaving
/// only the LoginGuard / TokenExpiry / TokenRecipient kinds
/// (already excluded upstream by `has_prior_subject_auth`'s
/// filter) to be ignored.
///
/// Set by `inject_middleware_auth` (Django, Flask, FastAPI) at
/// the route-decorator entry point. Default `false` for
/// in-body checks (`require_membership(user, group_id)`,
/// `is_admin(user)`, etc.), those still flow through the
/// per-subject coverage logic so a check on
/// `community.creator_id` doesn't blanket-suppress every other
/// subject in the unit.
pub is_route_level: bool,
}
#[derive(Debug, Clone)]
@ -140,7 +167,7 @@ pub struct SensitiveOperation {
pub kind: OperationKind,
/// Sink classification. `None` means the operation was recorded
/// for taxonomy completeness but does not match any known resource
/// class defensive, and currently unused.
/// class, defensive, and currently unused.
pub sink_class: Option<SinkClass>,
pub callee: String,
pub subjects: Vec<ValueRef>,
@ -183,7 +210,7 @@ pub struct AnalysisUnit {
/// "fetch-then-authorize" exemption in `checks.rs`: if a row-fetch
/// operation produces variable `V` and SOME auth check elsewhere
/// in the unit names `V`, the row-fetch operation is considered
/// authorized even though the check appears textually after the
/// authorized, even though the check appears textually after the
/// fetch. This is the standard idiom in row-level authz code:
/// fetch the row first to extract the resource id, then call
/// `check_<resource>_<role>(&user, &row, ...)` to authorize it.
@ -199,7 +226,7 @@ pub struct AnalysisUnit {
/// copies of `V.id` / `V.user_id` / `V.uid` / `V.userId` for some
/// `V ∈ self_actor_vars`). Populated when the extractor sees
/// `let X = V.id` or `let X = (V.id as ..).into()` / `V.id.into()`
/// shapes anywhere a route-handler reduces the authenticated
/// shapes, anywhere a route-handler reduces the authenticated
/// principal to a scalar id and reuses it as a SQL parameter.
/// Consulted by `is_actor_context_subject` so subjects whose `name`
/// is in this set count as actor context, not foreign scoped IDs.
@ -217,7 +244,7 @@ pub struct AnalysisUnit {
/// one of these names.
pub authorized_sql_vars: HashSet<String>,
/// Local variables bound (by `let`, `:=`, `var`, `const`) to a
/// pure literal string, integer, float, or boolean. These are
/// pure literal, string, integer, float, or boolean. These are
/// developer-chosen constants and cannot be user-controlled, so
/// they must never trip `<lang>.auth.missing_ownership_check`
/// even when the variable name passes `is_id_like`. Closes the
@ -231,22 +258,21 @@ pub struct AnalysisUnit {
/// `is_typed_bounded_subject` so parameters like Spring `Long
/// userId`, Axum `Path<i64>`, or FastAPI `user_id: int` are not
/// classified as scoped-identifier subjects even when their name
/// passes `is_id_like` the framework guarantees the value is a
/// passes `is_id_like`, the framework guarantees the value is a
/// number that cannot carry a SQL/file/shell payload.
pub typed_bounded_vars: HashSet<String>,
/// Phase 6: per-DTO-extractor parameter, the field names whose
/// per-DTO-extractor parameter, the field names whose
/// declared type is a payload-incompatible scalar. Map key is the
/// parameter name (e.g. `dto`), value is the list of field names
/// (e.g. `["age", "count"]`). Populated by
/// [`super::apply_typed_bounded_params`] only when the parameter
/// itself was recognised as a typed extractor by a Phase 1-2
/// matcher — bare parameters with no framework gate never lift
/// their fields.
/// itself was recognised as a typed extractor, bare parameters
/// with no framework gate never lift their fields.
pub typed_bounded_dto_fields: HashMap<String, Vec<String>>,
/// Per-unit dynamic session-base text set, supplementing the
/// hard-coded list in `is_self_scoped_session_base`. Populated by
/// the extractor when a parameter's static type signals a known
/// auth-context shape e.g. TRPC's `Options { ctx: { user:
/// auth-context shape, e.g. TRPC's `Options { ctx: { user:
/// NonNullable<TrpcSessionUser> } }` adds `<localCtx>.user` so
/// downstream `ctx.user.id` accesses count as actor context. Each
/// entry is the dotted base text (e.g. `"ctx.user"`,

View file

@ -28,7 +28,7 @@
pub enum SqlAuthClassification {
/// Query is auth-gated. The JOIN (or direct WHERE) pins returned
/// rows to the bound user. We don't track *which* bind position
/// here the caller treats whichever bind value flows into the
/// here, the caller treats whichever bind value flows into the
/// query as the user-id witness; that's safe because the caller
/// already requires the row binding to come from a `let X = …`
/// site we can name.
@ -37,12 +37,12 @@ pub enum SqlAuthClassification {
/// Classify `sql` as auth-gated under the configured ACL tables.
/// Returns `Some(Authorized)` when one of the recognized patterns
/// holds, `None` otherwise (conservative unknown shapes are treated
/// holds, `None` otherwise (conservative, unknown shapes are treated
/// as unauthorized).
pub fn classify_sql_query(sql: &str, acl_tables: &[String]) -> Option<SqlAuthClassification> {
let normalized = normalize_sql(sql);
if !normalized.trim_start().starts_with("select") {
// For B3 we only authorize SELECT queries INSERT/UPDATE/DELETE
// For B3 we only authorize SELECT queries, INSERT/UPDATE/DELETE
// need their own analysis and aren't in scope. (A literal
// `DELETE … WHERE user_id = ?N` could be safely authorized,
// but the call sites we care about for FP suppression are
@ -60,7 +60,7 @@ pub fn classify_sql_query(sql: &str, acl_tables: &[String]) -> Option<SqlAuthCla
}
/// `SELECT … FROM <T> [AS] <ALIAS>? JOIN <ACL> [AS] <GA>? ON … WHERE
/// <GA?>.user_id = ?N` verifies that an ACL table appears in a JOIN
/// <GA?>.user_id = ?N`, verifies that an ACL table appears in a JOIN
/// clause and that the WHERE clause contains a `<…>.user_id = ?` (or
/// bare `user_id = ?`) predicate. Order of the WHERE predicates
/// doesn't matter; AND/OR connectors are ignored.
@ -87,14 +87,14 @@ fn matches_join_through_acl(sql: &str, acl_tables: &[String]) -> bool {
where_clause_contains_user_id_bind(where_clause)
}
/// Direct ownership: `SELECT … FROM <T> WHERE … user_id = ?N` no
/// Direct ownership: `SELECT … FROM <T> WHERE … user_id = ?N`, no
/// JOIN. Covers single-table reads where the row already carries the
/// owning user id (`SELECT … FROM docs WHERE user_id = ?1`). We do
/// NOT require `id = ?M` to also be present; the `user_id = ?N`
/// predicate alone is sufficient, since any row returned must be
/// owned by the bound user.
///
/// Refuses to fire when a JOIN is present the JOIN target may not
/// Refuses to fire when a JOIN is present, the JOIN target may not
/// be in the ACL list, so the WHERE predicate (which may apply to
/// the joined table, e.g. `WHERE al.user_id = ?N` against an
/// `audit_log` JOIN) doesn't actually pin the primary rows to the
@ -125,7 +125,7 @@ fn where_clause_contains_user_id_bind(where_clause: &str) -> bool {
for (idx, _) in where_only.match_indices(needle) {
// Make sure this is a column boundary on the left side
// (avoid matching `posted_user_id` or `target_user_id`
// those don't pin to the actor).
//, those don't pin to the actor).
let before = where_only[..idx].chars().last();
if !is_column_boundary_left(before) {
continue;
@ -158,11 +158,11 @@ fn looks_like_bind_param(after_eq: &str) -> bool {
return false;
}
match bytes[0] {
// ?N (sqlite/sqlx anonymous) accept ?, ?1, ?2…
// ?N (sqlite/sqlx anonymous), accept ?, ?1, ?2…
b'?' => true,
// $N (postgres style) require a digit after.
// $N (postgres style), require a digit after.
b'$' => bytes.get(1).is_some_and(|b| b.is_ascii_digit()),
// :name (named bind) require an identifier char after.
// :name (named bind), require an identifier char after.
b':' => bytes
.get(1)
.is_some_and(|b| b.is_ascii_alphabetic() || *b == b'_'),
@ -277,7 +277,7 @@ mod tests {
#[test]
fn join_against_non_acl_table_is_not_authorized() {
// `audit_log` is not in the configured ACL list JOIN doesn't
// `audit_log` is not in the configured ACL list, JOIN doesn't
// pin rows to the bound user, so the query is unauthorized.
let sql = "SELECT d.* FROM docs d \
JOIN audit_log al ON al.doc_id = d.id \
@ -301,7 +301,7 @@ mod tests {
#[test]
fn similar_column_names_do_not_trip_user_id_match() {
// `posted_user_id` shouldn't satisfy the `user_id = ?` check
// `posted_user_id` shouldn't satisfy the `user_id = ?` check ,
// that column doesn't pin to the actor.
let sql = "SELECT * FROM posts WHERE posted_user_id = ?1";
assert_eq!(classify_sql_query(sql, &acl()), None);

View file

@ -16,7 +16,7 @@ use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub struct CallEdge {
/// The raw callee string as it appeared in source (e.g. `"env::var"`).
/// Preserved for diagnostics **not** the normalized form used for resolution.
/// Preserved for diagnostics, **not** the normalized form used for resolution.
#[allow(dead_code)] // used for future diagnostics and path display
pub call_site: String,
}
@ -28,7 +28,7 @@ pub struct UnresolvedCallee {
pub callee_name: String,
}
/// A callee that matched multiple function definitions ambiguous.
/// A callee that matched multiple function definitions, ambiguous.
#[derive(Debug, Clone)]
pub struct AmbiguousCallee {
pub caller: FuncKey,
@ -168,14 +168,14 @@ pub(crate) fn callee_container_hint(raw: &str) -> &str {
///
/// Key design notes:
///
/// * Keys are **language-scoped** a Java `findById` and a Python
/// * Keys are **language-scoped**, a Java `findById` and a Python
/// `findById` never alias. Every other index in this module is also
/// language-scoped (`by_lang_name`, `by_lang_qualified`); keeping the
/// same partition here means devirtualisation's "subset of today's
/// targets" invariant is structurally preserved.
/// * The container key carries the [`FuncKey::container`] verbatim
/// (e.g. `"Repository"` or nested `"Outer::Inner"`). Empty containers
/// are not indexed in `by_container` free top-level functions live
/// are not indexed in `by_container`, free top-level functions live
/// only in `by_name` and are looked up via the `None` container path.
/// * `SmallVec` inline capacity is sized for the common case (≤ 2 same-
/// container overloads, ≤ 4 same-name candidates across containers);
@ -199,7 +199,7 @@ impl ClassMethodIndex {
/// Iteration is over every `FuncKey` in the map; each key is
/// inserted into `by_name` and (when its container is non-empty)
/// into `by_container`. No ordering guarantees on the candidate
/// vectors call sites that need determinism should sort downstream.
/// vectors, call sites that need determinism should sort downstream.
pub fn build(summaries: &GlobalSummaries) -> Self {
let mut by_container: HashMap<(Lang, String, String), SmallVec<[FuncKey; 2]>> =
HashMap::new();
@ -223,11 +223,11 @@ impl ClassMethodIndex {
/// Resolve `(container, method)` to its candidate target set.
///
/// * `container = Some(c)` return only candidates whose defining
/// * `container = Some(c)`, return only candidates whose defining
/// container equals `c`. Empty slice when no such target exists,
/// even if a same-name function lives in another container.
/// This is the **devirtualised** path: a hard subset of `by_name`.
/// * `container = None` return every same-name candidate in the
/// * `container = None`, return every same-name candidate in the
/// language. This is the **fallback** path used when the receiver
/// type is unknown; matches today's name-only behaviour.
///
@ -264,48 +264,19 @@ impl ClassMethodIndex {
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Type hierarchy index — Phase 6 (subtype awareness)
// ─────────────────────────────────────────────────────────────────────────────
// ── Type hierarchy index ────────────────────────────────────────────────
/// Per-language `(super_type) → SmallVec<[sub_type]>` index built once
/// per call-graph construction from every merged
/// [`crate::summary::FuncSummary::hierarchy_edges`]. When a method
/// call's receiver is statically typed as a super-class / trait /
/// interface, the call-graph wedge fans out the edge to every concrete
/// implementer's matching method — recovering the dispatch precision
/// that would otherwise be lost to today's name-only resolution.
/// Per-language `(super_type) → sub-types` index built from every merged
/// [`crate::summary::FuncSummary::hierarchy_edges`]. Lets virtual
/// dispatch fan out to every concrete implementer's matching method.
///
/// Subtype semantics covered:
/// * Java `class X extends Y` / `class X implements I` / `interface
/// I extends J`
/// * Rust `impl Trait for Type`
/// * TypeScript `class X extends Y implements I` /
/// `interface I extends J`
/// * Python `class X(Base)` (excludes `object`)
/// * PHP, Ruby, C++ — see [`crate::cfg::hierarchy`] for the
/// per-language extraction rules.
/// Covers Java `extends`/`implements`, Rust `impl Trait for Type`, TS
/// `extends`/`implements`, Python `class X(Base)`, plus PHP/Ruby/C++
/// (see [`crate::cfg::hierarchy`]). Go's structural interfaces are
/// intentionally omitted, name-only resolution is used instead.
///
/// Go's structural / implicit interface satisfaction is intractable to
/// enumerate from per-file information and is **deliberately omitted**
/// — Go callers fall back to today's name-only resolution, so
/// precision is unchanged from the pre-Phase-6 baseline.
///
/// Key design notes
/// ────────────────
///
/// * **Language-scoped.** Mirrors [`ClassMethodIndex`]: a Java
/// `Repository` and a Python `Repository` never alias.
/// * **Bare container names.** No namespace qualification. When
/// container names alias across unrelated namespaces (rare in
/// practice, common in mono-repos) the resolver may over-fan-out;
/// that is conservative for *correctness* (a subset of dispatch
/// targets is unsafe — virtual dispatch may genuinely reach any
/// implementer) and may need namespace-qualified keying as a
/// Phase 6.5 follow-up if benchmark precision regresses.
/// * **`SmallVec` inline capacity.** 4 implementers per super-type
/// covers most real-world hierarchies without spillover; spillover
/// allocates but keeps lookups O(1) amortised.
/// Container names are bare (no namespace), so cross-namespace aliases
/// may over-fan-out. That is conservative for correctness.
#[derive(Debug, Default, Clone)]
pub struct TypeHierarchyIndex {
/// `(lang, super_type)` → distinct sub-type / impl container names.
@ -438,15 +409,11 @@ impl TypeHierarchyIndex {
/// 3. On ambiguity: use two-segment qualified name to narrow candidates
/// 4. Interop edges (explicit cross-language bridges)
///
/// **Phase 3 (typed call-graph devirtualisation):** when an SSA
/// summary on the caller carries a `(call_ordinal, container_name)`
/// entry in [`crate::summary::ssa_summary::SsaFuncSummary::typed_call_receivers`],
/// the matching call site is first resolved via [`ClassMethodIndex`]
/// restricted to the receiver-typed container. An exact match (after
/// arity filter) becomes the edge; a multi-candidate hit is fed back
/// into the standard resolver via `CalleeQuery.receiver_type`; a
/// zero-candidate hit falls through to today's name-only resolution
/// so receiver-type misclassifications never silently drop edges.
/// Typed-call devirtualisation: when the caller's SSA summary carries
/// a typed container for a call ordinal, that site is first resolved
/// via [`ClassMethodIndex`] restricted to the receiver type. Exact
/// match → edge; multi-candidate → fed back through
/// `CalleeQuery.receiver_type`; zero match → name-only fallback.
///
/// Unresolved and ambiguous callees are recorded for diagnostics but
/// do **not** create edges.
@ -460,7 +427,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
index.insert(key.clone(), idx);
}
// Phase 3: build a single `(lang, container, name) → candidates`
// build a single `(lang, container, name) → candidates`
// index from the merged summaries. Used below to devirtualise
// every method-call edge whose receiver has a recoverable type
// fact. Cost is one allocation per FuncKey across the program;
@ -468,7 +435,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
// win on codebases with many same-name methods.
let method_index = ClassMethodIndex::build(summaries);
// Phase 6: build a sibling `(lang, super_type) → sub_types` index
// build a sibling `(lang, super_type) → sub_types` index
// from every merged summary's `hierarchy_edges`. Consumed below
// to fan out method-call edges to all known concrete
// implementers when a receiver's static type is a super-class /
@ -497,7 +464,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
None
};
// Phase 3: per-caller `(call_ordinal → container_name)` map
// per-caller `(call_ordinal → container_name)` map
// pulled from the caller's SSA summary, when one exists.
// Empty when the caller has no SSA summary (zero-param trivial
// bodies skip extraction unless they had typed receivers) or
@ -520,23 +487,15 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
let leaf = callee_leaf_name(raw_callee);
// Two-segment form for diagnostics / fallback disambiguation.
let qualified = normalize_callee_name(raw_callee);
// Structured arity carried per call site used to disambiguate
// Structured arity carried per call site, used to disambiguate
// same-name/different-arity overloads during resolution.
let arity_hint: Option<usize> = site.arity;
// Phase 3 devirtualisation entry point. Only fires for
// method calls (sites carrying a structured receiver) when
// the caller's SSA summary recorded a typed container for
// this ordinal. When `Some(container)` resolves to a
// single arity-matching target, we add the edge and skip
// the standard resolver. When it resolves to multiple,
// we fall through with the container hinted as
// `receiver_type` so `resolve_callee`'s authoritative
// step-1 picks the right one. When it resolves to zero,
// we fall through entirely so today's name-only path can
// still find the edge — preserving the
// "subset of today's targets, never a superset" rule
// even under type-fact misclassification.
// Devirtualisation: for method calls whose SSA summary
// recorded a typed container, resolve via ClassMethodIndex
// first. Single match → direct edge; multi → fall through
// with `receiver_type` set; zero → name-only fallback so
// misclassified receivers never silently drop edges.
let typed_container: Option<&str> = if site.receiver.is_some() {
typed_receivers.get(&site.ordinal).copied()
} else {
@ -544,12 +503,10 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
};
if let Some(container) = typed_container {
// Phase 6: resolve the typed container *plus* every
// known sub-type / impl in the hierarchy index, so a
// receiver typed as a super-class / trait / interface
// fans out to every concrete implementer. When the
// hierarchy has no matching super-type entry, this
// collapses to the Phase 3 direct-container lookup.
// Resolve the typed container plus every known
// sub-type / impl, so a super-class / trait / interface
// receiver fans out to every concrete implementer.
// No hierarchy entry → direct-container lookup.
let widened: Vec<FuncKey> = hierarchy.resolve_with_hierarchy(
&method_index,
caller_key.lang,
@ -575,8 +532,8 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
}
continue;
}
// Phase 6: multiple arity-filtered candidates means
// genuine virtual dispatch through a super-type fan
// multiple arity-filtered candidates means
// genuine virtual dispatch through a super-type, fan
// out to *every* implementer. This widens edges
// (correctly: the call genuinely may target any
// implementer at runtime) so SCC sizes may grow on
@ -614,7 +571,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
continue;
}
// Either zero matches (fall through to legacy path) or
// multiple matches on the direct container let
// multiple matches on the direct container, let
// `resolve_callee` apply its authoritative
// receiver_type filter + tie-breakers.
if !arity_filtered.is_empty() {
@ -652,8 +609,8 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
// Rust callers with a module-qualified call (no receiver) go
// through the `use`-map aware resolver first. When the call has
// a structured receiver it is a method call the qualifier is
// an impl/trait name, not a module path so we fall back to the
// a structured receiver it is a method call, the qualifier is
// an impl/trait name, not a module path, so we fall back to the
// structured resolver. All other languages skip the use-map
// branch entirely.
let use_rust_path = caller_key.lang == Lang::Rust && site.receiver.is_none();
@ -671,11 +628,11 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
// categorize each hint so the resolver can apply the right
// policy:
//
// * `namespace_qualifier` structured module/namespace
// * `namespace_qualifier`, structured module/namespace
// prefix (`env` in `env::var`, `http` in `http.Get`).
// * `receiver_var` syntactic receiver variable (e.g.
// * `receiver_var`, syntactic receiver variable (e.g.
// `obj` in `obj.method`); used only as a last tie-break.
// * `caller_container` caller's own class/impl, so bare
// * `caller_container`, caller's own class/impl, so bare
// `foo()` inside a method resolves to the same class.
//
// The raw text-parsed container (legacy
@ -815,7 +772,7 @@ fn resolve_via_interop(
/// Compute SCC decomposition and topological ordering of the call graph.
///
/// `petgraph::algo::tarjan_scc` returns SCCs in *reverse* topological order
/// of the condensation DAG i.e. leaf SCCs (no outgoing cross-SCC edges)
/// of the condensation DAG, i.e. leaf SCCs (no outgoing cross-SCC edges)
/// come **first**. That is exactly the **callee-first** order suitable for
/// bottom-up taint propagation.
pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
@ -850,7 +807,7 @@ pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
/// [`crate::commands::scan::run_topo_batches`]. `cross_file` is a tighter
/// signal used by joint fixed-point convergence: it implies the
/// recursion involves at least one cross-file call edge, so the inline
/// cache and per-iteration findings need joint convergence not just
/// cache and per-iteration findings need joint convergence, not just
/// summary convergence.
pub struct FileBatch<'a> {
pub files: Vec<&'a PathBuf>,
@ -901,7 +858,7 @@ pub fn callers_of(cg: &CallGraph, callee: &FuncKey) -> Vec<FuncKey> {
/// result is a `HashSet<String>` suitable for membership checks while
/// filtering the batch's file list.
///
/// A changed callee's *own* namespace is also included if the
/// A changed callee's *own* namespace is also included, if the
/// callee's summary was refined, the file it lives in may itself
/// have been a caller (intra-file recursion) or may carry sibling
/// functions whose analysis should be re-run alongside the callee
@ -958,7 +915,7 @@ pub fn scc_file_batches_with_metadata<'a>(
// 2. Build file relative-path → (min topo index, has_mutual_recursion, cross_file).
// `cross_file` is set whenever the file participates in an SCC whose
// nodes span more than one namespace the cross-file signal.
// nodes span more than one namespace, the cross-file signal.
let mut file_topo: HashMap<&str, (usize, bool, bool)> = HashMap::new();
for (topo_pos, &scc_idx) in analysis.topo_scc_callee_first.iter().enumerate() {
let scc_recursive = analysis.sccs[scc_idx].len() > 1;
@ -1015,7 +972,7 @@ pub fn scc_file_batches_with_metadata<'a>(
/// of its functions appear. This ensures leaf callees are available as early
/// as possible for files that depend on them. Caller functions in the same
/// file that happen to be in a later SCC are no worse off than the current
/// fully-parallel approach they simply don't yet benefit from ordering,
/// fully-parallel approach, they simply don't yet benefit from ordering,
/// but nothing is lost.
///
/// Returns `(ordered_batches, orphan_files)` where orphan_files are paths
@ -1188,7 +1145,7 @@ mod tests {
fn same_name_python_and_rust() {
let py_foo = make_summary("foo", "handler.py", "python", 0, vec![]);
let rs_foo = make_summary("foo", "handler.rs", "rust", 0, vec![]);
// Python caller calls "foo" should only see the Python one
// Python caller calls "foo", should only see the Python one
let py_caller = make_summary("main", "app.py", "python", 0, vec!["foo"]);
let gs = merge_summaries(vec![py_foo, rs_foo, py_caller], None);
@ -1315,7 +1272,7 @@ mod tests {
let gs = merge_summaries(vec![helper_a, helper_b, caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 0); // no edge ambiguous
assert_eq!(cg.graph.edge_count(), 0); // no edge, ambiguous
assert!(cg.unresolved_not_found.is_empty());
assert_eq!(cg.unresolved_ambiguous.len(), 1);
assert_eq!(cg.unresolved_ambiguous[0].callee_name, "helper");
@ -1728,7 +1685,7 @@ mod tests {
// Two "send" functions in different namespaces.
let send_http = make_summary("send", "src/http.rs", "rust", 0, vec![]);
let send_mail = make_summary("send", "src/mail.rs", "rust", 0, vec![]);
// Caller is in a third namespace, calling "http::send" leaf "send"
// Caller is in a third namespace, calling "http::send", leaf "send"
// is ambiguous, but "http" qualifier should match "src/http.rs".
let caller = make_summary("caller", "src/main.rs", "rust", 0, vec!["http::send"]);
@ -1766,7 +1723,7 @@ mod tests {
#[test]
fn unqualified_callee_stays_ambiguous() {
// Same setup but caller uses unqualified "send" no disambiguation
// Same setup but caller uses unqualified "send", no disambiguation
let send_http = make_summary("send", "src/http.rs", "rust", 0, vec![]);
let send_mail = make_summary("send", "src/mail.rs", "rust", 0, vec![]);
let caller = make_summary("caller", "src/main.rs", "rust", 0, vec!["send"]);
@ -1806,7 +1763,7 @@ mod tests {
// ── structured-metadata disambiguation (callee metadata) ─────────────
/// Helper: build a summary whose callees carry structured CalleeSite
/// metadata used by the tests below to exercise arity / receiver /
/// metadata, used by the tests below to exercise arity / receiver /
/// qualifier propagation into resolution.
fn summary_with_sites(
name: &str,
@ -1840,7 +1797,7 @@ mod tests {
// Two `encode` functions in the same file, different arities.
let encode1 = make_summary("encode", "src/codec.rs", "rust", 1, vec![]);
let encode2 = make_summary("encode", "src/codec.rs", "rust", 2, vec![]);
// Caller lives in *another* file so namespace does not disambiguate
// Caller lives in *another* file so namespace does not disambiguate ,
// the only signal is the per-call-site arity.
let caller = summary_with_sites(
"driver",
@ -2007,7 +1964,7 @@ mod tests {
#[test]
fn legacy_string_callees_still_resolve() {
let helper = make_summary("helper", "src/lib.rs", "rust", 0, vec![]);
// make_summary already returns CalleeSite::bare entries i.e. the
// make_summary already returns CalleeSite::bare entries, i.e. the
// "lifted legacy" form with no arity or receiver metadata.
let caller = make_summary("main", "src/lib.rs", "rust", 0, vec!["helper"]);
let gs = merge_summaries(vec![helper, caller], None);
@ -2017,7 +1974,7 @@ mod tests {
assert!(cg.unresolved_ambiguous.is_empty());
}
// ── ClassMethodIndex (Phase 1: structural index, no behaviour wiring) ──
// ── ClassMethodIndex ────────────────────────────────────────────────
/// Helper: `(name, container)` pairs in the same file. Builds two
/// summaries with the same leaf name on different containers so the
@ -2058,7 +2015,7 @@ mod tests {
assert_eq!(cache_hits.len(), 1);
assert_eq!(cache_hits[0].container, "Cache");
// Bare-name lookup keeps both candidates fallback behaviour.
// Bare-name lookup keeps both candidates, fallback behaviour.
let bare_hits = idx.resolve(Lang::Rust, None, "findById");
assert_eq!(
bare_hits.len(),
@ -2070,7 +2027,7 @@ mod tests {
#[test]
fn class_method_index_falls_back_to_name_when_container_unknown() {
// `None` container or empty-string container both route to
// the bare-name index equivalent to today's name-only edge
// the bare-name index, equivalent to today's name-only edge
// insertion.
let svc = make_method_summary("process", "OrderService", "src/svc.rs", "rust", 1);
let helper = make_summary("process", "src/util.rs", "rust", 1, vec![]);
@ -2082,7 +2039,7 @@ mod tests {
let none_hits = idx.resolve(Lang::Rust, None, "process");
assert_eq!(none_hits.len(), 2);
// Empty string container behaves identically to None it is
// Empty string container behaves identically to None, it is
// not stored under any container key.
let empty_hits = idx.resolve(Lang::Rust, Some(""), "process");
assert_eq!(empty_hits.len(), 2);
@ -2107,7 +2064,7 @@ mod tests {
.is_empty()
);
// Right method, wrong container → empty (no fallback to bare-name
// when a container is supplied that's the whole devirtualisation
// when a container is supplied, that's the whole devirtualisation
// promise).
assert!(
idx.resolve(Lang::Rust, Some("OtherClass"), "findById")
@ -2140,7 +2097,7 @@ mod tests {
#[test]
fn class_method_index_handles_arity_overloads() {
// Two arity overloads on the same container are both kept under
// the same `(container, name)` key arity narrowing is the
// the same `(container, name)` key, arity narrowing is the
// caller's responsibility (today's resolver also does this).
let one = make_method_summary("encode", "Codec", "src/codec.rs", "rust", 1);
let two = make_method_summary("encode", "Codec", "src/codec.rs", "rust", 2);
@ -2156,7 +2113,7 @@ mod tests {
);
}
// ── Phase 3: devirtualised edge insertion via typed_call_receivers ──
// ── devirtualised edge insertion via typed_call_receivers ──
/// Two `findById` definitions live on different containers in
/// different files. A caller whose SSA summary records the
@ -2241,7 +2198,7 @@ mod tests {
use crate::summary::ssa_summary::SsaFuncSummary;
// Single `process` on `Worker`. No `process` exists on
// `Other` that's the receiver type the caller's SSA
// `Other`, that's the receiver type the caller's SSA
// summary will (incorrectly) record.
let worker = make_method_summary("process", "Worker", "src/worker.rs", "rust", 1);
let caller = summary_with_sites(
@ -2270,7 +2227,7 @@ mod tests {
gs.insert_ssa(
caller_key.clone(),
SsaFuncSummary {
// Wrong receiver type `Other::process` does not exist.
// Wrong receiver type, `Other::process` does not exist.
typed_call_receivers: vec![(0, "Other".to_string())],
..Default::default()
},
@ -2292,7 +2249,7 @@ mod tests {
);
}
// ── Phase 6: TypeHierarchyIndex ───────────────────────────────────
// ── TypeHierarchyIndex ───────────────────────────────────
/// Helper: build a hierarchy index from a list of
/// `(lang, sub, super)` edges by injecting them onto a single
@ -2334,7 +2291,7 @@ mod tests {
TypeHierarchyIndex::build(&gs)
}
/// B-1: Round-trip a hierarchy built from a small set of edges
/// B-1: Round-trip, a hierarchy built from a small set of edges
/// answers `subs_of` correctly and `super_keys_len` matches the
/// distinct super count.
#[test]
@ -2356,7 +2313,7 @@ mod tests {
assert_eq!(h.super_keys_len(), 2);
}
/// B-2: Java interface dispatch `Repository r; r.findById(...)`
/// B-2: Java interface dispatch, `Repository r; r.findById(...)`
/// fans out to every concrete implementer's `findById`.
#[test]
fn b2_java_interface_dispatch_fans_out_to_all_impls() {
@ -2421,7 +2378,7 @@ mod tests {
assert_eq!(targets.len(), 2, "B-2: exactly two fan-out edges expected");
}
/// B-3: Java extends `Base b; b.foo()` reaches Base AND Derived
/// B-3: Java extends, `Base b; b.foo()` reaches Base AND Derived
/// when Derived extends Base. Pins inheritance fan-out separately
/// from interface implements.
#[test]
@ -2479,7 +2436,7 @@ mod tests {
);
}
/// B-4: Rust trait dispatch `Box<dyn Repo>; r.find(...)` reaches
/// B-4: Rust trait dispatch, `Box<dyn Repo>; r.find(...)` reaches
/// every `impl Repo for X` `find`.
#[test]
fn b4_rust_trait_dispatch_fans_out_to_impls() {
@ -2536,10 +2493,9 @@ mod tests {
);
}
/// B-7: Empty hierarchy when the typed container has no recorded
/// B-7: Empty hierarchy, when the typed container has no recorded
/// sub-types, `resolve_with_hierarchy` collapses to the direct
/// `ClassMethodIndex::resolve` lookup. Pin: Phase 6 is a no-op
/// when no inheritance was extracted.
/// `ClassMethodIndex::resolve` lookup.
#[test]
fn b7_empty_hierarchy_falls_back_to_single_container() {
use crate::summary::ssa_summary::SsaFuncSummary;
@ -2561,7 +2517,7 @@ mod tests {
);
let mut gs = merge_summaries(vec![repo, cache, caller], None);
// No hierarchy_edges set anywhere Repository has no
// No hierarchy_edges set anywhere, Repository has no
// sub-types, so devirtualisation collapses to direct match.
let caller_key = FuncKey {
lang: Lang::Rust,
@ -2589,10 +2545,9 @@ mod tests {
assert_eq!(targets[0].container, "Repository");
}
/// B-8: Concrete sub-type when the receiver is typed as the
/// B-8: Concrete sub-type, when the receiver is typed as the
/// concrete sub-class (not the super-type), no hierarchy
/// expansion fires. Pin: Phase 6 narrows on concrete types
/// exactly like Phase 3.
/// expansion fires.
#[test]
fn b8_concrete_subtype_does_not_widen() {
use crate::summary::ssa_summary::SsaFuncSummary;
@ -2654,7 +2609,7 @@ mod tests {
assert_eq!(targets[0].container, "UserRepo");
}
/// B-9: Diamond multiple impls sharing a super-type, dedup
/// B-9: Diamond, multiple impls sharing a super-type, dedup
/// applied per call site so each FuncKey is edged at most once.
#[test]
fn b9_diamond_dedup_one_edge_per_funckey() {
@ -2662,7 +2617,7 @@ mod tests {
let a = make_method_summary("doIt", "A", "src/A.java", "java", 0);
let b = make_method_summary("doIt", "B", "src/B.java", "java", 0);
// A and B both extend Iface in two separate file emissions
// A and B both extend Iface in two separate file emissions ,
// hierarchy_edges duplicates across files; dedup expected.
let mut h1 = make_method_summary("__h", "Iface", "src/I1.java", "java", 0);
h1.hierarchy_edges = vec![
@ -2722,7 +2677,7 @@ mod tests {
assert!(containers.contains("A") && containers.contains("B"));
}
/// B-13: Stale hierarchy edge sub-type referenced by an edge
/// B-13: Stale hierarchy edge, sub-type referenced by an edge
/// no longer has a matching FuncKey. Resolver must not panic
/// and must still resolve to whatever IS present.
#[test]
@ -2730,7 +2685,7 @@ mod tests {
use crate::summary::ssa_summary::SsaFuncSummary;
// `Base` exists; `Derived` referenced by hierarchy_edges but
// its `foo` is never defined. Phase 6 must not panic and
// its `foo` is never defined. Resolver must not panic and
// must still emit the Base::foo edge.
let base = make_method_summary("foo", "Base", "src/Base.java", "java", 0);
let mut h = make_method_summary("__h", "X", "src/X.java", "java", 0);
@ -2815,7 +2770,7 @@ mod tests {
arity: Some(0),
..Default::default()
};
// A typed_call_receivers entry with ordinal=0 but since the
// A typed_call_receivers entry with ordinal=0, but since the
// site has receiver=None, this MUST be ignored.
gs.insert_ssa(
caller_key.clone(),

View file

@ -10,7 +10,7 @@ use tree_sitter::Node;
/// at the *case-level* shape `build_switch` sees here. Rust `match`, Go
/// `switch`, and Java arrow-switches qualify; classic Java/C/C++/JS switches
/// with fall-through do not. The check is per-language because Java mixes
/// arrow and classic shapes that's handled by inspecting the case kind in
/// arrow and classic shapes, that's handled by inspecting the case kind in
/// [`extract_case_literal_text`].
fn lang_has_exclusive_cases(lang: &str) -> bool {
matches!(lang, "rust" | "go")
@ -19,7 +19,7 @@ fn lang_has_exclusive_cases(lang: &str) -> bool {
/// Extract the scrutinee subtree from a switch-like AST node.
///
/// Returns the AST node referenced by the language's scrutinee field. Only
/// fires for Rust `match`, Go `switch`, and Java `switch` statements other
/// fires for Rust `match`, Go `switch`, and Java `switch` statements, other
/// languages return `None` so [`build_switch`] keeps its legacy behavior.
fn extract_scrutinee_node<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
let field = match lang {
@ -39,7 +39,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
let kind = case.kind();
match (lang, kind) {
("rust", "match_arm") => {
// Reject guarded arms `match x { y if cond => ... }`.
// Reject guarded arms, `match x { y if cond => ... }`.
if case.child_by_field_name("guard").is_some() {
return None;
}
@ -71,7 +71,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
text_of(inner, code)
}
("go", "expression_case") => {
// Go case `case v1, v2: ...` only handle exactly one expression.
// Go case `case v1, v2: ...`, only handle exactly one expression.
let value = case.child_by_field_name("value")?;
let mut named_children: Vec<Node> = Vec::new();
let mut cursor = value.walk();
@ -195,7 +195,7 @@ pub(super) fn extract_catch_param_name<'a>(
// -------------------------------------------------------------------------
/// Builds CFG for Ruby's `begin`/`rescue`/`ensure` blocks (and `body_statement`
/// with inline rescue). Ruby's `begin` has no `body` field the try-body
/// with inline rescue). Ruby's `begin` has no `body` field, the try-body
/// statements are direct children before `rescue`/`else`/`ensure` nodes.
#[allow(clippy::too_many_arguments)]
pub(super) fn build_begin_rescue<'a>(
@ -305,7 +305,7 @@ pub(super) fn build_begin_rescue<'a>(
vec![synth]
} else {
// No param name will wire exception edges to first rescue body node
// No param name, will wire exception edges to first rescue body node
Vec::new()
};
@ -333,7 +333,7 @@ pub(super) fn build_begin_rescue<'a>(
current_body_id,
)
} else {
// No body field build rescue node itself as a block.
// No body field, build rescue node itself as a block.
// Filter out meta-children (exceptions, exception_variable) by
// iterating and building only statement children.
let mut rescue_cursor = rescue_node.walk();
@ -407,7 +407,7 @@ pub(super) fn build_begin_rescue<'a>(
try_exits
};
// 6. Build ensure clause (Ruby's finally always runs)
// 6. Build ensure clause (Ruby's finally, always runs)
if let Some(ensure_node) = ensure_clause {
let mut ensure_preds: Vec<NodeIndex> = Vec::new();
ensure_preds.extend(&normal_exits);
@ -443,7 +443,7 @@ pub(super) fn build_begin_rescue<'a>(
}
// -------------------------------------------------------------------------
// switch handler multi-way dispatch with fallthrough
// switch handler, multi-way dispatch with fallthrough
// -------------------------------------------------------------------------
/// True for AST kinds that wrap a single switch case body.
@ -490,7 +490,7 @@ pub(super) fn case_has_default_label(case: Node<'_>) -> bool {
/// Build CFG for a switch statement.
///
/// The dispatch is decomposed into a chain of binary `StmtKind::If` headers
/// — one per non-default case — because the SSA terminator only models 0/1/2
///, one per non-default case, because the SSA terminator only models 0/1/2
/// successors. A monolithic N-way header would otherwise be collapsed to
/// `Goto(first)` and silently drop every other case. Each header's True edge
/// reaches its case body; the False edge falls through to the next header (or
@ -544,7 +544,7 @@ pub(super) fn build_switch<'a>(
}
}
// Grammar didn't expose recognisable case nodes fall back to a single
// Grammar didn't expose recognisable case nodes, fall back to a single
// header + Block-style walk so nodes still get linked.
if cases.is_empty() {
let header = push_node(
@ -603,7 +603,7 @@ pub(super) fn build_switch<'a>(
// arrow-switch), pre-extract the scrutinee text + idents so the synthetic
// dispatch headers can carry a `<scrutinee> == <case_literal>` condition.
// Falls back to `None` when the scrutinee is structurally complex (calls,
// member chains, parenthesized expressions in Go) the existing first-
// member chains, parenthesized expressions in Go), the existing first-
// reachable behavior remains correct in that case.
let supports_exclusive_cases = lang_has_exclusive_cases(lang) || lang == "java";
let (scrutinee_text, scrutinee_idents) = if supports_exclusive_cases {
@ -647,7 +647,7 @@ pub(super) fn build_switch<'a>(
for (idx, (case, is_default)) in cases.iter().copied().enumerate() {
let is_last = idx + 1 == cases.len();
// Default at the chain tail doesn't get its own dispatch If the
// Default at the chain tail doesn't get its own dispatch If, the
// previous header's False edge already targets it directly.
let case_first_preds: Vec<NodeIndex> = if is_default && is_last {
// First node of the default body becomes the False target of the
@ -675,12 +675,13 @@ pub(super) fn build_switch<'a>(
);
// The dispatch header is purely structural (it stands in for the
// discriminant comparison). It must not inherit Sink/Source labels
// from the case body's text push_node uses `text_of(ast)` for
// from the case body's text, push_node uses `text_of(ast)` for
// non-call kinds, which would let the body text drive classification.
g[header].taint.labels.clear();
g[header].call.callee = None;
g[header].call.sink_payload_args = None;
g[header].call.destination_uses = None;
g[header].call.gate_filters.clear();
// For mutually-exclusive switch shapes with a single-ident
// scrutinee, synthesize a `<scrutinee> == <case_literal>`
// structured condition on the dispatch header so SSA lowering
@ -958,7 +959,7 @@ pub(super) fn build_try<'a>(
vec![synth]
} else {
// No param name wire exception edges directly to first catch body node
// No param name, wire exception edges directly to first catch body node
Vec::new()
};

View file

@ -43,7 +43,7 @@ fn js_try_catch_has_exception_edges() {
/// When a classifiable call (here `eval`, a built-in JS sink) is nested
/// inside a multi-line statement, the CFG node's `classification_span()`
/// should point at the inner call, not at the outer statement's start
/// should point at the inner call, not at the outer statement's start ,
/// so finding display reports the line the dangerous call actually lives
/// on. `ast.span` must still cover the whole outer statement for
/// structural passes that need the statement grain.
@ -86,7 +86,7 @@ fn inner_call_override_narrows_classification_span() {
}
/// `classification_span()` must fall back to `ast.span` when no narrower
/// sub-expression was recorded so existing structural code paths keep
/// sub-expression was recorded, so existing structural code paths keep
/// working unchanged for nodes whose classification applies to the whole
/// outer node.
#[test]
@ -125,7 +125,7 @@ fn callee_span_unset_when_no_narrowing_is_possible() {
// A bare `eval(x);` on one line: `first_call_ident` finds the
// call_expression whose span is nearly the whole expression_statement
// (different by the trailing `;`). `classification_span` still
// returns a sensible line but the exact trimming is an
// returns a sensible line, but the exact trimming is an
// implementation detail. What we assert here is the invariant:
// if callee_span *is* set, it must be contained in ast.span.
let src = b"function f() { eval(x); }";
@ -708,7 +708,7 @@ fn python_if_and() {
#[test]
fn ruby_unless_and() {
// `unless a && b` chain built, branches swapped
// `unless a && b`, chain built, branches swapped
// Body should run when condition is false
let src = b"def f\n unless a && b\n x\n end\nend\n";
let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE);
@ -848,7 +848,7 @@ fn parse_tree(src: &[u8], ts_lang: Language) -> tree_sitter::Tree {
#[test]
fn first_call_ident_skips_lambda_body() {
// `process(lambda: eval(dangerous))` Python-style.
// `process(lambda: eval(dangerous))`, Python-style.
// first_call_ident should return "process", not "eval".
let src = b"process(lambda: eval(dangerous))";
let ts_lang = Language::from(tree_sitter_python::LANGUAGE);
@ -860,7 +860,7 @@ fn first_call_ident_skips_lambda_body() {
#[test]
fn first_call_ident_skips_arrow_function_body() {
// `process(() => eval(dangerous))` JS arrow function in argument.
// `process(() => eval(dangerous))`, JS arrow function in argument.
let src = b"process(() => eval(dangerous))";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -871,7 +871,7 @@ fn first_call_ident_skips_arrow_function_body() {
#[test]
fn first_call_ident_skips_named_function_in_arg() {
// `process(function inner() { eval(dangerous); })` named function expression in arg.
// `process(function inner() { eval(dangerous); })`, named function expression in arg.
let src = b"process(function inner() { eval(dangerous); })";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -882,7 +882,7 @@ fn first_call_ident_skips_named_function_in_arg() {
#[test]
fn first_call_ident_normal_nested_call() {
// `outer(inner(x))` inner is NOT behind a function boundary, should be reachable.
// `outer(inner(x))`, inner is NOT behind a function boundary, should be reachable.
let src = b"outer(inner(x))";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -895,7 +895,7 @@ fn first_call_ident_normal_nested_call() {
#[test]
fn first_call_ident_finds_call_not_blocked_by_function() {
// Ensure a call at the same level as a function literal is still found.
// `[function() {}, actual_call()]` array with function and call.
// `[function() {}, actual_call()]`, array with function and call.
let src = b"[function() {}, actual_call()]";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let tree = parse_tree(src, ts_lang);
@ -908,7 +908,7 @@ fn first_call_ident_finds_call_not_blocked_by_function() {
#[test]
fn callee_not_resolved_from_nested_function_arg() {
// `safe_wrapper(function() { eval(user_input); })` the CFG for the
// `safe_wrapper(function() { eval(user_input); })`, the CFG for the
// outer call should resolve the callee as "safe_wrapper", never "eval".
let src = b"function f() { safe_wrapper(function() { eval(user_input); }); }";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
@ -923,7 +923,7 @@ fn callee_not_resolved_from_nested_function_arg() {
assert!(has_safe, "expected a node with callee 'safe_wrapper'");
// The outer body should NOT have a node with callee "eval" attributed
// to the outer expression eval lives inside the nested function body.
// to the outer expression, eval lives inside the nested function body.
let outer_eval = body.graph.node_weights().any(|info| {
info.call.callee.as_deref() == Some("eval") && info.ast.enclosing_func.is_none()
});
@ -1117,6 +1117,7 @@ fn clone_preserves_all_sub_structs() {
kwargs: vec![("shell".into(), vec!["True".into()])],
arg_string_literals: vec![Some("lit".into())],
destination_uses: None,
gate_filters: Vec::new(),
},
taint: TaintMeta {
labels: {
@ -1399,7 +1400,7 @@ fn js_promisify_ignored_for_non_js_langs() {
#[test]
fn js_promisify_non_call_value_ignored() {
// RHS is not a promisify call no binding should be captured.
// RHS is not a promisify call, no binding should be captured.
let src = b"const execAsync = child_process.exec;";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_to_file_cfg(src, "javascript", ts_lang);
@ -1471,7 +1472,7 @@ fn cpp_function_extracts_param_names() {
// ── callee-site metadata extraction ──────────────────────────────────
/// Callees collected into `LocalFuncSummary` should now carry structured
/// arity, receiver, and qualifier fields not just a bare name.
/// arity, receiver, and qualifier fields, not just a bare name.
#[test]
fn local_summary_callees_carry_arity_and_receiver() {
// Two calls: one is a plain function call with 2 args, the other is
@ -1703,7 +1704,7 @@ fn local_summary_callees_have_distinct_ordinals() {
.find(|(k, _)| k.name == "outer")
.unwrap();
// Dedup key is (name, arity, receiver, qualifier, ordinal) the two
// Dedup key is (name, arity, receiver, qualifier, ordinal), the two
// `a()` sites have different ordinals, so both must appear.
let a_sites: Vec<_> = outer.callees.iter().filter(|c| c.name == "a").collect();
assert_eq!(
@ -1825,7 +1826,7 @@ fn anon_fn_named_from_short_var_decl_go() {
#[test]
fn iife_callee_resolves_to_anon_body_js() {
// `(function(arg){eval(arg);})(q)` the CallFn arm must produce
// `(function(arg){eval(arg);})(q)`, the CallFn arm must produce
// a synthetic anon callee name so that taint can match the
// inline body's FuncKey.
let src = b"(function(arg){ eval(arg); })(q);";
@ -1898,7 +1899,7 @@ fn strip_tags(s: &str) -> String {
#[test]
fn replace_chain_rejects_unrecognised_literals() {
// `.replace("foo", "bar")` contains no dangerous pattern must NOT be
// `.replace("foo", "bar")` contains no dangerous pattern, must NOT be
// credited as a sanitizer. Preserves the FP→TN guard: replace calls
// that don't strip anything dangerous must stay transparent to taint.
let src = br#"
@ -1916,7 +1917,7 @@ fn rewrite(s: &str) -> String {
#[test]
fn replace_chain_rejects_when_replacement_reintroduces_pattern() {
// `.replace("x", "..")` strips `x` but *reintroduces* `..` be
// `.replace("x", "..")` strips `x` but *reintroduces* `..`, be
// maximally conservative and abandon all credit for this chain.
let src = br#"
fn evil(s: &str) -> String {
@ -1933,7 +1934,7 @@ fn evil(s: &str) -> String {
#[test]
fn replace_chain_rejects_dynamic_arg() {
// `.replace(var, "")` search is not a literal; pattern analysis can
// `.replace(var, "")`, search is not a literal; pattern analysis can
// say nothing about what was stripped. Must not earn credit.
let src = br#"
fn dynamic(s: &str, needle: &str) -> String {
@ -1950,7 +1951,7 @@ fn dynamic(s: &str, needle: &str) -> String {
#[test]
fn replace_chain_rejects_non_identifier_base() {
// `get_s().replace("..", "")` innermost receiver is a call, not a
// `get_s().replace("..", "")`, innermost receiver is a call, not a
// parameter. We have no reason to believe `get_s()` returns a value
// that benefits the caller; refuse credit.
let src = br#"
@ -1976,7 +1977,7 @@ fn find_node_defining<'a>(cfg: &'a Cfg, var: &str) -> Option<&'a NodeInfo> {
#[test]
fn numeric_length_access_detected_on_js_property_read() {
// `var count = items.length` property access on a member expression
// `var count = items.length`, property access on a member expression
// should mark the CFG node as a numeric-length access so the
// type-fact analysis infers TypeKind::Int for `count`.
let src = br#"function f(items) {
@ -1994,7 +1995,7 @@ fn numeric_length_access_detected_on_js_property_read() {
#[test]
fn numeric_length_access_detected_on_js_zero_arg_method_call() {
// `var n = str.length()` zero-arg method call form (uncommon in JS
// `var n = str.length()`, zero-arg method call form (uncommon in JS
// but present in other languages). Detector should unwrap a
// zero-arg call around a member expression.
let src = br#"function f(list) {
@ -2012,7 +2013,7 @@ fn numeric_length_access_detected_on_js_zero_arg_method_call() {
#[test]
fn numeric_length_access_ignores_unrelated_properties() {
// `var v = arr.foo` arbitrary property reads must not be flagged.
// `var v = arr.foo`, arbitrary property reads must not be flagged.
let src = br#"function f(arr) {
var v = arr.foo;
return v;
@ -2028,7 +2029,7 @@ fn numeric_length_access_ignores_unrelated_properties() {
#[test]
fn numeric_length_access_ignores_method_calls_with_args() {
// `var r = s.indexOf('x')` the detector must reject any call with
// `var r = s.indexOf('x')`, the detector must reject any call with
// positional arguments because those aren't pure length reads.
let src = br#"function f(s) {
var r = s.indexOf('x');
@ -2043,7 +2044,7 @@ fn numeric_length_access_ignores_method_calls_with_args() {
);
}
// ── Pointer-Phase 6 / W5: subscript lowering tests ────────────────────────
//── subscript lowering tests ────────────────────────
/// Scope for tests that flip `NYX_POINTER_ANALYSIS=1` so the CFG-side
/// subscript synthesis activates. The env-var is restored afterwards
@ -2290,7 +2291,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
);
}
/// JS switch fall-through (`case 1: a(); case 2: b();`) case 1's
/// JS switch fall-through (`case 1: a(); case 2: b();`), case 1's
/// exit should flow into case 2's body so taint from `first()`
/// reaches `second()`'s sinks.
///
@ -2301,7 +2302,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
/// structural shape.
/// (b) `first()` has a non-Back forward out-edge that lands inside
/// the case-2 sub-graph (the actual fall-through wire), so we
/// prove there *is* a fall-through edge not just an
/// prove there *is* a fall-through edge, not just an
/// Entry→…→Exit path that happens to walk through both calls
/// via the dispatch chain.
///
@ -2309,7 +2310,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
/// Seq passthrough nodes (one per surrounding scope), so the
/// fall-through edge from `first()` lands on the *first wrapper
/// Seq node* of case 2, not on `second()` itself. Asserting that
/// `second()` has ≥2 in-edges would therefore be wrong the True
/// `second()` has ≥2 in-edges would therefore be wrong, the True
/// edge from the case-2 dispatch If targets the wrapper node, and
/// only a single Seq chain leads from there to `second()`.
#[test]
@ -2800,7 +2801,7 @@ fn nested_loops_two_headers_two_back_edges() {
#[test]
fn loop_with_break_no_back_edge_from_break() {
// A `break` short-circuits the loop body its edge must NOT be a
// A `break` short-circuits the loop body, its edge must NOT be a
// back edge to the header (it leaves the loop entirely).
let src = b"function f() { while (cond()) { if (done()) break; body(); } }";
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
@ -2879,7 +2880,7 @@ fn chained_method_call_rebinds_to_inner_gated_sink() {
// no longer be the recorded callee for this node.
if callee.ends_with("https.get") {
// The inner-gate path must have populated sink_payload_args
// (the gate's payload arg is position 0 the URL string).
// (the gate's payload arg is position 0, the URL string).
assert!(
info.call.sink_payload_args.is_some(),
"expected sink_payload_args to be populated for chained \

View file

@ -4,6 +4,7 @@ use super::{
member_expr_text, push_node, text_of,
};
use crate::labels::{DataLabel, LangAnalysisRules, classify};
use crate::utils::snippet::truncate_at_char_boundary;
use petgraph::graph::NodeIndex;
use smallvec::SmallVec;
use tree_sitter::Node;
@ -72,20 +73,15 @@ pub(super) fn push_condition_node<'a>(
code: &'a [u8],
enclosing_func: Option<&str>,
) -> NodeIndex {
// Pass cond_ast as both args sub-conditions are never `unless` nodes
// Pass cond_ast as both args, sub-conditions are never `unless` nodes
let (inner, negated) = detect_negation(cond_ast, cond_ast, lang);
let mut vars = Vec::new();
collect_idents(inner, code, &mut vars);
vars.sort();
vars.dedup();
vars.truncate(MAX_COND_VARS);
let text = text_of(cond_ast, code).map(|t| {
if t.len() > MAX_CONDITION_TEXT_LEN {
t[..MAX_CONDITION_TEXT_LEN].to_string()
} else {
t
}
});
let text = text_of(cond_ast, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
let span = (cond_ast.start_byte(), cond_ast.end_byte());
g.add_node(NodeInfo {
kind: StmtKind::If,
@ -140,7 +136,7 @@ pub(super) fn detect_rust_let_match_guard<'a>(
/// Synthesize a `StmtKind::If` CFG node carrying a Rust match-arm guard's
/// condition text and vars. The let-binding name is added to `condition_vars`
/// so `apply_branch_predicates` narrows validation to that specific variable
/// the variable that receives the arm's value and flows to downstream sinks.
///, the variable that receives the arm's value and flows to downstream sinks.
pub(super) fn emit_rust_match_guard_if<'a>(
g: &mut Cfg,
guard: Node<'a>,
@ -154,13 +150,8 @@ pub(super) fn emit_rust_match_guard_if<'a>(
vars.sort();
vars.dedup();
vars.truncate(MAX_COND_VARS);
let text = text_of(guard, code).map(|t| {
if t.len() > MAX_CONDITION_TEXT_LEN {
t[..MAX_CONDITION_TEXT_LEN].to_string()
} else {
t
}
});
let text = text_of(guard, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
let span = (guard.start_byte(), guard.end_byte());
g.add_node(NodeInfo {
kind: StmtKind::If,
@ -181,7 +172,7 @@ pub(super) fn emit_rust_match_guard_if<'a>(
/// `lhs_text` is then synthesised by SSA lowering at the join.
///
/// The condition's identifiers live on the If node's `condition_vars`, **not**
/// on the branch `uses`. This is the whole point of the split cond is control
/// on the branch `uses`. This is the whole point of the split, cond is control
/// flow, branches are data flow.
///
/// Returns the exit frontier for downstream statement chaining (a single-element
@ -219,7 +210,7 @@ pub(super) fn build_ternary_diamond<'a>(
g[cond_if].is_eq_with_const = detect_eq_with_const(cond_ast, lang);
connect_all(g, preds, cond_if, pred_edge);
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node)
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node) ,
// a nested ternary recurses and returns its own join node.
let true_exits = lower_ternary_branch(
cons_ast,
@ -332,7 +323,7 @@ pub(super) fn lower_ternary_branch<'a>(
analysis_rules,
);
// The branch expression's own `defines` (if any typically None for a
// The branch expression's own `defines` (if any, typically None for a
// pure value expression) is replaced with the outer LHS so that both
// branches agree on the target, driving phi insertion at the join.
g[node].taint.defines = Some(lhs_text.to_string());
@ -410,7 +401,7 @@ pub(super) fn classify_ternary_lhs(
.unwrap_or_default();
// Try the full dotted path first (e.g. "document.cookie"), then fall back
// to the property alone (e.g. "innerHTML") mirrors the LHS classification
// to the property alone (e.g. "innerHTML"), mirrors the LHS classification
// already performed in `push_node` for non-split assignments.
if let Some(l) = classify(lang, &lhs_text, extra) {
labels.push(l);
@ -429,7 +420,7 @@ pub(super) fn classify_ternary_lhs(
/// Recursively decompose a boolean condition into a chain of `StmtKind::If` nodes
/// with short-circuit edges.
///
/// Returns `(true_exits, false_exits)` the sets of nodes from which True/False
/// Returns `(true_exits, false_exits)`, the sets of nodes from which True/False
/// edges should connect to the then/else branches.
pub(super) fn build_condition_chain<'a>(
cond_ast: Node<'a>,

View file

@ -5,7 +5,7 @@ use tree_sitter::Node;
///
/// Used by decorator extraction to reduce `login_required`, `permission_required(...)`,
/// `flask_login.login_required`, `hasRole('ADMIN')` to their first identifier
/// name the matcher target.
/// name, the matcher target.
fn leading_ident_text(node: Node<'_>, code: &[u8]) -> Option<String> {
let mut cur = node;
loop {
@ -56,7 +56,7 @@ fn normalize_decorator_name(raw: &str) -> String {
let trimmed = raw.trim();
let trimmed = trimmed.trim_start_matches(':').trim_start_matches('@');
// If a call syntax leaked through (e.g. `UseGuards(AuthGuard)`), keep only
// the head callers that want the arg handle it separately.
// the head, callers that want the arg handle it separately.
let head = trimmed
.split(['(', ' ', '\t', '\n'])
.next()
@ -115,7 +115,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
/// are `decorator` nodes containing an `identifier` or `call` expression.
/// - **JS/TS**: decorators attach to `method_definition` children or appear
/// as siblings inside `class_body`; stage-3 decorators use `decorator` nodes.
/// `@UseGuards(AuthGuard)` we include the call args too.
/// `@UseGuards(AuthGuard)`, we include the call args too.
/// - **Java**: annotations live in the `modifiers` child of `method_declaration`;
/// kinds are `marker_annotation` / `annotation`.
/// - **Rust**: `function_item` has `attribute_item` siblings (outer `#[..]`).
@ -127,7 +127,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
/// at class body scope applies to every method in the class. `only:` /
/// `except:` hash args scope the filter to the listed action names; the
/// filter is only recorded for the current method when the scope matches.
/// Conditional filters (`if:` / `unless:`) are not honored those require
/// Conditional filters (`if:` / `unless:`) are not honored, those require
/// predicate evaluation and are deferred.
pub(super) fn extract_auth_decorators<'a>(
func_node: Node<'a>,
@ -379,12 +379,12 @@ pub(super) fn extract_auth_decorators<'a>(
}
/// If a Ruby statement is `before_action :name` (or `before_filter :name`),
/// push the normalized filter name into `out` honoring any `only:` / `except:`
/// push the normalized filter name into `out`, honoring any `only:` / `except:`
/// hash arguments against `method_name`.
///
/// Positional symbol args (`before_action :a, :b, only: [:x]`) all share the
/// single trailing scope. Conditional filters (`if:` / `unless:`) are not
/// honored here those require predicate evaluation and are deferred.
/// honored here, those require predicate evaluation and are deferred.
fn collect_ruby_before_action(
node: Node<'_>,
code: &[u8],
@ -499,7 +499,7 @@ fn collect_ruby_before_action(
/// Parse a single `only:` / `except:` hash pair and append the symbol list into
/// the corresponding out-vec. Sets the `*_present` flag when the key is seen,
/// regardless of whether the value parses into any symbols treating
/// regardless of whether the value parses into any symbols, treating
/// `only: []` as "no actions match" is safer than ignoring the scope.
fn collect_ruby_filter_pair(
pair_node: Node<'_>,

View file

@ -1,26 +1,28 @@
//! Phase 6.1: per-language DTO definition collectors.
//! per-language DTO definition collectors.
//!
//! Walks a parsed file's AST and emits `(class_name, DtoFields)` pairs
//! for class / interface / struct / Pydantic-model declarations whose
//! field types resolve to a recognised [`TypeKind`].
//!
//! Strictly additive: classes whose fields cannot be classified produce
//! a `DtoFields` with an empty `fields` map the caller must decide
//! a `DtoFields` with an empty `fields` map, the caller must decide
//! whether to use that as a "Dto with no inferred fields" or fall back
//! to the pre-Phase-6 Object/Unknown classification.
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use tree_sitter::Node;
use super::helpers::text_of;
use super::params::{java_type_to_kind, python_primitive_to_kind, ts_type_to_kind};
use super::params::{
java_type_to_kind, python_primitive_to_kind, ts_type_to_kind, ts_type_to_local_collection,
};
use crate::ssa::type_facts::{DtoFields, TypeKind};
/// Collect all DTO-shaped class definitions in a parsed file.
///
/// Dispatches per-language; returns an empty map for languages without
/// a Phase 6 collector (Go, Ruby, PHP, C/C++ — DTOs in those ecosystems
/// a collector (Go, Ruby, PHP, C/C++, DTOs in those ecosystems
/// either don't follow framework conventions Nyx tracks today, or are
/// already covered by other type-inference paths).
pub(super) fn collect_dto_classes(
@ -39,6 +41,55 @@ pub(super) fn collect_dto_classes(
out
}
/// Collect same-file `type X = Map<...>` / `Set<...>` / `T[]`
/// aliases for TS / JS so the param classifier can resolve a
/// parameter typed `m: ElementsMap` (where
/// `type ElementsMap = Map<K, V>`) to
/// [`TypeKind::LocalCollection`].
///
/// Empty for non-JS/TS languages. Cross-file aliases are not
/// resolved here, that requires the multi-file type-resolution
/// pipeline that doesn't yet exist for TS. Excalidraw's
/// `type ElementsMap = Map<...>` is in
/// `packages/element/src/types.ts`; users that import the alias
/// without a same-file copy still see the original FP. Most
/// real-repo aliases the FP cluster touched were declared in the
/// same file as their consumers (see fixture).
pub(super) fn collect_type_alias_local_collections(
root: Node<'_>,
lang: &str,
code: &[u8],
) -> HashSet<String> {
let mut out: HashSet<String> = HashSet::new();
if matches!(lang, "typescript" | "ts" | "javascript" | "js") {
collect_ts_type_alias_local_collections(root, code, &mut out);
}
out
}
fn collect_ts_type_alias_local_collections(root: Node<'_>, code: &[u8], out: &mut HashSet<String>) {
walk(root, &mut |node| {
if node.kind() != "type_alias_declaration" {
return;
}
let Some(name_node) = node.child_by_field_name("name") else {
return;
};
let Some(alias_name) = text_of(name_node, code) else {
return;
};
let Some(value_node) = node.child_by_field_name("value") else {
return;
};
let Some(value_text) = text_of(value_node, code) else {
return;
};
if ts_type_to_local_collection(value_text.trim()).is_some() {
out.insert(alias_name);
}
});
}
// ─────────────────────────────────────────────────────────────────────
// Java
// ─────────────────────────────────────────────────────────────────────
@ -163,7 +214,7 @@ fn extract_ts_property<'a>(node: Node<'a>, code: &'a [u8]) -> Option<(String, Ty
let name_node = node.child_by_field_name("name")?;
let field_name = text_of(name_node, code)?;
let type_anno = node.child_by_field_name("type")?;
// type_annotation node text is `: T` walk to the inner type.
// type_annotation node text is `: T`, walk to the inner type.
let type_text = type_anno
.named_child(0)
.and_then(|t| text_of(t, code))
@ -193,7 +244,7 @@ fn collect_rust(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFields
return;
};
if body.kind() != "field_declaration_list" {
// Tuple struct or unit struct no named fields.
// Tuple struct or unit struct, no named fields.
return;
}
let mut fields = DtoFields::new(class_name.clone());
@ -291,7 +342,7 @@ fn collect_python(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFiel
/// Conservative supertype scan: returns true when the class definition
/// has a superclass list whose text mentions `BaseModel` (covers both
/// `BaseModel` and `pydantic.BaseModel`). No false positives on
/// non-Pydantic classes named `BaseModel`-something match is on the
/// non-Pydantic classes named `BaseModel`-something, match is on the
/// full token, not a substring.
fn python_inherits_basemodel<'a>(class_node: Node<'a>, code: &'a [u8]) -> bool {
let Some(supers) = class_node.child_by_field_name("superclasses") else {
@ -418,7 +469,7 @@ mod tests {
"#;
let dtos = collect("rust", src);
// Tuple structs have no named fields and must NOT produce a
// DtoFields entry — Phase 6 only handles named-field DTOs.
// DtoFields entry, This collector only handles named-field DTOs.
assert!(!dtos.contains_key("Wrap"));
}

View file

@ -19,11 +19,11 @@ pub(crate) fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option<String> {
///
/// For `Runtime.getRuntime().exec(cmd)`, the receiver of `exec` is the call
/// `Runtime.getRuntime()`. This function drills through that to return
/// `"Runtime"` the outermost non-call object. This lets labels like
/// `"Runtime"`, the outermost non-call object. This lets labels like
/// `"Runtime.exec"` match correctly.
pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<String> {
match lookup(lang, n.kind()) {
// The receiver is itself a call drill into ITS receiver.
// The receiver is itself a call, drill into ITS receiver.
// e.g. for `Runtime.getRuntime()`, the object is `Runtime`.
Kind::CallFn | Kind::CallMethod => {
let inner = n
@ -53,7 +53,7 @@ pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<Str
/// identifier (e.g. call expressions, subscripts, `this`/`self`, etc.).
pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
let mut cur = n;
// Bounded walk tree-sitter can nest deeply but we only need a handful
// Bounded walk, tree-sitter can nest deeply but we only need a handful
// of hops for real code.
for _ in 0..16 {
match cur.kind() {
@ -68,7 +68,7 @@ pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
cur = cur.child_by_field_name("value")?;
}
// Drill through nested calls / method chains to find the base
// identifier. E.g. `Connection::open(p).unwrap().execute(...)`
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` ,
// the receiver of `.execute` is the `.unwrap()` call whose
// object is `Connection::open(p)`; we want the leftmost plain
// identifier the chain resolves to (for SSA var_stacks lookup).
@ -212,7 +212,7 @@ pub(crate) fn first_call_ident_with_span<'a>(
return ident.map(|s| (s, span));
}
Kind::Function => {
// Do not descend into nested function/lambda bodies
// Do not descend into nested function/lambda bodies ,
// they are separate scopes and should not contribute
// callee identifiers to the parent expression.
continue;
@ -240,7 +240,7 @@ pub(crate) fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> O
/// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does.
///
/// Returns `(callee_text, label, span)` where `span` is the byte range of the
/// inner call node itself used to populate `CallMeta.callee_span` so that
/// inner call node itself, used to populate `CallMeta.callee_span` so that
/// display sites can report the actual call location rather than the enclosing
/// statement's span.
pub(crate) fn find_classifiable_inner_call<'a>(
@ -251,7 +251,7 @@ pub(crate) fn find_classifiable_inner_call<'a>(
) -> Option<(String, DataLabel, (usize, usize))> {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
// Do not descend into Kind::Function nodes they will be extracted
// Do not descend into Kind::Function nodes, they will be extracted
// as separate BodyCfg entries and should not contribute inner callees
// to the parent expression.
if lookup(lang, c.kind()) == Kind::Function {
@ -329,7 +329,7 @@ pub(crate) fn member_expr_text_inner(n: Node, code: &[u8]) -> Option<String> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => {
// Tree-sitter exposes the receiver under `object` (JS/TS, Python),
// `value` (Rust field_expression handled in the matching arm
// `value` (Rust field_expression, handled in the matching arm
// above), or `operand` (Go selector_expression). Without the
// `operand` fallback, Go member access like `r.Body` collapsed to
// just the trailing field (`Body`), so source rules keyed on the
@ -442,7 +442,7 @@ pub(crate) fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
/// This finds anonymous functions / arrow functions / closures that are
/// passed as arguments to a call and should be analysed as separate
/// function scopes. Only direct function-argument children are collected
/// (not functions nested inside other functions those get handled when
/// (not functions nested inside other functions, those get handled when
/// the outer function is recursed into).
pub(crate) fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec<Node<'a>> {
let mut funcs = Vec::new();
@ -558,7 +558,7 @@ pub(crate) fn derive_anon_fn_name_from_context<'a>(
}
// Python: `h = lambda: ...` parents as `assignment`, handled above.
// Python `default_parameter` assigning `def foo(x=lambda: 0)` ambiguous, skip.
// Python `default_parameter` assigning `def foo(x=lambda: 0)`, ambiguous, skip.
_ => {
// Some grammars wrap the RHS in an `expression`, `expression_list`,
// or similar node between the binding site and the function literal.
@ -709,7 +709,7 @@ pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec<String>) {
}
}
/// Pointer-Phase 6 / W5: AST kind names for subscript / index expressions
/// AST kind names for subscript / index expressions
/// across the languages whose container-element flow we model.
///
/// JS/TS use `subscript_expression`; Python uses `subscript`; Go uses
@ -724,7 +724,7 @@ pub(crate) fn is_subscript_kind(kind: &str) -> bool {
)
}
/// Pointer-Phase 6 / W5: when the LHS of an assignment statement is a
/// when the LHS of an assignment statement is a
/// subscript / index expression (or a single-element wrapper around
/// one), return that node. Returns `None` for multi-target Go
/// `expression_list`s, identifier LHSs, member-expression LHSs, etc.
@ -745,10 +745,10 @@ pub(crate) fn subscript_lhs_node<'a>(lhs: Node<'a>, lang: &str) -> Option<Node<'
None
}
/// Pointer-Phase 6 / W5: extract `(array_text, index_text)` from a
/// extract `(array_text, index_text)` from a
/// subscript / index AST node.
///
/// Returns `None` when the array operand is not a plain identifier we
/// Returns `None` when the array operand is not a plain identifier, we
/// only synthesise `__index_get__` / `__index_set__` calls when the
/// receiver resolves cleanly to a SSA-renamed local, since the W2/W4
/// container hooks need a stable receiver var_name to drive
@ -771,7 +771,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
n.named_children(&mut cur).nth(1)
})?;
let arr_kind = arr.kind();
// Only proceed when the array is a plain identifier otherwise
// Only proceed when the array is a plain identifier, otherwise
// we can't bind a stable receiver name for the synth Call.
if !matches!(
arr_kind,
@ -780,7 +780,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
return None;
}
let arr_text = text_of(arr, code)?;
// PHP-style `$x` strip not needed here Go/JS/Python don't use it.
// PHP-style `$x` strip not needed here, Go/JS/Python don't use it.
let idx_text = text_of(idx, code)?;
Some((arr_text, idx_text))
}

View file

@ -1,4 +1,4 @@
//! Phase 6: per-language class / trait / interface hierarchy extraction.
//! per-language class / trait / interface hierarchy extraction.
//!
//! Walks a parsed file's AST and emits `(sub_container, super_container)`
//! pairs for every declared inheritance / impl / implements relationship.
@ -47,7 +47,7 @@ pub(crate) fn collect_hierarchy_edges(
"php" => collect_php(root, code, &mut push),
"cpp" | "c++" => collect_cpp(root, code, &mut push),
// Go: structural / implicit interface satisfaction is intractable
// per-file; Phase 6 deliberately skips it.
// per-file; deliberately skipped it.
// C: no inheritance.
_ => {}
}
@ -70,7 +70,7 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
let Some(sub) = text_of(name_node, code) else {
return;
};
// `superclass` field on class_declaration singular `extends Y`.
// `superclass` field on class_declaration, singular `extends Y`.
if let Some(superclass) = node.child_by_field_name("superclass") {
let mut cursor = superclass.walk();
for c in superclass.named_children(&mut cursor) {
@ -79,13 +79,13 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
}
}
}
// `interfaces` field on class_declaration `implements I, J`
// `interfaces` field on class_declaration, `implements I, J`
// wraps a `super_interfaces` → `type_list`.
if let Some(ifaces) = node.child_by_field_name("interfaces") {
collect_java_type_list(ifaces, code, &sub, push);
}
// `extends_interfaces` is an unnamed child on
// interface_declaration `extends Foo, Bar` for an
// interface_declaration, `extends Foo, Bar` for an
// interface. Walk children directly since it's not a field.
let mut cursor = node.walk();
for c in node.named_children(&mut cursor) {
@ -123,7 +123,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
match n.kind() {
"type_identifier" | "identifier" => text_of(n, code),
"generic_type" => {
// `Foo<T>` the leading child is the bare type identifier.
// `Foo<T>`, the leading child is the bare type identifier.
let mut cursor = n.walk();
for c in n.named_children(&mut cursor) {
if matches!(
@ -136,7 +136,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
None
}
"scoped_type_identifier" => {
// `pkg.Foo` return last segment.
// `pkg.Foo`, return last segment.
text_of(n, code).map(|s| {
let last = s.rsplit('.').next().unwrap_or(&s);
last.to_string()
@ -152,7 +152,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
/// Walk for `impl_item` nodes and emit edges from the concrete type to
/// the trait being implemented. Inherent impls (`impl Foo {}`) emit
/// no edge there is no super-trait relationship to record.
/// no edge, there is no super-trait relationship to record.
fn collect_rust<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mut F) {
walk(root, &mut |node| {
if node.kind() != "impl_item" {
@ -179,7 +179,7 @@ fn rust_path_leaf(n: Node<'_>, code: &[u8]) -> Option<String> {
match n.kind() {
"type_identifier" | "identifier" => text_of(n, code),
"scoped_type_identifier" | "scoped_identifier" => {
// `crate::foo::Bar` last segment.
// `crate::foo::Bar`, last segment.
let s = text_of(n, code)?;
Some(s.rsplit("::").next().unwrap_or(&s).to_string())
}
@ -286,12 +286,12 @@ fn collect_python<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &
let Some(superclasses) = node.child_by_field_name("superclasses") else {
return; // no parents
};
// `superclasses` is an `argument_list` each non-keyword
// `superclasses` is an `argument_list`, each non-keyword
// argument is a base class.
let mut cursor = superclasses.walk();
for arg in superclasses.named_children(&mut cursor) {
if let Some(t) = python_base_text(arg, code) {
// Skip Python `object` not informative.
// Skip Python `object`, not informative.
if t != "object" {
push(sub.clone(), t);
}
@ -304,7 +304,7 @@ fn python_base_text(n: Node<'_>, code: &[u8]) -> Option<String> {
match n.kind() {
"identifier" => text_of(n, code),
"attribute" => {
// `pkg.Base` last segment.
// `pkg.Base`, last segment.
let s = text_of(n, code)?;
Some(s.rsplit('.').next().unwrap_or(&s).to_string())
}
@ -474,7 +474,7 @@ mod tests {
let src = "interface Mine extends Foo, Bar {}";
let edges = collect("java", src);
// tree-sitter-java models `extends` on interface as `extends_interfaces`
// rooted at the same node at least one of the parents should land.
// rooted at the same node, at least one of the parents should land.
assert!(
edges.iter().any(|(s, _)| s == "Mine"),
"interface extends should emit at least one edge; got {edges:?}"
@ -516,8 +516,8 @@ mod tests {
#[test]
fn python_class_object_base_skipped() {
// Inheriting from `object` is not informative Python's
// implicit root. Phase 6 omits these edges to keep the
// Inheriting from `object` is not informative, Python's
// implicit root. We omit these edges to keep the
// hierarchy index focused on user-defined relationships.
let src = "class Plain(object):\n pass\n";
let edges = collect("python", src);

View file

@ -12,7 +12,7 @@ use tree_sitter::{Node, Tree};
/// - ES6: `import { A as B } from 'mod'` → B → ImportBinding { original: A, module: mod }
/// - CommonJS: `const { A: B } = require('mod')` → B → ImportBinding { original: A, module: mod }
///
/// Only aliased (renamed) bindings are recorded same-name imports (e.g.
/// Only aliased (renamed) bindings are recorded, same-name imports (e.g.
/// `import { exec }`) are already resolvable by their original name.
pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBindings {
let mut bindings = ImportBindings::new();
@ -149,7 +149,7 @@ pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBinding
continue;
}
// The alias is accessed via the "alias" field (a `name` node).
// The qualified name has no field find it by kind.
// The qualified name has no field, find it by kind.
let alias_node = clause.child_by_field_name("alias");
let mut c2 = clause.walk();
let qname_node = clause

View file

@ -45,7 +45,7 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
/// (JS `object`, TS `object`, Python `dictionary`). `names` contains
/// identifiers lifted from pair values whose key matches any entry in
/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field
/// pairs are present, returns `Some(vec![])` the sink is effectively
/// pairs are present, returns `Some(vec![])`, the sink is effectively
/// silenced because no destination identifier exists.
/// * `None` if the arg is absent, is not an object literal (plain string
/// / ident / expression), or has splat/spread children that break static
@ -77,7 +77,7 @@ pub(super) fn extract_destination_field_idents(
match child.kind() {
// `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't
// statically attribute spread contents to specific fields, so
// bail out caller falls back to the whole-arg filter, matching
// bail out, caller falls back to the whole-arg filter, matching
// the conservative posture used by arg_uses for splats.
"spread_element" | "dictionary_splat" => {
return None;
@ -107,7 +107,7 @@ pub(super) fn extract_destination_field_idents(
}
}),
// Computed keys like `[someVar]` can't be statically
// resolved skip (conservative: not a destination field).
// resolved, skip (conservative: not a destination field).
"computed_property_name" => continue,
_ => text_of(key_node, code),
};
@ -200,7 +200,7 @@ pub(super) fn extract_const_keyword_arg(
continue;
}
let value_node = child.child_by_field_name("value")?;
// Only return a literal identifiers / calls / complex exprs are
// Only return a literal, identifiers / calls / complex exprs are
// "dynamic" and must be reported as `None` so the gate can
// distinguish literal-safe from dynamic.
return match value_node.kind() {
@ -252,7 +252,7 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8])
/// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as
/// `arg0`). Returns `None` when the call has no arguments.
///
/// Used by per-language shape-aware sink suppression for example, Ruby
/// Used by per-language shape-aware sink suppression, for example, Ruby
/// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically
/// parameterised when arg 0 is a hash/symbol/array/non-interpolated string,
/// regardless of taint reaching that argument.
@ -268,7 +268,7 @@ pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bo
/// Walk a Java method-chain receiver looking for an inner `method_invocation`
/// whose method name matches one of `target_methods` (e.g. `createQuery`,
/// `prepareStatement`). Returns the kind of that inner call's arg 0 used
/// `prepareStatement`). Returns the kind of that inner call's arg 0, used
/// to verify the SQL-bearing call up-chain was given a string literal rather
/// than a concatenation / method call.
///
@ -307,7 +307,7 @@ pub(super) fn java_chain_arg0_kind_for_method(
/// method identifier matches one of `target_methods`, then return that
/// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node
/// represents a chained expression like `Model.where(...).preload(...).to_a`
/// the outermost call (`to_a`) has no arguments, so the shape suppressor
///, the outermost call (`to_a`) has no arguments, so the shape suppressor
/// must reach down the chain to inspect `where`'s arg 0.
///
/// Conservative: returns `None` if the chain doesn't contain a matching
@ -353,6 +353,116 @@ fn subtree_has_interpolation(n: Node) -> bool {
n.named_children(&mut cursor).any(subtree_has_interpolation)
}
/// Walk a JS/TS method-chain receiver-side to find an inner `call_expression`
/// whose member-property name matches one of `target_methods` (e.g. `query`,
/// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0.
///
/// Used to recognise ORM-accessor chains where a labelled SQL sink sits on
/// the receiver side of a parameterised execute method:
/// `strapi.db.query('admin::api-token').findOne({...})`. The outer call
/// (`findOne`) is the CFG node; the inner labelled `db.query` call carries
/// the literal model UID that proves the chain is parameterised.
///
/// Conservative: returns `None` when no matching inner call is found, so
/// callers fall through to the no-suppression path.
pub(super) fn js_chain_arg0_kind_for_method(
expr: Node,
target_methods: &[&str],
code: &[u8],
) -> Option<(String, bool)> {
let n = unwrap_parens(expr);
// tree-sitter-typescript / -javascript: call_expression with fields
// `function` (member_expression / identifier) and `arguments`.
if n.kind() == "call_expression" {
// Check this call's callee: if its property name (or full text) ends
// with one of `target_methods`, this is the inner labelled call.
if let Some(function) = n.child_by_field_name("function") {
// Property of a member_expression; falls back to the function
// text itself for bare-identifier calls.
let prop_text = function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
let full_text = text_of(function, code);
let leaf_text = full_text
.as_ref()
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
let matched = target_methods.iter().any(|m| {
prop_text.as_deref() == Some(*m)
|| leaf_text.as_deref() == Some(*m)
|| full_text.as_deref() == Some(*m)
|| full_text
.as_deref()
.is_some_and(|s| s.ends_with(&format!(".{m}")))
});
if matched {
return arg0_kind_and_interpolation(n);
}
// Drill down the receiver spine: function.object is the prior
// call in the chain.
if let Some(object) = function.child_by_field_name("object")
&& let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code)
{
return Some(found);
}
}
}
None
}
/// Walk the receiver chain of a JS/TS call to count *non-execute* method
/// calls between the outer call and an inner labelled call to
/// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer
/// chain method name (e.g. `findOne`) when an inner-call to `target_inner`
/// exists somewhere on the receiver spine, otherwise `None`.
///
/// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain
/// shape `<inner>.query(LITERAL).<orm_method>(...)`, bare
/// `connection.query("SELECT ...")` returns `None` because there is no
/// outer chain method.
pub(super) fn js_chain_outer_method_for_inner<'a>(
outer: Node<'a>,
target_inner: &[&str],
code: &'a [u8],
) -> Option<String> {
let n = unwrap_parens(outer);
if n.kind() != "call_expression" {
return None;
}
let function = n.child_by_field_name("function")?;
let object = function.child_by_field_name("object")?;
// If `object` itself is a call_expression whose property matches
// `target_inner`, the immediate outer is `function.property`.
if object.kind() == "call_expression" {
let inner_function = object.child_by_field_name("function");
if let Some(inner_function) = inner_function {
let prop_text = inner_function
.child_by_field_name("property")
.and_then(|p| text_of(p, code));
let full_text = text_of(inner_function, code);
let leaf_text = full_text
.as_ref()
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
let inner_matched = target_inner.iter().any(|m| {
prop_text.as_deref() == Some(*m)
|| leaf_text.as_deref() == Some(*m)
|| full_text.as_deref() == Some(*m)
|| full_text
.as_deref()
.is_some_and(|s| s.ends_with(&format!(".{m}")))
});
if inner_matched {
return function
.child_by_field_name("property")
.and_then(|p| text_of(p, code).map(|s| s.to_string()));
}
}
// Recurse: outer chain may have more depth (`a.b().c().d()` ,
// d is outermost, c is next, target may be at b or further in).
return js_chain_outer_method_for_inner(object, target_inner, code);
}
None
}
/// For a chained method call (`a.b().c().d()`), walk down the receiver
/// chain (`function.object`) and return the innermost call_expression
/// alongside its callee text (e.g. `"http.get"`).
@ -385,7 +495,7 @@ pub(super) fn find_chained_inner_call<'a>(
return None;
}
// Recurse: the inner call may itself be chained
// (`axios.get(u).then(h).catch(h)` innermost is `axios.get`).
// (`axios.get(u).then(h).catch(h)`, innermost is `axios.get`).
if let Some(inner) = find_chained_inner_call(object, lang, code) {
return Some(inner);
}
@ -398,7 +508,7 @@ pub(super) fn find_chained_inner_call<'a>(
.or_else(|| object.child_by_field_name("name"))?;
// Multi-line dotted member expressions (`http\n .get`) include
// formatting whitespace in the source-text slice. The labels map
// keys are literal `"http.get"` etc. strip whitespace so the
// keys are literal `"http.get"` etc., strip whitespace so the
// chained-call inner-gate rebinding fires for both single-line and
// multi-line chain styles. Also strips `\r` for CRLF sources.
// Motivated by upstream Parse Server CVE-2025-64430 which uses the
@ -410,18 +520,18 @@ pub(super) fn find_chained_inner_call<'a>(
/// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod
/// node) and yield each *named argument* of every inner call along the
/// way. Outer's own arguments are NOT included the caller already
/// way. Outer's own arguments are NOT included, the caller already
/// handles those via the standard `pre_emit_arg_source_nodes` pass over
/// `outer.arguments`.
///
/// For `json.NewDecoder(r.Body).Decode(emoji)`:
/// outer = `.Decode(emoji)` caller iterates `emoji`
/// inner = `json.NewDecoder(r.Body)` yielded arg: `r.Body`
/// outer = `.Decode(emoji)` , caller iterates `emoji`
/// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body`
///
/// We only pull from each inner call's `arguments` field, never from its
/// `function`/`method`/receiver expressions. That distinction matters
/// because chained source-receivers like `r.URL.Query()` expose a
/// member-text path that classifies as a Source but it's the OUTER
/// member-text path that classifies as a Source, but it's the OUTER
/// chain text (`r.URL.Query.Get`) that already classifies, so emitting
/// a synth source for the inner-call's own callee would double-count.
///
@ -498,7 +608,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
return false;
}
let first_arg = named[0];
// Extract the raw text of arg 0 must be a string literal or
// Extract the raw text of arg 0, must be a string literal or
// template string without interpolation.
let query_text = match first_arg.kind() {
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
@ -511,7 +621,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return false; // dynamic not safe
return false; // dynamic, not safe
}
text_of(first_arg, code)
}
@ -534,7 +644,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
/// - `$1`, `$2`, …, `$N` (PostgreSQL positional)
/// - `?` (MySQL / SQLite positional)
/// - `%s` (Python DB-API / psycopg2)
/// - `:identifier` (Oracle / named parameters) requires the colon to be
/// - `:identifier` (Oracle / named parameters), requires the colon to be
/// preceded by a space or `=` (to avoid matching JS ternary / object
/// literals).
pub(super) fn has_sql_placeholders(s: &str) -> bool {
@ -559,7 +669,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
&& i + 1 < len
&& bytes[i + 1].is_ascii_alphabetic() =>
{
// :identifier must be preceded by whitespace/= to avoid
// :identifier, must be preceded by whitespace/= to avoid
// false positives on object literals or ternary operators.
return true;
}
@ -581,7 +691,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
#[allow(clippy::only_used_in_recursion)]
pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
match node.kind() {
// Scalar strings but reject if they contain interpolation
// Scalar strings, but reject if they contain interpolation
// (e.g. Ruby `"hello #{name}"`, Python f-strings).
"string"
| "string_literal"
@ -602,7 +712,7 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
// PHP encapsed_string: safe only if no variable interpolation
"encapsed_string" => !has_interpolation_cfg(node),
// Wrapper: PHP/Go wrap each arg in an `argument` node unwrap
// Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap
"argument" => {
node.named_child_count() == 1
&& node
@ -765,7 +875,7 @@ pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool {
return false;
}
}
// Zero-arg calls are not "all literal" taint can still flow via a
// Zero-arg calls are not "all literal", taint can still flow via a
// non-literal receiver (e.g. `tainted.readObject()`), and the sink-
// suppression gate (`info.all_args_literal`) must not skip these.
if !any_arg {
@ -781,7 +891,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
let kind = child.kind();
// Skip argument lists those are checked by the caller.
// Skip argument lists, those are checked by the caller.
if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" {
continue;
}
@ -804,7 +914,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
/// Returns one `Vec<String>` per argument (in parameter-position order).
/// Returns empty if argument list can't be found or contains spread/keyword args.
pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>> {
// Ruby `subshell` (backticks) has no `arguments` field its children are
// Ruby `subshell` (backticks) has no `arguments` field, its children are
// string fragments and `interpolation` nodes. Lift each interpolation's
// identifiers into a positional arg so taint flows from `#{var}` into the
// synthetic "subshell" sink.
@ -834,7 +944,7 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>>
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
// Named / keyword arguments are tracked separately in `CallMeta.kwargs`
// and do not participate in positional indexing skip them here so
// and do not participate in positional indexing, skip them here so
// `arg_uses` remains strictly positional. Splats (spread/dict splat)
// still invalidate positional mapping; bail out in that case.
if kind == "spread_element"
@ -1058,13 +1168,13 @@ pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) -
/// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call
/// (non-method-chain) Go shape. The caller wires the resulting cap into
/// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the
/// taint engine consumes via the standard sanitizer pathway taint flows
/// taint engine consumes via the standard sanitizer pathway, taint flows
/// in on `s`, the matching cap is stripped from the result.
pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option<Cap> {
if call_ast.kind() != "call_expression" {
return None;
}
// The call's `function` field is a `selector_expression` `operand`
// The call's `function` field is a `selector_expression`, `operand`
// is the package ident (`strings`), `field` is the method ident.
let func = call_ast.child_by_field_name("function")?;
if func.kind() != "selector_expression" {
@ -1085,7 +1195,7 @@ pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> O
let new_lit = extract_const_string_arg(call_ast, 2, code)?;
// If the replacement itself reintroduces a dangerous sequence, don't
// credit the strip matches the Rust chain detector's policy.
// credit the strip, matches the Rust chain detector's policy.
if !caps_stripped_by_literal_pattern(&new_lit).is_empty() {
return None;
}
@ -1106,7 +1216,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
}
match lookup(lang, n.kind()) {
Kind::Function => {
// Function/closure expression passed as argument return the same
// Function/closure expression passed as argument, return the same
// synthetic anon name used by build_sub so callback_bindings and
// source_to_callback can match it to the extracted BodyCfg.
n.child_by_field_name("name")
@ -1155,7 +1265,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
/// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`].
///
/// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces
/// an empty vector positional indices past the splat are meaningless and
/// an empty vector, positional indices past the splat are meaningless and
/// downstream passes already treat an empty vector as "no info".
pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<Option<String>> {
let Some(args_node) = call_node.child_by_field_name("arguments") else {
@ -1175,7 +1285,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
return Vec::new();
}
// Named / keyword arguments are tracked separately in `kwargs` and
// don't participate in positional indexing skip them here so this
// don't participate in positional indexing, skip them here so this
// vector stays aligned with `arg_uses`.
if kind == "keyword_argument" || kind == "named_argument" {
continue;
@ -1198,7 +1308,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
| "raw_string_literal"
// PHP's double-quoted form (single-quoted maps to `string`).
// Only safe to lift when there is no `encapsed_string` /
// `embedded_expression` interpolation child checked below.
// `embedded_expression` interpolation child, checked below.
| "encapsed_string" => {
let raw = text_of(target, code);
raw.and_then(|s| strip_literal_quotes(&s, target, code))
@ -1212,7 +1322,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
/// Strip surrounding quotes from a syntactic string literal, resolving the
/// `string_content` child for Rust-style two-level string nodes. Returns the
/// raw inner text (no escape-sequence processing) sufficient for whitelist
/// raw inner text (no escape-sequence processing), sufficient for whitelist
/// matching against shell-metachar sets.
pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option<String> {
// Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child.
@ -1320,7 +1430,7 @@ pub(super) fn def_use(
// Python/Ruby `expression_statement` → `assignment`)
let mut cursor = ast.walk();
for child in ast.children(&mut cursor) {
// Only use left/right fields for actual assignment nodes binary
// Only use left/right fields for actual assignment nodes, binary
// expressions also have left/right but are not definitions.
let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment);
let child_name = child
@ -1403,7 +1513,7 @@ pub(super) fn def_use(
(defs, uses, vec![])
}
// iflet / whilelet the `let_condition` binds a variable from
// iflet / whilelet, the `let_condition` binds a variable from
// the value expression. E.g. `if let Ok(cmd) = env::var("CMD")`
// defines `cmd` and uses `env`, `var`, `CMD`.
Kind::If | Kind::While => {
@ -1418,7 +1528,7 @@ pub(super) fn def_use(
let mut tmp = Vec::<String>::new();
collect_idents(pat, code, &mut tmp);
// The first plain identifier in the pattern is the binding.
// Skip type identifiers (e.g. "Ok" in Ok(cmd)) take the
// Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the
// last ident which is the inner binding name.
defs = tmp.into_iter().last();
}

View file

@ -14,6 +14,7 @@ use crate::labels::{
};
use crate::summary::FuncSummary;
use crate::symbol::{FuncKey, Lang};
use crate::utils::snippet::truncate_at_char_boundary;
use smallvec::SmallVec;
use std::cell::RefCell;
use std::collections::{HashMap, HashSet};
@ -54,8 +55,8 @@ use literals::{
extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg,
extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node,
find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args,
is_parameterized_query_call, java_chain_arg0_kind_for_method, ruby_chain_arg0_for_method,
walk_chain_inner_call_args,
is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
};
use params::{
compute_container_and_kind, extract_param_meta, inject_framework_param_sources,
@ -74,7 +75,7 @@ pub fn extract_param_meta_for_test<'a>(
}
/// Test-only helper to populate the per-file DTO class map without
/// running `build_cfg`. Used by the Phase 6 audit harness in
/// running `build_cfg`. Used by the DTO audit harness in
/// `tests/typed_extractors_audit.rs` to verify that
/// `classify_param_type_*` resolves a same-file DTO via the
/// thread-local map.
@ -91,30 +92,26 @@ pub fn clear_dto_classes_for_test() {
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
}
// -------------------------------------------------------------------------
// Structural DFS index for function bodies
// -------------------------------------------------------------------------
//
// Per-file map of function-node start_byte → depth-first preorder index.
// Populated at the start of `build_cfg`, consumed by every site that
// previously formatted `<anon@{start_byte}>` or stored `start_byte` as
// the disambig. The DFS index is stable against edits elsewhere in the
// file (inserting a line above a function does not change its index).
//
// Thread-local is safe because `build_cfg` is not re-entrant within a
// single rayon worker: each file is parsed and CFG-built to completion
// before the next one starts.
// Per-file map of function-node start_byte → DFS preorder index. Stable
// against unrelated edits (inserting a line above a function doesn't
// change its index). Thread-local is safe, `build_cfg` is not
// re-entrant within a single rayon worker.
thread_local! {
static FN_DFS_INDICES: RefCell<HashMap<usize, u32>> = RefCell::new(HashMap::new());
/// Phase 6: per-file DTO class definitions. Populated at the top
/// of [`build_cfg`] by [`dto::collect_dto_classes`] so per-parameter
/// classifiers can resolve `@RequestBody T dto` /
/// `Json<CreateUser>` / `Annotated[CreateUser, Body()]` to a
/// [`crate::ssa::type_facts::TypeKind::Dto`] when the DTO type is
/// declared in the same file. Cleared at the end of `build_cfg`
/// so thread-local state never leaks between files.
/// Per-file DTO class definitions, populated at the top of
/// [`build_cfg`] so per-parameter classifiers can resolve typed
/// extractors against same-file DTOs.
pub(crate) static DTO_CLASSES: RefCell<HashMap<String, crate::ssa::type_facts::DtoFields>>
= RefCell::new(HashMap::new());
/// Per-file set of TS / JS `type X = Map<...>` (or `Set<...>` /
/// `Array<...>` / `T[]`) aliases, populated at the top of
/// [`build_cfg`]. Lets `classify_param_type_ts` resolve a
/// parameter typed `m: ElementsMap` to
/// [`crate::ssa::type_facts::TypeKind::LocalCollection`] via
/// same-file alias lookup. Cross-file aliases are not yet
/// resolved.
pub(crate) static TYPE_ALIAS_LC: RefCell<std::collections::HashSet<String>>
= RefCell::new(std::collections::HashSet::new());
}
/// Populate the per-file DFS-index map from a preorder walk of the
@ -148,11 +145,8 @@ fn fn_dfs_index(start_byte: usize) -> Option<u32> {
FN_DFS_INDICES.with(|cell| cell.borrow().get(&start_byte).copied())
}
/// Synthetic name for an anonymous function. Uses the DFS index when
/// available (`<anon#N>`), falls back to the byte offset when the map
/// is empty (e.g. during tests that bypass `build_cfg`). The `#`
/// sigil is intentionally different from `@` so the two formats are
/// distinguishable by downstream consumers.
/// Synthetic name for an anonymous function: `<anon#N>` from the DFS
/// index when available, `<anon@OFFSET>` as fallback.
pub(crate) fn anon_fn_name(start_byte: usize) -> String {
match fn_dfs_index(start_byte) {
Some(idx) => format!("<anon#{idx}>"),
@ -160,9 +154,7 @@ pub(crate) fn anon_fn_name(start_byte: usize) -> String {
}
}
/// Prefix check that accepts both the new `<anon#...>` and legacy
/// `<anon@...>` formats. Used by code paths that classify whether a
/// function name came from anonymous synthesis.
/// True for any anonymous-function synthesis prefix.
pub(crate) fn is_anon_fn_name(name: &str) -> bool {
name.starts_with("<anon#") || name.starts_with("<anon@")
}
@ -235,9 +227,9 @@ pub struct CallMeta {
///
/// CFG construction does NOT populate this field today (callee already
/// carries the full path). It is the canonical place to read the original
/// textual callee for **debug/display only** analysis code should walk
/// SSA `FieldProj` receivers (Phase 4) or use the
/// [`crate::labels::bare_method_name`] textual fallback (Phase 5).
/// textual callee for **debug/display only**, analysis code should walk
/// SSA `FieldProj` receivers or use the
/// [`crate::labels::bare_method_name`] textual fallback.
#[doc(hidden)]
#[serde(default)]
pub callee_text: Option<String>,
@ -248,14 +240,14 @@ pub struct CallMeta {
pub outer_callee: Option<String>,
/// Byte span of the inner call that supplied the classification, when
/// `find_classifiable_inner_call` overrode the outer callee. `None` when
/// the classification came from the outer AST node directly in that
/// the classification came from the outer AST node directly, in that
/// case `AstMeta.span` already points at the classified expression.
///
/// Consumers that want the location of the *labeled* call (sink/source/
/// sanitizer display, flow-step rendering, taint origin attribution)
/// should use [`NodeInfo::classification_span`] rather than reading this
/// field directly. `AstMeta.span` remains the authoritative "whole
/// statement" span used by structural passes (unreachability,
/// statement" span, used by structural passes (unreachability,
/// resource lifecycle, guard byte scans, CFG/taint span dedup).
#[serde(default)]
pub callee_span: Option<(usize, usize)>,
@ -283,7 +275,7 @@ pub struct CallMeta {
/// only positional arguments.
pub kwargs: Vec<(String, Vec<String>)>,
/// String-literal value at each positional argument of this call, parallel
/// to `arg_uses` `Some(s)` when the argument is a syntactic string
/// to `arg_uses`, `Some(s)` when the argument is a syntactic string
/// literal, `None` otherwise. Empty for non-call nodes or when positional
/// boundaries can't be determined. Consumed by the static-map abstract
/// analysis (and future literal-aware passes) so they don't need the
@ -302,10 +294,41 @@ pub struct CallMeta {
///
/// Takes priority over `sink_payload_args` in the SSA sink scan: when a
/// call has an object-literal destination arg, only idents under the
/// listed fields may contribute sink findings not every ident in the
/// listed fields may contribute sink findings, not every ident in the
/// positional slot.
///
/// Legacy single-gate path: populated only when this call site matched
/// exactly one gate. When a callee carries multiple gates (e.g. `fetch`
/// is both an SSRF and a `DATA_EXFIL` gate), per-gate filters live in
/// [`Self::gate_filters`] and this field is left `None`.
#[serde(default)]
pub destination_uses: Option<Vec<String>>,
/// Per-gate filters for callees that carry multiple gated-sink rules.
///
/// Each entry preserves one matching gate's `(label_caps, payload_args,
/// destination_uses)` so the SSA sink scan can attribute findings
/// per-cap. Empty when the call site matches zero or exactly one gate
/// (the single-gate case continues to use [`Self::sink_payload_args`] +
/// [`Self::destination_uses`]).
#[serde(default)]
pub gate_filters: Vec<GateFilter>,
}
/// One gate's contribution at a call site whose callee matches multiple
/// gates. The SSA taint engine processes each filter independently so a
/// `fetch({url: tainted}, {body: tainted})` flow surfaces as one SSRF
/// finding (URL filter) plus one `DATA_EXFIL` finding (body filter), each
/// carrying its own cap mask rather than a conflated union.
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct GateFilter {
/// Sink caps emitted by this gate (e.g. `Cap::SSRF`, `Cap::DATA_EXFIL`).
pub label_caps: crate::labels::Cap,
/// Argument positions that carry the tainted payload for this gate.
pub payload_args: Vec<usize>,
/// Destination-aware filter: when `Some(names)`, the sink check only
/// considers SSA values whose `var_name` matches one of `names` (object-
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
pub destination_uses: Option<Vec<String>>,
}
/// Taint-classification and variable-flow metadata.
@ -349,7 +372,7 @@ pub struct NodeInfo {
///
/// This flag is scoped to taint-style sink suppression: it indicates
/// that no attacker-controlled data enters through the immediate
/// arguments. It does NOT mean the call is "safe" in general other
/// arguments. It does NOT mean the call is "safe" in general, other
/// detectors (resource lifecycle, structural analysis) may still
/// legitimately flag these calls.
pub all_args_literal: bool,
@ -411,7 +434,7 @@ pub struct NodeInfo {
pub is_eq_with_const: bool,
/// True when this node reads a numeric-length property on a container:
/// `arr.length`, `map.size`, `buf.byteLength`, `items.count`, `vec.len()`
/// either as a pure property access or as a zero-arg method call.
///, either as a pure property access or as a zero-arg method call.
/// Populated by inspecting the AST in `push_node` across JS/TS, Python,
/// Ruby, Java, Rust, PHP, and C/C++ idioms where these accessors return
/// an integer. Consumed by the type-fact analysis (`ssa::type_facts`)
@ -419,12 +442,12 @@ pub struct NodeInfo {
/// FILE_IO / SHELL_ESCAPE sink suppression for provably numeric
/// payloads.
pub is_numeric_length_access: bool,
/// Phase 6.3: the field name read on the RHS of an assignment whose
/// the field name read on the RHS of an assignment whose
/// RHS is a single member-access expression (e.g. `let x = dto.email`).
/// Set to `Some("email")` for that shape; left `None` otherwise.
/// Consumed by the type-fact analysis (`ssa::type_facts`) so reads
/// against a [`crate::ssa::type_facts::TypeKind::Dto`] receiver pick
/// up the field's declared `TypeKind`. Strictly additive when
/// up the field's declared `TypeKind`. Strictly additive, when
/// `None`, the legacy copy-prop semantics apply.
pub member_field: Option<String>,
}
@ -442,7 +465,7 @@ impl NodeInfo {
/// lines, flow-step rendering, symbolic witness extraction, debug views.
///
/// Use `ast.span` directly for **structural grain**: unreachability,
/// resource lifecycle, guard byte scans, CFG/taint span dedup anywhere
/// resource lifecycle, guard byte scans, CFG/taint span dedup, anywhere
/// the enclosing statement is the meaningful unit.
#[inline]
pub fn classification_span(&self) -> (usize, usize) {
@ -514,7 +537,7 @@ pub struct BodyMeta {
/// Per-parameter [`crate::ssa::type_facts::TypeKind`] inferred from
/// decorators / annotations / static type text at CFG construction
/// time. Same length as `params`; positions with no recoverable
/// type info are `None`. Strictly additive when every entry is
/// type info are `None`. Strictly additive, when every entry is
/// `None`, downstream behaviour is identical to the pre-Phase-1
/// engine.
pub param_types: Vec<Option<crate::ssa::type_facts::TypeKind>>,
@ -528,7 +551,7 @@ pub struct BodyMeta {
/// `LocalFuncSummary`. `None` for the synthetic top-level body.
///
/// All intra-file maps keyed on function identity (SSA summaries, callee
/// bodies, inline cache, callback bindings) use this key never the bare
/// bodies, inline cache, callback bindings) use this key, never the bare
/// leaf `name`, which is collision-prone across (container, arity,
/// disambig, kind).
pub func_key: Option<FuncKey>,
@ -589,7 +612,7 @@ pub struct FileCfg {
/// Promisify wrapper aliases: local name → wrapped callee name.
/// Only populated for JS/TS files.
pub promisify_aliases: PromisifyAliases,
/// Phase 6: per-file class / trait / interface hierarchy edges.
/// per-file class / trait / interface hierarchy edges.
/// Each entry is `(sub_container, super_container)` after
/// language-specific normalisation. See
/// [`crate::cfg::hierarchy`] for the per-language extraction
@ -711,14 +734,10 @@ fn extract_condition_raw<'a>(
vars.dedup();
vars.truncate(MAX_COND_VARS);
// 4. Extract text, truncated.
let text = text_of(cond, code).map(|t| {
if t.len() > MAX_CONDITION_TEXT_LEN {
t[..MAX_CONDITION_TEXT_LEN].to_string()
} else {
t
}
});
// 4. Extract text, truncated. UTF-8-safe, gogs (Gurmukhi) /
// discourse (Cyrillic) trip raw byte slices on regex literals.
let text = text_of(cond, code)
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
(text, vars, negated)
}
@ -739,7 +758,7 @@ pub(super) fn detect_negation<'a>(
_if_ast: Node<'a>,
_lang: &str,
) -> (Node<'a>, bool) {
// Unwrap parenthesized_expression JS/Java/PHP wrap if-conditions in parens.
// Unwrap parenthesized_expression, JS/Java/PHP wrap if-conditions in parens.
// This lets us detect negation inside: `if (!expr)` → cond is `(!expr)`.
let cond = if cond.kind() == "parenthesized_expression" {
cond.child_by_field_name("expression")
@ -811,7 +830,7 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
"*" => Some(BinOp::Mul),
"/" => Some(BinOp::Div),
"%" => Some(BinOp::Mod),
// Bitwise (single-char tokens no conflict with && / ||)
// Bitwise (single-char tokens, no conflict with && / ||)
"&" => Some(BinOp::BitAnd),
"|" => Some(BinOp::BitOr),
"^" => Some(BinOp::BitXor),
@ -909,7 +928,7 @@ fn extract_template_prefix(ast: Node, lang: &str, code: &[u8]) -> Option<String>
/// `extract_template_prefix` for both assignment RHS and call arguments.
///
/// Also descends through `await` / `yield` wrappers and into the first
/// argument of a call expression this covers the common sink shape
/// argument of a call expression, this covers the common sink shape
/// `await axios.get(\`https://host/…${x}\`)` where the template literal lives
/// inside a call inside an `await` wrapper.
fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
@ -930,7 +949,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
}
"call_expression" | "call" | "new_expression" => {
// Descend into the first positional argument (e.g.
// `axios.get(\`https://…${x}\`)` the URL we want to lock
// `axios.get(\`https://…${x}\`)`, the URL we want to lock
// is the template-literal first argument of the call).
let args = cur
.child_by_field_name("arguments")
@ -942,7 +961,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
}
}
// Case 1: template literal `\`scheme://host/…${x}…\``.
// Case 1: template literal, `\`scheme://host/…${x}…\``.
if cur.kind() == "template_string" {
let mut w = cur.walk();
let first_child = cur.named_children(&mut w).next()?;
@ -957,7 +976,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
return None;
}
// Case 2: `"scheme://host/" + x` LHS is a string literal.
// Case 2: `"scheme://host/" + x`, LHS is a string literal.
if cur.kind() == "binary_expression" {
let mut w2 = cur.walk();
let mut ops = cur.children(&mut w2).filter(|c| !c.is_named());
@ -1028,7 +1047,7 @@ fn extract_bin_op_const(ast: Node, lang: &str, code: &[u8]) -> Option<i64> {
}
}
// Try left, then right one of them should be a literal
// Try left, then right, one of them should be a literal
try_parse_number(left, code).or_else(|| try_parse_number(right, code))
}
@ -1067,7 +1086,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
.named_child(0)
.is_some_and(|c| is_boolean_eq_const_tree(c, lang)),
"unary_expression" | "not_operator" => {
// `!` / `not` operator is an anonymous child; operand is the
// `!` / `not`, operator is an anonymous child; operand is the
// single named child.
let mut w = node.walk();
let mut op_is_not = false;
@ -1084,7 +1103,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
.is_some_and(|c| is_boolean_eq_const_tree(c, lang))
}
"boolean_operator" => {
// Python `and`/`or` operands are named children.
// Python `and`/`or`, operands are named children.
let l = node.named_child(0);
let r = node.named_child(1);
l.is_some_and(|n| is_boolean_eq_const_tree(n, lang))
@ -1137,9 +1156,9 @@ fn binary_operator_token(node: Node) -> Option<String> {
/// Property names whose value is provably an integer across the supported
/// languages: JS/TS `arr.length` (Array/String/TypedArray), `map.size`
/// (Map/Set), `buffer.byteLength` (ArrayBuffer/TypedArray); Python `.count`
/// (`str.count`, `list.count`, `tuple.count` all return int); Ruby `.length`
/// (`str.count`, `list.count`, `tuple.count`, all return int); Ruby `.length`
/// / `.size` / `.count`; Java `.size()` / `.length()`; Rust `.len()`. This
/// list is intentionally narrow only properties whose semantics across every
/// list is intentionally narrow, only properties whose semantics across every
/// host we scan return an integer, so the `TypeKind::Int` fact is sound.
fn is_numeric_length_property(name: &str) -> bool {
matches!(name, "length" | "size" | "byteLength" | "count" | "len")
@ -1157,7 +1176,7 @@ fn is_numeric_length_property(name: &str) -> bool {
/// Consumed by the type-fact analysis (`ssa::type_facts::analyze_types`) to
/// infer `TypeKind::Int` on the defined value so sink-cap suppression can
/// treat `"row " + arr.length` as a non-injectable payload.
/// Phase 6.3: when the RHS of an assignment / declaration is a single
/// when the RHS of an assignment / declaration is a single
/// member-access expression (`let x = dto.email`, `x = obj.field`,
/// `let x = obj["field"]`), return the property name. The CFG type-fact
/// analysis uses the recovered name to look up the field's declared
@ -1321,7 +1340,7 @@ fn find_single_binary_expr<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
// Check if ast itself is a binary expression
if is_binary_expr_kind(ast_kind, lang) {
// Verify it has exactly 2 named children (left, right) no nesting
// Verify it has exactly 2 named children (left, right), no nesting
let named_count = ast.named_child_count();
if named_count == 2 {
// Ensure neither child is itself a binary expression (that would
@ -1435,7 +1454,7 @@ pub(super) fn push_node<'a>(
// (e.g. PHP `object_creation_expression` has positional children).
.or_else(|| find_constructor_type_child(ast))
.and_then(|n| {
// IIFE: `(function(x){...})(arg)` the called expression is a
// IIFE: `(function(x){...})(arg)`, the called expression is a
// function literal with no identifier. Bind the call to the
// anonymous body's synthetic name so resolve_callee can find
// the extracted BodyCfg/summary. Without this, text_of() would
@ -1512,7 +1531,7 @@ pub(super) fn push_node<'a>(
// If this is a declaration/expression wrapper or an assignment that
// *contains* a call, prefer the first inner call identifier instead of
// the whole line. Track the inner call's byte span so we can populate
// `CallMeta.callee_span` once the labels settle enabling narrow
// `CallMeta.callee_span` once the labels settle, enabling narrow
// source-location reporting when the classified call lives several lines
// below the enclosing statement (e.g. call inside a multi-line template
// literal).
@ -1546,9 +1565,9 @@ pub(super) fn push_node<'a>(
let mut labels = classify_all(lang, &text, extra);
// If the outermost call didn't classify, try inner/nested calls.
// E.g. `str(eval(expr))` `str` is not a sink, but `eval` is.
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
// When the callee is overridden, save the original for container ops
// (e.g. `parts.add(req.getParameter(...))` callee becomes
// (e.g. `parts.add(req.getParameter(...))`, callee becomes
// "req.getParameter" but outer_callee preserves "parts.add").
let mut outer_callee: Option<String> = None;
let mut inner_callee_span: Option<(usize, usize)> = None;
@ -1568,7 +1587,7 @@ pub(super) fn push_node<'a>(
// For assignments like `element.innerHTML = value`, the inner-call heuristic
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
// If that didn't produce a label, check the LHS property name it may be a
// If that didn't produce a label, check the LHS property name, it may be a
// sink like `innerHTML`.
//
// This covers both direct `Kind::Assignment` nodes and `Kind::CallWrapper`
@ -1588,7 +1607,7 @@ pub(super) fn push_node<'a>(
if let Some(assign) = assign_node
&& let Some(lhs) = assign.child_by_field_name("left")
{
// Try full member expression first (e.g. "location.href") more
// Try full member expression first (e.g. "location.href"), more
// specific and avoids false positives on `a.href`.
if let Some(full) = member_expr_text(lhs, code) {
if let Some(l) = classify(lang, &full, extra) {
@ -1612,7 +1631,7 @@ pub(super) fn push_node<'a>(
// try to classify the member expression text as a source.
// This handles `var x = process.env.CMD` (JS), `os.environ["KEY"]` (Python),
// and similar property-access-based source patterns.
// Skip when the assignment's RHS is itself a function/lambda literal
// Skip when the assignment's RHS is itself a function/lambda literal ,
// labels found by `first_member_label` would come from inside the
// closure body and shouldn't tag the outer wrapper (e.g. Go's
// `run := func() { exec.Command(...) }` would otherwise inherit
@ -1687,7 +1706,7 @@ pub(super) fn push_node<'a>(
if labels.is_empty()
&& let Some(outer) = call_ast
&& let Some((inner, inner_callee_text)) = find_chained_inner_call(outer, lang, code)
&& classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_some()
&& !classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_empty()
{
call_ast = Some(inner);
outer_callee = Some(text.clone());
@ -1707,13 +1726,14 @@ pub(super) fn push_node<'a>(
// the outer statement `text`, so gate matcher names like `"fetch"` hit.
let mut sink_payload_args: Option<Vec<usize>> = None;
let mut destination_uses: Option<Vec<String>> = None;
let mut gate_filters: Vec<GateFilter> = Vec::new();
if labels.is_empty() {
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(cn) = gate_call {
let gate_callee_text = if call_ast.is_some() {
text.clone()
} else {
// Inner call reached via wrapper use the call-expression's
// Inner call reached via wrapper, use the call-expression's
// function name directly. Falls back to `text` so non-call-
// expression kinds (method calls, Ruby `call` nodes, macros)
// still have a usable callee string.
@ -1723,51 +1743,84 @@ pub(super) fn push_node<'a>(
.and_then(|f| text_of(f, code))
.unwrap_or_else(|| text.clone())
};
if let Some(gm) = classify_gated_sink(
let matches = classify_gated_sink(
lang,
&gate_callee_text,
|idx| extract_const_string_arg(cn, idx, code),
|kw| extract_const_keyword_arg(cn, kw, code),
|kw| has_keyword_arg(cn, kw, code),
) {
labels.push(gm.label);
let payload = gm.payload_args;
if payload == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual call
// arity so `collect_tainted_sink_values` checks each one.
let arity = extract_arg_uses(cn, code).len();
if arity > 0 {
sink_payload_args = Some((0..arity).collect());
}
} else if !payload.is_empty() {
sink_payload_args = Some(payload.to_vec());
}
);
// Destination-aware gates (outbound HTTP clients): when the
// gate declares destination-bearing object fields and the
// positional destination arg at call time is an object
// literal, narrow sink-taint checks to identifiers under
// those fields. Non-object arg forms (string / ident /
// expression) return `None` from the extractor and fall
// through to whole-arg positional filtering.
//
// We only populate destination_uses for the FIRST payload
// position that is an object literal. For outbound HTTP
// gates `payload_args` is always a single position (arg 0)
// so this is exact.
if !gm.object_destination_fields.is_empty() {
for &pos in gm.payload_args {
if let Some(names) = extract_destination_field_idents(
cn,
pos,
gm.object_destination_fields,
code,
) {
destination_uses = Some(names);
break;
if !matches.is_empty() {
// Per-gate filter accumulation. Each match contributes:
// * its label (added to `labels` so `resolve_sink_caps`
// downstream sees the union),
// * a `GateFilter` carrying that gate's specific
// `(label_caps, payload_args, destination_uses)` so
// the SSA sink scan can attribute taint per-cap.
let mut union_payload: Vec<usize> = Vec::new();
for gm in &matches {
labels.push(gm.label);
let payload_vec: Vec<usize> =
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual
// call arity so `collect_tainted_sink_values` checks
// each one.
let arity = extract_arg_uses(cn, code).len();
(0..arity).collect()
} else {
gm.payload_args.to_vec()
};
// Destination-aware gates: when the gate declares
// destination-bearing object fields and a payload-position
// arg is an object literal at call time, narrow sink-taint
// checks to identifiers under those fields. Non-object
// arg forms return `None` from the extractor and the gate
// falls back to whole-arg positional filtering.
let mut dest_uses: Option<Vec<String>> = None;
if !gm.object_destination_fields.is_empty() {
for &pos in gm.payload_args {
if let Some(names) = extract_destination_field_idents(
cn,
pos,
gm.object_destination_fields,
code,
) {
dest_uses = Some(names);
break;
}
}
}
let label_caps = match gm.label {
crate::labels::DataLabel::Sink(c) => c,
_ => crate::labels::Cap::empty(),
};
for &p in &payload_vec {
if !union_payload.contains(&p) {
union_payload.push(p);
}
}
gate_filters.push(GateFilter {
label_caps,
payload_args: payload_vec,
destination_uses: dest_uses,
});
}
if !union_payload.is_empty() {
sink_payload_args = Some(union_payload);
}
// Legacy single-gate path keeps `destination_uses` populated so
// the SSA fast-path (one filter) continues to work without
// consulting `gate_filters`. When multiple gates match,
// per-position filters live in `gate_filters` and the legacy
// field is intentionally left `None`.
if gate_filters.len() == 1 {
destination_uses = gate_filters[0].destination_uses.clone();
}
}
}
@ -1778,7 +1831,7 @@ pub(super) fn push_node<'a>(
// path-traversal or HTML metacharacters. The CFG collapses the whole
// chain into a single call node, so detection must inspect the AST of
// that node directly. Only fires when no Sanitizer label already
// classifies this node existing label rules win.
// classifies this node, existing label rules win.
if lang == "rust" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) {
if let Some(cn) = call_ast {
if cn.kind() == "call_expression" || cn.kind() == "method_call_expression" {
@ -1815,7 +1868,7 @@ pub(super) fn push_node<'a>(
// `having` / `joins` as `Sink(SQL_QUERY)` because their string-interpolation
// form (`Model.where("id = #{x}")`) is a real SQLi vector. But the same
// methods are intrinsically parameterised when arg 0 is a hash, symbol,
// array, or non-interpolated string Rails escapes the values. Rather
// array, or non-interpolated string, Rails escapes the values. Rather
// than dropping the sink (which would lose the genuine TPs), synthesise
// a same-node `Sanitizer(SQL_QUERY)` for the safe shapes; this clears
// SQL taint at the call and reflexively dominates the sink, suppressing
@ -1825,7 +1878,7 @@ pub(super) fn push_node<'a>(
// Chained calls (`Model.where(...).preload(...).to_a`) collapse into a
// single CFG node whose outer `call_ast` may be `to_a` (no args). The
// shape inspection has to walk the receiver chain to reach the AR query
// call itself `ruby_chain_arg0_for_method` does that walk.
// call itself, `ruby_chain_arg0_for_method` does that walk.
if (lang == "ruby" || lang == "rb")
&& labels
.iter()
@ -1859,7 +1912,7 @@ pub(super) fn push_node<'a>(
// and `Statement.executeQuery(String)` overloads are real injection
// sinks when given a concatenated SQL string. But the same method
// names on JPA `javax.persistence.Query` and JDBC `PreparedStatement`
// are zero-arg they execute SQL that was bound upstream by
// are zero-arg, they execute SQL that was bound upstream by
// `entityManager.createQuery(LITERAL)` / `connection.prepareStatement(LITERAL)`,
// and any bind values went through `setParameter` / `setString`
// (which the JDBC/JPA driver escapes). Walk the receiver chain to
@ -1894,7 +1947,7 @@ pub(super) fn push_node<'a>(
// (`createQuery` / `createNativeQuery` / `prepareStatement`)
// and require its arg 0 to be a string literal. Anything
// else (binary concat, identifier, method call) leaves
// the sink in place we cannot prove the SQL is
// the sink in place, we cannot prove the SQL is
// parameterised, so the structural finding stands.
const JPA_BIND_METHODS: &[&str] = &[
"createQuery",
@ -1914,6 +1967,89 @@ pub(super) fn push_node<'a>(
}
}
// Shape-based sanitizer synthesis for JS/TS ORM-accessor chains.
// The static label table marks `db.query` / `connection.query` /
// `pool.query` / `client.query` / `db.execute` as `Sink(SQL_QUERY)`
// because the bare `connection.query("SELECT ..." + name)` form is a
// real SQLi sink. But the same `db.query` method on Strapi-style ORMs
// takes a model UID literal and returns a chainable model accessor:
// `strapi.db.query('admin::api-token').findOne({ where: whereParams })`.
// The trailing `.findOne({...})` / `.findMany({...})` / `.create(...)`
// calls are intrinsically parameterised, the actual SQL is generated
// by the ORM, and the per-call values arrive through field-keyed object
// literals that the ORM driver escapes.
//
// Recognition rule: when the CFG node's classified text reaches a sink
// with `SQL_QUERY` cap, walk the receiver chain looking for an inner
// `*.query(...)` / `*.execute(...)` whose arg 0 is a string literal
// and whose result has at least one chained method call appended whose
// name is in the ORM-accessor whitelist. If both hold, synthesise a
// same-node `Sanitizer(SQL_QUERY)` mirroring the Java JPA fix. Bare
// `connection.query("SELECT ...")` (no chained method) and
// `db.query("UPDATE x SET y=" + name)` (non-literal arg 0) leave the
// sink in place, both are genuine SQLi shapes.
if (lang == "javascript"
|| lang == "js"
|| lang == "typescript"
|| lang == "ts"
|| lang == "tsx")
&& labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
&& !labels
.iter()
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
{
const QUERY_TARGETS: &[&str] = &["query", "execute"];
// ORM-accessor methods that take object-literal args and return
// promises of rows / row counts. Promise methods (`then`, `catch`,
// `finally`) deliberately excluded, they don't prove ORM shape.
const ORM_CHAIN_METHODS: &[&str] = &[
"findOne",
"findMany",
"findFirst",
"findUnique",
"findById",
"find",
"create",
"createMany",
"update",
"updateMany",
"upsert",
"delete",
"deleteMany",
"count",
"aggregate",
"distinct",
"save",
];
// Fall back to a deeper walk (up to 4 levels) for await/return-
// wrapped calls (e.g. `const x = await db.query(...).findOne(...)` ,
// call sits at depth 3 inside lexical_declaration > variable_declarator
// > await_expression > call_expression).
let chain_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(call_node) = chain_call {
// Outer method must be in the ORM whitelist *and* the chain must
// have a deeper inner call to a `query`/`execute` whose arg 0 is
// a string literal. Both checks gate the synthesis.
let outer_method = js_chain_outer_method_for_inner(call_node, QUERY_TARGETS, code);
let outer_is_orm = outer_method
.as_deref()
.is_some_and(|m| ORM_CHAIN_METHODS.contains(&m));
if outer_is_orm
&& let Some((arg0_kind, has_interp)) =
js_chain_arg0_kind_for_method(call_node, QUERY_TARGETS, code)
&& !has_interp
&& matches!(
arg0_kind.as_str(),
"string" | "string_fragment" | "template_string"
)
{
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
}
}
}
let span = (ast.start_byte(), ast.end_byte());
/* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */
@ -2036,7 +2172,7 @@ pub(super) fn push_node<'a>(
// (SSA `SsaOp::Call.receiver`, summary `receiver_to_return`/`receiver_to_sink`).
//
// Two cases:
// 1. Kind::CallMethod native method call AST (Java method_invocation,
// 1. Kind::CallMethod, native method call AST (Java method_invocation,
// Rust method_call_expression, Ruby call, PHP member_call_expression).
// Receiver is exposed via "object"/"receiver"/"scope" field on the call.
// 2. Kind::CallFn whose function child is a member_expression (JS/TS) or
@ -2065,7 +2201,7 @@ pub(super) fn push_node<'a>(
// value, which is what type-qualified resolution
// anchors on. Falls back to `root_receiver_text` (which
// returns raw text like "conn.execute") only if drilling
// fails preserving prior behavior for types we can't
// fails, preserving prior behavior for types we can't
// structurally reduce.
root_member_receiver(rn, code).or_else(|| root_receiver_text(cn, lang, code))
} else {
@ -2076,7 +2212,7 @@ pub(super) fn push_node<'a>(
// JS/TS `obj.method(x)`: call_expression.function = member_expression.
// Python `obj.method(x)`: call.function = attribute.
// Rust `obj.method(x)`: call_expression.function = field_expression
// (field on `value`, not `object` value can be another call
// (field on `value`, not `object`, value can be another call
// for chained forms like `Connection::open(p).unwrap().execute(...)`).
// Pull the receiver from the object/attribute-owner field.
let func_child = cn.child_by_field_name("function");
@ -2139,7 +2275,7 @@ pub(super) fn push_node<'a>(
// Python `with` and Java try-with-resources.
let is_raii_managed = is_raii_factory(lang, &text);
// Ruby block form auto-close: `File.open(path) { |f| f.read }`
// Ruby block form auto-close: `File.open(path) { |f| f.read }` ,
// the block parameter receives the resource and Ruby guarantees close
// at block exit. If assigned (`f = File.open(p) { ... }`), the
// variable holds the block's return value, not an open resource.
@ -2156,7 +2292,7 @@ pub(super) fn push_node<'a>(
// Prefer the span of the call found by `find_classifiable_inner_call`
// (deeper, classification-driven) over the one from `first_call_ident`
// (shallower, text-override-driven). Only record `callee_span` when it
// actually narrows against `ast.span` storing a redundant copy would
// actually narrows against `ast.span`, storing a redundant copy would
// just bloat every labeled Call node.
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
@ -2174,6 +2310,7 @@ pub(super) fn push_node<'a>(
kwargs,
arg_string_literals,
destination_uses,
gate_filters,
},
taint: TaintMeta {
labels,
@ -2228,7 +2365,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
/// Pre-emit dedicated Source CFG nodes for call arguments that contain source
/// member expressions.
///
/// **Two-step API** Source nodes must be created *before* the Call node so
/// **Two-step API**, Source nodes must be created *before* the Call node so
/// they receive lower graph indices. This is critical because the If handler
/// uses `NodeIndex::new(g.node_count())` to capture the first node built in a
/// branch and wires a True/False edge to it. If the Source node has a lower
@ -2239,7 +2376,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
/// the branch body.
///
/// True when `ast` is an assignment / declaration whose RHS is a
/// function or lambda literal i.e. shapes like
/// function or lambda literal, i.e. shapes like
/// * Go `run := func() { ... }`
/// * JS/TS `var run = function() { ... }` / `const run = () => ...`
/// * Python `run = lambda x: ...`
@ -2311,7 +2448,7 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
false
}
/// Pointer-Phase 6 / W5: when `ast` is (or wraps) an assignment whose
/// when `ast` is (or wraps) an assignment whose
/// LHS is a single subscript / index expression with a plain-identifier
/// receiver, emit a synthetic `__index_set__` Call node and return its
/// `NodeIndex`. Returns `None` for non-subscript LHSs, multi-target
@ -2328,7 +2465,7 @@ fn try_lower_subscript_write(
enclosing_func: Option<&str>,
call_ordinal: &mut u32,
) -> Option<NodeIndex> {
// Locate the assignment node `ast` may be the assignment itself
// Locate the assignment node, `ast` may be the assignment itself
// (Go `assignment_statement`) or a wrapper (`expression_statement`
// containing JS `assignment_expression` / Python `assignment`).
let assign_ast = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
@ -2383,7 +2520,7 @@ fn try_lower_subscript_write(
/// `synth_bindings` carry `(arg_pos, synth_name)` pairs that should be
/// appended to both the call's `arg_uses[arg_pos]` and its `taint.uses`.
/// `uses_only_synth_names` carry synth names that should *only* be
/// appended to `taint.uses` used for chain-inner-arg sources where the
/// appended to `taint.uses`, used for chain-inner-arg sources where the
/// synth value is not a positional argument of the OUTER call but still
/// participates in the call's implicit dependency chain (e.g. `r.Body`
/// inside `json.NewDecoder(r.Body).Decode(emoji)`'s receiver).
@ -2446,7 +2583,7 @@ fn pre_emit_arg_source_nodes(
for (pos, child) in children.iter().enumerate() {
let src_label = first_member_label(*child, lang, code, extra);
if let Some(DataLabel::Source(caps)) = src_label {
// Use the *current* node count as a unique token it equals the
// Use the *current* node count as a unique token, it equals the
// index the new Source node will receive.
let synth_name = format!("__nyx_src_{}_{}", g.node_count(), pos);
let member_text = first_member_text(*child, code);
@ -2481,7 +2618,7 @@ fn pre_emit_arg_source_nodes(
continue;
}
// Pointer-Phase 6 / W5: pre-emit `__index_get__` Call nodes for
//pre-emit `__index_get__` Call nodes for
// subscript / index-expression args when pointer analysis is
// enabled. This lets the W2/W4 container ELEM read hook fire
// on the synth call, propagating must/may/caps from the cell
@ -2489,7 +2626,7 @@ fn pre_emit_arg_source_nodes(
//
// Gated on `pointer::is_enabled()` so the env-var=0 path keeps
// CFG shapes bit-identical to today's output. Only fires when
// the array operand resolves to a plain identifier see
// the array operand resolves to a plain identifier, see
// `subscript_components` for the bail conditions.
if pointer_on
&& is_subscript_kind(child.kind())
@ -2539,7 +2676,7 @@ fn pre_emit_arg_source_nodes(
// Gated to Go and to writeback-shaped outer callees (`Decode` /
// `Unmarshal`) because the synth-source emission is only useful when
// a downstream writeback consumer reads from the chain's tainted
// receiver broader gating risks emitting synth sources whose taint
// receiver, broader gating risks emitting synth sources whose taint
// never propagates and whose presence trips Layer B AST-pattern
// suppression on unrelated sinks (see
// `tests/fixtures/real_world/go/taint/func_literal_capture.go`).
@ -2613,7 +2750,7 @@ fn pre_emit_arg_source_nodes(
/// Step 2: wire synthetic variable names from pre-emitted Source nodes into
/// the Call node's `arg_uses` and `uses`. `uses_only` synth names are
/// appended only to `taint.uses` used for chain-inner-arg sources whose
/// appended only to `taint.uses`, used for chain-inner-arg sources whose
/// synth value is not a positional outer-call argument.
fn apply_arg_source_bindings(
g: &mut Cfg,
@ -2724,7 +2861,7 @@ pub(super) fn build_sub<'a>(
.unwrap_or(false);
// Check for negation wrapping the entire condition (e.g. `!(a && b)`)
// if present, skip short-circuit decomposition (De Morgan out of scope).
//, if present, skip short-circuit decomposition (De Morgan out of scope).
let has_short_circuit = has_short_circuit
&& cond_subtree.map_or(false, |c| {
let unwrapped = unwrap_parens(c);
@ -3424,7 +3561,7 @@ pub(super) fn build_sub<'a>(
// When the grammar-level name is anonymous, try to derive a binding
// name from the surrounding declaration or assignment. This lets
// `var h = function(x){...}` / `this.run = () => {...}` participate
// in callback resolution callers referencing `h` or `run` can
// in callback resolution, callers referencing `h` or `run` can
// find the body via `resolve_local_func_key` and intra-file calls
// like `h()` can resolve to the anonymous body's summary. Without
// this, the body is keyed with the synthetic anon name and there
@ -3731,7 +3868,7 @@ pub(super) fn build_sub<'a>(
// would lower the return as a plain `StmtKind::Call`, losing
// the return semantics and letting fall-through Seq edges
// survive into the SSA terminator (the OR-chain rejection-arm
// defect see `or_chain_rejection_block_terminates_with_return`).
// defect, see `or_chain_rejection_block_terminates_with_return`).
if let Some(inner) = ast.children(&mut cursor).find(|c| {
matches!(
lookup(lang, c.kind()),
@ -3788,7 +3925,7 @@ pub(super) fn build_sub<'a>(
);
}
// Pointer-Phase 6 / W5: subscript-write lowering when the
//subscript-write lowering when the
// CallWrapper's inner expression is `arr[i] = v` (JS/TS,
// Python). See `try_lower_subscript_write` for shape +
// bail matrix.
@ -3824,7 +3961,7 @@ pub(super) fn build_sub<'a>(
// Pre-emit Source nodes for call arguments containing source
// member expressions (e.g. `req.body.returnTo` inside
// `res.redirect(req.body.returnTo)`). Created BEFORE the Call
// node so they get lower indices see doc comment on
// node so they get lower indices, see doc comment on
// `pre_emit_arg_source_nodes` for why this ordering matters.
let (effective_preds, src_bindings, src_uses_only) = if kind == StmtKind::Call {
pre_emit_arg_source_nodes(g, ast, lang, code, enclosing_func, analysis_rules, preds)
@ -3984,7 +4121,7 @@ pub(super) fn build_sub<'a>(
// Assignment that may contain a call (Python `x = os.getenv(...)`, Ruby `x = gets()`)
Kind::Assignment => {
// JS/TS ternary-RHS split same rationale as the CallWrapper branch.
// JS/TS ternary-RHS split, same rationale as the CallWrapper branch.
if matches!(lang, "javascript" | "typescript" | "tsx")
&& let (Some(left), Some(right)) = (
ast.child_by_field_name("left"),
@ -4011,7 +4148,7 @@ pub(super) fn build_sub<'a>(
}
}
// Pointer-Phase 6 / W5: subscript-write lowering. See
//subscript-write lowering. See
// `try_lower_subscript_write` for the per-language shape
// matrix and bail conditions.
if crate::pointer::is_enabled()
@ -4099,12 +4236,19 @@ pub(crate) fn build_cfg<'a>(
// function so thread-local state never leaks between files.
populate_fn_dfs_indices(tree, lang);
// Phase 6: harvest DTO class definitions before any param classifier
// runs. Empty for languages without a Phase 6 collector. Cleared
// harvest DTO class definitions before any param classifier
// runs. Empty for languages without a collector. Cleared
// alongside the DFS map at end-of-build_cfg.
DTO_CLASSES.with(|cell| {
*cell.borrow_mut() = dto::collect_dto_classes(tree.root_node(), lang, code);
});
// harvest same-file `type X = Map<...>` / `Set<...>` / `T[]`
// aliases so JS/TS param classifiers resolve `m: ElementsMap`
// to `LocalCollection`. Empty for non-JS/TS languages.
TYPE_ALIAS_LC.with(|cell| {
*cell.borrow_mut() =
dto::collect_type_alias_local_collections(tree.root_node(), lang, code);
});
// Create the top-level body graph (BodyId(0)).
let (mut g, entry, exit) = create_body_graph(0, code.len(), None);
@ -4143,7 +4287,7 @@ pub(crate) fn build_cfg<'a>(
connect_all(&mut g, &[e], exit, EdgeKind::Seq);
}
debug!(target: "cfg", "CFG DONE top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
debug!(target: "cfg", "CFG DONE, top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
if cfg!(debug_assertions) {
for idx in g.node_indices() {
@ -4231,10 +4375,11 @@ pub(crate) fn build_cfg<'a>(
// Clear the per-file DFS-index map so it does not leak to the next
// file built on this thread.
clear_fn_dfs_indices();
// Phase 6: same hygiene for the DTO map.
// same hygiene for the DTO map.
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
TYPE_ALIAS_LC.with(|cell| cell.borrow_mut().clear());
// Phase 6 (typed call-graph subtype awareness): collect every
// collect every
// declared inheritance / impl / implements relationship in the
// file. Per-language extractor in `cfg::hierarchy`; empty for
// Go and C. Each `(sub, super)` pair gets duplicated onto every
@ -4289,14 +4434,14 @@ fn apply_promisify_labels(
/// Build a `CalleeSite` carrying the richer per-call-site metadata for a
/// CFG node.
///
/// * `arity` positional argument count. `None` when `extract_arg_uses`
/// * `arity`, positional argument count. `None` when `extract_arg_uses`
/// bailed out on splats/keyword-args (length 0 does not distinguish
/// zero-arg calls from unknown; we treat 0 as a concrete zero). The
/// receiver is a separate channel via `CallMeta.receiver` and is not
/// represented in `arg_uses`, so `arity == arg_uses.len()` for calls.
/// * `receiver` forwarded verbatim from `CallMeta.receiver` (already
/// * `receiver`, forwarded verbatim from `CallMeta.receiver` (already
/// normalized to the root identifier).
/// * `qualifier` the segment(s) before the leaf identifier of the callee.
/// * `qualifier`, the segment(s) before the leaf identifier of the callee.
/// For **Rust** specifically, this is the *full* `::`-joined prefix (e.g.
/// `"crate::auth::token"` for `crate::auth::token::validate`) so that
/// cross-file `use`-map resolution in `callgraph.rs` has everything it
@ -4380,7 +4525,7 @@ pub(crate) fn export_summaries(
module_path: None,
rust_use_map: None,
rust_wildcards: None,
// Phase 6 hierarchy edges live on `FileCfg`, not on the
// Hierarchy edges live on `FileCfg`, not on the
// graph-local `FuncSummaries`. `ParsedFile::export_summaries_with_root`
// attaches them after this transform returns.
hierarchy_edges: Vec::new(),

View file

@ -8,7 +8,7 @@ use petgraph::graph::NodeIndex;
use smallvec::smallvec;
use tree_sitter::Node;
/// Phase 6.2 — resolve a syntactic class / struct / interface / model
/// resolve a syntactic class / struct / interface / model
/// name against the per-file [`DTO_CLASSES`] map populated at the top
/// of `build_cfg`. Returns the [`TypeKind::Dto`] carrying the
/// per-field type map when the class is declared in the same file;
@ -21,7 +21,7 @@ fn lookup_dto_class(class_name: &str) -> Option<TypeKind> {
/// Extract parameter names + per-position [`TypeKind`] from a function
/// AST node. Each entry's second slot is `Some(TypeKind)` when the
/// parameter's decorator, attribute, or static type annotation maps to
/// a known kind, and `None` otherwise. Strictly additive when no
/// a known kind, and `None` otherwise. Strictly additive, when no
/// type info is recoverable, behaviour is identical to the names-only
/// path.
pub(super) fn extract_param_meta<'a>(
@ -109,7 +109,7 @@ pub(super) fn extract_param_meta<'a>(
// Python `typed_parameter`, `default_parameter`,
// `typed_default_parameter`): the wrapper node has no `name`
// field but contains the identifier as a child. Pick the
// *first* identifier that is the parameter name; subsequent
// *first* identifier, that is the parameter name; subsequent
// identifiers are part of the type annotation or default
// expression.
if !found {
@ -123,7 +123,7 @@ pub(super) fn extract_param_meta<'a>(
continue;
}
// Bare identifier children e.g. Rust untyped closure params `|cmd|`
// Bare identifier children, e.g. Rust untyped closure params `|cmd|`
// where the child is an `identifier` node, not a `parameter` wrapper.
if child.kind() == "identifier" {
if let Some(txt) = text_of(child, code) {
@ -137,8 +137,8 @@ pub(super) fn extract_param_meta<'a>(
/// Walk up from a function definition node and build a container path.
///
/// Records the names of enclosing classes / impls / modules / namespaces /
/// structs and, for anonymous / nested functions, the name of an enclosing
/// named function joined with `::`. Also returns a `FuncKind` guess
/// structs, and, for anonymous / nested functions, the name of an enclosing
/// named function, joined with `::`. Also returns a `FuncKind` guess
/// reflecting the structural role.
///
/// Returns `(container, kind)`.
@ -185,7 +185,7 @@ pub(super) fn compute_container_and_kind(
| "enum_item"
| "struct_specifier"
| "struct_item" => Some("name"),
// Rust impl blocks pick the type name, not the trait name.
// Rust impl blocks, pick the type name, not the trait name.
"impl_item" => Some("type"),
// Go / C++ / PHP namespaces and modules.
"namespace_definition" | "namespace_declaration" | "module_declaration" | "module" => {
@ -223,7 +223,7 @@ pub(super) fn compute_container_and_kind(
|| pk == "lambda_expression"
|| pk == "function_expression"
{
// Nested definition record the outer function's name and
// Nested definition, record the outer function's name and
// classify self as Closure even if we got a real name.
if let Some(name_node) = parent.child_by_field_name("name") {
if let Some(text) = text_of(name_node, code) {
@ -428,15 +428,15 @@ pub(super) fn inject_framework_param_sources(
/// no recognised pattern matches, returns `None` and the engine
/// behaves exactly as before.
///
/// Recognised patterns (Phase 2):
/// * Java (Spring) `@PathVariable`/`@RequestParam Long X` →
/// Recognised patterns:
/// * Java (Spring), `@PathVariable`/`@RequestParam Long X` →
/// [`TypeKind::Int`]; `@RequestBody T` → object (no kind today).
/// * TypeScript (NestJS) `@Param('id') id: number` →
/// * TypeScript (NestJS), `@Param('id') id: number` →
/// [`TypeKind::Int`]; `@Body() dto: T` / `@Query('q') q: string`.
/// * Rust (Axum / Rocket / Actix) `Path<i64>` / `Path<u32>` /
/// * Rust (Axum / Rocket / Actix), `Path<i64>` / `Path<u32>` /
/// `web::Path<i64>` → [`TypeKind::Int`]; `Path<String>` →
/// [`TypeKind::String`].
/// * Python (FastAPI) `def h(x: int)` → [`TypeKind::Int`];
/// * Python (FastAPI), `def h(x: int)` → [`TypeKind::Int`];
/// `Annotated[int, Path()]` → [`TypeKind::Int`].
pub(super) fn classify_param_type<'a>(
param: Node<'a>,
@ -453,9 +453,9 @@ pub(super) fn classify_param_type<'a>(
}
}
/// Java (Spring) recognise typed-extractor parameters via the
/// Java (Spring), recognise typed-extractor parameters via the
/// surrounding annotation. Per Hard Rule 3, plain `Long X` without a
/// known framework annotation is **not** treated as a typed extractor
/// known framework annotation is **not** treated as a typed extractor ,
/// the parameter could be a regular function argument that the
/// framework never validates. Recognised annotations:
/// `@PathVariable`, `@RequestParam`, `@RequestBody`, `@RequestHeader`,
@ -473,7 +473,7 @@ fn classify_param_type_java<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
if let Some(k) = java_type_to_kind(&type_text) {
return Some(k);
}
// Phase 6.2: when the static type is a class name we don't classify
// when the static type is a class name we don't classify
// as a primitive (e.g. `@RequestBody CreateUser dto`), look up the
// class in the same-file DTO map. Strip any generics for the
// leading type so `Foo<Bar>` still resolves on `Foo`.
@ -527,7 +527,7 @@ fn has_java_framework_annotation(param: Node<'_>, code: &[u8]) -> bool {
}
/// Map a Java type-text fragment to a [`TypeKind`]. Public to the
/// `cfg` module so the Phase 6 DTO collector can reuse the same
/// `cfg` module so the DTO DTO collector can reuse the same
/// classifier for class fields.
pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
let bare = t.trim().trim_start_matches('@').trim();
@ -546,7 +546,7 @@ pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
/// Map a TypeScript type-text fragment (already stripped of leading
/// `:` / whitespace) to a primitive [`TypeKind`]. Used by both the
/// per-parameter classifier and the Phase 6 DTO collector.
/// per-parameter classifier and the DTO DTO collector.
pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
let head = t.split('<').next().unwrap_or(t).trim();
match head {
@ -557,13 +557,35 @@ pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
}
}
/// TypeScript (NestJS) recognise typed-extractor parameters via a
/// TypeScript (NestJS), recognise typed-extractor parameters via a
/// known NestJS decorator (`@Param`, `@Body`, `@Query`, `@Headers`,
/// `@Req`, `@Res`). Per Hard Rule 3, a bare `function h(id: number)`
/// is not a framework extractor without a NestJS decorator no
/// is not a framework extractor, without a NestJS decorator no
/// runtime gate is implied. Pipe coercions (`ParseIntPipe` /
/// `ParseBoolPipe`) override the static type.
///
/// Exception: parameters annotated as a known JS built-in collection
/// type (`Map<...>`, `Set<...>`, `WeakMap<...>`, `WeakSet<...>`,
/// `Array<...>` / `T[]` / `ReadonlyArray<...>`) resolve to
/// [`TypeKind::LocalCollection`] regardless of decorator presence.
/// `LocalCollection` is a *receiver-shape* claim, not a
/// framework-validated-input claim, it tells the auth analyser that
/// `param.get(k)` / `param.set(k, v)` / `param.find(p)` is a
/// container operation rather than a data-layer read/mutation. This
/// closes the Excalidraw FP cluster (`elementsMap: ElementsMap`,
/// `groupIdMapForOperation: Map<string, string>`) without affecting
/// any input-validation reasoning.
fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
let type_text = param
.child_by_field_name("type")
.and_then(|n| inner_ts_type_text(n, code));
if let Some(t) = type_text.as_deref()
&& let Some(k) = ts_type_to_local_collection(t.trim().trim_start_matches(':').trim())
{
return Some(k);
}
if !has_ts_decorator_argument(
param,
code,
@ -586,14 +608,12 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
if has_ts_decorator_argument(param, code, &["ParseBoolPipe"]) {
return Some(TypeKind::Bool);
}
let t = param
.child_by_field_name("type")
.and_then(|n| inner_ts_type_text(n, code))?;
let t = type_text?;
let stripped = t.trim().trim_start_matches(':').trim();
if let Some(k) = ts_type_to_kind(stripped) {
return Some(k);
}
// Phase 6.2: NestJS `@Body() dto: CreateUser` — when the static
// NestJS `@Body() dto: CreateUser`, when the static
// type is a class / interface name declared in the same file,
// resolve via the DTO map. Generic args dropped for the leading
// type so `Foo<Bar>` matches on `Foo`.
@ -601,8 +621,41 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
lookup_dto_class(head)
}
/// Map a TypeScript / JavaScript type-text fragment to
/// [`TypeKind::LocalCollection`] when the head is a JS built-in
/// container type. Recognises:
///
/// * `Map<K, V>`, `Set<T>`, `WeakMap<K, V>`, `WeakSet<T>`, the four
/// built-in keyed/unkeyed collection types.
/// * `Array<T>`, `ReadonlyArray<T>`, the named array generics.
/// * `T[]`, `readonly T[]`, the array shorthand syntax.
/// * Same-file `type X = Map<...>` aliases (resolved via the
/// per-file `TYPE_ALIAS_LC` map populated at the top of
/// [`build_cfg`]).
///
/// Same-file user types named `Map` / `Set` / etc. (which would
/// shadow the built-ins) are vanishingly rare in TS codebases that
/// also define the methods (`get`, `set`, `has`, `find`); the
/// classifier accepts the head match.
pub(super) fn ts_type_to_local_collection(t: &str) -> Option<TypeKind> {
let head_text = t.trim().trim_start_matches("readonly ").trim();
// Array shorthand: `T[]` or `readonly T[]`.
if head_text.ends_with("[]") {
return Some(TypeKind::LocalCollection);
}
let head = head_text.split('<').next().unwrap_or(head_text).trim();
match head {
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" | "ReadonlyArray" => {
Some(TypeKind::LocalCollection)
}
_ => super::TYPE_ALIAS_LC
.with(|cell| cell.borrow().contains(head))
.then_some(TypeKind::LocalCollection),
}
}
fn inner_ts_type_text<'a>(type_anno: Node<'a>, code: &'a [u8]) -> Option<String> {
// type_annotation node text is `: T` — unwrap to T.
// type_annotation node text is `: T`, unwrap to T.
if let Some(child) = type_anno.named_child(0) {
return text_of(child, code);
}
@ -643,10 +696,10 @@ fn has_ts_decorator_argument(param: Node<'_>, code: &[u8], wanted: &[&str]) -> b
false
}
/// Rust (Axum / Rocket / Actix) read the parameter's type text and
/// Rust (Axum / Rocket / Actix), read the parameter's type text and
/// look for `Path<i64>` / `Json<T>` / `Form<T>` / `Query<T>` shapes.
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)` without an
/// extractor wrapper) are **not** treated as typed extractors only
/// extractor wrapper) are **not** treated as typed extractors, only
/// framework-wrapped types qualify.
fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
if param.kind() != "parameter" {
@ -654,9 +707,121 @@ fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
}
let type_node = param.child_by_field_name("type")?;
let type_text = text_of(type_node, code)?;
// LocalCollection is a *receiver-shape* claim, not a
// framework-validated-input claim, Hard Rule 3's "bare primitives
// don't count" gate doesn't apply (mirrors `classify_param_type_ts`
// for the same reason). Captures `unsharded: RoaringBitmap`,
// `docids: &mut RoaringBitmap`, `params: HashMap<String, String>`,
// `new_shard_docids: &'a mut hashbrown::HashMap<...>` shapes from
// meilisearch/index-scheduler's bitmap bookkeeping where the
// verb-name dispatch (`is_mutation: insert/remove`) would otherwise
// classify these as DB writes.
if let Some(k) = rust_type_to_local_collection(&type_text) {
return Some(k);
}
rust_type_to_kind(&type_text)
}
/// Strip Rust reference markers, lifetimes, and `mut` from the head of
/// a type-text fragment so the underlying type name is exposed for
/// matching. Handles `&T`, `&mut T`, `&'a T`, `&'a mut T`, and
/// repeated `&` prefixes (e.g. `&&mut T`).
fn strip_rust_ref_markers(t: &str) -> &str {
let mut s = t.trim();
loop {
if let Some(rest) = s.strip_prefix('&') {
let rest = rest.trim_start();
// Optional lifetime label: `'a`, `'static`, `'_`.
let rest = if let Some(after) = rest.strip_prefix('\'') {
let end = after
.find(|c: char| !c.is_alphanumeric() && c != '_')
.unwrap_or(after.len());
after[end..].trim_start()
} else {
rest
};
// Optional `mut` keyword.
let rest = rest.strip_prefix("mut ").unwrap_or(rest).trim_start();
s = rest;
continue;
}
if let Some(rest) = s.strip_prefix("mut ") {
s = rest.trim_start();
continue;
}
break;
}
s
}
/// Map a Rust parameter / variable type-text to
/// [`TypeKind::LocalCollection`] when the head names a known
/// in-memory container. Strips reference / lifetime / `mut` markers,
/// drops module-path prefixes (`std::collections::`, `hashbrown::`,
/// `roaring::`), then matches the head against std and ecosystem
/// collection types.
///
/// Recognises:
/// * Std: `Vec`, `HashMap`, `HashSet`, `BTreeMap`, `BTreeSet`,
/// `VecDeque`, `BinaryHeap`, `LinkedList`.
/// * Ecosystem: `IndexMap`, `IndexSet` (indexmap), `SmallVec`
/// (smallvec), `DashMap`, `DashSet` (dashmap), `FxHashMap`,
/// `FxHashSet` (rustc-hash / fxhash), `RoaringBitmap`,
/// `RoaringTreemap` (roaring).
/// * Array / slice shorthand: `[T; N]`, `[T]` (covered by the
/// leading-`[` check after ref-stripping).
///
/// Returns `None` for `Database<...>` (heed/sled, persistent KV
/// store, NOT a local collection, keeping this `None` preserves
/// real IDOR detection on persistent-store calls), `Mutex<...>` /
/// `RwLock<...>` (synchronisation wrappers, not sink-shape claims),
/// and bare primitives.
pub(super) fn rust_type_to_local_collection(t: &str) -> Option<TypeKind> {
let stripped = strip_rust_ref_markers(t);
// Array / slice shorthand: `[T; N]` or `[T]` (the `&` was
// already stripped).
if stripped.starts_with('[') {
return Some(TypeKind::LocalCollection);
}
// Drop module-path prefix: keep only the last segment before `<`
// or end (`std::collections::HashMap<K, V>` → `HashMap`).
let head_with_generics = stripped.rsplit("::").next().unwrap_or(stripped);
let head = head_with_generics
.split('<')
.next()
.unwrap_or(head_with_generics)
.trim();
const TYPES: &[&str] = &[
"Vec",
"VecDeque",
"BinaryHeap",
"LinkedList",
"HashMap",
"HashSet",
"BTreeMap",
"BTreeSet",
"IndexMap",
"IndexSet",
"SmallVec",
"DashMap",
"DashSet",
"FxHashMap",
"FxHashSet",
"RoaringBitmap",
"RoaringTreemap",
];
if TYPES.contains(&head) {
Some(TypeKind::LocalCollection)
} else {
None
}
}
fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
let stripped = t.trim();
// Reject reference / mutability noise so `&Path<i64>` still matches
@ -666,7 +831,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
.trim_start_matches('&')
.trim_start_matches("mut ")
.trim();
// Only framework wrapper extractors qualify bare primitives like
// Only framework wrapper extractors qualify, bare primitives like
// `i64` could be regular function parameters with no framework
// validation gate.
for wrap in [
@ -684,7 +849,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
if let Some(rest) = stripped.strip_prefix(&prefix) {
if let Some(inner) = rest.strip_suffix('>') {
let inner = inner.trim();
// Tuple extractor `Path<(i64, String)>` first element wins.
// Tuple extractor `Path<(i64, String)>`, first element wins.
if inner.starts_with('(') {
let inside = inner.trim_start_matches('(').trim_end_matches(')');
let first = inside.split(',').next().unwrap_or("").trim();
@ -696,16 +861,16 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
if let Some(k) = rust_primitive_to_kind(inner) {
return Some(k);
}
// Phase 6.2: `Json<T>` / `Form<T>` / `Query<T>` /
// `Path<T>` with a same-file struct type resolve via
// `Json<T>` / `Form<T>` / `Query<T>` /
// `Path<T>` with a same-file struct type, resolve via
// the DTO map. Strip nested generics so `Json<Foo<i64>>`
// matches on `Foo`.
let head = inner.split('<').next().unwrap_or(inner).trim();
if let Some(k) = lookup_dto_class(head) {
return Some(k);
}
// Custom struct outside the same file leave None
// (cross-file resolution is Phase 6.4).
// Custom struct outside the same file, leave None
// (cross-file resolution is a follow-up).
return None;
}
}
@ -714,7 +879,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
}
/// Map a Rust primitive / `String` / `&str` to a [`TypeKind`]. Public
/// to the `cfg` module so the Phase 6 DTO collector can reuse it for
/// to the `cfg` module so the DTO DTO collector can reuse it for
/// `struct` field types.
pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
let t = t.trim();
@ -728,10 +893,10 @@ pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
}
}
/// Python (FastAPI) recognise typed-extractor parameters via the
/// Python (FastAPI), recognise typed-extractor parameters via the
/// `Annotated[X, Path()/Query()/Body()/Header()/Cookie()]` shape. Per
/// Hard Rule 3, a bare `def h(id: int)` is **not** a framework
/// extractor the function may be a plain Python function and the
/// extractor, the function may be a plain Python function and the
/// type annotation provides no runtime gate.
fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
let type_node = param.child_by_field_name("type")?;
@ -741,7 +906,7 @@ fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<Typ
fn python_type_to_kind(t: &str) -> Option<TypeKind> {
let stripped = t.trim();
// `Annotated[int, Path()]` only matches when one of the generic
// `Annotated[int, Path()]`, only matches when one of the generic
// args names a recognised FastAPI binding marker. Otherwise no
// framework gate is implied.
if let Some(inner) = stripped
@ -756,8 +921,8 @@ fn python_type_to_kind(t: &str) -> Option<TypeKind> {
if let Some(k) = python_primitive_to_kind(first) {
return Some(k);
}
// Phase 6.2: `Annotated[CreateUser, Body()]` with a same-file
// Pydantic model resolve via the DTO map. Generic args are
// `Annotated[CreateUser, Body()]` with a same-file
// Pydantic model, resolve via the DTO map. Generic args are
// dropped via the same head-split as `python_primitive_to_kind`.
let head = first.split('[').next().unwrap_or(first).trim();
return lookup_dto_class(head);
@ -773,7 +938,7 @@ fn contains_fastapi_marker(s: &str) -> bool {
}
/// Map a Python type expression to a primitive [`TypeKind`]. Used by
/// both the per-parameter classifier and the Phase 6 Pydantic-model
/// both the per-parameter classifier and the DTO Pydantic-model
/// field collector.
pub(super) fn python_primitive_to_kind(t: &str) -> Option<TypeKind> {
let head = t.trim().split('[').next().unwrap_or(t).trim();
@ -806,10 +971,70 @@ pub(super) fn is_configured_terminator(
mod typed_extractor_tests {
use super::{
contains_fastapi_marker, java_type_to_kind, python_primitive_to_kind, python_type_to_kind,
rust_primitive_to_kind, rust_type_to_kind,
rust_primitive_to_kind, rust_type_to_kind, rust_type_to_local_collection,
ts_type_to_local_collection,
};
use crate::ssa::type_facts::TypeKind;
// ── TypeScript / JavaScript local-collection types ───────────────────
#[test]
fn ts_built_in_collections_map_to_local_collection() {
// The four keyed/unkeyed built-in container generics.
assert_eq!(
ts_type_to_local_collection("Map<string, number>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("Set<string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("WeakMap<object, string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("WeakSet<object>"),
Some(TypeKind::LocalCollection)
);
// Array forms.
assert_eq!(
ts_type_to_local_collection("Array<string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("ReadonlyArray<string>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("string[]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
ts_type_to_local_collection("readonly string[]"),
Some(TypeKind::LocalCollection)
);
// Excalidraw-style keyed map with index-type generic args.
assert_eq!(
ts_type_to_local_collection("Map<ExcalidrawElement[\"id\"], ExcalidrawElement>"),
Some(TypeKind::LocalCollection)
);
}
#[test]
fn ts_non_collection_types_return_none() {
// Plain primitives.
assert_eq!(ts_type_to_local_collection("string"), None);
assert_eq!(ts_type_to_local_collection("number"), None);
assert_eq!(ts_type_to_local_collection("boolean"), None);
// Promise / Iterator / etc. are not LocalCollections.
assert_eq!(ts_type_to_local_collection("Promise<string>"), None);
assert_eq!(ts_type_to_local_collection("Iterator<number>"), None);
// User types.
assert_eq!(ts_type_to_local_collection("CreateUserDto"), None);
assert_eq!(ts_type_to_local_collection("ElementsMap"), None);
}
// ── Java (Spring) ────────────────────────────────────────────────────
#[test]
@ -841,7 +1066,7 @@ mod typed_extractor_tests {
#[test]
fn java_request_body_dto_returns_none_until_phase_six() {
// @RequestBody CreateUserDto dto — no kind today; Phase 6 will
// @RequestBody CreateUserDto dto, no kind today; future passes will
// return DtoObject(name) once cross-file class resolution lands.
assert_eq!(java_type_to_kind("CreateUserDto"), None);
assert_eq!(java_type_to_kind("List<String>"), None);
@ -860,7 +1085,7 @@ mod typed_extractor_tests {
#[test]
fn rust_path_tuple_first_element_wins() {
// Path<(i64, String)> first slot is the int extractor that
// Path<(i64, String)>, first slot is the int extractor that
// matters for sink suppression.
assert_eq!(
rust_type_to_kind("Path<(i64, String)>"),
@ -876,15 +1101,15 @@ mod typed_extractor_tests {
#[test]
fn rust_json_dto_returns_none_until_phase_six() {
// Json<T> / Form<T> / Query<T> with a custom struct type no
// primitive resolution available; Phase 6 lifts to DTO.
// Json<T> / Form<T> / Query<T> with a custom struct type, no
// primitive resolution available; future passes will lift to DTO.
assert_eq!(rust_type_to_kind("Json<CreateUserDto>"), None);
assert_eq!(rust_type_to_kind("Form<CreateUserDto>"), None);
assert_eq!(rust_type_to_kind("Query<Filters>"), None);
}
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)`) are NOT
/// framework extractors only wrapper types (`Path<i64>` etc.)
/// framework extractors, only wrapper types (`Path<i64>` etc.)
/// imply a framework runtime gate. Bare i64 must return None.
#[test]
fn rust_bare_primitives_are_not_framework_extractors() {
@ -903,7 +1128,7 @@ mod typed_extractor_tests {
#[test]
fn python_bare_primitives_are_not_framework_extractors() {
// Per Hard Rule 3: bare `def h(id: int)` is NOT a typed
// extractor without an `Annotated[..., Path()/Query()/Body()]`
// extractor, without an `Annotated[..., Path()/Query()/Body()]`
// wrapper, no FastAPI gate is implied.
assert_eq!(python_type_to_kind("int"), None);
assert_eq!(python_type_to_kind("float"), None);
@ -936,7 +1161,7 @@ mod typed_extractor_tests {
#[test]
fn python_annotated_without_marker_returns_none() {
// Annotated without a FastAPI binding marker is a generic
// type-system tag not a framework extractor.
// type-system tag, not a framework extractor.
assert_eq!(python_type_to_kind("Annotated[int, str]"), None);
assert_eq!(python_type_to_kind("Annotated[int, MyMeta]"), None);
}
@ -954,4 +1179,128 @@ mod typed_extractor_tests {
assert!(contains_fastapi_marker("bytes, File()"));
assert!(!contains_fastapi_marker("int, str"));
}
// ── Rust local-collection types ──────────────────────────────────────
#[test]
fn rust_std_collections_map_to_local_collection() {
for ty in [
"Vec<u32>",
"HashMap<String, u32>",
"HashSet<u64>",
"BTreeMap<u32, String>",
"BTreeSet<u32>",
"VecDeque<u8>",
"BinaryHeap<u32>",
"LinkedList<i32>",
] {
assert_eq!(
rust_type_to_local_collection(ty),
Some(TypeKind::LocalCollection),
"{ty} should map to LocalCollection"
);
}
}
#[test]
fn rust_ecosystem_collections_map_to_local_collection() {
for ty in [
"IndexMap<String, u32>",
"IndexSet<u64>",
"SmallVec<[u32; 4]>",
"DashMap<String, u32>",
"DashSet<u64>",
"FxHashMap<String, u32>",
"FxHashSet<u64>",
"RoaringBitmap",
"RoaringTreemap",
] {
assert_eq!(
rust_type_to_local_collection(ty),
Some(TypeKind::LocalCollection),
"{ty} should map to LocalCollection"
);
}
}
#[test]
fn rust_module_qualified_collections_map_to_local_collection() {
// Module-path prefixes: keep only the last segment for matching.
assert_eq!(
rust_type_to_local_collection("std::collections::HashMap<K, V>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("hashbrown::HashMap<String, RoaringBitmap>"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("roaring::RoaringBitmap"),
Some(TypeKind::LocalCollection)
);
}
#[test]
fn rust_reference_and_lifetime_markers_stripped() {
// `&T`, `&mut T`, `&'a T`, `&'a mut T`, `&'static T`,
// repeated `&` prefixes, all reach the underlying type head.
for ty in [
"&RoaringBitmap",
"&mut RoaringBitmap",
"&'a RoaringBitmap",
"&'a mut RoaringBitmap",
"&'static RoaringBitmap",
"&&mut RoaringBitmap",
"&'a mut hashbrown::HashMap<String, RoaringBitmap>",
] {
assert_eq!(
rust_type_to_local_collection(ty),
Some(TypeKind::LocalCollection),
"{ty} should map to LocalCollection after ref stripping"
);
}
}
#[test]
fn rust_array_and_slice_shorthand_map_to_local_collection() {
// `[T; N]` arrays and `[T]` slices are local containers.
assert_eq!(
rust_type_to_local_collection("[u32; 4]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("[u8]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("&[u32]"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
rust_type_to_local_collection("&mut [u32]"),
Some(TypeKind::LocalCollection)
);
}
#[test]
fn rust_persistent_db_and_sync_wrappers_return_none() {
// heed / sled / rocksdb persistent-store handles are NOT local
// collections, preserves IDOR detection on real DB calls.
assert_eq!(
rust_type_to_local_collection("Database<BEU32, SerdeJson<Task>>"),
None
);
assert_eq!(rust_type_to_local_collection("heed::Database<K, V>"), None);
assert_eq!(rust_type_to_local_collection("sled::Db"), None);
// Sync wrappers don't claim a sink shape.
assert_eq!(rust_type_to_local_collection("Mutex<HashMap<K, V>>"), None);
assert_eq!(rust_type_to_local_collection("RwLock<Vec<u32>>"), None);
// Bare primitives.
assert_eq!(rust_type_to_local_collection("u32"), None);
assert_eq!(rust_type_to_local_collection("&str"), None);
assert_eq!(rust_type_to_local_collection("String"), None);
// Unrelated user types.
assert_eq!(rust_type_to_local_collection("MyDao<User>"), None);
assert_eq!(rust_type_to_local_collection("Connection"), None);
}
}

View file

@ -107,11 +107,11 @@ fn has_web_handler_params(ctx: &AnalysisContext, func_name: &str) -> bool {
/// Determine if a function qualifies as a web entrypoint (not just any entrypoint).
///
/// A web entrypoint must:
/// 1. Match entrypoint naming rules (handle_*, route_*, api_*, etc.) but NOT bare `main`
/// 1. Match entrypoint naming rules (handle_*, route_*, api_*, etc.), but NOT bare `main`
/// unless it has web-like parameters
/// 2. Have parameters resembling HTTP handler signatures
fn is_web_entrypoint(ctx: &AnalysisContext, func_name: &str) -> bool {
// "main" without web params is a CLI entrypoint skip
// "main" without web params is a CLI entrypoint, skip
if func_name == "main" {
return has_web_handler_params(ctx, func_name);
}
@ -163,7 +163,7 @@ impl CfgAnalysis for AuthGap {
fn run(&self, ctx: &AnalysisContext) -> Vec<CfgFinding> {
// Decorator/annotation/attribute auth on the body declaration
// already gates every sink in the body skip the
// already gates every sink in the body, skip the
// structural-call dominance check entirely when the framework
// enforces auth at the declaration level. Mirrors the
// `classify_auth_decorators` lookup the state engine uses to

View file

@ -14,7 +14,7 @@ use petgraph::visit::EdgeRef;
/// Returns true if the identifier is exactly `err` / `error` or a
/// snake-case error name (`err_x`, `error_x`, `x_err`, `x_error`).
/// CamelCase names (`isErrorEnabled`, `getError`, `errorMsg`) are
/// rejected the cost is occasional FNs on Java-style error fields,
/// rejected, the cost is occasional FNs on Java-style error fields,
/// which is acceptable for a precision fix.
fn is_error_var_ident(name: &str) -> bool {
let lower = name.to_ascii_lowercase();
@ -36,7 +36,7 @@ fn is_error_var_ident(name: &str) -> bool {
/// Used by the error-fallthrough rule to skip happy-path checks
/// like `if (!data.error && Array.isArray(results))` whose TRUE branch
/// is the success path and is not expected to return. The original
/// rule fires on `if (err) { warn(); } sink_after()` a positive
/// rule fires on `if (err) { warn(); } sink_after()`, a positive
/// error check whose body forgets to early-return.
fn contains_negated_err_identifier(text: &str) -> bool {
let bytes = text.as_bytes();
@ -46,7 +46,7 @@ fn contains_negated_err_identifier(text: &str) -> bool {
i += 1;
continue;
}
// Skip the `!=` / `!==` operators those are comparisons, not
// Skip the `!=` / `!==` operators, those are comparisons, not
// logical-not. Only treat a `!` followed by whitespace or an
// identifier-leading char as logical negation.
if i + 1 < bytes.len() && bytes[i + 1] == b'=' {
@ -57,7 +57,7 @@ fn contains_negated_err_identifier(text: &str) -> bool {
while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
j += 1;
}
// Allow a leading `(` for `!(expr)` shapes peek past one open
// Allow a leading `(` for `!(expr)` shapes, peek past one open
// paren and continue capturing the identifier chain.
if j < bytes.len() && bytes[j] == b'(' {
j += 1;
@ -118,7 +118,95 @@ fn branch_terminates(cfg: &crate::cfg::Cfg, if_node: NodeIndex) -> bool {
false
}
/// Check if all paths from `node` reach a Return/Break/Continue before exiting scope.
/// Recognise calls that never return on the success path.
///
/// `cfg-error-fallthrough` looks for `if err != nil { … }` whose body
/// fails to terminate. A `return`/`break`/`continue`/`throw` is the
/// canonical terminator and already produces a `StmtKind::Return` /
/// `Throw` / `Break` / `Continue` node. But a large class of real
/// terminators arrives as a *call* whose callee is documented to abort
/// the goroutine, process, or test:
///
/// * Go testing, `t.Fatal`, `t.Fatalf`, `t.Fatalln`, `b.Fatal*`,
/// `*Helper()` chains ending in `Fatal*`, also third-party
/// `require.NoError(t, …)` (asserts and aborts on err) which the
/// common `c.Fatalf("...")` pattern in minio's table tests reduces
/// to. All `Fatal*` methods on a `testing.T`/`B`/`F` call
/// `runtime.Goexit()` which is documented as never returning to the
/// caller.
/// * Go std-library, `os.Exit`, `syscall.Exit`, `runtime.Goexit`,
/// `log.Fatal`, `log.Fatalf`, `log.Fatalln`, `log.Panic*`.
/// * Go builtin, bare `panic(…)`.
/// * Rust, `panic!`, `unreachable!`, `unimplemented!`, `todo!`,
/// `process::exit`, `std::process::exit`, `process::abort`,
/// `std::process::abort` (the macros currently lower to
/// `StmtKind::Throw` via tree-sitter's macro arm; the function
/// forms need explicit recognition).
/// * Python, `sys.exit`, `os._exit`, `os.abort`.
///
/// The recogniser looks at the bare method name (last segment after
/// `.` or `::`) and, where the receiver is a closed token, the
/// receiver's first segment. Bare `panic` / `exit` callees are
/// recognised only when the namespace context matches (callee equals
/// the literal string, no other receiver). This keeps the recogniser
/// from claiming arbitrary user-defined `Exit(...)` / `Panic(...)` as
/// terminators.
///
/// Closes the minio test-file cluster (49 in `xl-storage_test.go`
/// alone, 176 across the repo) where every `if err != nil { c.Fatalf(...) }`
/// fired `cfg-error-fallthrough`: the `Fatalf` aborts the goroutine
/// and the post-if code never executes, but the rule classified it as
/// fall-through. Conservative: only adds new terminators; never
/// removes the existing `Return`/`Throw`/`Break`/`Continue` recognition.
fn call_never_returns(info: &crate::cfg::NodeInfo) -> bool {
if info.kind != StmtKind::Call {
return false;
}
let Some(callee) = info.call.callee.as_deref() else {
return false;
};
let last = callee.rsplit(['.', ':']).next().unwrap_or(callee);
// Method names that always terminate when called on any receiver
// that's a testing handle (`*testing.T`, `*testing.B`, `*testing.F`)
// or a logger. Receiver type is unknown to this rule; the names
// are sufficiently distinctive that arbitrary user-defined methods
// sharing the name are vanishingly rare.
if matches!(
last,
// Go testing
"Fatal" | "Fatalf" | "Fatalln" | "FailNow" |
// Go log/slog terminating handlers
"Panic" | "Panicf" | "Panicln" |
// Rust process / never-return std fns
"abort" | "unreachable_unchecked"
) {
return true;
}
// Bare callees (no receiver) that are language builtins or
// unambiguous std-library terminators.
match callee {
// Go builtin
"panic" => return true,
// Go std
"os.Exit" | "syscall.Exit" | "runtime.Goexit" | "log.Fatal" | "log.Fatalf"
| "log.Fatalln" | "log.Panic" | "log.Panicf" | "log.Panicln" | "slog.Fatal"
| "klog.Fatal" | "klog.Fatalf" | "klog.Exit" | "klog.Exitf" => return true,
// Rust std
"process::exit" | "process::abort" | "std::process::exit" | "std::process::abort" => {
return true;
}
// Python std
"sys.exit" | "os._exit" | "os.abort" => return true,
_ => {}
}
false
}
/// Check if all paths from `node` reach a Return/Break/Continue (or a
/// known never-returning call) before exiting scope.
fn terminates_on_all_paths(
cfg: &crate::cfg::Cfg,
node: NodeIndex,
@ -142,10 +230,15 @@ fn terminates_on_all_paths(
}
_ => {}
}
if call_never_returns(info) {
// Documented never-returning call (`t.Fatalf`, `os.Exit`,
// `panic`, `runtime.Goexit`, …), this path terminates.
continue;
}
let successors: Vec<_> = cfg.neighbors(current).collect();
if successors.is_empty() {
// Reached a dead end without terminating — path does not terminate
// Reached a dead end without terminating, path does not terminate
return false;
}
@ -181,7 +274,7 @@ fn find_post_if_sinks(cfg: &crate::cfg::Cfg, if_node: NodeIndex) -> Vec<NodeInde
// Seed from the False edge only. If the if has no explicit False
// edge (some CFG shapes omit it for one-branch ifs), fall back to
// Seq edges from the if node but never follow True edges, which
// Seq edges from the if node, but never follow True edges, which
// lead into the body.
let mut stack: Vec<NodeIndex> = cfg
.edges(if_node)
@ -225,9 +318,9 @@ impl CfgAnalysis for IncompleteErrorHandling {
// Look for If nodes whose CONDITION involves "err" or "error".
// `info.taint.uses` for an If node contains identifiers from the
// whole if statement (condition + body) see
// whole if statement (condition + body), see
// `cfg::literals::extract_defs_uses_extra_defs` Kind::If branch
// so checking it would misfire on `if (!res.ok) { ... const
//, so checking it would misfire on `if (!res.ok) { ... const
// err = await … ; return … }` shapes whose body happens to
// mention `err` even though the condition doesn't. Use
// `info.condition_vars`, which is populated strictly from the
@ -244,7 +337,7 @@ impl CfgAnalysis for IncompleteErrorHandling {
// Polarity gate: only fire when the condition POSITIVELY
// checks for an error. `if (!data.error && other)` is a
// happy-path check the TRUE branch is the success branch
// happy-path check, the TRUE branch is the success branch
// and is not expected to terminate. Detect by scanning the
// condition text for any `!` (logical-not, distinct from
// `!=`) preceding an identifier whose name contains "err".
@ -354,7 +447,7 @@ mod err_ident_tests {
fn rejects_camelcase_method_names() {
// Spring `logger.isErrorEnabled()` lifts `isErrorEnabled` into
// `condition_vars`; under the old `lower.contains("err")` check
// this fired the rule. The new strict check rejects it the
// this fired the rule. The new strict check rejects it, the
// condition is asking "is logging enabled", not "is there an
// error".
assert!(!is_error_var_ident("isErrorEnabled"));
@ -371,3 +464,103 @@ mod err_ident_tests {
assert!(!is_error_var_ident("perform"));
}
}
#[cfg(test)]
mod terminator_call_tests {
use super::call_never_returns;
use crate::cfg::{CallMeta, NodeInfo, StmtKind};
fn call_node(callee: &str) -> NodeInfo {
NodeInfo {
kind: StmtKind::Call,
call: CallMeta {
callee: Some(callee.to_string()),
..Default::default()
},
..Default::default()
}
}
#[test]
fn recognises_go_testing_fatal_methods() {
// Bare method name on any receiver, the canonical minio test
// shape `c.Fatalf("bucket creat error: %v", err)`.
assert!(call_never_returns(&call_node("c.Fatalf")));
assert!(call_never_returns(&call_node("t.Fatal")));
assert!(call_never_returns(&call_node("t.Fatalf")));
assert!(call_never_returns(&call_node("t.Fatalln")));
assert!(call_never_returns(&call_node("b.Fatal")));
assert!(call_never_returns(&call_node("t.FailNow")));
// Logger panics (handler-style fatal).
assert!(call_never_returns(&call_node("logger.Panic")));
assert!(call_never_returns(&call_node("logger.Panicf")));
}
#[test]
fn recognises_go_std_terminators() {
assert!(call_never_returns(&call_node("os.Exit")));
assert!(call_never_returns(&call_node("syscall.Exit")));
assert!(call_never_returns(&call_node("runtime.Goexit")));
assert!(call_never_returns(&call_node("log.Fatal")));
assert!(call_never_returns(&call_node("log.Fatalf")));
assert!(call_never_returns(&call_node("log.Fatalln")));
assert!(call_never_returns(&call_node("log.Panic")));
assert!(call_never_returns(&call_node("klog.Exit")));
// Bare builtin
assert!(call_never_returns(&call_node("panic")));
}
#[test]
fn recognises_rust_and_python_std_terminators() {
assert!(call_never_returns(&call_node("std::process::exit")));
assert!(call_never_returns(&call_node("std::process::abort")));
assert!(call_never_returns(&call_node("process::exit")));
assert!(call_never_returns(&call_node("sys.exit")));
assert!(call_never_returns(&call_node("os._exit")));
}
#[test]
fn does_not_claim_user_defined_lookalikes() {
// Bare `Exit` on a custom receiver is a normal method, not the
// process-level terminator. The bare callee path only matches
// exact std-library forms.
assert!(!call_never_returns(&call_node("server.Exit")));
assert!(!call_never_returns(&call_node("Exit")));
assert!(!call_never_returns(&call_node("session.exit")));
// Bare `panic` is a Go builtin; method `panic` is not.
// The recogniser keys off the full callee path so
// `widget.panic` does not match.
assert!(!call_never_returns(&call_node("widget.panic")));
// Common helpers that *don't* terminate.
assert!(!call_never_returns(&call_node("log.Print")));
assert!(!call_never_returns(&call_node("log.Println")));
assert!(!call_never_returns(&call_node("t.Errorf")));
assert!(!call_never_returns(&call_node("t.Logf")));
assert!(!call_never_returns(&call_node("c.Skip")));
}
#[test]
fn requires_call_kind() {
// Only StmtKind::Call nodes are inspected; an If or Seq node
// carrying the same callee text wouldn't ever come through
// this path. Defensive: confirm the kind gate.
let mut node = call_node("t.Fatal");
node.kind = StmtKind::Seq;
assert!(!call_never_returns(&node));
node.kind = StmtKind::If;
assert!(!call_never_returns(&node));
}
#[test]
fn missing_callee_does_not_panic() {
let node = NodeInfo {
kind: StmtKind::Call,
call: CallMeta {
callee: None,
..Default::default()
},
..Default::default()
};
assert!(!call_never_returns(&node));
}
}

View file

@ -29,7 +29,7 @@ pub struct UnguardedSink;
/// receiver recorded as a compound identifier rather than a named binding).
fn is_all_args_constant(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
// Fast path: syntactic literal detection from CFG construction.
// Strictly weaker than the one-hop trace below serves as an
// Strictly weaker than the one-hop trace below, serves as an
// optimization for the common case of inline literal arguments.
if ctx.cfg[sink].all_args_literal {
return true;
@ -127,17 +127,17 @@ fn ssa_all_sink_operands_constant(
/// SSA-backed reassign-aware safety probe: every operand of the sink
/// resolves to a constant, callee fragment, OR a function parameter that
/// is not itself a Source. Used at the cfg-unguarded-sink site under
/// `!has_taint` the taint engine has already proved no source-tainted
/// `!has_taint`, the taint engine has already proved no source-tainted
/// data reaches the sink, so a non-source Param at operand position is
/// inert payload-wise (e.g. HTTP writer in `Fprintf(w, "<h1>", "Guest")`).
///
/// Gated on the function body actually exhibiting the reassign-to-constant
/// signature at least one named SSA def whose RHS is a literal Const
/// signature, at least one named SSA def whose RHS is a literal Const
/// (`name = "Guest"`). In a thin wrapper without a same-block named
/// const assignment (`fn wrap(p) { sink(p) }`, or C `popen(buf, "r")` where
/// `buf` is filled in-place by `sprintf` with no Const Assign on `buf`),
/// the bare Param at operand position IS the payload and the suppression's
/// rationale does not apply `cfg-unguarded-sink` must still fire.
/// rationale does not apply, `cfg-unguarded-sink` must still fire.
fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
let Some(facts) = ctx.body_const_facts else {
return false;
@ -165,13 +165,13 @@ fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex)
}
/// Return true if the SSA body contains a *named* variable whose definition
/// is a constant the SSA signature of an explicit `name = "literal"`
/// is a constant, the SSA signature of an explicit `name = "literal"`
/// reassignment. Used as the gate for the broader operand-Param suppression:
/// the suppression's purpose is the reassign-to-constant idiom, which by
/// definition has at least one named const assignment. In a thin wrapper
/// (`fn wrap(p) { sink(p) }` or `popen(buf, "r")` where `buf` is filled by
/// `sprintf`), no such named const assignment exists and the suppression's
/// rationale doesn't apply so the bare-Param structural finding fires.
/// rationale doesn't apply, so the bare-Param structural finding fires.
fn func_body_has_named_const_assign(facts: &BodyConstFacts) -> bool {
for block in &facts.ssa.blocks {
for inst in &block.body {
@ -228,7 +228,7 @@ fn ssa_operand_const_or_param(
// CFG-node-level Source label: when an SSA `Call` corresponds to a
// Source-labeled CFG node (e.g. `env::var(...)` whose callee
// matches a `LabelRule` Source matcher), the call's result is
// tainted user input refuse, regardless of how the SSA
// tainted user input, refuse, regardless of how the SSA
// happened to lower. Catches the `SsaOp::Call` lowering of
// labeled Source functions, which the `SsaOp::Source` arm only
// sees for callee-less pure sources like PHP `$_GET`.
@ -266,7 +266,7 @@ fn ssa_operand_const_or_param(
}
SsaOp::Source => return false,
SsaOp::Nop | SsaOp::Undef => {}
// FieldProj: walk the receiver `obj.f` is constant iff `obj`
// FieldProj: walk the receiver, `obj.f` is constant iff `obj`
// is constant under the same definition. The field name itself
// is structural and adds no runtime value.
SsaOp::FieldProj { receiver, .. } => stack.push(*receiver),
@ -321,7 +321,7 @@ fn ssa_operand_constant(
}
SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam | SsaOp::Source => {
// Only acceptable when the param's `var_name` is a callee
// fragment i.e. an identifier that only appears because
// fragment, i.e. an identifier that only appears because
// the CFG recorded name components of the dotted/chained
// callee as uses. Real parameters and sources are dynamic.
let name = inst.var_name.as_deref().unwrap_or("");
@ -333,7 +333,7 @@ fn ssa_operand_constant(
}
}
SsaOp::Nop => {}
// Undef is a non-user, non-dynamic sentinel treat like Const
// Undef is a non-user, non-dynamic sentinel, treat like Const
// (no additional operands to trace).
SsaOp::Undef => {}
// FieldProj: structural field read; constness reduces to the
@ -440,7 +440,7 @@ fn sink_args_typed_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap)
!is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts)
}
// Constant string literals used as inline args (e.g. `"listener"`,
// `"-c"`) are not user-controlled treat as non-real for the
// `"-c"`) are not user-controlled, treat as non-real for the
// "all int-typed" test so they don't block suppression.
SsaOp::Const(_) => false,
_ => true,
@ -477,7 +477,7 @@ fn type_facts_suppress(values: &[SsaValue], sink_caps: Cap, type_facts: &TypeFac
/// lookup idiom (e.g. `map.get(x).unwrap_or("safe")` over literal inserts)
/// should clear a command-injection sink.
///
/// Only fires for `Cap::SHELL_ESCAPE` SQL / path suppression from this
/// Only fires for `Cap::SHELL_ESCAPE`, SQL / path suppression from this
/// domain would require stronger reasoning (literal keys can still carry
/// SQL tokens if the inserts themselves contain them).
fn sink_args_static_map_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
@ -595,6 +595,71 @@ fn match_config_sanitizer(callee: &str, extra: &[RuntimeLabelRule]) -> Option<Ca
None
}
/// Resolve the `if (X)` / `if (!X)` indirect-validator pattern: the
/// condition has exactly one bare-identifier variable whose defining
/// CFG node is a [`StmtKind::Call`] whose `defines` is the same name
/// and whose `callee` is recognised by
/// [`crate::ssa::type_facts::classify_input_validator_callee`].
///
/// Returns the validator callee name when the pattern matches, `None`
/// otherwise. Conservative: bails when the condition has zero or more
/// than one variable, when no defining call is found, or when the
/// callee doesn't match a validator pattern. Mirrors the SSA
/// branch-narrowing layer
/// ([`crate::taint::ssa_transfer::apply_input_validator_branch_narrowing`])
/// so the structural `cfg-unguarded-sink` suppression matches the
/// taint engine's validator recognition.
///
/// Driven off CFG `TaintMeta.defines` rather than the per-body SSA
/// value-defs because nested arrow-function bodies are sometimes
/// lowered with empty SSA in the cfg-analysis context, but the CFG
/// nodes themselves carry `defines` in every body.
fn cond_indirect_validator_callee(
info: &crate::cfg::NodeInfo,
ctx: &AnalysisContext,
) -> Option<String> {
if info.condition_vars.len() != 1 {
return None;
}
let var_name = info.condition_vars[0].as_str();
let cond_func = info.ast.enclosing_func.as_deref();
let cond_span_start = info.ast.span.0;
// Walk the CFG for any node that DEFINES `var_name` via a Call
// expression. Same-function only, and only consider definitions
// textually before the condition: a reassignment after the `if`
// cannot be the def reaching it. Among the eligible defs, take
// the textually-last one (highest span start), a conservative
// latest-def proxy without paying for full dominator analysis.
let mut best: Option<(usize, &str)> = None;
for nidx in ctx.cfg.node_indices() {
let n = &ctx.cfg[nidx];
if n.kind != crate::cfg::StmtKind::Call {
continue;
}
if n.taint.defines.as_deref() != Some(var_name) {
continue;
}
if n.ast.enclosing_func.as_deref() != cond_func {
continue;
}
let span_start = n.ast.span.0;
if span_start >= cond_span_start {
continue;
}
let Some(callee) = n.call.callee.as_deref() else {
continue;
};
match best {
Some((s, _)) if s >= span_start => {}
_ => best = Some((span_start, callee)),
}
}
let (_, callee) = best?;
crate::ssa::type_facts::classify_input_validator_callee(callee).map(|_| callee.to_string())
}
/// Find all nodes in the CFG that are calls to guard functions.
fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
let guard_rules = rules::guard_rules(ctx.lang);
@ -620,6 +685,24 @@ fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
| PredicateKind::ValidationCall
) {
result.push((idx, Cap::all()));
} else if cond_indirect_validator_callee(info, ctx).is_some() {
// Indirect-validator pattern:
// const err = validate(x); if (err) throw …;
// const ok = isValid(x); if (!ok) throw …;
// The classifier returns Unknown / NullCheck / ErrorCheck
// because the if-condition is a bare result variable, not
// a direct call expression. `cond_indirect_validator_callee`
// handles that by scanning the CFG for nodes whose
// `TaintMeta.defines` matches the condition variable and
// checking whether any defining Call has an
// `is_input_validator_callee`-recognised callee. This keeps
// cfg-unguarded-sink suppression aligned with the same
// structural validator recognition the SSA branch-narrowing
// layer uses, without requiring the condition itself to be
// a direct call expression.
//
// Motivated by Novu CVE GHSA-4x48-cgf9-q33f.
result.push((idx, Cap::all()));
} else if matches!(
kind,
PredicateKind::ShellMetaValidated | PredicateKind::BoundedLength
@ -733,7 +816,7 @@ fn sink_arg_is_parameter_only(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
let sink_uses = &sink_info.taint.uses;
if sink_uses.is_empty() {
// No identifiable arguments could be a constant call like Command::new("ls")
// No identifiable arguments, could be a constant call like Command::new("ls")
return true; // treat as non-dangerous (constant arg)
}
@ -787,7 +870,7 @@ pub(crate) fn has_redirect_path_prefix(source_bytes: &[u8], span: (usize, usize)
false
}
/// Check if this sink is an internal redirect a `res.redirect` (SSRF sink)
/// Check if this sink is an internal redirect, a `res.redirect` (SSRF sink)
/// whose argument is a template literal or string starting with `/`, indicating
/// a server-relative path rather than an attacker-controlled URL.
fn is_internal_redirect(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
@ -896,7 +979,7 @@ impl CfgAnalysis for UnguardedSink {
let source_derived = sink_arg_is_source_derived(ctx, *sink);
// If sink args are all constants (including one-hop constant bindings)
// and taint didn't confirm, this is a false positive skip it.
// and taint didn't confirm, this is a false positive, skip it.
if is_all_args_constant(ctx, *sink) && !has_taint {
continue;
}
@ -904,7 +987,7 @@ impl CfgAnalysis for UnguardedSink {
// SSA latest-def suppression: when the taint engine has already
// proved no source-tainted data reaches this sink (`!has_taint`)
// and every SSA operand resolves to a constant, callee-fragment
// pseudo-name, OR a function parameter that is not a Source
// pseudo-name, OR a function parameter that is not a Source ,
// the sink's actual arguments cannot carry an injection payload.
// Catches the reassign-to-constant idiom (`name := req.x; name =
// "Guest"; sink(name)`) where the latest SSA def is a literal
@ -919,7 +1002,7 @@ impl CfgAnalysis for UnguardedSink {
// Type-aware suppression: when all SSA operand values of the sink
// are proven to carry non-injectable types (e.g. integers parsed
// from a raw source), the arguments cannot form a payload for
// SHELL/SQL/FILE sinks. Skip the structural finding the taint
// SHELL/SQL/FILE sinks. Skip the structural finding, the taint
// engine already covers the source→sink flow via type-aware
// suppression. Unknown-typed or mixed operands fall through.
if !has_taint && sink_args_typed_safe(ctx, *sink, sink_caps) {
@ -936,13 +1019,13 @@ impl CfgAnalysis for UnguardedSink {
// Parameterized SQL queries: arg 0 is a string literal with
// placeholders ($1, ?, %s, :name) and a params argument exists.
// These are safe by construction the driver handles escaping.
// These are safe by construction, the driver handles escaping.
if sink_info.parameterized_query {
continue;
}
// Internal redirects: res.redirect(`/path/...`) with a path-prefix
// argument are server-relative not attacker-controlled URLs.
// argument are server-relative, not attacker-controlled URLs.
if is_internal_redirect(ctx, *sink, sink_caps) {
continue;
}
@ -953,10 +1036,10 @@ impl CfgAnalysis for UnguardedSink {
let (severity, confidence) = if has_taint || source_derived {
(Severity::High, Confidence::High)
} else if param_only && !in_entrypoint {
// Wrapper function with param-only args zero signal. Suppress.
// Wrapper function with param-only args, zero signal. Suppress.
continue;
} else if !ctx.taint_active {
// AST-only / cfg-only mode preserve as LOW (unchanged)
// AST-only / cfg-only mode, preserve as LOW (unchanged)
(Severity::Low, Confidence::Low)
} else {
// taint_active=true but found nothing.
@ -970,7 +1053,7 @@ impl CfgAnalysis for UnguardedSink {
// If the function containing the sink has no Source-labeled
// nodes AND no parameters (through which taint could flow
// from callers), taint ran and found nothing because there
// is nothing to find. Suppress the structural finding
// is nothing to find. Suppress, the structural finding
// is noise.
let sink_func = sink_info.ast.enclosing_func.as_deref();
let has_sources = ctx.cfg.node_indices().any(|n| {

View file

@ -1,3 +1,5 @@
#![doc = include_str!(concat!(env!("OUT_DIR"), "/cfg_analysis.md"))]
pub mod auth;
pub mod dominators;
pub mod error_handling;
@ -30,17 +32,15 @@ pub struct BodyConstFacts {
pub type_facts: TypeFactResult,
/// Field-sensitive Steensgaard points-to facts.
///
/// Computed only when [`crate::pointer::is_enabled()`] (i.e. the
/// `NYX_POINTER_ANALYSIS=1` env var is set). Phase 2 of the
/// pointer-analysis rollout consumes this in `state::transfer.rs`
/// to suppress proxy-acquire mis-attribution on field-aliased
/// locals like `m := c.mu`. When `None`, every consumer must fall
/// back to its existing pointer-unaware behaviour.
/// Computed only when [`crate::pointer::is_enabled()`].
/// `state::transfer.rs` consumes this to suppress proxy-acquire
/// mis-attribution on field-aliased locals like `m := c.mu`. When
/// `None`, consumers fall back to pointer-unaware behaviour.
pub pointer_facts: Option<crate::pointer::PointsToFacts>,
}
/// Lower a body to SSA and run constant propagation. Returns `None` when
/// lowering fails (empty CFG, invalid entry) callers treat absence as
/// lowering fails (empty CFG, invalid entry), callers treat absence as
/// "no SSA facts available" and fall back to the syntactic path.
pub fn build_body_const_facts(body: &crate::cfg::BodyCfg, lang: Lang) -> Option<BodyConstFacts> {
let mut ssa = crate::ssa::lower_to_ssa_with_params(
@ -116,13 +116,13 @@ pub struct AnalysisContext<'a> {
/// Structural analyses use it to suppress findings when a sink's argument
/// SSA values are proven to carry non-injectable types (e.g. integers
/// parsed from a raw source can't form SHELL/SQL/path payloads). Sourced
/// from `body_const_facts` when present keep both pointers coherent.
/// from `body_const_facts` when present, keep both pointers coherent.
pub type_facts: Option<&'a TypeFactResult>,
/// Decorators / annotations / attributes attached to the body's
/// declaration (e.g. Python `@login_required`, Java `@PreAuthorize`,
/// Symfony `#[IsGranted(...)]`). Consumed by the AuthGap analysis to
/// suppress `cfg-auth-gap` when the framework already enforces auth at
/// the function-declaration level the gap only matters when the
/// the function-declaration level, the gap only matters when the
/// auth call has to live inside the body.
pub auth_decorators: &'a [String],
}

View file

@ -25,7 +25,7 @@ fn find_acquire_nodes(
}
if let Some(callee) = &info.call.callee {
let callee_lower = callee.to_ascii_lowercase();
// Check exclusions first if the callee matches an exclude
// Check exclusions first, if the callee matches an exclude
// pattern, it is NOT an acquire even if it also matches an
// acquire pattern (e.g. `freopen` ends with `fopen`).
let excluded = exclude_patterns.iter().any(|p| {
@ -74,7 +74,7 @@ fn find_release_nodes(ctx: &AnalysisContext, release_patterns: &[&str]) -> Vec<N
/// `if (acquire_var)` (or `if (!acquire_var)`) and the edge represents
/// "acquire_var is null", the resource was never actually produced on that
/// path, so a release is unnecessary. This closes the canonical
/// `FILE *f = fopen(...); if (f) fclose(f);` idiom without this rule the
/// `FILE *f = fopen(...); if (f) fclose(f);` idiom, without this rule the
/// false edge of the null check provides a path acquire→exit that misses
/// the release, producing a may-leak FP.
fn release_on_all_exit_paths(
@ -103,8 +103,8 @@ fn release_on_all_exit_paths(
/// the resource handle is null/falsy and therefore not actually acquired.
///
/// Recognises:
/// * `if (var)` false edge means `var` is null
/// * `if (!var)` true edge means `var` is null
/// * `if (var)`, false edge means `var` is null
/// * `if (!var)`, true edge means `var` is null
///
/// Rejects comparisons (`if (var != NULL)`), method calls
/// (`if (var.is_valid())`), and composite conditions (`if (var && cond)`).
@ -198,7 +198,7 @@ fn all_paths_pass_through(
/// - `obj.field = var` (C dot / generic field store)
/// - `list->next = ...` (linked-list insertion)
///
/// If the variable is transferred, there is no leak the receiving struct is
/// If the variable is transferred, there is no leak, the receiving struct is
/// responsible for the lifetime.
fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
let acquired_var = match &ctx.cfg[acquire].taint.defines {
@ -258,7 +258,7 @@ fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
false
};
if !is_field_write {
continue; // genuine redefinition stop this path
continue; // genuine redefinition, stop this path
}
}
@ -343,7 +343,7 @@ fn is_consumed_by_owner(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
}
}
// Also check the span text for consuming calls handles cases where
// Also check the span text for consuming calls, handles cases where
// the call is embedded in a return statement (e.g. `return FileResponse(f)`)
if info.taint.uses.iter().any(|u| u == &acquired_var) {
let (start, end) = info.ast.span;

View file

@ -141,7 +141,7 @@ static JAVA_AUTH: &[AuthRule] = &[AuthRule {
"hasPermission",
"requireRole",
// Spring Security / JAX-RS annotation names (used by decorator
// detection see `extract_auth_decorators` in src/cfg.rs).
// detection, see `extract_auth_decorators` in src/cfg.rs).
"PreAuthorize",
"PostAuthorize",
"Secured",
@ -174,7 +174,7 @@ static JS_AUTH: &[AuthRule] = &[AuthRule {
"jwt.verify",
// NestJS-style decorators and guard class names (seeded by decorator
// arg extraction in `extract_auth_decorators`). `UseGuards` alone is
// too generic we still match on guard *argument* identifiers here.
// too generic, we still match on guard *argument* identifiers here.
"Authenticated",
"AuthGuard",
"JwtAuthGuard",
@ -268,7 +268,7 @@ static CPP_AUTH: &[AuthRule] = &[AuthRule {
"check_auth",
"verify_token",
"validate_token",
// Custom C++ attributes framework-defined, bare-name match.
// Custom C++ attributes, framework-defined, bare-name match.
"authenticated",
"require_auth",
"admin_only",
@ -287,7 +287,7 @@ static RUST_AUTH: &[AuthRule] = &[AuthRule {
"check_auth",
"verify_token",
"validate_token",
// Custom proc-macro attributes framework-defined, bare-name match.
// Custom proc-macro attributes, framework-defined, bare-name match.
"authenticated",
"require_auth",
"admin_only",

View file

@ -127,7 +127,7 @@ fn unreachable_code_detection_runs_without_panic() {
#[test]
fn all_branches_reachable_no_findings() {
// All branches reachable no unreachable-code findings
// All branches reachable, no unreachable-code findings
let src = br#"
use std::process::Command;
fn main() {
@ -282,7 +282,7 @@ fn ssa_const_prop_preserves_sink_on_dynamic_source_arg() {
#[test]
fn unguarded_sink_detected() {
// Sink with no validation should be flagged
// Sink with no validation, should be flagged
let src = br#"
use std::process::Command;
fn main() {
@ -335,6 +335,90 @@ fn guarded_sink_with_sanitizer_not_flagged() {
);
}
/// Regression: `cond_indirect_validator_callee` used to pick the
/// textually-last call defining the condition variable across the
/// whole function, including reassignments that occur **after** the
/// `if`. When that later call wasn't a recognised validator, the
/// validator pattern was missed and the downstream sink was
/// (incorrectly) flagged as `cfg-unguarded-sink`.
///
/// Pattern:
/// let err = validateInput(cmd); // real validator, before the if
/// if (err) throw …; // sink-guarding branch
/// eval(cmd); // sink dominated by the guard
/// err = recordMetric(); // later reassignment, NOT a validator
///
/// The defining call reaching the `if` is `validateInput`; the
/// `recordMetric` reassignment is downstream of the use and must not
/// shadow it.
#[test]
fn indirect_validator_ignores_reassignment_after_if() {
let src = br#"
async function handler(req) {
const cmd = req.query.cmd;
let err = await validateInput(cmd);
if (err) {
throw new Error('blocked');
}
eval(cmd);
err = recordMetric();
}
"#;
let findings = parse_and_analyse(
&guards::UnguardedSink,
src,
"javascript",
Language::from(tree_sitter_javascript::LANGUAGE),
);
let guard_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
guard_findings.is_empty(),
"later non-validator reassignment must not shadow the real validator def reaching the if; got {:?}",
guard_findings
);
}
/// Companion sanity check for `indirect_validator_ignores_reassignment_after_if`:
/// without the trailing reassignment the same pattern is already
/// suppressed by `cond_indirect_validator_callee`. Pinned so a future
/// change to the indirect-validator recognition can't silently regress
/// this baseline alongside the regression case above.
#[test]
fn indirect_validator_baseline_suppresses_dominated_sink() {
let src = br#"
async function handler(req) {
const cmd = req.query.cmd;
const err = await validateInput(cmd);
if (err) {
throw new Error('blocked');
}
eval(cmd);
}
"#;
let findings = parse_and_analyse(
&guards::UnguardedSink,
src,
"javascript",
Language::from(tree_sitter_javascript::LANGUAGE),
);
let guard_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
guard_findings.is_empty(),
"indirect-validator pattern (no reassignment) must suppress dominated sink; got {:?}",
guard_findings
);
}
// ─── Auth gap tests ────────────────────────────────────────────────────
#[test]
@ -397,7 +481,7 @@ fn auth_check_before_sink_no_finding() {
#[test]
fn error_fallthrough_analysis_runs_on_go() {
// Go pattern: err check without return, followed by dangerous call.
// This is a heuristic analysis we verify it runs without panicking.
// This is a heuristic analysis, we verify it runs without panicking.
let src = br#"
package main
import "os/exec"
@ -422,7 +506,7 @@ fn error_fallthrough_analysis_runs_on_go() {
#[test]
fn proper_error_return_no_finding_go() {
// Go pattern: err check with return should not flag error fallthrough.
// Go pattern: err check with return, should not flag error fallthrough.
let src = br#"
package main
import "os/exec"
@ -820,6 +904,7 @@ fn taint_and_unguarded_sink_deduped() {
path_hash: 0,
finding_id: String::new(),
alternative_finding_ids: smallvec::SmallVec::new(),
effective_sink_caps: crate::labels::Cap::empty(),
}];
let findings = parse_and_run_all_with_taint(
@ -949,7 +1034,7 @@ function readFile() {
#[test]
fn js_throw_terminates_block() {
// throw should act as a terminator code directly after throw in the same
// throw should act as a terminator, code directly after throw in the same
// block should be unreachable.
let src = br#"
function fail() {
@ -1031,7 +1116,7 @@ fn configured_terminator_stops_flow() {
"eval should be unreachable after process.exit terminator"
);
}
// If eval_nodes is empty it means the node wasn't created (also acceptable
// If eval_nodes is empty it means the node wasn't created (also acceptable ,
// it's after a terminator so the CFG may not even emit it)
}
@ -1480,7 +1565,7 @@ void process() {
let reachable = dominators::reachable_set(cfg, entry);
// All nodes should be reachable the preproc recovery should prevent
// All nodes should be reachable, the preproc recovery should prevent
// the dangling-else from orphaning downstream code.
let unreachable_count = cfg.node_count() - reachable.len();
assert!(
@ -1515,7 +1600,7 @@ void process() {
let reachable = dominators::reachable_set(cfg, entry);
// All nodes should be reachable break exits the loop and post-loop
// All nodes should be reachable, break exits the loop and post-loop
// code (free(x)) should be connected.
let unreachable_count = cfg.node_count() - reachable.len();
assert!(
@ -1878,7 +1963,7 @@ def run():
#[test]
fn python_one_hop_constant_still_suppressed() {
// cmd = "ls"; os.system(cmd) `all_args_literal` is false (identifier arg),
// cmd = "ls"; os.system(cmd), `all_args_literal` is false (identifier arg),
// but should still be suppressed via existing one-hop constant trace in cfg_analysis.
let src = br#"
import os
@ -1959,7 +2044,7 @@ def run():
#[test]
fn python_constant_receiver_tainted_arg_produces_finding() {
// safe_obj.system(user_input) constant receiver is irrelevant, tainted arg must report
// safe_obj.system(user_input), constant receiver is irrelevant, tainted arg must report
let src = br#"
import os
import sys

View file

@ -26,7 +26,7 @@ fn event_handler_callbacks(ctx: &AnalysisContext) -> HashSet<String> {
.iter()
.any(|h| callee_lower.ends_with(&h.to_ascii_lowercase()));
if is_handler {
// The callback function is typically used within the call any function
// The callback function is typically used within the call, any function
// that appears as `uses` of this call node is a potential callback.
for u in &info.taint.uses {
callbacks.insert(u.clone());
@ -113,7 +113,7 @@ impl CfgAnalysis for UnreachableCode {
Severity::Medium,
)
} else {
// Plain unreachable code low severity
// Plain unreachable code, low severity
continue;
}
};

View file

@ -57,7 +57,7 @@ fn print_toml_with_highlights(toml_str: &str) {
continue;
}
// key = value lines (but not `[xxx]`). Split on the first `=`
// that isn't inside a quoted string TOML keys don't contain
// that isn't inside a quoted string, TOML keys don't contain
// `=` outside quotes, so a leading-segment split is safe enough
// for the common case. Continuation lines from multi-line
// arrays/strings won't have `=` and fall through to plain.
@ -149,7 +149,7 @@ fn prune_matching(effective: &toml::Value, defaults: &toml::Value) -> Option<tom
}
}
None => {
// Key absent in defaults keep entirely.
// Key absent in defaults, keep entirely.
out.insert(k.clone(), v.clone());
}
}
@ -160,9 +160,9 @@ fn prune_matching(effective: &toml::Value, defaults: &toml::Value) -> Option<tom
Some(toml::Value::Table(out))
}
}
// Identical leaf drop.
// Identical leaf, drop.
_ if effective == defaults => None,
// Differing leaf or shape change keep the effective value.
// Differing leaf or shape change, keep the effective value.
_ => Some(effective.clone()),
}
}
@ -180,13 +180,13 @@ fn count_top_level_keys(toml_str: &str) -> usize {
continue;
}
if trimmed.starts_with('[') {
// Section header not an override on its own. Reset
// Section header, not an override on its own. Reset
// any stuck multi-line state defensively.
in_multiline = false;
continue;
}
if in_multiline {
// Inside a multi-line array/inline table closing bracket
// Inside a multi-line array/inline table, closing bracket
// ends it, intermediate lines don't count.
if trimmed.starts_with(']') || trimmed.starts_with('}') {
in_multiline = false;

View file

@ -123,7 +123,7 @@ pub fn build_index_with_observer(
logs: Option<&Arc<ScanLogCollector>>,
) -> NyxResult<()> {
// Pass 1 of the indexed scan reads persisted summaries produced here, so
// framework context must be populated at index-build time otherwise
// framework context must be populated at index-build time, otherwise
// framework-conditional label rules never contribute to the summaries
// and indexed scans diverge from non-indexed ones. Matches the
// auto-fill in scan_filesystem_with_observer /
@ -152,7 +152,7 @@ pub fn build_index_with_observer(
let walk_start = std::time::Instant::now();
let (rx, handle) = spawn_file_walker(project_path, config);
// Drain the channel BEFORE joining the bounded channel will deadlock
// Drain the channel BEFORE joining, the bounded channel will deadlock
// if we join first and the walker blocks on send.
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
if let Err(err) = handle.join() {
@ -205,7 +205,7 @@ pub fn build_index_with_observer(
.try_for_each(|path| -> NyxResult<()> {
let mut idx = Indexer::from_pool(project_name, &pool)?;
// Read once, hash once pass bytes to both rule execution and
// Read once, hash once, pass bytes to both rule execution and
// summary extraction. Use pre-computed hash for upsert to avoid
// a redundant file read inside upsert_file.
let bytes = std::fs::read(&path)?;

View file

@ -21,7 +21,7 @@ pub fn handle_command(
// Resolve engine options once for the whole process. Scan overlays CLI
// flags below; other subcommands use the config values verbatim. The
// install is a no-op after the first call, so Scan's overlay must happen
// before we reach this point for its own call path we delay the install
// before we reach this point for its own call path, we delay the install
// to the Scan arm and gate non-scan commands behind a fallback install of
// the bare config values.
let install_from_config = |config: &Config| {
@ -378,7 +378,7 @@ fn print_engine_explanation(config: &Config, engine_profile: Option<EngineProfil
use console::style;
// Plain-text on/off, padded to 3 chars so the trailing column aligns
// regardless of which value is rendered. Colour is layered on top
// regardless of which value is rendered. Colour is layered on top ,
// the visible width stays 3 characters because `console::style` emits
// zero-width ANSI codes (and nothing at all when NO_COLOR is set).
fn onoff(b: bool) -> String {

View file

@ -54,7 +54,7 @@ fn record_persist_error(errors: &Arc<Mutex<Vec<String>>>, message: String) {
/// When `enabled` is true, a panic inside `f` is caught, logged, and
/// converted into a `NyxError::Msg`; callers that already match on
/// `Err(_)` will gracefully skip the file. When `enabled` is false,
/// the panic propagates unchanged preserving the default behaviour
/// the panic propagates unchanged, preserving the default behaviour
/// for users who want to catch engine bugs loudly.
///
/// `AssertUnwindSafe` is load-bearing: closures over `&Config` /
@ -222,7 +222,7 @@ fn is_false(b: &bool) -> bool {
/// Framework detection drives framework-conditional label rules (e.g. actix /
/// axum / rocket handler-arg sources, Rails route helpers) and auth-analysis
/// extractors. If any scan entry point forgets to populate it, the indexed
/// and non-indexed paths silently diverge missing framework-specific
/// and non-indexed paths silently diverge, missing framework-specific
/// findings in whichever path skipped detection. This helper exists so the
/// auto-fill stays consistent across `scan_filesystem_with_observer`,
/// `scan_with_index_parallel_observer`, and `build_index_with_observer`.
@ -239,7 +239,7 @@ pub(crate) fn ensure_framework_ctx(root: &Path, cfg: &Config) -> Option<Config>
///
/// Drives the one-time `preview-tier scan` banner in `handle()`. Tracks
/// the extensions `lang_for_path` in `ast.rs` maps to the `"c"` and `"cpp"`
/// slugs keep this aligned with that mapping.
/// slugs, keep this aligned with that mapping.
pub(crate) fn is_preview_tier_path(path: &Path) -> bool {
matches!(
path.extension()
@ -514,14 +514,14 @@ pub fn retain_converged_findings(diags: &mut Vec<Diag>) {
/// the same function; tiebreak by source line asc, source col asc).
///
/// Rule IDs of the form `taint-unsanitised-flow (source L:C)` share a single
/// base `taint-unsanitised-flow`. The grouping key is column-agnostic
/// base `taint-unsanitised-flow`. The grouping key is column-agnostic ,
/// multiple flows to the same sink line differing only in column or source
/// are collapsed to one. The rule_id preserves the source location, so the
/// kept representative still identifies which flow was reported.
///
/// The grouping key **includes the resolved sink capability bits** so that
/// two different sinks on the same line (e.g. `sink_sql(x); sink_shell(x);`)
/// are not collapsed into one finding they represent materially different
/// are not collapsed into one finding, they represent materially different
/// vulnerabilities and must surface independently. Findings with different
/// base rule IDs (e.g. `js.code_exec.eval`) or different severities are
/// left untouched per guardrails.
@ -560,7 +560,7 @@ pub(crate) fn deduplicate_taint_flows(diags: &mut Vec<Diag>) {
let src_col = src.map(|s| s.col).unwrap_or(u32::MAX);
// Same-function check: first flow_step (Source) and the step at the
// sink share an `enclosing_func`. If flow_steps are absent or the
// function markers are missing, treat as "unknown" worse than a
// function markers are missing, treat as "unknown", worse than a
// confirmed same-function match but better than a confirmed mismatch.
let same_function_flag: u32 = ev
.and_then(|e| {
@ -677,7 +677,7 @@ pub const SCC_UNCONVERGED_CROSS_FILE_NOTE_PREFIX: &str = "scc_unconverged:cross-
/// [`GlobalSummaries::snapshot_caps`] results.
///
/// Used by the Phase-B worklist to derive the next iteration's dirty
/// file set. Semantics match [`diff_cap_snapshots`] a key that
/// file set. Semantics match [`diff_cap_snapshots`], a key that
/// appears or disappears counts as changed.
fn changed_cap_keys_of(
before: &HashMap<crate::symbol::FuncKey, (u16, u16, u16, Vec<usize>)>,
@ -728,7 +728,7 @@ fn changed_ssa_keys_of(
///
/// Called once per unconverged batch (after the pass-2 rayon parallelism
/// has collected `iteration_diags`) so the cost is O(n) over the batch's
/// findings much cheaper than a per-finding `warn!`.
/// findings, much cheaper than a per-finding `warn!`.
///
/// Confidence is **capped** at `Low` rather than unconditionally set:
/// upstream analysis may have proven something particularly strong about
@ -795,7 +795,7 @@ fn tag_unconverged_findings(
/// Safety cap on SCC fixed-point iterations.
///
/// The convergence predicate is *snapshot equality* we break as soon as
/// The convergence predicate is *snapshot equality*, we break as soon as
/// an iteration leaves both `snapshot_caps()` and `snapshot_ssa()`
/// unchanged. The cap only triggers if something prevents monotone
/// progress (e.g. a non-monotone SSA summary refinement or an SCC larger
@ -809,7 +809,7 @@ fn tag_unconverged_findings(
/// SCC with `k` functions arranged in a chain, fresh taint introduced at
/// one end of the chain needs up to `k` iterations to reach the other
/// end. A hard cap of 3 was silently truncating propagation for any
/// SCC of 4+ cross-file functions findings vanished with no warning.
/// SCC of 4+ cross-file functions, findings vanished with no warning.
///
/// `FuncSummary` is a finite-height lattice (≤ 48 bits of caps + a
/// bounded vector of parameter indices) and `insert()` is strictly
@ -865,7 +865,7 @@ fn effective_scc_cap() -> usize {
/// persisted by non-recursive topo batches in the most recent
/// [`run_topo_batches`] invocation. Intended for the regression tests
/// that prove the topo-refinement pipeline is wired and producing
/// observable cross-batch state see
/// observable cross-batch state, see
/// `tests/topo_pass2_refinement_tests.rs`. Cheap relaxed load.
static LAST_TOPO_NONRECURSIVE_REFINEMENTS: AtomicUsize = AtomicUsize::new(0);
@ -905,7 +905,7 @@ fn topo_refine_enabled() -> bool {
///
/// When `call_graph` is missing an edge (e.g. a summary was inserted
/// after graph construction), we conservatively fall back to
/// re-analysing the full batch correctness is preserved at the cost
/// re-analysing the full batch, correctness is preserved at the cost
/// of the worklist optimisation for that iteration.
#[allow(clippy::too_many_arguments)]
fn run_topo_batches(
@ -1104,7 +1104,7 @@ fn run_topo_batches(
// A file becomes dirty for iteration N+1 iff it
// contains at least one caller of a FuncKey that
// changed in iteration N. If no key changed, the
// dirty set is empty which implies convergence (and
// dirty set is empty, which implies convergence (and
// matches `iter_converged` above).
let changed_cap_keys = changed_cap_keys_of(&snap_before, &snap_after);
let changed_ssa_keys =
@ -1124,7 +1124,7 @@ fn run_topo_batches(
// changed key. Fall back to the full batch when the
// call graph does not resolve any caller (e.g. all
// changes happened in leaf functions that no one in
// this batch calls rare but must not regress to
// this batch calls, rare but must not regress to
// missed analysis).
let namespaces_needing_reanalysis =
crate::callgraph::namespaces_for_callers(call_graph, &all_changed_keys);
@ -1165,7 +1165,7 @@ fn run_topo_batches(
}
if iter_converged {
// Snapshots equal but dirty_files non-empty is
// anomalous log and treat as converged
// anomalous, log and treat as converged
// (snapshot equality is the correctness-preserving
// signal).
tracing::debug!(
@ -1182,7 +1182,7 @@ fn run_topo_batches(
// After the loop, flatten per-file diags into the
// iteration_diags vector in batch order for deterministic
// output. Files that were in the batch but never made
// dirty (shouldn't happen iter 0 runs all of them) are
// dirty (shouldn't happen, iter 0 runs all of them) are
// skipped silently.
let mut iteration_diags: Vec<Diag> = Vec::new();
for p in &batch.files {
@ -1268,7 +1268,7 @@ fn run_topo_batches(
// parallel section completes, persist those refinements into
// `global_summaries` sequentially. Subsequent batches in
// topo order (caller-most batches) then resolve their call
// sites against the refined cross-file context the final
// sites against the refined cross-file context, the final
// step in the callee-first topo pipeline that pass-2
// sequencing was always meant to deliver.
//
@ -1455,7 +1455,7 @@ fn run_topo_batches(
}
}
// Orphan files (no functions in call graph) process last, single pass.
// Orphan files (no functions in call graph), process last, single pass.
if !orphans.is_empty() {
let orphan_diags: Vec<Diag> = orphans
.par_iter()
@ -2099,7 +2099,7 @@ pub fn scan_with_index_parallel_observer(
if let Some(p) = &progress_ref {
p.set_current_file(&path.to_string_lossy());
}
// Read once, hash once use the hash for the change check
// Read once, hash once, use the hash for the change check
// to avoid a second file read inside should_scan.
if let Ok(bytes) = std::fs::read(path) {
let hash = Indexer::digest_bytes(&bytes);
@ -2681,7 +2681,7 @@ pub fn scan_with_index_parallel_observer(
// pipeline intends to produce (taint + cfg-* + state-* from state
// analysis + auth.* when configured). A previous revision clipped this
// to `taint*`/`cfg-*` only, silently dropping state-model findings and
// breaking parity with `scan_filesystem` fixed. Mode-scoped
// breaking parity with `scan_filesystem`, fixed. Mode-scoped
// filtering, if ever needed, belongs in the analysis layer, not here.
let post_process_start = std::time::Instant::now();
@ -3134,7 +3134,7 @@ mod dedup_taint_flow_tests {
#[test]
fn dedup_collapses_same_line_different_columns() {
// Two findings at line 10 but different columns the widened key
// Two findings at line 10 but different columns, the widened key
// (path, line, severity) collapses them; the tighter source wins.
let mut diags = vec![
make_taint("a.rs", 10, 3, 4, 1),
@ -3151,7 +3151,7 @@ mod dedup_taint_flow_tests {
#[test]
fn dedup_does_not_drop_different_sink_caps_on_same_line() {
// Two findings at line 10, same column, same severity but with
// Two findings at line 10, same column, same severity, but with
// different resolved sink capability bits (SQL vs SHELL). They must
// NOT collapse: different sink kinds are materially different
// vulnerabilities. Regression guard.
@ -3175,7 +3175,7 @@ mod dedup_taint_flow_tests {
#[test]
fn dedup_collapses_same_sink_caps_on_same_line() {
// Same line, same severity, same sink caps this is the canonical
// Same line, same severity, same sink caps, this is the canonical
// dedup case (two flows to the same sink, differing only in source).
let mut diags = vec![
make_taint("a.rs", 10, 5, 3, 1),

View file

@ -88,7 +88,7 @@ pub fn handle(
// Invalidate the findings cache whenever a scan finishes so the next
// request rebuilds against fresh diags. The next-request rebuild keeps
// this hot-path simple we only clear the slot here, never recompute.
// this hot-path simple, we only clear the slot here, never recompute.
let cache_for_invalidate = Arc::clone(&state.findings_cache);
let mut event_rx = event_tx.subscribe();
tokio::spawn(async move {
@ -152,7 +152,7 @@ async fn shutdown_signal() {
.expect("failed to listen for Ctrl+C");
eprintln!("\n Shutting down...");
// SSE connections block graceful shutdown indefinitely.
// Use a raw OS thread to force exit tokio tasks may not
// Use a raw OS thread to force exit, tokio tasks may not
// run reliably during shutdown.
std::thread::spawn(|| {
std::thread::sleep(std::time::Duration::from_millis(250));

View file

@ -106,7 +106,7 @@ impl ConstValue {
if let Ok(i) = t.parse::<i64>() {
return Some(ConstValue::Int(i));
}
// Negative with space: "- 5" not supported, conservative
// Negative with space: "- 5", not supported, conservative
None
}
}
@ -118,9 +118,9 @@ impl ConstValue {
pub struct TypeSet(u16);
impl TypeSet {
/// All 12 type bits set no type constraint (Top).
/// All 12 type bits set, no type constraint (Top).
pub const TOP: Self = Self(0x0FFF);
/// No type bits unsatisfiable (Bottom).
/// No type bits, unsatisfiable (Bottom).
pub const BOTTOM: Self = Self(0);
pub fn singleton(kind: &TypeKind) -> Self {
@ -149,7 +149,7 @@ impl TypeSet {
self == Self::TOP
}
/// Complement all types NOT in this set.
/// Complement, all types NOT in this set.
pub fn complement(self) -> Self {
Self(!self.0 & Self::TOP.0)
}
@ -184,7 +184,7 @@ fn type_kind_index(kind: &TypeKind) -> u32 {
TypeKind::Url => 10,
TypeKind::HttpClient => 11,
TypeKind::LocalCollection => 12,
// Phase 6 DTO types carry per-field structural info that the
// the analysis DTO types carry per-field structural info that the
// bitset domain can't represent. Collapse to Unknown so callers
// still see "any type possible" rather than crashing on an
// unhandled variant. Same-file/cross-file Dto-aware paths read
@ -274,7 +274,7 @@ impl Nullability {
/// Boolean state lattice.
///
/// Same shape as [`Nullability`]. No `negate()` negation is structural
/// Same shape as [`Nullability`]. No `negate()`, negation is structural
/// on [`ConditionExpr`](super::lower::ConditionExpr).
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum BoolState {
@ -313,7 +313,7 @@ impl BoolState {
/// Abstract fact about a single SSA value.
///
/// Combines interval, constant, type, null, and boolean constraints.
/// There is intentionally no generic `negate()` on ValueFact negation
/// There is intentionally no generic `negate()` on ValueFact, negation
/// is structural on [`ConditionExpr`](super::lower::ConditionExpr) and
/// then applied as atomic refinements by the solver.
#[derive(Clone, Debug, PartialEq, Eq)]
@ -857,14 +857,14 @@ impl PathEnv {
// `assume_neq`, and a few internal sites. Large generated inputs
// (thousands of short statements on one line) can drive millions
// of calls and overflow a plain u16 `refine_count`. Saturate to
// stay within bounds the refinement pipeline is already
// stay within bounds, the refinement pipeline is already
// idempotent past the cap, so saturation is semantically a no-op.
self.refine_count = self.refine_count.saturating_add(1);
// Check size bound
let pos = self.facts.binary_search_by_key(&v, |(k, _)| *k);
if pos.is_err() && self.facts.len() >= MAX_PATH_ENV_ENTRIES {
return; // bounded don't grow
return; // bounded, don't grow
}
// Get meet count for widening
@ -963,7 +963,7 @@ impl PathEnv {
let ra = self.uf.find_immutable(a);
let rb = self.uf.find_immutable(b);
if ra == rb {
// Already known equal contradiction
// Already known equal, contradiction
self.unsat = true;
return;
}
@ -1040,7 +1040,7 @@ impl PathEnv {
return;
}
// Step 4: dedup check if this exact constraint already exists, skip
// Step 4: dedup check, if this exact constraint already exists, skip
let already_present = self
.relational
.iter()
@ -1052,7 +1052,7 @@ impl PathEnv {
if self.relational.len() < MAX_RELATIONAL {
self.relational.push((ra, op, rb));
}
// If at capacity, skip conservative: losing a constraint only
// If at capacity, skip, conservative: losing a constraint only
// loses pruning power, never introduces unsoundness.
}
@ -1089,7 +1089,7 @@ impl PathEnv {
if has_strict || op == RelOp::Lt {
return true;
}
// All Le: a <= b <= ... <= a means all equal satisfiable
// All Le: a <= b <= ... <= a means all equal, satisfiable
return false;
}
// Continue walking (take first outgoing edge)
@ -1181,11 +1181,11 @@ impl PathEnv {
while i < self.facts.len() && j < other.facts.len() {
match self.facts[i].0.cmp(&other.facts[j].0) {
std::cmp::Ordering::Less => {
// Only in self drop (absent on other side = Top)
// Only in self, drop (absent on other side = Top)
i += 1;
}
std::cmp::Ordering::Greater => {
// Only in other drop
// Only in other, drop
j += 1;
}
std::cmp::Ordering::Equal => {

View file

@ -8,10 +8,10 @@
//! 1. **Structural:** `condition_negated` (AST-level boolean)
//! 2. **Structural:** `condition_vars` (AST-extracted identifiers)
//! 3. **Structural:** compound decomposition (already handled by
//! `build_condition_chain` each leaf is a separate Block/Branch)
//! 4. **Structural:** `value_defs` resolve var names to [`SsaValue`]s
//! 5. **Structural:** `const_values` augment with known constants
//! 6. **Text fallback:** `condition_text` parse comparison operator and
//! `build_condition_chain`, each leaf is a separate Block/Branch)
//! 4. **Structural:** `value_defs`, resolve var names to [`SsaValue`]s
//! 5. **Structural:** `const_values`, augment with known constants
//! 6. **Text fallback:** `condition_text`, parse comparison operator and
//! literal operand. Necessary because individual comparisons are NOT
//! decomposed into separate SSA operations (condition nodes → `Nop`).
@ -82,7 +82,7 @@ impl CompOp {
/// Structured condition expression with SSA-resolved operands.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum ConditionExpr {
/// `lhs op rhs` e.g., `x > 5`, `x == y`.
/// `lhs op rhs`, e.g., `x > 5`, `x == y`.
Comparison {
lhs: Operand,
op: CompOp,
@ -98,7 +98,7 @@ pub enum ConditionExpr {
},
/// Boolean truthiness test: `if (x)`.
BoolTest { var: SsaValue },
/// Could not parse or resolve conservatively no refinement.
/// Could not parse or resolve, conservatively no refinement.
Unknown,
}
@ -240,7 +240,7 @@ pub fn lower_condition_with_stacks(
.map(|(name, val)| (name.as_str(), *val))
.collect();
// No const_values at lowering time empty lookup
// No const_values at lowering time, empty lookup
let const_lookup: HashMap<SsaValue, super::domain::ConstValue> = HashMap::new();
let lower = text.to_ascii_lowercase();

View file

@ -1,6 +1,6 @@
//! Constraint solver: apply conditions to [`PathEnv`] and check satisfiability.
//!
//! The solver operates on structured [`ConditionExpr`] values never on raw
//! The solver operates on structured [`ConditionExpr`] values, never on raw
//! text. Negation is always structural (via [`ConditionExpr::negate`] /
//! [`CompOp::negate`]), not via a generic "negate ValueFact" operation.
@ -13,7 +13,7 @@ use super::lower::{CompOp, ConditionExpr, Operand};
/// for the branch where the condition has the given polarity.
///
/// `polarity = true`: condition holds (true branch).
/// `polarity = false`: condition does NOT hold (false branch) negate
/// `polarity = false`: condition does NOT hold (false branch), negate
/// the condition structurally, then apply.
pub fn refine_env(env: &PathEnv, cond: &ConditionExpr, polarity: bool) -> PathEnv {
if env.is_unsat() {
@ -97,7 +97,7 @@ fn apply_condition(env: &mut PathEnv, cond: &ConditionExpr) {
}
ConditionExpr::Unknown => {
// No information no refinement
// No information, no refinement
}
}
}
@ -232,7 +232,7 @@ pub fn class_name_to_type_kind(name: &str) -> Option<TypeKind> {
"Boolean" => Some(TypeKind::Bool),
"List" | "ArrayList" | "Collection" | "Set" | "HashSet" => Some(TypeKind::Array),
"URL" | "URI" => Some(TypeKind::Url),
// Framework HTTP clients also listed in JAVA_HIERARCHY (type_facts.rs)
// Framework HTTP clients, also listed in JAVA_HIERARCHY (type_facts.rs)
// for subtype resolution. Both locations needed: this function is called
// directly by the constraint solver, while the hierarchy provides
// is_subtype_of() for instanceof checks.

View file

@ -156,7 +156,7 @@ fn valuefact_widen_stable_bound() {
b.lo = Some(0);
b.lo_strict = true;
let w = a.widen(&b);
assert_eq!(w.lo, Some(0)); // stable preserved
assert_eq!(w.lo, Some(0)); // stable, preserved
assert!(w.lo_strict);
}
@ -357,7 +357,7 @@ fn pathenv_max_refine_per_block() {
let v = SsaValue(0);
// Reset counter
env.reset_refine_count();
// Refine many times should stop after MAX_REFINE_PER_BLOCK
// Refine many times, should stop after MAX_REFINE_PER_BLOCK
for _ in 0..(MAX_REFINE_PER_BLOCK + 50) {
let mut f = ValueFact::top();
f.null = Nullability::NonNull;

View file

@ -1,69 +1,20 @@
//! Convergence-loop telemetry: per-batch and per-file JSONL sidecar.
//!
//! Records how many iterations each fix-point loop (cross-file SCC;
//! JS/TS in-file pass-2) actually used on real inputs, plus the
//! per-iteration change-set size trajectory, so we can tune caps on
//! evidence rather than by guess.
//!
//! # Why this module exists
//!
//! The SCC fix-point safety cap ([`crate::commands::scan::SCC_FIXPOINT_SAFETY_CAP`])
//! and the JS/TS pass-2 cap ([`crate::taint::JS_TS_PASS2_SAFETY_CAP`])
//! are both 64 iterations — chosen as "generous for every realistic
//! input we've seen". Neither value is backed by telemetry from a
//! production corpus (React, VSCode, Webpack, enterprise
//! monorepos). Without that data we cannot:
//!
//! * tell how often the cap actually fires under real workloads,
//! * distinguish tuneable-budget problems from non-monotonicity
//! regressions (Phase-D classifier addresses this on cap-hit, but
//! tells us nothing about the near-cap distribution),
//! * decide whether further Phase-B worklist optimisation is needed.
//!
//! The telemetry emitted here is consumed by offline analysis tools
//! (`tools/convergence_report.py`, not tracked here) that compute
//! P50/P95/P99 iteration counts per corpus.
//!
//! # Lifecycle
//!
//! Telemetry is **opt-in** via `NYX_CONVERGENCE_TELEMETRY=1` — production
//! scans are unaffected by default. When enabled:
//!
//! * [`is_enabled`] returns true.
//! * The SCC loop and JS/TS pass-2 loop each call [`record`] when
//! they terminate (early-convergence or cap-hit).
//! * On scan shutdown, the collected records are written to a JSONL
//! file alongside the SARIF output (or to the path specified by
//! `NYX_CONVERGENCE_TELEMETRY_PATH`).
//!
//! Records never touch the critical path — [`record`] is a cheap
//! push onto a `Mutex<Vec<_>>` and the write happens once at scan end.
//!
//! # Schema stability
//!
//! Records serialize as JSONL (one JSON object per line, newline
//! separated). The `kind` tag is snake_case and stable; adding new
//! fields is backwards-compatible because unknown fields are ignored
//! by downstream tooling. Removing fields, or changing existing
//! fields' types, is a **breaking change** — bump the schema version
//! in [`SCHEMA_VERSION`] if you must.
//! Opt-in via `NYX_CONVERGENCE_TELEMETRY=1`. Records iteration counts
//! and change-set trajectories for the cross-file SCC and JS/TS
//! pass-2 fix-point loops so caps can be tuned from evidence. Output
//! goes to `NYX_CONVERGENCE_TELEMETRY_PATH` or a SARIF-adjacent file.
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::sync::{Mutex, OnceLock};
/// Stable schema version for the JSONL records emitted by this module.
///
/// Bump when the record shape changes in a way that breaks downstream
/// consumers (field removed, type changed). Adding optional fields is
/// backwards-compatible and does not require a bump.
/// JSONL schema version. Bump on breaking shape changes; optional
/// fields don't require a bump.
pub const SCHEMA_VERSION: u32 = 1;
/// One convergence event: either a cross-file SCC batch or a JS/TS
/// in-file pass-2 run. The `kind` discriminator selects between them.
///
/// Serialized as JSON with `kind` as a snake_case tag so downstream
/// tooling can pattern-match without depending on Rust enum layout.
/// One convergence event, either a cross-file SCC batch or a JS/TS
/// in-file pass-2 run.
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum ConvergenceEvent {
@ -98,7 +49,7 @@ pub struct SccBatchRecord {
/// True iff the batch reached the fixed point before the cap
/// fired.
pub converged: bool,
/// Per-iteration change-set size the same trajectory the
/// Per-iteration change-set size, the same trajectory the
/// [`crate::engine_notes::CapHitReason`] classifier consumes. Empty
/// when the loop terminated on iteration 0 (pathological case).
pub trajectory: SmallVec<[u32; 4]>,
@ -130,20 +81,10 @@ pub struct InFilePass2Record {
pub trajectory: SmallVec<[u32; 4]>,
}
/// Global collector for convergence events recorded during a scan.
///
/// Stored behind a `OnceLock<Mutex<Vec<_>>>` so multiple rayon workers
/// can record events concurrently without a startup cost when
/// telemetry is disabled. The mutex contention is negligible because
/// each scan produces O(batches + JS/TS files) events, not per-task
/// events.
static COLLECTOR: OnceLock<Mutex<Vec<ConvergenceEvent>>> = OnceLock::new();
/// Returns true when telemetry collection is active for this process.
///
/// Controlled by the `NYX_CONVERGENCE_TELEMETRY` env var: any value
/// except `"0"`, `"false"`, or empty enables it. Cached on first
/// read so the env lookup is paid once per process.
/// True when `NYX_CONVERGENCE_TELEMETRY` is set to anything other than
/// `"0"`, `"false"`, or empty. Cached.
pub fn is_enabled() -> bool {
static ENABLED: OnceLock<bool> = OnceLock::new();
*ENABLED.get_or_init(|| match std::env::var("NYX_CONVERGENCE_TELEMETRY") {
@ -152,11 +93,7 @@ pub fn is_enabled() -> bool {
})
}
/// Record a convergence event. No-op when telemetry is disabled.
///
/// Safe to call from parallel rayon contexts — the underlying mutex
/// is reentrant-safe and the push is O(1). Events are retained in
/// memory until [`drain`] is called at scan end.
/// Record a convergence event. No-op when telemetry is disabled.
pub fn record(event: ConvergenceEvent) {
if !is_enabled() {
return;
@ -167,9 +104,7 @@ pub fn record(event: ConvergenceEvent) {
}
}
/// Drain and return all recorded events. Leaves the collector empty
/// so subsequent scans in the same process (e.g. integration tests)
/// do not see stale events.
/// Drain all recorded events.
pub fn drain() -> Vec<ConvergenceEvent> {
let Some(lock) = COLLECTOR.get() else {
return Vec::new();
@ -207,7 +142,7 @@ pub fn write_jsonl(path: &std::path::Path) -> std::io::Result<usize> {
/// Canonical sidecar path: uses `NYX_CONVERGENCE_TELEMETRY_PATH` if
/// set, otherwise derives from the current working directory.
///
/// The `_derive_from_root` hint is the scan root when no explicit
/// The `_derive_from_root` hint is the scan root, when no explicit
/// path is configured we place the sidecar next to it as
/// `nyx-convergence.jsonl` so the file lands alongside the SARIF
/// output by default.
@ -230,7 +165,7 @@ mod tests {
static COLLECTOR_TEST_GUARD: Mutex<()> = Mutex::new(());
/// Clear the global collector so each test starts with a known
/// state. Does **not** force `is_enabled()` true the unit
/// state. Does **not** force `is_enabled()` true, the unit
/// tests below bypass `record()` (which is a no-op unless
/// env-enabled) by pushing directly into the collector.
fn reset_and_enable_telemetry() {

View file

@ -202,16 +202,16 @@ pub mod index {
///
/// Bumped independently of `ENGINE_VERSION` whenever the serialized
/// layout or identity of a cached artefact changes in an incompatible
/// way e.g. a `FuncKey` field semantic change that would cause old
/// way, e.g. a `FuncKey` field semantic change that would cause old
/// summaries to misbehave when rehydrated.
///
/// History:
/// * `"1"` initial.
/// * `"2"` 0.5.0: `FuncKey.disambig` changed from the function-node
/// * `"1"`, initial.
/// * `"2"`, 0.5.0: `FuncKey.disambig` changed from the function-node
/// byte offset to a depth-first structural index. Pre-0.5.0 caches
/// store byte-offset disambigs and would fail to match bodies built
/// by the new engine, so they are silently rebuilt on open.
/// * `"3"` `ssa_function_bodies.body` changed from JSON TEXT to
/// * `"3"`, `ssa_function_bodies.body` changed from JSON TEXT to
/// bincode BLOB. Old JSON payloads cannot be deserialised by the
/// new engine, so they are silently rebuilt on open.
pub const SCHEMA_VERSION: &str = "3";
@ -432,7 +432,7 @@ pub mod index {
match stored {
Some(ref v) if v == current => {
// Schema version matches nothing to do.
// Schema version matches, nothing to do.
}
_ => {
let old = stored.as_deref().unwrap_or("<none>");
@ -475,7 +475,7 @@ pub mod index {
match stored {
Some(ref v) if v == current => {
// Version matches nothing to do.
// Version matches, nothing to do.
}
_ => {
let old = stored.as_deref().unwrap_or("<none>");
@ -601,10 +601,10 @@ pub mod index {
Ok(match row {
Some((stored_hash, stored_mtime)) => {
if stored_mtime != mtime {
// mtime changed must re-scan
// mtime changed, must re-scan
true
} else {
// mtime matches compare hash only if cheap
// mtime matches, compare hash only if cheap
// (the caller already read the file and can use
// should_scan_with_hash instead for full accuracy)
let digest = Self::digest_file(path)?;
@ -811,7 +811,7 @@ pub mod index {
/// Atomically replace all SSA function summaries for a single file.
///
/// The input tuple is
/// `(name, arity, lang, namespace, container, disambig, kind, summary)`
/// `(name, arity, lang, namespace, container, disambig, kind, summary)` ,
/// matching the fields required to reconstruct a full [`crate::symbol::FuncKey`]
/// on load.
pub fn replace_ssa_summaries_for_file(
@ -1040,7 +1040,7 @@ pub mod index {
/// Load symbol metadata (name, arity, lang, namespace, container, kind)
/// for a single file.
///
/// Lighter than `load_all_ssa_summaries` skips JSON deserialization of
/// Lighter than `load_all_ssa_summaries`, skips JSON deserialization of
/// the full summary body and filters by file_path in the query. `kind`
/// is the [`crate::symbol::FuncKind`] slug (`"fn"`, `"method"`,
/// `"closure"`, ...) so consumers can distinguish anonymous functions
@ -1074,7 +1074,7 @@ pub mod index {
///
/// Persists cross-file callee bodies for interprocedural symex.
/// Bodies are serialized as MessagePack (rmp-serde, named-field
/// encoding) BLOBs JSON proved too costly at indexing time on
/// encoding) BLOBs, JSON proved too costly at indexing time on
/// large SSA structures, and bincode's positional format trips
/// over the `#[serde(skip_serializing_if = ...)]` attributes
/// scattered through `OptimizeResult` and friends.
@ -1260,7 +1260,7 @@ pub mod index {
///
/// Mirrors [`Self::replace_ssa_summaries_for_file`]. Each input tuple
/// is `(name, arity, lang, namespace, container, disambig, kind, summary)`
/// the full identity needed to reconstruct the callee's
///, the full identity needed to reconstruct the callee's
/// [`crate::symbol::FuncKey`] on load.
pub fn replace_auth_summaries_for_file(
&mut self,
@ -1326,7 +1326,7 @@ pub mod index {
/// [`Self::replace_ssa_summaries_for_file`],
/// [`Self::replace_ssa_bodies_for_file`] and
/// [`Self::replace_auth_summaries_for_file`] in sequence, but
/// issues a single fsync at commit instead of four the
/// issues a single fsync at commit instead of four, the
/// dominant cost on large scans.
///
/// Behaviour parity with the four-call sequence:
@ -1376,7 +1376,7 @@ pub mod index {
let path_str = file_path.to_string_lossy();
let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64;
// function_summaries always replace.
// function_summaries, always replace.
tx.execute(
"DELETE FROM function_summaries WHERE project = ?1 AND file_path = ?2",
params![self.project, path_str],
@ -1408,7 +1408,7 @@ pub mod index {
}
}
// ssa_function_summaries only touched when non-empty.
// ssa_function_summaries, only touched when non-empty.
if !ssa_summaries.is_empty() {
tx.execute(
"DELETE FROM ssa_function_summaries
@ -1444,7 +1444,7 @@ pub mod index {
}
}
// ssa_function_bodies only touched when non-empty.
// ssa_function_bodies, only touched when non-empty.
if !ssa_bodies.is_empty() {
tx.execute(
"DELETE FROM ssa_function_bodies
@ -1478,7 +1478,7 @@ pub mod index {
}
}
// auth_check_summaries always replace, even when empty,
// auth_check_summaries, always replace, even when empty,
// so a helper that lost its ownership check no longer
// leaks lifts into subsequent pass-2 runs.
tx.execute(
@ -2203,7 +2203,7 @@ pub mod index {
Ok(rows)
}
/// Record the first time a finding fingerprint was observed. Idempotent
/// Record the first time a finding fingerprint was observed. Idempotent ,
/// the earliest call wins via INSERT OR IGNORE. Used by the overview
/// backlog-age computation; ts should be the originating scan's
/// `started_at` (RFC-3339).
@ -2246,7 +2246,7 @@ pub mod index {
if fingerprints.is_empty() {
return Ok(std::collections::HashMap::new());
}
// SQLite IN-clause cap is high but parameter count is bounded chunk
// SQLite IN-clause cap is high but parameter count is bounded, chunk
// for safety with large fingerprint sets.
let mut out = std::collections::HashMap::with_capacity(fingerprints.len());
let conn = self.c();
@ -2590,7 +2590,7 @@ fn ssa_summaries_round_trip() {
/// asserts that `return_path_facts` survive serialise → SQLite persist →
/// load → deserialise. Regression guard for the per-return-path PathFact
/// decomposition that closes the rs-safe-014 / tar-rs / rs-safe-016 FP
/// cluster without this round-trip working, cross-file callers lose
/// cluster, without this round-trip working, cross-file callers lose
/// the per-arm narrowing and inline-only callees regain the joined-fact
/// dilution.
#[test]
@ -2955,7 +2955,7 @@ fn ssa_bodies_replace_on_rescan() {
assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 1);
assert_eq!(idx.load_all_ssa_bodies().unwrap()[0].8.ssa.blocks.len(), 2);
// Store v2 with 5 blocks should replace, not accumulate
// Store v2 with 5 blocks, should replace, not accumulate
let hash2 = index::Indexer::digest_bytes(b"v2");
let bodies2 = vec![(
"func".to_string(),
@ -3053,7 +3053,7 @@ fn ssa_bodies_removed_on_file_delete() {
idx.replace_ssa_bodies_for_file(&f, &hash, &bodies).unwrap();
assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 1);
// Delete file should also remove bodies
// Delete file, should also remove bodies
idx.remove_file_and_related(&f).unwrap();
assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 0);
}
@ -3215,7 +3215,7 @@ fn version_mismatch_triggers_reset() {
1
);
// Reopen version mismatch should trigger full wipe
// Reopen, version mismatch should trigger full wipe
drop(pool);
let pool2 = index::Indexer::init(&db).unwrap();
@ -3286,7 +3286,7 @@ fn multiple_opens_no_repeated_resets() {
populate_project(&pool, "proj", td.path());
drop(pool);
// Second open should preserve data
// Second open, should preserve data
let pool2 = index::Indexer::init(&db).unwrap();
assert_eq!(
index::Indexer::count_rows(&pool2, "function_summaries", "proj").unwrap(),
@ -3297,7 +3297,7 @@ fn multiple_opens_no_repeated_resets() {
populate_project(&pool2, "proj2", td.path());
drop(pool2);
// Third open should still preserve both projects
// Third open, should still preserve both projects
let pool3 = index::Indexer::init(&db).unwrap();
assert_eq!(
index::Indexer::count_rows(&pool3, "function_summaries", "proj").unwrap(),
@ -3376,7 +3376,7 @@ fn missing_ssa_namespace_column_triggers_recreate() {
.unwrap();
}
// Open via init should detect missing namespace and recreate
// Open via init, should detect missing namespace and recreate
let pool = index::Indexer::init(&db).unwrap();
// Verify the table now has the namespace column by inserting with it
@ -3405,12 +3405,12 @@ fn valid_schema_no_recreate() {
let td = tempfile::tempdir().unwrap();
let db = td.path().join("nyx.sqlite");
// First init creates all tables
// First init, creates all tables
let pool = index::Indexer::init(&db).unwrap();
populate_project(&pool, "proj", td.path());
drop(pool);
// Second init schema is valid, should NOT drop/recreate
// Second init, schema is valid, should NOT drop/recreate
let pool2 = index::Indexer::init(&db).unwrap();
// Data survives because schema was already correct
assert_eq!(
@ -3735,7 +3735,7 @@ fn metadata_table_survives_clear() {
assert_eq!(stored.as_deref(), Some(index::ENGINE_VERSION));
}
/// Pointer-Phase 5 / A3 audit: field_points_to round-trips through
/// field_points_to round-trips through
/// the SsaFuncSummary SQLite blob. Pin that the new field_points_to
/// records preserve param_field_reads, param_field_writes, the
/// receiver sentinel (`u32::MAX`), the container-element marker
@ -3817,7 +3817,7 @@ fn ssa_summaries_round_trip_preserves_field_points_to() {
}
/// Pre-Phase-5 blob compatibility: a summary serialised without
/// `field_points_to` deserialises with the empty default no
/// `field_points_to` deserialises with the empty default, no
/// migration needed because the field is `#[serde(default)]`.
#[test]
fn ssa_summaries_pre_phase5_blob_decodes_with_empty_field_points_to() {

View file

@ -1,98 +1,43 @@
//! Provenance notes attached to findings when the engine has hit an
//! internal budget, widening, or lowering cap.
//!
//! The notes are surfaced through `Finding.engine_notes` (and
//! `Evidence.engine_notes` once the finding reaches the `Diag` layer) so
//! downstream consumers can tell "we found nothing" from "we stopped
//! looking".
//!
//! Each note carries a [`LossDirection`] classification that describes
//! *how* the engine deviated from a fully-converged analysis. The
//! direction drives two downstream behaviours:
//!
//! * [`crate::evidence::compute_confidence`] caps confidence at
//! `Medium` when any attached note has direction
//! [`LossDirection::OverReport`] or [`LossDirection::Bail`] (the
//! finding itself may be spurious).
//! * [`crate::rank`] applies a direction-aware `completeness` penalty
//! to the attack-surface score (see `rank.rs::completeness_penalty`).
//!
//! This replaces the earlier Phase-3 stance of "notes are purely
//! additive and never influence score". A release audit flagged that
//! users sorting thousands of findings by rank could not distinguish
//! converged analysis from capped analysis, which produced false
//! confidence in fragile findings. The direction-aware pipeline
//! preserves the observability goal while fixing the credibility gap.
//! Each note carries a [`LossDirection`] classification.
//! [`crate::evidence::compute_confidence`] caps confidence at `Medium`
//! for `OverReport`/`Bail` notes, and [`crate::rank`] applies a
//! direction-aware completeness penalty.
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
/// Classification of *why* a fix-point loop hit its safety cap.
///
/// The cap-hit alone is not actionable — "we ran 64 iterations and did
/// not detect convergence" can mean several very different things:
///
/// * the lattice is still shrinking but slowly (e.g. a 72-function chain
/// SCC that legitimately needs >64 iterations),
/// * the lattice stopped shrinking but the convergence predicate still
/// detects change (the change set stabilised at a non-zero value —
/// monotonicity is fine but something in the convergence predicate is
/// spurious), or
/// * the lattice is oscillating (two iterations alternating with the
/// same change-set size; this is a *bug*, not a tuning issue).
///
/// Recording the reason makes cap-hit telemetry actionable: operators
/// can tell when "raise the cap" would actually help vs. when they are
/// looking at a summary-non-monotonicity regression.
///
/// Serialized as a nested snake_case tagged enum so SARIF/JSON consumers
/// can pattern-match without depending on Rust layout.
/// Why a fix-point loop hit its safety cap. Distinguishes "raise the
/// cap" cases from non-monotonicity bugs in cap-hit telemetry.
/// Serialized as a tagged snake_case enum for SARIF/JSON consumers.
#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum CapHitReason {
/// The change-set size was still decreasing when the cap fired.
/// `trajectory` is the last N iteration deltas (most recent last).
/// Operators can safely raise the cap; the underlying analysis is
/// healthy but the SCC is larger than the current budget.
/// Change-set still decreasing when the cap fired. Safe to raise
/// the cap; the SCC is just larger than budget.
MonotoneShrinking { trajectory: SmallVec<[u32; 4]> },
/// The change-set size stayed constant for the last ≥2 iterations
/// without reaching zero. This is unusual: every iteration is
/// updating the *same* keys, which suggests a summary that changes
/// the same fields back and forth even though the cap bits are
/// saturating. Raise the cap **and** investigate.
/// Change-set held steady at a non-zero value for ≥2 iterations.
/// Same keys updating back and forth, investigate.
Plateau { delta: u32 },
/// The change-set size oscillated with a detected period ≤ N/2.
/// Genuinely bad — the analysis is not monotone, convergence will
/// *never* be reached, and raising the cap will not help. File a
/// bug with the fixture attached.
/// Period-2 oscillation detected. Non-monotone; raising the cap
/// will not help. File a bug.
SuspectedOscillation {
period: u8,
trajectory: SmallVec<[u32; 4]>,
},
/// Default when the engine did not record a trajectory (e.g. the
/// cap fired after only one iteration so there is nothing to
/// classify). Preserves backwards compatibility for old notes
/// deserialized from disk.
/// No trajectory recorded (e.g. cap fired after a single iteration).
#[default]
Unknown,
}
impl CapHitReason {
/// Classify a trajectory of per-iteration change-set sizes.
///
/// `deltas` should carry the *changed-key counts* from the last N
/// iterations (most recent last). Classification rules:
///
/// 1. Fewer than 2 samples → `Unknown` (nothing to diff against).
/// 2. A period-2 pattern (a,b,a,b) with a ≠ b → `SuspectedOscillation`.
/// 3. Last two samples equal and non-zero → `Plateau`.
/// 4. Strictly decreasing tail → `MonotoneShrinking`.
/// 5. Otherwise → `Unknown` (inconclusive; rare in practice).
///
/// The function is pure — no allocation beyond the returned
/// [`SmallVec`] — so it is safe to call from within a hot loop when
/// a cap actually fires. Callers should accumulate deltas in a
/// fixed-size ring buffer to bound memory.
/// Classify a trajectory of per-iteration change-set sizes
/// (most recent last). Rules: <2 samples → `Unknown`; a,b,a,b with
/// a≠b → `SuspectedOscillation`; last two equal non-zero →
/// `Plateau`; strictly decreasing tail → `MonotoneShrinking`;
/// otherwise `Unknown`.
pub fn classify(deltas: &[u32]) -> CapHitReason {
if deltas.len() < 2 {
return CapHitReason::Unknown;
@ -161,44 +106,26 @@ impl CapHitReason {
}
/// Direction of precision loss encoded by an [`EngineNote`].
///
/// Every new [`EngineNote`] variant must declare a direction via
/// [`EngineNote::direction`] — the match is exhaustive by design so the
/// classification cannot silently default.
///
/// Ordering matters: variants are sorted by worsening impact on a
/// specific finding's credibility. [`combine`](Self::combine) uses the
/// `Ord` impl to merge directions when multiple notes are attached.
/// Variants are ordered by worsening credibility impact;
/// [`combine`](Self::combine) takes the max.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LossDirection {
/// The note is informational only. Analysis was fully converged;
/// the note records a harmless event such as a cache reuse.
/// Analysis converged; the note records a harmless event.
Informational,
/// The analysis may have *missed* additional findings (e.g. the
/// worklist was capped before fully propagating taint). Findings
/// that *were* reported are still sound — they correspond to real
/// flows — but the result set is a lower bound.
/// Analysis may have missed findings (worklist was capped). Reported
/// findings remain sound, the result set is a lower bound.
UnderReport,
/// The analysis may have reported a *spurious* finding (e.g.
/// predicate state was widened to top, so a validation guard that
/// would have suppressed the finding was lost). The specific
/// finding is more likely to be a false positive than one produced
/// from converged state.
/// Analysis may have reported a spurious finding (e.g. predicate
/// state widened to top, dropping a guard). Likely FP.
OverReport,
/// Analysis of this finding's body aborted before producing a
/// trustworthy result (e.g. SSA lowering bailed, parse timed out).
/// The finding is weakly supported; a human reviewer should treat
/// it as a starting point rather than a confirmed flow.
/// Analysis aborted before producing a trustworthy result.
/// Treat the finding as a starting point, not a confirmed flow.
Bail,
}
impl LossDirection {
/// Merge two directions by taking the worse (later in `Ord`).
///
/// A body with both `UnderReport` and `OverReport` notes is treated
/// as `OverReport` because over-reporting is the more credibility-
/// damaging failure mode for a specific emitted finding.
/// Merge by taking the worse (later in `Ord`).
pub fn combine(self, other: LossDirection) -> LossDirection {
self.max(other)
}
@ -215,111 +142,46 @@ impl LossDirection {
}
/// A single provenance event recorded during analysis.
///
/// `kind` is serialized as a snake_case tag so tooling can pattern-match
/// across JSON and SARIF output without depending on Rust enum layout.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum EngineNote {
/// The taint worklist hit its iteration budget before converging.
/// Direction: [`LossDirection::UnderReport`] — the fixpoint was
/// aborted, so some flows may have been missed, but emitted flows
/// are still backed by propagated taint.
/// Taint worklist hit its iteration budget. UnderReport.
WorklistCapped { iterations: u32 },
/// Origin tracking was truncated when a value exceeded the configured
/// per-value origin cap (`analysis.engine.max_origins`, default 32).
/// Direction: [`LossDirection::UnderReport`] — each dropped origin
/// corresponds to a real source flow whose independent finding will
/// not be emitted. Other survivors still produce findings, so the
/// counter is a strict lower bound on under-reporting. Raise
/// `max_origins` if operators observe this note on realistic inputs.
/// Truncation is deterministic: origins are sorted by source
/// location and the largest-by-location are dropped first, so the
/// survivor set is stable across runs and merge orderings.
/// Per-value origin set truncated to `analysis.engine.max_origins`
/// (default 32). UnderReport, dropped origins correspond to real
/// source flows whose findings won't emit.
OriginsTruncated { dropped: u32 },
/// JS/TS pass-2 in-file global propagation hit its iteration cap.
/// Direction: [`LossDirection::UnderReport`] — global state may
/// not have reached fixpoint; cross-function flows could be missed.
///
/// `reason` classifies *why* the cap fired (monotone-but-slow,
/// plateau, suspected oscillation) so operators can tell a
/// tunable-budget problem from a monotonicity regression. Older
/// serialized notes without this field default to
/// [`CapHitReason::Unknown`].
/// JS/TS pass-2 in-file global propagation hit its cap. UnderReport.
InFileFixpointCapped {
iterations: u32,
#[serde(default)]
reason: CapHitReason,
},
/// Cross-file SCC fixpoint hit `SCC_FIXPOINT_SAFETY_CAP`.
/// Direction: [`LossDirection::UnderReport`] — the iterative
/// cross-file join aborted; summaries for members of this SCC may
/// be incomplete.
///
/// `reason` classifies *why* the cap fired (monotone-but-slow,
/// plateau, suspected oscillation) so operators can tell a
/// tunable-budget problem from a monotonicity regression. Older
/// serialized notes without this field default to
/// [`CapHitReason::Unknown`].
/// Cross-file SCC fixpoint hit `SCC_FIXPOINT_SAFETY_CAP`. UnderReport.
CrossFileFixpointCapped {
iterations: u32,
#[serde(default)]
reason: CapHitReason,
},
/// SSA lowering produced an empty body (parse failure or
/// unsupported shape). Direction: [`LossDirection::Bail`] — any
/// finding attributed to this body is weakly supported because the
/// IR itself is malformed.
/// SSA lowering produced an empty body. Bail.
SsaLoweringBailed { reason: String },
/// Tree-sitter parse exceeded the configured timeout.
/// Direction: [`LossDirection::Bail`] — parse aborted; findings
/// surfaced from the partial tree should be treated as a human-
/// review starting point.
/// Tree-sitter parse exceeded the timeout. Bail.
ParseTimeout { timeout_ms: u32 },
/// Predicate state was widened to top to maintain monotonicity.
/// Direction: [`LossDirection::OverReport`] — validation guards
/// that would have suppressed the finding may have been lost, so
/// the finding is more likely to be a false positive.
/// Predicate state widened to top to keep the lattice monotone.
/// OverReport, guards may have been lost.
PredicateStateWidened,
/// Path-environment constraints exceeded internal cap; widened to
/// top. Direction: [`LossDirection::OverReport`] — same reasoning
/// as [`Self::PredicateStateWidened`]: dropped path constraints can
/// only turn infeasible paths into apparent-feasible ones.
/// Path-environment constraints widened to top. OverReport.
PathEnvCapped,
/// Inline cache reused a cached body summary; origins were
/// re-attributed. Direction: [`LossDirection::Informational`] —
/// the cache hit does not affect precision, but surfacing the
/// re-attribution helps explain why origin locations move between
/// runs that share a body signature.
/// Inline cache reused a cached body. Informational.
InlineCacheReused,
/// Points-to analysis dropped heap object members when an
/// intra-procedural points-to set exceeded
/// `analysis.engine.max_pointsto` (default 32).
/// Direction: [`LossDirection::UnderReport`] — stores and loads
/// that flow through the truncated set miss the dropped abstract
/// heap objects, so any taint into those objects via this alias
/// path will not reach downstream sinks. Other aliasing paths to
/// the same objects still propagate normally, so the counter is a
/// strict lower bound on under-reporting. Raise `max_pointsto`
/// if operators observe this note on factory-heavy codebases.
/// Points-to set truncated to `analysis.engine.max_pointsto`
/// (default 32). UnderReport.
PointsToTruncated { dropped: u32 },
}
impl EngineNote {
/// Classify this note by direction of precision loss.
///
/// The match is exhaustive: every `EngineNote` variant must declare
/// a direction. When adding a new cap site, pick the direction
/// that most honestly describes the impact on an emitted finding:
///
/// * `Informational` — analysis fully converged; note is a
/// provenance breadcrumb (e.g. cache reuse).
/// * `UnderReport` — analysis was cut short, but anything emitted
/// is still backed by real propagation.
/// * `OverReport` — precision was widened, so the emitted finding
/// is *more* likely to be a false positive than the baseline.
/// * `Bail` — analysis of this body aborted; the finding is weakly
/// supported.
/// Direction of precision loss for this note. New variants must
/// declare one explicitly.
pub fn direction(&self) -> LossDirection {
match self {
EngineNote::WorklistCapped { .. } => LossDirection::UnderReport,
@ -335,23 +197,15 @@ impl EngineNote {
}
}
/// True if this note indicates the engine may have deviated from a
/// fully-converged analysis (any non-informational direction).
///
/// This is a convenience over
/// `self.direction() != LossDirection::Informational` and drives
/// the `confidence_capped` SARIF property.
/// True for any non-informational direction. Drives the
/// `confidence_capped` SARIF property.
pub fn lowers_confidence(&self) -> bool {
self.direction() != LossDirection::Informational
}
}
/// Compute the worst direction across a slice of notes.
///
/// Returns `None` when `notes` is empty or contains only
/// [`LossDirection::Informational`] notes. Returns `Some(dir)` with
/// the most impactful direction otherwise — this is what downstream
/// consumers (rank, confidence) use to decide how to degrade a finding.
/// Worst non-informational direction across a slice of notes, or
/// `None` if the slice is empty or only carries informational notes.
pub fn worst_direction(notes: &[EngineNote]) -> Option<LossDirection> {
let mut worst: Option<LossDirection> = None;
for note in notes {
@ -367,9 +221,7 @@ pub fn worst_direction(notes: &[EngineNote]) -> Option<LossDirection> {
worst
}
/// Deduplicating push: does not append if an identical note is already
/// present. Used to keep per-finding note lists small when a cap site
/// fires repeatedly inside the same body.
/// Push-if-not-present.
pub fn push_unique(notes: &mut smallvec::SmallVec<[EngineNote; 2]>, note: EngineNote) {
if !notes.iter().any(|n| n == &note) {
notes.push(note);

View file

@ -289,7 +289,7 @@ pub struct StateEvidence {
/// (validation guards may have been lost, so the finding is more
/// likely to be a false positive); `Bail` means analysis of the body
/// aborted before producing a trustworthy result. `UnderReport` notes
/// (e.g. `WorklistCapped`) do *not* cap confidence the reported flow
/// (e.g. `WorklistCapped`) do *not* cap confidence, the reported flow
/// is still real, just surrounded by an incomplete result set.
pub fn compute_confidence(diag: &Diag) -> Confidence {
// Degraded analysis caps confidence
@ -343,7 +343,7 @@ fn apply_engine_notes_cap(diag: &Diag, base: Confidence) -> Confidence {
| crate::engine_notes::LossDirection::Bail => base.min(Confidence::Medium),
// UnderReport: result set is a lower bound, but the emitted
// finding itself remains as credible as the analysis decided.
// Do not cap the rank completeness penalty is the right lever
// Do not cap, the rank completeness penalty is the right lever
// for that case (see rank.rs::completeness_penalty).
crate::engine_notes::LossDirection::UnderReport => base,
// Informational is filtered out upstream by `worst_direction`,
@ -600,7 +600,7 @@ pub fn generate_explanation(diag: &Diag) -> Option<String> {
/// Extract a vulnerability category label from the Diag (used in explanation text).
fn extract_category_from_id(id: &str) -> String {
// Rule IDs like "taint-unsanitised-flow (source 3:1)" category comes
// Rule IDs like "taint-unsanitised-flow (source 3:1)", category comes
// from the finding category field, but we approximate from the ID here.
if id.contains("sql") || id.contains("SQL") {
"SQL injection".to_string()
@ -680,7 +680,7 @@ pub fn compute_confidence_limiters(diag: &Diag) -> Vec<String> {
"Backwards demand-driven analysis exceeded its budget (verdict not reached)".into(),
);
}
// Confirmation is *not* a limiter it is a positive signal. The
// Confirmation is *not* a limiter, it is a positive signal. The
// taint-confidence scorer picks it up separately.
let _ = NOTE_CONFIRMED;
}
@ -976,7 +976,7 @@ mod tests {
#[test]
fn confidence_capped_at_medium_by_over_report() {
// OverReport (PredicateStateWidened) means validation predicates
// were lost the emitted finding is more likely to be spurious.
// were lost, the emitted finding is more likely to be spurious.
let d = with_notes(
taint_high_confidence_diag(),
vec![crate::engine_notes::EngineNote::PredicateStateWidened],
@ -995,7 +995,7 @@ mod tests {
#[test]
fn confidence_cap_does_not_upgrade_low() {
// `base.min(Medium)` is what caps it must not *raise* a Low
// `base.min(Medium)` is what caps, it must not *raise* a Low
// baseline to Medium. Use a taint finding with weak evidence so
// the points scorer gives us Low, then attach a Bail note.
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::Low);

View file

@ -31,7 +31,7 @@ pub fn render_console(
}
for (path, issues) in &grouped {
// File path header dim blue, never brighter than severity.
// File path header, dim blue, never brighter than severity.
out.push_str(&format!("{}\n", style(path).blue().dim().underlined()));
for d in issues {
out.push_str(&render_diag(d, width));
@ -261,7 +261,7 @@ fn render_diag(d: &Diag, width: usize) -> String {
// Engine provenance notes: show count + worst direction so a user
// scanning the console can see "this finding is from capped analysis"
// at a glance. Direction tags ("under-report", "over-report", "bail")
// are stable strings from `LossDirection::tag()` kept in sync with
// are stable strings from `LossDirection::tag()`, kept in sync with
// the SARIF `result.properties.engine_notes[].kind` serialization so
// downstream tooling can cross-reference console and SARIF output.
// Informational-only notes (e.g. InlineCacheReused) are not surfaced
@ -453,7 +453,7 @@ fn state_remediation_hint(rule_id: &str) -> Option<&'static str> {
/// Colored severity tag with icon. The tag is the visual anchor of each finding.
///
/// - HIGH: bold red
/// - MEDIUM: bold 208 (orange) distinct from yellow
/// - MEDIUM: bold 208 (orange), distinct from yellow
/// - LOW: dim 67 (muted blue-gray)
fn severity_tag(sev: Severity) -> String {
match sev {
@ -503,7 +503,7 @@ fn collapse_chain_spacing(s: &str) -> String {
// Collapse: emit `.` directly after `)`
continue;
} else {
// Not a chain continuation emit the whitespace we skipped
// Not a chain continuation, emit the whitespace we skipped
for c in &chars[ws_start..i] {
out.push(*c);
}

View file

@ -18,7 +18,7 @@ pub struct CallSiteKey {
/// An explicit cross-language bridge edge.
///
/// Connects a call site in one language to a function definition in another.
/// Without an `InteropEdge`, cross-language resolution is never attempted
/// Without an `InteropEdge`, cross-language resolution is never attempted ,
/// this prevents false positives from name collisions across languages.
#[derive(Clone, Debug)]
pub struct InteropEdge {

View file

@ -115,8 +115,8 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig {
/// Benchmark-driven output-parameter source positions for known C APIs.
/// Maps callee name → argument positions that receive Source taint.
pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[
("fgets", &[0]), // fgets(buf, size, stream) buf receives input
("gets", &[0]), // gets(buf) buf receives input
("fgets", &[0]), // fgets(buf, size, stream), buf receives input
("gets", &[0]), // gets(buf), buf receives input
("recv", &[1]), // recv(fd, buf, len, flags)
("recvfrom", &[1]), // recvfrom(fd, buf, len, flags, ...)
];

View file

@ -120,7 +120,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
// and extract them as separate bodies. Without these, a
// `class_specifier` / `struct_specifier` falls through to the
// generic `_ =>` arm in `build_sub`, which records a leaf `Seq`
// node and never walks the body so inline member-function
// node and never walks the body, so inline member-function
// definitions (and methods of nested classes) are silently dropped.
"declaration_list" => Kind::Block,
"field_declaration_list" => Kind::Block,
@ -160,7 +160,7 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig {
/// Benchmark-driven output-parameter source positions for known C++ APIs.
pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[
("getline", &[1]), // std::getline(stream, str) str receives input
("getline", &[1]), // std::getline(stream, str), str receives input
("std::getline", &[1]),
("fgets", &[0]),
("gets", &[0]),

View file

@ -66,7 +66,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: false,
},
// fmt.Printf/Sprintf write to stdout or build strings in memory not
// fmt.Printf/Sprintf write to stdout or build strings in memory, not
// security sinks. fmt.Fprintf writes to an io.Writer (often http.ResponseWriter)
// so it IS a security sink for XSS.
LabelRule {
@ -110,7 +110,7 @@ pub static RULES: &[LabelRule] = &[
// Idiomatic Go SSRF sinks (Owncast CVE-2023-3188) use the
// `http.DefaultClient.Get(url)` form rather than the bare
// `http.Get(url)` helper, so the suffix-matched callee text needs
// an explicit entry here bare `Get/Post/Do/Head` would
// an explicit entry here, bare `Get/Post/Do/Head` would
// over-match unrelated method names.
"http.DefaultClient.Get",
"http.DefaultClient.Post",

View file

@ -53,13 +53,13 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::URL_ENCODE),
case_sensitive: false,
},
// OWASP ESAPI input validator validates and canonicalizes input
// OWASP ESAPI input validator, validates and canonicalizes input
LabelRule {
matchers: &["Validator.getValidInput"],
label: DataLabel::Sanitizer(Cap::all()),
case_sensitive: false,
},
// Type-check sanitizers parsing to a primitive erases taint
// Type-check sanitizers, parsing to a primitive erases taint
LabelRule {
matchers: &[
"Integer.parseInt",
@ -99,7 +99,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::CODE_EXEC),
case_sensitive: false,
},
// HTTP response sinks println/print are broad (also match System.out)
// HTTP response sinks, println/print are broad (also match System.out)
// but necessary to catch response.getWriter().println() via suffix matching.
LabelRule {
matchers: &["println", "print"],
@ -107,7 +107,7 @@ pub static RULES: &[LabelRule] = &[
case_sensitive: false,
},
// openConnection() is the standard java.net.URL API for initiating a connection.
// It is the correct interception point the URL is already set on the object.
// It is the correct interception point, the URL is already set on the object.
LabelRule {
matchers: &[
"openConnection",
@ -153,9 +153,9 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
},
// NOTE: Java logging (logger.info, log.warn, etc.) removed as sinks
// NOTE: Java logging (logger.info, log.warn, etc.) removed as sinks ,
// logging format injection is not a real security vulnerability in Java.
// String.format also removed it builds strings in memory (not a sink);
// String.format also removed, it builds strings in memory (not a sink);
// the real sink is wherever the formatted string is used (SQL, HTTP, etc.).
// ─── JNDI injection sinks ───
LabelRule {

View file

@ -36,7 +36,7 @@ pub static RULES: &[LabelRule] = &[
case_sensitive: false,
},
// `encodeURIComponent` percent-encodes every character outside the
// ASCII identifier alphabet, including `<`, `>`, `&`, `"`, `'` so
// ASCII identifier alphabet, including `<`, `>`, `&`, `"`, `'`, so
// the result is safe to embed in HTML text content and HTML
// attribute values, not just URL components. Treating it as
// covering both URL_ENCODE and HTML_ESCAPE caps avoids FPs when a
@ -92,7 +92,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE),
case_sensitive: false,
},
// he library HTML entity encoding
// he library, HTML entity encoding
LabelRule {
matchers: &["he.encode", "he.escape"],
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
@ -148,16 +148,16 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
},
// ── Outbound HTTP clients modeled as destination-aware gated sinks ──
// ── Outbound HTTP clients, modeled as destination-aware gated sinks ──
// Flat-Sink modeling of fetch/axios/got/undici/http.request was producing
// a dominant FP class where any tainted body/payload arg appeared as SSRF
// (e.g. `fetch("/api/telemetry", { body: navigator.userAgent })`). SSRF
// semantics require attacker control over the *destination*, not the
// payload. The gated entries in `GATED_SINKS` below narrow activation to
// URL / host / path / origin arguments or object fields. Taint flowing
// only to body / data / json / headers is no longer flagged as SSRF —
// cross-boundary data-exfiltration detection is a separate future
// capability (`Cap::DATA_EXFIL`, not yet introduced).
// payload. The gated entries in `GATED_SINKS` below narrow SSRF
// activation to URL / host / path / origin arguments or object fields.
// Taint flowing only to body / data / json / headers is captured by a
// *separate* gate class (`Cap::DATA_EXFIL`) so the two can coexist on
// the same callee without one over-flagging the other.
// Express response sinks
LabelRule {
matchers: &["res.send", "res.json"],
@ -222,6 +222,21 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// ── Cross-boundary data exfiltration (DATA_EXFIL) ─────────────────────
//
// `XMLHttpRequest.prototype.send(body)`, when the receiver type is
// tracked back to `new XMLHttpRequest()`, the SSA engine's type-qualified
// resolver converts `xhr.send` to `HttpClient.send`; matching that form
// fires DATA_EXFIL on tainted body flow. The explicit
// `XMLHttpRequest.prototype.send.apply(...)` form is also covered. The
// `fetch` body / headers / json case is covered by the gated entry in
// `GATED_SINKS` (so SSRF on the URL and DATA_EXFIL on the payload can
// coexist on a single call site).
LabelRule {
matchers: &["HttpClient.send", "XMLHttpRequest.prototype.send"],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
},
// ─────────── SQL injection sinks ─────────────
// Database drivers: mysql, mysql2, pg, better-sqlite3
LabelRule {
@ -314,7 +329,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
// only to body / data / json / headers / payload is silenced. See the
// commentary at the top of RULES for the rationale.
//
// `fetch(input, init)` arg 0 can be a URL string OR a Request/config
// `fetch(input, init)`, arg 0 can be a URL string OR a Request/config
// object with `url`. Per WHATWG Fetch, when `input` is a dictionary, the
// URL field is canonically `url`. Init-object body/headers at arg 1 are
// *not* destination-bearing.
@ -332,7 +347,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["url"],
},
},
// `axios(config)` / `axios.request(config)` config object exposes
// `axios(config)` / `axios.request(config)`, config object exposes
// `url` and `baseURL`. Body-ish fields (`data`, `params`, `headers`)
// are excluded.
SinkGate {
@ -363,7 +378,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["url", "baseURL"],
},
},
// `axios.get(url[, config])` arg 0 is URL; arg 1 is config.
// `axios.get(url[, config])`, arg 0 is URL; arg 1 is config.
SinkGate {
callee_matcher: "axios.get",
arg_index: 0,
@ -378,7 +393,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &[],
},
},
// `axios.post(url, data[, config])` arg 0 is URL; `data` at arg 1 is
// `axios.post(url, data[, config])`, arg 0 is URL; `data` at arg 1 is
// the request body and must NOT activate SSRF.
SinkGate {
callee_matcher: "axios.post",
@ -394,7 +409,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &[],
},
},
// `axios.put / axios.patch / axios.delete` follow the same shape
// `axios.put / axios.patch / axios.delete` follow the same shape ,
// (url, data?, config?). Keep the model consistent across verbs.
SinkGate {
callee_matcher: "axios.put",
@ -438,7 +453,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &[],
},
},
// `got(url[, options])` / `got(options)` options exposes `url` and
// `got(url[, options])` / `got(options)`, options exposes `url` and
// `prefixUrl`. Body-ish fields (`body`, `json`, `form`, `searchParams`,
// `headers`) are excluded.
SinkGate {
@ -455,7 +470,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["url", "prefixUrl"],
},
},
// `undici.request(url | opts[, opts])` opts exposes `origin` and
// `undici.request(url | opts[, opts])`, opts exposes `origin` and
// `path`. Body-ish fields (`body`, `headers`) are excluded.
SinkGate {
callee_matcher: "undici.request",
@ -471,11 +486,11 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["origin", "path"],
},
},
// Node `http.request(options[, cb])` / `https.request(options[, cb])`
// Node `http.request(options[, cb])` / `https.request(options[, cb])` ,
// options exposes `host`, `hostname`, `path`, `protocol`, `port`,
// `origin`. Body is sent via `.write()`/`.end()` on the returned
// ClientRequest, so it never appears as a positional arg here.
// Arg 0 may also be a URL string the "whole arg is destination"
// Arg 0 may also be a URL string, the "whole arg is destination"
// fallback (triggered when arg 0 is not an object literal) covers that.
SinkGate {
callee_matcher: "http.request",
@ -505,7 +520,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"],
},
},
// Node `http.get(options[, cb])` / `https.get(options[, cb])`
// Node `http.get(options[, cb])` / `https.get(options[, cb])` ,
// convenience wrappers around `.request()` that auto-call `.end()`.
// Same destination semantics as `.request`. Motivated by
// CVE-2025-64430 (Parse Server SSRF via http.get(uri)).
@ -537,6 +552,31 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"],
},
},
// ── Cross-boundary data exfiltration ──────────────────────────────────
//
// Sensitive data flowing into the *payload* of an outbound request is a
// distinct vulnerability class from SSRF: the destination is fixed but
// attacker-influenced bytes leave the process via the request body /
// headers / json field. These gates fire on the body-bearing positions
// and emit `Cap::DATA_EXFIL`, which is intentionally separate from
// `Cap::SSRF` so a `fetch(taintedUrl, {body: tainted})` site reports
// both classes independently.
//
// `fetch(input, init)`, `init` at arg 1 carries body / headers / json.
SinkGate {
callee_matcher: "fetch",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["body", "headers", "json"],
},
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -32,7 +32,7 @@ pub struct LabelRule {
/// expands it to `(0..arity)` using the actual call arity.
///
/// The value `usize::MAX` is used because `args.get(usize::MAX)` is a guaranteed
/// miss for any real argument list an accidental direct-lookup would be a no-op
/// miss for any real argument list, an accidental direct-lookup would be a no-op
/// rather than silently aliasing position 0.
pub const ALL_ARGS_PAYLOAD: &[usize] = &[usize::MAX];
@ -54,7 +54,7 @@ pub enum GateActivation {
/// arg selects the MIME type).
ValueMatch,
/// Destination-bearing flow activation. The gate fires when taint reaches
/// a declared destination location at the call site no literal
/// a declared destination location at the call site, no literal
/// inspection, no prefix heuristic.
///
/// For callees whose destination is a positional argument (e.g. `fetch`'s
@ -80,7 +80,7 @@ pub enum GateActivation {
}
/// Argument-sensitive sink activation. Whether a call becomes a sink is
/// determined by the gate's [`GateActivation`] mode literal-value matching
/// determined by the gate's [`GateActivation`] mode, literal-value matching
/// for traditional role-selector APIs, or destination-flow activation for
/// outbound HTTP clients and other APIs where a specific location in the
/// call carries the attacker-controlled destination.
@ -144,6 +144,13 @@ bitflags! {
/// carrier cap for folding `auth_analysis` into the SSA/taint
/// engine.
const UNAUTHORIZED_ID = 0b0001_0000_0000_0000; // bit 12
/// Cross-boundary data-exfiltration: tainted sensitive data flowing
/// into outbound request bodies, headers, or other payload-bearing
/// fields of network egress APIs. Distinct from `SSRF` (attacker
/// control over the destination URL), `DATA_EXFIL` fires when the
/// destination is fixed but attacker-influenced data leaves the
/// process via the request payload.
const DATA_EXFIL = 0b0010_0000_0000_0000; // bit 13
}
}
@ -192,7 +199,7 @@ pub enum Kind {
/// reachability does not depend on sibling-case execution order.
Switch,
Trivia,
/// Simple sequential expression (e.g. cast/type-assertion) treated like
/// Simple sequential expression (e.g. cast/type-assertion), treated like
/// any other sequential statement in the CFG but explicitly classified so
/// code that inspects `Kind` can recognise it.
Seq,
@ -472,9 +479,9 @@ pub enum SourceKind {
FileSystem,
/// Database query results
Database,
/// Caught exception may carry user-controlled data
/// Caught exception, may carry user-controlled data
CaughtException,
/// Could not determine treat conservatively
/// Could not determine, treat conservatively
Unknown,
}
@ -511,7 +518,7 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
// File system patterns
if cl.contains("read") || cl.contains("fopen") || cl.contains("open") {
// Distinguish from db reads file reads typically have FILE_IO cap
// Distinguish from db reads, file reads typically have FILE_IO cap
if caps.contains(Cap::FILE_IO) {
return SourceKind::FileSystem;
}
@ -570,6 +577,7 @@ pub fn parse_cap(s: &str) -> Option<Cap> {
"code_exec" => Some(Cap::CODE_EXEC),
"crypto" => Some(Cap::CRYPTO),
"unauthorized_id" => Some(Cap::UNAUTHORIZED_ID),
"data_exfil" | "data_exfiltration" => Some(Cap::DATA_EXFIL),
"all" => Some(Cap::all()),
_ => None,
}
@ -621,7 +629,7 @@ pub fn build_lang_rules(
Vec::new()
};
// Phase C: fold `auth_analysis` into the taint engine by injecting
// fold `auth_analysis` into the taint engine by injecting
// `Cap::UNAUTHORIZED_ID` sink/sanitizer rules. Gated by config; default
// OFF so the standalone `auth_analysis` subsystem remains authoritative.
if config.scanner.enable_auth_as_taint {
@ -636,7 +644,7 @@ pub fn build_lang_rules(
}
}
/// Return Phase C auth-as-taint rules for a given language (currently Rust-only).
/// Return the auth-as-taint rules for a given language (Rust-only).
fn phase_c_auth_rules_for_lang(lang_slug: &str) -> Vec<RuntimeLabelRule> {
match lang_slug {
"rust" | "rs" => rust::phase_c_auth_rules(),
@ -718,7 +726,7 @@ fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool {
if exact_only {
// `=foo` matchers fire only when `text` IS `foo` (no `Mod.foo`,
// `Class::foo`, or any preceding namespace). Lets a label rule
// distinguish bare `Kernel#open` from `File.open` the former
// distinguish bare `Kernel#open` from `File.open`, the former
// shells out on `|cmd`, the latter never does (CVE-2020-8130).
start == 0
} else {
@ -731,7 +739,7 @@ fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool {
/// Strip an optional `=` "exact-match" sigil from the start of a matcher.
/// Matchers prefixed with `=` (e.g. `"=open"`) only fire when the candidate
/// text equals the matcher exactly the boundary-`.`-or-`:` allowance is
/// text equals the matcher exactly, the boundary-`.`-or-`:` allowance is
/// suppressed. Used to distinguish bare-callee Ruby/Python builtins from
/// methods of the same name on a typed receiver.
#[inline]
@ -767,7 +775,7 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
let full_normalized = normalize_chained_call(text);
let full_norm_bytes = full_normalized.as_bytes();
// ── Check runtime (config) rules first they take priority ──────
// ── Check runtime (config) rules first, they take priority ──────
if let Some(extras) = extra {
// Pass 1: exact / suffix
for rule in extras {
@ -865,7 +873,7 @@ pub fn classify_all(
}
}
// ── Check runtime (config) rules first they take priority ──────
// ── Check runtime (config) rules first, they take priority ──────
if let Some(extras) = extra {
// Pass 1: exact / suffix
for rule in extras {
@ -941,7 +949,7 @@ pub fn classify_all(
/// (or [`ALL_ARGS_PAYLOAD`] for dynamic-activation conservative fallback).
/// `object_destination_fields`, when non-empty, restricts sink-taint checks
/// to identifiers found under those field names within an object-literal
/// positional argument used by destination-aware outbound-HTTP gates so
/// positional argument, used by destination-aware outbound-HTTP gates so
/// `fetch({url, body})` fires only when taint reaches `url`, not `body`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct GateMatch {
@ -952,9 +960,13 @@ pub struct GateMatch {
/// Classify a call against gated sink rules.
///
/// Returns `Some(GateMatch)` if the callee matches a gated rule AND the
/// activation conditions fire. Returns `None` if the callee doesn't match
/// any gated rule, or matches but the activation is provably safe.
/// Returns every gate whose callee matches AND whose activation conditions
/// fire. An empty result means the callee did not match any gated rule, or
/// every match was provably safe. Multiple matches are possible when the
/// same callee carries gates for different sink classes, e.g. `fetch` is
/// both an SSRF gate (URL flow) and a `DATA_EXFIL` gate (body / headers /
/// json flow); each gate carries its own [`GateMatch`] so downstream code
/// can attribute findings per-cap.
///
/// `const_arg_at` extracts positional argument values.
/// `const_keyword_arg` extracts keyword argument values (for languages like Python).
@ -964,11 +976,15 @@ pub fn classify_gated_sink(
const_arg_at: impl Fn(usize) -> Option<String>,
const_keyword_arg: impl Fn(&str) -> Option<String>,
kwarg_present: impl Fn(&str) -> bool,
) -> Option<GateMatch> {
let gates = GATED_REGISTRY.get(lang).or_else(|| {
) -> SmallVec<[GateMatch; 2]> {
let mut out: SmallVec<[GateMatch; 2]> = SmallVec::new();
let gates = match GATED_REGISTRY.get(lang).or_else(|| {
let key = lang.to_ascii_lowercase();
GATED_REGISTRY.get(key.as_str())
})?;
}) {
Some(g) => g,
None => return out,
};
let callee_bytes = callee_text.as_bytes();
@ -985,11 +1001,12 @@ pub fn classify_gated_sink(
object_destination_fields,
} = gate.activation
{
return Some(GateMatch {
out.push(GateMatch {
label: gate.label,
payload_args: gate.payload_args,
object_destination_fields,
});
continue;
}
// ── ValueMatch activation (legacy) ───────────────────────────────
@ -1012,7 +1029,7 @@ pub fn classify_gated_sink(
any_dangerous = true;
break;
}
// Present with a safe literal continue checking other kwargs.
// Present with a safe literal, continue checking other kwargs.
}
None => {
any_dynamic_present = true;
@ -1020,23 +1037,25 @@ pub fn classify_gated_sink(
}
}
if any_dangerous {
return Some(GateMatch {
out.push(GateMatch {
label: gate.label,
payload_args: gate.payload_args,
object_destination_fields: &[],
});
continue;
}
if any_dynamic_present {
// Dynamic kwarg value we can't prove safe. Conservatively
// Dynamic kwarg value, we can't prove safe. Conservatively
// flag every positional arg so the activation pathway isn't
// silently narrowed to the gate's declared `payload_args`.
return Some(GateMatch {
out.push(GateMatch {
label: gate.label,
payload_args: ALL_ARGS_PAYLOAD,
object_destination_fields: &[],
});
continue;
}
return None; // all listed kwargs absent or safe-literal → suppress
continue; // all listed kwargs absent or safe-literal → suppress
}
// Single-kwarg / positional gate path (original semantics).
@ -1058,22 +1077,22 @@ pub fn classify_gated_sink(
.iter()
.any(|p| lower.starts_with(&p.to_ascii_lowercase()));
if is_dangerous {
return Some(GateMatch {
out.push(GateMatch {
label: gate.label,
payload_args: gate.payload_args,
object_destination_fields: &[],
});
}
return None; // safe constant → suppress
// safe constant → suppress (no push)
}
// Unknown / dynamic activation arg: the gate fires conservatively,
// but we can't prove that only the declared `payload_args` carry
// risk a tainted activation arg (e.g. `setAttribute(userAttr, …)`
// risk, a tainted activation arg (e.g. `setAttribute(userAttr, …)`
// where `userAttr` is user-controlled) is itself a vulnerability
// path. Return ALL_ARGS_PAYLOAD so downstream sink scanning
// considers every positional argument.
None => {
return Some(GateMatch {
out.push(GateMatch {
label: gate.label,
payload_args: ALL_ARGS_PAYLOAD,
object_destination_fields: &[],
@ -1081,7 +1100,7 @@ pub fn classify_gated_sink(
}
}
}
None
out
}
/// Public wrapper for [`normalize_chained_call`] so callers outside the module
@ -1090,25 +1109,11 @@ pub fn normalize_chained_call_for_classify(text: &str) -> String {
normalize_chained_call(text)
}
/// Return the bare method-name segment of a callee text.
///
/// Centralised replacement for the textual `callee.rsplit('.').next().unwrap_or(callee)`
/// pattern that used to be scattered across the codebase.
///
/// Behaviour-preserving across the Phase 2 SSA chain decomposition rollout:
/// - When SSA lowering rewrites a chained-receiver call (`c.mu.Lock()` →
/// `Call("Lock", [v_mu])`), the call's `callee` is already the bare method
/// name, so this helper is a no-op pass-through.
/// - For 1-dot callees (`obj.method`) and for languages where Phase 2 lowering
/// doesn't run yet (PHP/Ruby) the helper still extracts the trailing method
/// from the textual form, exactly as the old per-callsite split did.
/// - For bare callees (no dot), it returns the input unchanged.
///
/// Use this helper when you need the *terminal* method name from a callee
/// string regardless of whether the call had a chained receiver. When you
/// have an `SsaOp::Call` in hand, prefer reading `callee` directly and
/// walking `receiver` through `FieldProj` ops — that's the precise path.
/// This helper is the textual fallback for callsites that only see a `&str`.
/// Return the bare method-name segment of a callee text. Returns the
/// input unchanged for bare callees. When you have an `SsaOp::Call`,
/// prefer reading `callee` directly and walking `receiver` through
/// `FieldProj` ops, this helper is the textual fallback for callsites
/// that only see a `&str`.
pub fn bare_method_name(callee: &str) -> &str {
callee.rsplit('.').next().unwrap_or(callee)
}
@ -1314,19 +1319,15 @@ mod tests {
fn bare_method_name_strips_chain() {
// No-dot input → returned as-is.
assert_eq!(bare_method_name("foo"), "foo");
// 1-dot → trailing segment (Phase 2 leaves these alone in SSA).
// 1-dot → trailing segment.
assert_eq!(bare_method_name("obj.method"), "method");
// Multi-dot → trailing segment (matches AST-only callees from
// PHP/Ruby and any pre-Phase-2 textual paths kept around in
// `callee_text` for display).
// Multi-dot → trailing segment.
assert_eq!(bare_method_name("a.b.c.method"), "method");
// Trailing dot → empty trailing segment, matching the legacy
// `rsplit('.').next()` behaviour bit-for-bit.
// Trailing dot → empty trailing segment.
assert_eq!(bare_method_name("foo."), "");
// Empty input.
assert_eq!(bare_method_name(""), "");
// Phase 2 invariant: when SSA decomposed a chain, `callee` is
// the bare method already and the helper is a no-op.
// SSA-decomposed chains pass through untouched.
assert_eq!(bare_method_name("Lock"), "Lock");
}
@ -1399,7 +1400,7 @@ mod tests {
#[test]
fn classify_bare_href_is_none() {
// Bare "href" should NOT be a sink only "location.href" and variants
// Bare "href" should NOT be a sink, only "location.href" and variants
let result = classify("javascript", "href", None);
assert_eq!(result, None);
}
@ -1497,7 +1498,7 @@ mod tests {
#[test]
fn classify_go_user_client_get_is_not_ssrf_sink() {
// `client.Get` on a user-named *http.Client variable should NOT
// match the Go SSRF set is restricted to the stdlib package
// match, the Go SSRF set is restricted to the stdlib package
// helper `http.DefaultClient`. Type-aware resolution would be the
// path to a broader rule, not a bare-name match.
let result = classify("go", "client.Get", None);
@ -1530,7 +1531,7 @@ mod tests {
#[test]
fn classify_ruby_io_open_is_not_shell_escape_sink() {
// `IO.open` takes a file descriptor never pipes. The bare-
// `IO.open` takes a file descriptor, never pipes. The bare-
// open CMDI rule must leave it alone.
let result = classify("ruby", "IO.open", None);
assert_ne!(result, Some(DataLabel::Sink(Cap::SHELL_ESCAPE)));
@ -1572,7 +1573,7 @@ mod tests {
#[test]
fn classify_cpp_sto_family_is_sanitizer() {
// Phase 1: full `std::sto*` family (including 64-bit and `long
// full `std::sto*` family (including 64-bit and `long
// double` variants) clears every taint cap that flows through it,
// matching the existing `std::stoi`/`std::stol` rule.
for callee in [
@ -1621,6 +1622,16 @@ mod tests {
false
}
/// Find the first matching gate whose label sink-caps overlap `caps`.
/// Lets tests target a specific gate when a callee carries multiple
/// (e.g. `fetch` is both an SSRF and a `DATA_EXFIL` gate).
fn find_match_with_caps(matches: &[GateMatch], caps: Cap) -> Option<GateMatch> {
matches
.iter()
.find(|m| matches!(m.label, DataLabel::Sink(c) if c.intersects(caps)))
.copied()
}
#[test]
fn gated_sink_dangerous_exact() {
let result = classify_gated_sink(
@ -1631,12 +1642,12 @@ mod tests {
no_kw_present,
);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::HTML_ESCAPE),
payload_args: [1usize].as_slice(),
object_destination_fields: &[],
})
}]
);
}
@ -1650,12 +1661,12 @@ mod tests {
no_kw_present,
);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::HTML_ESCAPE),
payload_args: [1usize].as_slice(),
object_destination_fields: &[],
})
}]
);
}
@ -1668,24 +1679,24 @@ mod tests {
no_kw,
no_kw_present,
);
assert_eq!(result, None);
assert!(result.is_empty());
}
#[test]
fn gated_sink_dynamic_conservative() {
// Dynamic activation (e.g. `setAttribute(attrVar, val)`) returns the
// ALL_ARGS_PAYLOAD sentinel so callers expand payload tracking to
// every positional arg the activation arg itself is a vulnerability
// every positional arg, the activation arg itself is a vulnerability
// path when attacker-controlled.
let result =
classify_gated_sink("javascript", "setAttribute", |_| None, no_kw, no_kw_present);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::HTML_ESCAPE),
payload_args: ALL_ARGS_PAYLOAD,
object_destination_fields: &[],
})
}]
);
}
@ -1698,7 +1709,7 @@ mod tests {
no_kw,
no_kw_present,
);
assert_eq!(result, None);
assert!(result.is_empty());
}
#[test]
@ -1711,7 +1722,7 @@ mod tests {
no_kw,
no_kw_present,
);
assert_eq!(result.unwrap().payload_args, &[1]);
assert_eq!(result[0].payload_args, &[1]);
// parseFromString: payload is arg 0
let result = classify_gated_sink(
@ -1727,7 +1738,7 @@ mod tests {
no_kw,
no_kw_present,
);
assert_eq!(result.unwrap().payload_args, &[0]);
assert_eq!(result[0].payload_args, &[0]);
}
#[test]
@ -1745,7 +1756,7 @@ mod tests {
no_kw,
no_kw_present,
);
assert_eq!(result, None);
assert!(result.is_empty());
}
#[test]
@ -1764,12 +1775,12 @@ mod tests {
|kw| kw == "shell",
);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
payload_args: [0usize].as_slice(),
object_destination_fields: &[],
})
}]
);
}
@ -1788,7 +1799,7 @@ mod tests {
},
|kw| kw == "shell",
);
assert_eq!(result, None);
assert!(result.is_empty());
}
#[test]
@ -1797,12 +1808,12 @@ mod tests {
// literal available → unknown activation → ALL_ARGS_PAYLOAD sentinel.
let result = classify_gated_sink("python", "Popen", |_| None, |_| None, no_kw_present);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
payload_args: ALL_ARGS_PAYLOAD,
object_destination_fields: &[],
})
}]
);
}
@ -1825,12 +1836,12 @@ mod tests {
|kw| kw == "shell",
);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
payload_args: [0usize].as_slice(),
object_destination_fields: &[],
})
}]
);
}
@ -1850,7 +1861,7 @@ mod tests {
},
|kw| kw == "shell",
);
assert_eq!(result, None);
assert!(result.is_empty());
}
/// `subprocess.run(cmd)` → no shell kwarg → presence-aware gate suppresses.
@ -1864,7 +1875,7 @@ mod tests {
|_| None,
no_kw_present,
);
assert_eq!(result, None);
assert!(result.is_empty());
}
/// `subprocess.run(cmd, shell=flag)` → shell kwarg present but dynamic →
@ -1880,12 +1891,12 @@ mod tests {
|kw| kw == "shell",
);
assert_eq!(
result,
Some(GateMatch {
result.as_slice(),
&[GateMatch {
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
payload_args: ALL_ARGS_PAYLOAD,
object_destination_fields: &[],
})
}]
);
}
@ -1893,18 +1904,18 @@ mod tests {
/// verbatim for the caller to apply object-literal field filtering.
#[test]
fn gated_sink_destination_positional_always_fires() {
// `fetch(url)` arg 0 is the URL (positional destination) OR an
// `fetch(url)`, arg 0 is the URL (positional destination) OR an
// object with a `url` field. The gate fires unconditionally, with
// `url` declared as the object-literal destination-field for the
// `fetch({url, body})` shape.
let result = classify_gated_sink(
"javascript",
"fetch",
|_| None, // no literal Destination mode doesn't inspect it
|_| None, // no literal, Destination mode doesn't inspect it
no_kw,
no_kw_present,
);
let m = result.expect("fetch gate should fire");
let m = find_match_with_caps(&result, Cap::SSRF).expect("fetch SSRF gate should fire");
assert_eq!(m.label, DataLabel::Sink(Cap::SSRF));
assert_eq!(m.payload_args, &[0]);
assert_eq!(m.object_destination_fields, &["url"]);
@ -1914,10 +1925,13 @@ mod tests {
/// the CFG caller to drive object-literal field filtering.
#[test]
fn gated_sink_destination_object_fields_surfaced() {
// `http.request(opts, cb)` opts is an object with destination fields.
// `http.request(opts, cb)`, opts is an object with destination fields.
let result =
classify_gated_sink("javascript", "http.request", |_| None, no_kw, no_kw_present);
let m = result.expect("http.request gate should fire");
let m = result
.first()
.copied()
.expect("http.request gate should fire");
assert_eq!(m.label, DataLabel::Sink(Cap::SSRF));
assert_eq!(m.payload_args, &[0]);
assert!(
@ -1929,6 +1943,27 @@ mod tests {
);
}
/// `fetch` carries both SSRF (URL flow) and `DATA_EXFIL` (body / headers /
/// json flow) gates. Both must fire from a single classify call so the
/// downstream CFG can build per-cap filters.
#[test]
fn gated_sink_fetch_emits_ssrf_and_data_exfil() {
let result = classify_gated_sink("javascript", "fetch", |_| None, no_kw, no_kw_present);
let ssrf = find_match_with_caps(&result, Cap::SSRF).expect("SSRF gate fires");
assert_eq!(ssrf.label, DataLabel::Sink(Cap::SSRF));
assert_eq!(ssrf.payload_args, &[0]);
assert_eq!(ssrf.object_destination_fields, &["url"]);
let exfil = find_match_with_caps(&result, Cap::DATA_EXFIL).expect("DATA_EXFIL gate fires");
assert_eq!(exfil.label, DataLabel::Sink(Cap::DATA_EXFIL));
assert_eq!(exfil.payload_args, &[1]);
assert!(
exfil.object_destination_fields.contains(&"body"),
"expected body in DATA_EXFIL destination fields, got {:?}",
exfil.object_destination_fields,
);
}
#[test]
fn classify_all_single_label() {
let result = classify_all("javascript", "innerHTML", None);

View file

@ -106,6 +106,19 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::URL_ENCODE),
case_sensitive: false,
},
// SQLAlchemy bound-parameter sanitizer. Values passed as keyword
// arguments to `text("…:name…").bindparams(name=value)` are bound
// by the driver, so injection cannot break out of the literal
// context. The accompanying SQL-string check (py.sqli.text_format)
// already flags the `text(f"…")` shape at construction, so this
// sanitizer only clears flow when the SQL is a literal and the
// values reach the engine via bindparams. Recognises both the
// method form (`text(…).bindparams(...)`) and the bare call form.
LabelRule {
matchers: &["bindparams", ".bindparams"],
label: DataLabel::Sanitizer(Cap::SQL_QUERY),
case_sensitive: false,
},
// Path canonicalization
LabelRule {
matchers: &["os.path.abspath", "os.path.normpath"],
@ -119,7 +132,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::CODE_EXEC),
case_sensitive: false,
},
// Jinja2 / string.Template tainted template string enables SSTI
// Jinja2 / string.Template, tainted template string enables SSTI
LabelRule {
matchers: &["Template"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
@ -141,7 +154,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::HTML_ESCAPE),
case_sensitive: false,
},
// Flask Markup bypasses auto-escaping
// Flask Markup, bypasses auto-escaping
LabelRule {
matchers: &["Markup"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
@ -216,7 +229,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// aiohttp HTTP client SSRF sinks
// aiohttp HTTP client, SSRF sinks
LabelRule {
matchers: &[
"aiohttp.get",
@ -228,6 +241,30 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// Type-qualified SSRF sinks: when the receiver is tracked as
// TypeKind::HttpClient (e.g. `client = requests.Session()`,
// `client = httpx.Client()`, or `s = aiohttp.ClientSession()`),
// resolve_type_qualified_labels() constructs `"HttpClient.<method>"`
// call texts so the receiver-name is no longer load-bearing. Matches
// the existing Rust HttpClient.<method> sink set so both languages
// stay in step on the type-aware SSRF model. Motivated by the
// upstream LMDeploy CVE-2026-33626 shape:
// client = requests.Session()
// response = client.get(url, ...)
LabelRule {
matchers: &[
"HttpClient.get",
"HttpClient.post",
"HttpClient.put",
"HttpClient.delete",
"HttpClient.patch",
"HttpClient.head",
"HttpClient.request",
"HttpClient.send",
],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
LabelRule {
matchers: &[
"pickle.loads",
@ -256,7 +293,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
dangerous_kwargs: &[],
activation: GateActivation::ValueMatch,
},
// subprocess.run(cmd, shell=True) multi-kwarg gate using the new
// subprocess.run(cmd, shell=True), multi-kwarg gate using the new
// presence-aware mechanism. Payload is arg 1 (after receiver offset
// applied by the CFG layer when the call is modelled method-style).
SinkGate {
@ -361,7 +398,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
let mut rules = Vec::new();
if ctx.has(DetectedFramework::Django) {
// QuerySet.extra() raw SQL injection risk.
// QuerySet.extra(), raw SQL injection risk.
// Framework-conditional because `extra` is too generic as a static matcher.
rules.push(RuntimeLabelRule {
matchers: vec!["extra".into()],

View file

@ -14,7 +14,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// Rails request object user-controlled HTTP request data.
// Rails request object, user-controlled HTTP request data.
// Dotted matchers work via push_node receiver.method text construction
// (confirmed by existing Net::HTTP.get matcher in ssrf_net_http fixture).
LabelRule {
@ -75,7 +75,7 @@ pub static RULES: &[LabelRule] = &[
},
// Bare `Kernel#open(path)` interprets a path beginning with `|` as a
// shell command (`open("|cmd")` runs `cmd`). `=open` exact-matcher
// syntax limits this rule to the bare call `File.open`, `IO.open`,
// syntax limits this rule to the bare call, `File.open`, `IO.open`,
// `URI.open` etc. each have their own non-pipe semantics and are
// covered by their own labels (or intentionally not labeled as CMDI).
// CVE-2020-8130 (rake `Rake::FileList#egrep`) was the canonical
@ -99,7 +99,7 @@ pub static RULES: &[LabelRule] = &[
// File I/O sinks: user-controlled paths flowing into File.open/File.new
// are a path-traversal / arbitrary-read vector. File.open also participates
// in the resource-lifecycle acquire/release pair (cfg_analysis::RUBY_RESOURCES),
// so this entry is additive it does not disturb resource-leak detection.
// so this entry is additive, it does not disturb resource-leak detection.
LabelRule {
matchers: &["File.open", "File.new", "File.read", "IO.read"],
label: DataLabel::Sink(Cap::FILE_IO),
@ -115,7 +115,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::HTML_ESCAPE),
case_sensitive: false,
},
// URI.open is the network-capable Kernel#open wrapper more specific than
// URI.open is the network-capable Kernel#open wrapper, more specific than
// plain `open` (excluded to avoid file I/O false positives).
LabelRule {
matchers: &[
@ -140,7 +140,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::DESERIALIZE),
case_sensitive: false,
},
// Reflection / dynamic class resolution arbitrary class instantiation from
// Reflection / dynamic class resolution, arbitrary class instantiation from
// user-controlled names enables gadget chains (similar risk profile to
// deserialization). Rails adds `constantize`/`safe_constantize` to String.
LabelRule {
@ -157,7 +157,7 @@ pub static RULES: &[LabelRule] = &[
// SQL injection: ActiveRecord query methods that accept raw SQL strings.
// `where` and `order` are the most common Rails SQLi vectors when called
// with string interpolation (e.g., User.where("name = '#{params[:name]}'")).
// Broad matchers verified against fixture fallout.
// Broad matchers, verified against fixture fallout.
LabelRule {
matchers: &["where", "order", "group", "having", "joins", "pluck"],
label: DataLabel::Sink(Cap::SQL_QUERY),
@ -240,7 +240,7 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig {
/// ActiveRecord query methods that the static [`RULES`] table classifies as
/// `Sink(Cap::SQL_QUERY)`. These are SQL injection vectors only when arg 0
/// is a string with interpolation (`#{x}`) or a non-literal identifier the
/// is a string with interpolation (`#{x}`) or a non-literal identifier, the
/// hash form (`where(id: x)`) and the parameterised form (`where("a = ?", x)`)
/// are intrinsically safe because Rails escapes the values.
const AR_QUERY_METHOD_NAMES: &[&str] = &["where", "order", "group", "having", "joins", "pluck"];
@ -249,7 +249,7 @@ const AR_QUERY_METHOD_NAMES: &[&str] = &["where", "order", "group", "having", "j
/// shape-safe. Hash literals (`pair`, `hash`), symbol literals
/// (`simple_symbol`, `hash_key_symbol`), array literals (`array`), and pure
/// string literals without `#{...}` interpolation are all safe. Strings WITH
/// interpolation and identifiers / method calls are *not* in this list
/// interpolation and identifiers / method calls are *not* in this list ,
/// callers must check `has_interpolation` and the kind separately.
const AR_QUERY_SAFE_ARG0_KINDS: &[&str] = &[
"pair",
@ -270,15 +270,15 @@ const AR_QUERY_SAFE_ARG0_KINDS: &[&str] = &[
/// `cfg-unguarded-sink` (sanitiser dominates the sink reflexively).
///
/// Real-world FP shapes this closes (redmine, mastodon, diaspora):
/// * `Issue.where(:id => params[:id])` hash form
/// * `Model.where(id: x, name: y)` keyword-shorthand pairs
/// * `Project.order(:created_at)` symbol literal
/// * `Issue.pluck(:id, :name)` symbol literals
/// * `Model.where("active = ?", x)` parameterised string
/// * `Issue.where(:id => params[:id])`, hash form
/// * `Model.where(id: x, name: y)`, keyword-shorthand pairs
/// * `Project.order(:created_at)`, symbol literal
/// * `Issue.pluck(:id, :name)`, symbol literals
/// * `Model.where("active = ?", x)`, parameterised string
///
/// Real-world TPs preserved:
/// * `User.where("name = '#{name}'")` string with interpolation
/// * `Model.where(some_string_var)` dynamic identifier (conservative)
/// * `User.where("name = '#{name}'")`, string with interpolation
/// * `Model.where(some_string_var)`, dynamic identifier (conservative)
pub fn ar_query_safe_shape(callee_text: &str, arg0_kind: &str, has_interpolation: bool) -> bool {
// Match the callee's last segment ("Model.where" → "where", "where" → "where").
let leaf = callee_text.rsplit(['.', ':']).next().unwrap_or(callee_text);
@ -297,7 +297,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
let mut rules = Vec::new();
if ctx.has(DetectedFramework::Rails) {
// Strong parameters permit/require sanitize user input
// Strong parameters, permit/require sanitize user input
rules.push(RuntimeLabelRule {
matchers: vec!["permit".into(), "require".into()],
label: DataLabel::Sanitizer(Cap::all()),
@ -306,7 +306,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
}
if ctx.has(DetectedFramework::Sinatra) {
// Sinatra template rendering user content flows to rendered output
// Sinatra template rendering, user content flows to rendered output
rules.push(RuntimeLabelRule {
matchers: vec!["erb".into(), "haml".into()],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
@ -323,7 +323,7 @@ mod ar_query_tests {
#[test]
fn hash_form_is_safe() {
// Model.where(:id => x) pair node directly in argument_list
// Model.where(:id => x) , pair node directly in argument_list
assert!(ar_query_safe_shape("Model.where", "pair", false));
// Model.where(id: x)
assert!(ar_query_safe_shape("where", "pair", false));
@ -338,32 +338,32 @@ mod ar_query_tests {
#[test]
fn parameterised_string_is_safe() {
// Model.where("a = ?", x) first arg is a string literal w/o interpolation
// Model.where("a = ?", x) , first arg is a string literal w/o interpolation
assert!(ar_query_safe_shape("where", "string", false));
assert!(ar_query_safe_shape("where", "string_literal", false));
}
#[test]
fn interpolated_string_is_dangerous() {
// Model.where("a = #{x}") string node WITH interpolation child
// Model.where("a = #{x}") , string node WITH interpolation child
assert!(!ar_query_safe_shape("where", "string", true));
}
#[test]
fn dynamic_identifier_is_dangerous() {
// Model.where(some_var) kind is identifier, not in safe list
// Model.where(some_var), kind is identifier, not in safe list
assert!(!ar_query_safe_shape("where", "identifier", false));
}
#[test]
fn array_form_is_safe() {
// Model.pluck([:id, :name]) uncommon but valid
// Model.pluck([:id, :name]), uncommon but valid
assert!(ar_query_safe_shape("pluck", "array", false));
}
#[test]
fn non_ar_method_is_never_suppressed() {
// find_by_sql is a real raw-SQL sink never suppress.
// find_by_sql is a real raw-SQL sink, never suppress.
assert!(!ar_query_safe_shape("find_by_sql", "string", false));
assert!(!ar_query_safe_shape("connection.execute", "pair", false));
}

View file

@ -168,7 +168,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"expression_statement" => Kind::CallWrapper,
"assignment_expression" => Kind::Assignment,
// struct expressions recurse so env::var() calls inside field
// struct expressions, recurse so env::var() calls inside field
// initialisers produce Source-labelled CFG nodes (needed for summaries).
"struct_expression" => Kind::Block,
"field_initializer_list" => Kind::Block,
@ -287,7 +287,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
rules
}
/// Phase C: auth-as-taint label rules for Rust. Gated by
/// auth-as-taint label rules for Rust. Gated by
/// `config.scanner.enable_auth_as_taint`; appended to the runtime rule set
/// when the flag is enabled. These declare **sinks** (state-changing or
/// outbound operations that should not be reached by an un-checked
@ -343,10 +343,8 @@ pub fn phase_c_auth_rules() -> Vec<RuntimeLabelRule> {
case_sensitive: false,
},
// ── Sanitizers clearing Cap::UNAUTHORIZED_ID ──
// Ownership and membership guards from the auth_analysis default
// `authorization_check_names` list. Phase C consumes these via
// call-site argument sanitization (see
// `is_auth_as_taint_arg_sanitizer` in ssa_transfer).
// Ownership and membership guards consumed via call-site
// argument sanitization (see `is_auth_as_taint_arg_sanitizer`).
RuntimeLabelRule {
matchers: vec![
"check_ownership".into(),

View file

@ -86,7 +86,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE),
case_sensitive: false,
},
// he library HTML entity encoding
// he library, HTML entity encoding
LabelRule {
matchers: &["he.encode", "he.escape"],
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
@ -131,7 +131,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
},
// ── Outbound HTTP clients modeled as destination-aware gated sinks ──
// ── Outbound HTTP clients, modeled as destination-aware gated sinks ──
// See GATED_SINKS below; rationale mirrors javascript.rs.
LabelRule {
matchers: &[
@ -206,6 +206,14 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// ── Cross-boundary data exfiltration (DATA_EXFIL) ─────────────────────
// See javascript.rs for rationale. `xhr.send(body)` resolves to
// `HttpClient.send` via type-qualified resolution.
LabelRule {
matchers: &["HttpClient.send", "XMLHttpRequest.prototype.send"],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
},
// ─────────── SQL injection sinks ─────────────
// Database drivers: mysql, mysql2, pg, better-sqlite3
LabelRule {
@ -283,7 +291,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
dangerous_kwargs: &[],
activation: GateActivation::ValueMatch,
},
// ── Outbound HTTP clients (SSRF) see javascript.rs for rationale ────
// ── Outbound HTTP clients (SSRF), see javascript.rs for rationale ────
SinkGate {
callee_matcher: "fetch",
arg_index: 0,
@ -452,6 +460,24 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"],
},
},
// ── Cross-boundary data exfiltration ──────────────────────────────────
// `fetch(input, init)`, payload-bearing fields of `init` (arg 1) flow
// into the request body / headers / json, distinct from SSRF on the URL
// (arg 0). See javascript.rs for full rationale.
SinkGate {
callee_matcher: "fetch",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["body", "headers", "json"],
},
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -1,43 +1,14 @@
//! # Nyx Scanner
//! Multi-language static vulnerability scanner. Tree-sitter parsing, petgraph
//! CFGs, SSA-based dataflow, and cross-file taint analysis with a
//! capability-based sanitizer system. Supports Rust, C, C++, Java, Go, PHP,
//! Python, Ruby, TypeScript, and JavaScript.
//!
//! A multi-language static vulnerability scanner. Nyx parses source files with
//! [tree-sitter](https://tree-sitter.github.io/), builds intra-procedural
//! control-flow graphs ([petgraph](https://docs.rs/petgraph)), and runs
//! cross-file taint analysis with a capability-based sanitizer system.
//!
//! ## Architecture
//!
//! Nyx uses a **two-pass architecture**:
//!
//! 1. **Pass 1 — Summary extraction**: Parse each file, build a CFG per function,
//! and export a [`summary::FuncSummary`] capturing source/sanitizer/sink capabilities,
//! taint propagation behavior, and callee lists. Summaries are persisted to SQLite.
//!
//! 2. **Pass 2 — Analysis**: Load all summaries into a [`summary::GlobalSummaries`] map,
//! re-parse files, and run taint analysis with cross-file callee resolution. CFG
//! structural analysis checks for auth gaps, unguarded sinks, and resource leaks.
//!
//! ## Four Detector Families
//!
//! - **Taint** ([`taint`]) — Monotone forward dataflow tracking source-to-sink flows
//! - **CFG Structural** ([`cfg_analysis`]) — Dominator-based guard and auth-gap detection
//! - **State Model** ([`state`]) — Resource lifecycle and authentication state lattices
//! - **AST Patterns** ([`patterns`]) — Tree-sitter structural queries per language
//!
//! ## Supported Languages
//!
//! Rust, C, C++, Java, Go, PHP, Python, Ruby, TypeScript, JavaScript.
//!
//! ## Entry Points
//!
//! - [`scan_no_index`] — Run a two-pass scan without indexing (for tests)
//! - [`commands::scan::scan_filesystem`] — Filesystem scan with optional indexing
//! - [`commands::scan::scan_with_index_parallel`] — Index-backed parallel scan
//!
//! ## Documentation
//!
//! See the [`docs/`](https://github.com/elicpeter/nyx/tree/master/docs) directory
//! for user and contributor documentation.
//! The handbook below is embedded verbatim from
//! [`docs/how-it-works.md`](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md).
//! Per-detector documentation lives on the [`taint`], [`cfg_analysis`],
//! [`state`], [`patterns`], and [`auth_analysis`] modules. The primary
//! library entry point for tests and embedders is [`scan_no_index`].
#![doc = include_str!(concat!(env!("OUT_DIR"), "/lib_intro.md"))]
pub mod abstract_interp;
pub mod ast;

View file

@ -69,7 +69,7 @@ fn main() -> NyxResult<()> {
let quiet = config.output.quiet || cli.command.is_structured_output(&config);
// Print config note before scanning (human-readable mode only). Pure
// informational commands suppress it too their output is usually
// informational commands suppress it too, their output is usually
// piped or grepped and the preamble is noise.
if let Some(note) = config_note.filter(|_| !quiet && !is_info) {
eprint!("{note}");

View file

@ -47,14 +47,28 @@ fn cfg_rule_description(id: &str) -> Option<&'static str> {
}
}
/// Look up a human-readable description for any rule ID.
fn rule_description(id: &str) -> &str {
// Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base)
let base_id = if id.starts_with("taint-") {
/// Normalise a finding's id to the base SARIF rule id.
///
/// Findings carry source-location-suffixed ids like
/// `"taint-unsanitised-flow (source 12:3)"` so identical (source, sink)
/// pairs can be deduped, but SARIF wants a single rule per category.
/// Cap-specific taint rule classes (e.g. `taint-data-exfiltration`) are
/// preserved as distinct bases so consumers can filter on them rather than
/// folding everything into `taint-unsanitised-flow`.
fn sarif_base_id(id: &str) -> &str {
if id.starts_with("taint-data-exfiltration") {
"taint-data-exfiltration"
} else if id.starts_with("taint-") {
"taint-unsanitised-flow"
} else {
id
};
}
}
/// Look up a human-readable description for any rule ID.
fn rule_description(id: &str) -> &str {
// Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base)
let base_id = sarif_base_id(id);
if let Some(desc) = PATTERN_DESCRIPTIONS.get(base_id) {
return desc;
@ -62,10 +76,13 @@ fn rule_description(id: &str) -> &str {
if let Some(desc) = cfg_rule_description(base_id) {
return desc;
}
if base_id == "taint-unsanitised-flow" {
return "Unsanitised data flows from source to sink";
match base_id {
"taint-unsanitised-flow" => "Unsanitised data flows from source to sink",
"taint-data-exfiltration" => {
"Sensitive data flows into the payload of an outbound network request"
}
_ => id,
}
id
}
fn severity_to_level(sev: Severity) -> &'static str {
@ -83,11 +100,7 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
let mut rule_index_map: HashMap<String, usize> = HashMap::new();
for d in diags {
let base = if d.id.starts_with("taint-") {
"taint-unsanitised-flow".to_string()
} else {
d.id.clone()
};
let base = sarif_base_id(&d.id).to_string();
if !rule_index_map.contains_key(&base) {
let idx = rule_ids.len();
rule_index_map.insert(base.clone(), idx);
@ -108,15 +121,11 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
let results: Vec<Value> = diags
.iter()
.map(|d| {
let base = if d.id.starts_with("taint-") {
"taint-unsanitised-flow"
} else {
&d.id
};
let base = sarif_base_id(&d.id);
let rule_index = rule_index_map[base];
// Make path relative to scan root. Fall back to a deterministic
// sentinel instead of the absolute path SARIF must not leak
// sentinel instead of the absolute path, SARIF must not leak
// home-directory or host-specific prefixes.
let uri = match Path::new(&d.path).strip_prefix(scan_root) {
Ok(p) => p.to_string_lossy().to_string(),
@ -213,17 +222,17 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
props.insert("relatedFindings".into(), json!(d.alternative_finding_ids));
}
// Engine provenance notes surface any cap-hit / lowering
// Engine provenance notes, surface any cap-hit / lowering
// bail / timeout signals recorded by the analysis engine so
// downstream consumers can tell "nothing found" from "engine
// stopped looking".
//
// Three properties are emitted together:
// * `engine_notes` raw list of {kind, ...} entries
// * `confidence_capped` true iff any non-informational
// * `engine_notes` , raw list of {kind, ...} entries
// * `confidence_capped` , true iff any non-informational
// note is present (back-compat
// boolean; drives legacy dashboards)
// * `loss_direction` worst `LossDirection` across
// * `loss_direction` , worst `LossDirection` across
// the list ("under-report",
// "over-report", "bail"). Absent
// when only informational notes
@ -590,7 +599,7 @@ mod tests {
#[test]
fn build_sarif_path_outside_scan_root_is_redacted() {
// Absolute host paths leak home-directory information SARIF must
// Absolute host paths leak home-directory information, SARIF must
// substitute a deterministic token when a finding falls outside the
// scan root.
let mut diag = make_diag("rule-x", Severity::High);

View file

@ -43,7 +43,7 @@ pub fn scan_ejs_file(path: &Path, bytes: &[u8]) -> Vec<Diag> {
// Advance past this match for the next iteration.
search_from = abs_end + 2; // skip "%>"
// Skip <%- include(...) %> EJS partial inclusion, not user-controlled.
// Skip <%- include(...) %>, EJS partial inclusion, not user-controlled.
if is_include_call(expr) {
continue;
}

View file

@ -12,7 +12,7 @@ pub const PATTERNS: &[Pattern] = &[
Pattern {
id: "java.deser.readobject",
description: "ObjectInputStream.readObject() performs unsafe deserialization",
// Match any .readObject() call the method name is specific enough.
// Match any .readObject() call, the method name is specific enough.
query: r#"(method_invocation
name: (identifier) @id (#eq? @id "readObject"))
@vuln"#,
@ -21,6 +21,46 @@ pub const PATTERNS: &[Pattern] = &[
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier A: SnakeYAML deserialization (CVE-2022-1471) ──────────────
// `new Yaml()` constructed without a `SafeConstructor` argument
// accepts arbitrary YAML tags (`!!javax.script.ScriptEngineManager`,
// `!!java.net.URLClassLoader`, …) and instantiates any class via
// reflection. SnakeYAML 2.0 swapped the default to SafeConstructor
// but pre-2.0 deployments stay vulnerable until call sites are
// patched. We match the empty-arg form `new Yaml()` only, so the
// explicit-SafeConstructor remediation form
// `new Yaml(new SafeConstructor(new LoaderOptions()))` is silent.
Pattern {
id: "java.deser.snakeyaml_unsafe_constructor",
description: "new Yaml() without SafeConstructor accepts arbitrary class tags (CVE-2022-1471)",
query: r#"(object_creation_expression
type: (type_identifier) @t (#eq? @t "Yaml")
arguments: (argument_list) @args (#eq? @args "()"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier A: Apache Commons Text Text4Shell (CVE-2022-42889) ────────
// `StringSubstitutor.createInterpolator()` enables `script:`,
// `dns:`, and `url:` lookups by default, `${script:js:…}`
// evaluates JavaScript via the JSR-223 ScriptEngineManager. The
// factory call is itself the structural bug; the recommended app-
// side mitigation builds a `StringSubstitutor` directly with a
// restricted lookup map.
Pattern {
id: "java.code_exec.text4shell_interpolator",
description: "StringSubstitutor.createInterpolator() enables script:/dns:/url: evaluation (CVE-2022-42889)",
query: r#"(method_invocation
object: (identifier) @c (#eq? @c "StringSubstitutor")
name: (identifier) @id (#eq? @id "createInterpolator"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "java.cmdi.runtime_exec",

View file

@ -1,42 +1,4 @@
//! # AST Pattern Conventions
//!
//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs.
//!
//! ## ID format
//!
//! `<lang>.<category>.<specific>` — e.g. `java.deser.readobject`, `py.cmdi.os_system`.
//!
//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`.
//!
//! ## Tiers
//!
//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`).
//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated
//! arg, format-string with variable first arg).
//!
//! ## Severity
//!
//! * **High** — command exec, deserialization, banned C functions.
//! * **Medium** — SQL concat, reflection, XSS sinks, casts.
//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`).
//!
//! Note: the default `min_severity` filter skips Low patterns; they only appear when
//! the user explicitly lowers the threshold.
//!
//! ## No-duplicate rule
//!
//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a
//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but
//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`).
//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location.
//!
//! ## Adding a new pattern
//!
//! 1. Pick the language file under `src/patterns/<lang>.rs`.
//! 2. Choose tier, category, severity per the rules above.
//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`.
//! 4. Add a snippet to `tests/fixtures/patterns/<lang>/positive.<ext>`.
//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`.
#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))]
pub mod c;
pub mod cpp;
@ -68,7 +30,7 @@ pub enum Severity {
impl Severity {
/// Bracketed, colored, fixed-width tag for aligned console output.
///
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` always 8 visible characters
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"`, always 8 visible characters
/// so the column after the tag lines up regardless of severity.
#[allow(dead_code)] // public API for lib consumers
pub fn colored_tag(self) -> String {
@ -123,9 +85,9 @@ impl FromStr for Severity {
/// A parsed severity filter expression.
///
/// Supports three forms:
/// - Single level: `"HIGH"` matches only that level
/// - Comma list: `"HIGH,MEDIUM"` matches any listed level
/// - Threshold: `">=MEDIUM"` matches that level and above
/// - Single level: `"HIGH"`, matches only that level
/// - Comma list: `"HIGH,MEDIUM"`, matches any listed level
/// - Threshold: `">=MEDIUM"`, matches that level and above
///
/// Parsing is case-insensitive and tolerates whitespace around tokens.
#[derive(Debug, Clone, PartialEq, Eq)]
@ -242,7 +204,7 @@ impl PatternCategory {
/// One AST pattern with a tree-sitter query and meta-data.
#[derive(Debug, Clone, Serialize, PartialEq)]
pub struct Pattern {
/// Unique identifier `<lang>.<category>.<specific>` preferred.
/// Unique identifier, `<lang>.<category>.<specific>` preferred.
pub id: &'static str,
/// Human-readable explanation.
pub description: &'static str,

View file

@ -5,7 +5,7 @@ use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
///
/// Taint rules cover `eval`/`exec`, `os.system`/`os.popen`/`subprocess.*`,
/// and `cursor.execute`. AST patterns here add coverage for **deserialization**,
/// **subprocess shell=True** (Tier B taint doesn't check keyword args), and
/// **subprocess shell=True** (Tier B, taint doesn't check keyword args), and
/// **code execution** sinks that taint cannot structurally verify.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Code execution ─────────────────────────────────────────
@ -121,14 +121,45 @@ pub const PATTERNS: &[Pattern] = &[
confidence: Confidence::High,
},
// ── Tier B: SQL injection (format/concat heuristic) ────────────────
// Catches both `cursor.execute(query + user)` (binary_operator concat)
// and `cursor.execute(f"... {user} ...")` (f-string with interpolation).
// f-strings appear as a `string` node with `interpolation` children in
// tree-sitter-python; the alternation lets the same pattern cover both
// the historical % / + concat shapes and the modern f-string SQLi shape
// that surfaces in CVE-2025-24793 (snowflake-connector-python),
// CVE-2025-69662 (geopandas), and dozens of similar cursor.execute
// call sites across the corpus.
Pattern {
id: "py.sqli.execute_format",
description: "cursor.execute with string concatenation risks SQL injection",
description: "cursor.execute with string concatenation or f-string risks SQL injection",
query: r#"(call
function: (attribute
attribute: (identifier) @fn (#eq? @fn "execute"))
arguments: (argument_list
(binary_operator) @arg))
[(binary_operator)
(string (interpolation))] @arg))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::SqlInjection,
confidence: Confidence::Medium,
},
// SQLAlchemy `text(<concat-or-fstring>)`, same Tier B heuristic
// applied to the SQLAlchemy raw-SQL constructor. Catches the
// CVE-2025-69662 (geopandas) shape:
// connection.execute(text(f"SELECT … '{geom_name}' …"))
// where the f-string interpolation is the injection point and the
// surrounding `connection.execute` would otherwise hide the unsafe
// construction from the simple execute_format pattern.
Pattern {
id: "py.sqli.text_format",
description: "sqlalchemy text() with f-string or string concat risks SQL injection",
query: r#"(call
function: [(identifier) @fn (attribute attribute: (identifier) @fn)]
(#eq? @fn "text")
arguments: (argument_list
[(binary_operator)
(string (interpolation))] @arg))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,

View file

@ -1,33 +1,8 @@
//! Field-sensitive Steensgaard points-to analysis driver.
//!
//! Walks the SSA body once per fixpoint pass, emitting equality
//! constraints for each instruction. The constraints are resolved
//! via standard union-find with path compression and union-by-rank;
//! propagation through `FieldProj` requires a worklist because the
//! representative of a receiver may change after the field projection
//! is first visited.
//!
//! The analysis is flow-insensitive (Steensgaard) — every assignment
//! that joins two values unifies their points-to sets across the
//! entire body. Field sensitivity is recovered by representing each
//! `obj.f` access as a structural [`AbsLoc::Field`] location with a
//! distinct identity per `(parent_loc, field)` pair.
//!
//! ## Phase 1 scope
//!
//! - Field READS via [`SsaOp::FieldProj`] — sufficient for Phase 2's
//! resource-lifecycle attribution fix (the gin/`context.go` proxy
//! acquire FP).
//! - Param/SelfParam → fresh caller-relative locations.
//! - Phi/Assign → Steensgaard unification.
//! - Call results → fresh allocation-site locations (one per call
//! instruction, keyed by SSA value).
//! - Source/Const/Nop/Undef → empty (scalars don't reach the heap).
//!
//! Field WRITES land in Phase 3 alongside the cross-method field-flow
//! consumer; they require careful handling of the synthetic
//! base-update `Assign` instructions emitted by SSA lowering and are
//! not load-bearing for Phase 1's "no behaviour change" gate.
//! Flow-insensitive union-find over SSA values; field sensitivity comes
//! from representing each `obj.f` access as a structural
//! [`AbsLoc::Field`] keyed by `(parent_loc, field)`.
use std::collections::HashMap;
@ -41,13 +16,9 @@ use super::domain::{AbsLoc, LOC_TOP, LocId, LocInterner, PointsToSet, PtrProxyHi
/// in a small number of passes for any well-formed body.
const MAX_FIXPOINT_ITERS: usize = 8;
/// Pointer-Phase 4: container-read callees that pull a single element
/// out of a collection without indexing through a key. Recognised
/// across the languages nyx supports as a cross-cut surface — exact
/// per-language specialisation is intentionally skipped for the
/// minimum-viable rollout. The receiver-side projection through
/// [`FieldId::ELEM`] is conservative: a callee not in this list still
/// gets the existing fresh-alloc behaviour and does not lose precision.
/// Container-read callees that pull a single element out of a
/// collection without a key. Cross-language; non-listed callees still
/// get fresh-alloc behaviour, so the list is conservative.
fn is_container_read_callee(callee: &str) -> bool {
let bare = match callee.rsplit_once('.') {
Some((_, m)) => m,
@ -67,19 +38,12 @@ fn is_container_read_callee(callee: &str) -> bool {
| "dequeue"
| "remove"
| "popleft"
// Pointer-Phase 6 / W5: synthetic callee emitted by CFG
// lowering for subscript / index-expression reads
// (`arr[i]`, `map[k]`, `cmds[0]`).
// synthetic callee for subscript reads (`arr[i]`, `map[k]`)
| "__index_get__"
)
}
/// Pointer-Phase 4: container-write callees that store an element into
/// a collection. Mirror of [`is_container_read_callee`]. The pointer
/// analysis itself doesn't track stored values (the Steensgaard
/// receiver/result aliasing already covers the common cases), but the
/// helper is exposed so the taint engine's ELEM-cell write hook can
/// share a single classifier with the points-to pass.
/// Container-write callees, mirror of [`is_container_read_callee`].
pub fn is_container_write_callee(callee: &str) -> bool {
let bare = match callee.rsplit_once('.') {
Some((_, m)) => m,
@ -97,37 +61,34 @@ pub fn is_container_write_callee(callee: &str) -> bool {
| "insert"
| "enqueue"
| "unshift"
// Pointer-Phase 6 / W5: synthetic callee emitted by CFG
// lowering for subscript / index-expression writes
// (`arr[i] = v`, `map[k] = v`).
// synthetic callee for subscript writes (`arr[i] = v`, `map[k] = v`)
| "__index_set__"
)
}
/// Pointer-Phase 4: callee-name aware container-read recognition.
/// Public for unit tests + reuse from the taint engine.
/// Public re-export of [`is_container_read_callee`] for the taint engine.
pub fn is_container_read_callee_pub(callee: &str) -> bool {
is_container_read_callee(callee)
}
/// Pointer-Phase 5: derive a [`crate::summary::points_to::FieldPointsToSummary`]
/// from per-body points-to facts.
/// Derive a [`crate::summary::points_to::FieldPointsToSummary`] from
/// per-body points-to facts.
///
/// Records two channels:
///
/// 1. **Reads** walks every [`SsaOp::FieldProj`] in the body; for
/// 1. **Reads**, walks every [`SsaOp::FieldProj`] in the body; for
/// each `loc ∈ pt(receiver)` that resolves to a parameter
/// location ([`AbsLoc::Param`] / [`AbsLoc::SelfParam`]), records
/// the projected field name into the summary's
/// `param_field_reads`.
/// 2. **Writes** walks the body's [`SsaBody::field_writes`] side-
/// 2. **Writes**, walks the body's [`SsaBody::field_writes`] side-
/// table (populated by SSA lowering's W1 synth-Assign hook) and
/// records each `(receiver, FieldId)` pair against the receiver's
/// pt set the same way reads are recorded.
///
/// Field name resolution goes through the body's
/// [`SsaBody::field_interner`] because [`crate::ssa::ir::FieldId`]
/// is body-local names are the only stable cross-file identity.
/// is body-local, names are the only stable cross-file identity.
///
/// Receiver (`SelfParam`) reads/writes are recorded under the
/// [`u32::MAX`] sentinel parameter index, mirroring the convention in
@ -226,7 +187,7 @@ pub fn extract_field_points_to(
/// Per-body points-to result.
///
/// Owns the body-local [`LocInterner`] and a flat `SsaValue → PointsToSet`
/// table. The table is dense — one slot per SSA value — so lookups
/// table. The table is dense, one slot per SSA value, so lookups
/// are O(1).
#[derive(Clone, Debug)]
pub struct PointsToFacts {
@ -242,7 +203,7 @@ pub struct PointsToFacts {
}
impl PointsToFacts {
/// Empty result every value points to nothing. Used by callers
/// Empty result, every value points to nothing. Used by callers
/// that need a "no facts" placeholder when the analysis is
/// disabled or the body could not be analysed.
pub fn empty(body: BodyId) -> Self {
@ -288,11 +249,6 @@ impl PointsToFacts {
/// [`PtrProxyHint::FieldOnly`] iff every member is an
/// [`AbsLoc::Field`].
///
/// Phase 2 consumer: the resource-lifecycle proxy attribution in
/// `state::transfer.rs` uses `FieldOnly` to recognise locals like
/// `m` in `m := c.mu` and route the proxy entry through
/// `chain_proxies` instead of marking the local as a leakable
/// SymbolId-keyed resource.
pub fn proxy_hint(&self, v: SsaValue) -> PtrProxyHint {
let set = self.pt(v);
if set.is_empty() || set.is_top() {
@ -310,7 +266,7 @@ impl PointsToFacts {
/// Build a `var_name → PtrProxyHint` map by scanning the body's
/// value defs for the latest definition of each named variable.
/// Names that resolve to no variable, or whose latest definition is
/// `Other`, are omitted only `FieldOnly` entries appear.
/// `Other`, are omitted, only `FieldOnly` entries appear.
///
/// Iterates over [`SsaBody::value_defs`] in *reverse* order so the
/// last (post-renaming) SSA definition for each name wins. Used by
@ -340,13 +296,13 @@ impl PointsToFacts {
/// Analyse a single body and return its [`PointsToFacts`].
///
/// `body_id` is used as the disambiguator inside the abstract
/// locations supplying a stable id (e.g. the file's
/// locations, supplying a stable id (e.g. the file's
/// `BodyMeta.id`) lets callers compare facts emitted by different
/// bodies in the same file.
pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts {
let mut state = AnalysisState::new(body_id, body.num_values());
// Pass 1 emit constraints from ops that don't depend on
// Pass 1, emit constraints from ops that don't depend on
// representative resolution (Param, SelfParam, Call result,
// etc.). These produce the "leaf" points-to sets.
for block in &body.blocks {
@ -355,7 +311,7 @@ pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts {
}
}
// Pass 2+ propagate through field projections, phis, and
// Pass 2+, propagate through field projections, phis, and
// assignments until a fixpoint. Field projections need iteration
// because a `FieldProj` whose receiver's representative changes
// (via a later unification) must re-emit its constraint with the
@ -377,7 +333,7 @@ pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts {
// ── Constraint solver internals ────────────────────────────────────
/// Mutable analysis state the interner, points-to table, and
/// Mutable analysis state, the interner, points-to table, and
/// union-find arrays. Lives inside `analyse_body` only.
struct AnalysisState {
/// Body-id forwarded to [`PointsToFacts::body`] when the analysis
@ -457,7 +413,7 @@ impl AnalysisState {
/// `pt(rep_a) = pt(rep_b)`. Caller is responsible for passing
/// already-resolved representatives if it wants Steensgaard
/// unification see `union` for that.
/// unification, see `union` for that.
fn copy_pt(&mut self, dst: u32, src: u32) -> bool {
let dr = self.find(dst);
let sr = self.find(src);
@ -486,7 +442,7 @@ impl AnalysisState {
self.add_loc(v, loc);
}
SsaOp::CatchParam => {
// Exception bindings come from the runtime model as
// Exception bindings come from the runtime, model as
// an opaque allocation-site keyed by the SSA value.
let loc = self.interner.intern_alloc(body_id, v);
self.add_loc(v, loc);
@ -494,14 +450,14 @@ impl AnalysisState {
SsaOp::Call {
callee, receiver, ..
} => {
// Pointer-Phase 4: container element retrieval ops
// container element retrieval ops
// (`shift`, `pop`, `peek`, `front`, …) project through
// the abstract `Field(pt(receiver), ELEM)` cell so
// per-element taint flows independently of the SSA
// value referencing the container. The receiver's
// points-to set may not be fully resolved on this
// pass, so we *also* add a fresh allocation site as a
// fallback the fixpoint pass below absorbs the
// fallback, the fixpoint pass below absorbs the
// proper Field projection once the receiver's set
// converges.
let loc = self.interner.intern_alloc(body_id, v);
@ -538,7 +494,7 @@ impl AnalysisState {
}
}
SsaOp::FieldProj { .. } => {
// Resolved during the fixpoint pass see
// Resolved during the fixpoint pass, see
// `propagate_inst`.
}
SsaOp::Source | SsaOp::Const(_) | SsaOp::Nop | SsaOp::Undef => {
@ -548,7 +504,7 @@ impl AnalysisState {
}
/// Fixpoint-pass transfer. Re-runs constraints whose result
/// depends on the current set of representatives i.e. field
/// depends on the current set of representatives, i.e. field
/// projections, phis, and assignments may need to absorb new
/// members emitted after the first pass. Returns `true` when
/// any points-to set changed.
@ -608,7 +564,7 @@ impl AnalysisState {
}
/// Materialise the dense `SsaValue → PointsToSet` table. Each
/// value's set is the set of its representative values in the
/// value's set is the set of its representative, values in the
/// same Steensgaard class share the same set.
fn into_facts(mut self) -> PointsToFacts {
let mut by_value = Vec::with_capacity(self.pt.len());
@ -714,7 +670,7 @@ mod tests {
}
}
/// `let c = self; let m = c.mu;` pt(m) must be `{Field(SelfParam, mu)}`,
/// `let c = self; let m = c.mu;` , pt(m) must be `{Field(SelfParam, mu)}`,
/// distinct from pt(c) = `{SelfParam}`.
#[test]
fn field_subobject_distinct_from_receiver() {
@ -762,7 +718,7 @@ mod tests {
}
}
/// `let y = x;` y and x share the same points-to set.
/// `let y = x;` , y and x share the same points-to set.
#[test]
fn copy_propagation_unifies() {
let mut b = BodyBuilder::new();
@ -783,7 +739,7 @@ mod tests {
assert!(!facts.pt(y).is_empty());
}
/// `if (cond) z = a; else z = b;` phi at the merge unifies
/// `if (cond) z = a; else z = b;` , phi at the merge unifies
/// `pt(z)` with both `pt(a)` and `pt(b)`.
#[test]
fn phi_unifies_branches() {
@ -793,7 +749,7 @@ mod tests {
let b_v = b.fresh(Some("b"));
b.emit(b_v, SsaOp::Param { index: 1 }, Some("b"));
// Phi(0: a, 0: b) predecessor block ids are placeholders.
// Phi(0: a, 0: b), predecessor block ids are placeholders.
let z = b.fresh(Some("z"));
b.emit(
z,
@ -812,7 +768,7 @@ mod tests {
assert_eq!(pt_z.len(), 2);
}
/// `node = node.next;` the `FieldProj` self-cycle must
/// `node = node.next;`, the `FieldProj` self-cycle must
/// terminate via the union-find / depth bound, not loop.
#[test]
fn self_referential_field_chain_terminates() {
@ -847,7 +803,7 @@ mod tests {
let facts = analyse_body(&body, body_id());
let pt_node = facts.pt(node);
// Either we converge to a non-empty set including a Field chain,
// or we saturate to Top either is a valid termination outcome.
// or we saturate to Top, either is a valid termination outcome.
assert!(!pt_node.is_empty());
}
@ -864,7 +820,7 @@ mod tests {
assert!(facts.pt(s).is_empty());
}
/// `Call` produces a fresh allocation-site location for its result
/// `Call` produces a fresh allocation-site location for its result ,
/// distinct from its arguments.
#[test]
fn call_result_is_fresh_alloc() {
@ -901,7 +857,7 @@ mod tests {
/// Driver smoke-test: the analysis runs on an SsaBody produced by
/// the real lowering pipeline without panicking. This pins the
/// "no behaviour change" gate analysis runs to completion on
/// "no behaviour change" gate, analysis runs to completion on
/// representative input.
#[test]
fn smoke_runs_on_lowered_body() {
@ -929,12 +885,10 @@ mod tests {
assert!(facts.is_trivial());
assert_eq!(facts.len(), 0);
// Suppress unused-import warning for `Cfg` — it's exposed for
// future Phase 1.b tests that need a real CFG.
let _ = std::marker::PhantomData::<Cfg>;
}
/// Pointer-Phase 2 contract pin: a value defined by a `FieldProj`
/// Contract pin: a value defined by a `FieldProj`
/// classifies as [`PtrProxyHint::FieldOnly`]. Consumed by the
/// resource-lifecycle pass to recognise field-aliased locals.
#[test]
@ -965,7 +919,7 @@ mod tests {
assert_eq!(facts.proxy_hint(c), crate::pointer::PtrProxyHint::Other);
}
/// Pointer-Phase 4: container-read callee classifier covers a
/// container-read callee classifier covers a
/// representative sample across nyx's languages. Pinned because
/// the taint engine relies on the same classifier.
#[test]
@ -992,7 +946,7 @@ mod tests {
}
}
/// Pointer-Phase 4: container-write classifier (mirror).
/// container-write classifier (mirror).
#[test]
fn container_write_callee_classifier() {
for c in [
@ -1014,7 +968,7 @@ mod tests {
}
}
/// Pointer-Phase 4: a `Call("shift", receiver=container)` projects
/// a `Call("shift", receiver=container)` projects
/// `Field(pt(container), ELEM)` into the result, alongside the
/// fresh allocation site that fall-back paths still emit.
#[test]
@ -1023,7 +977,7 @@ mod tests {
// `arr` is the parameter container.
let arr = b.fresh(Some("arr"));
b.emit(arr, SsaOp::Param { index: 0 }, Some("arr"));
// `e := arr.shift()` container read.
// `e := arr.shift()`, container read.
let e = b.fresh(Some("e"));
b.emit(
e,
@ -1055,7 +1009,7 @@ mod tests {
);
}
/// Pointer-Phase 5: `extract_field_points_to` records a field
/// `extract_field_points_to` records a field
/// READ on the parameter index when a `FieldProj` traces back to
/// an `AbsLoc::Param`.
#[test]
@ -1064,7 +1018,7 @@ mod tests {
// `obj` is parameter 0.
let obj = b.fresh(Some("obj"));
b.emit(obj, SsaOp::Param { index: 0 }, Some("obj"));
// `let n = obj.name;` field projection from a param.
// `let n = obj.name;`, field projection from a param.
let name_field = b.intern_field("name");
let n = b.fresh(Some("n"));
b.emit(
@ -1088,7 +1042,7 @@ mod tests {
assert!(entry.1.iter().any(|s| s == "name"));
}
/// Pointer-Phase 5 / W3: `extract_field_points_to` records field
/// `extract_field_points_to` records field
/// WRITES from the body's `field_writes` side-table populated by
/// SSA lowering. A synth Assign whose receiver traces back to
/// `AbsLoc::Param` produces a `param_field_writes` entry.
@ -1124,7 +1078,7 @@ mod tests {
);
}
/// Pointer-Phase 5 / W3: writes through the receiver (`this.f =
/// writes through the receiver (`this.f =
/// rhs`) are recorded under the same `u32::MAX` sentinel as
/// reads.
#[test]
@ -1151,7 +1105,7 @@ mod tests {
assert!(entry.1.iter().any(|s| s == "cache"));
}
/// Pointer-Phase 5 / W3: container-element writes (`<elem>`
/// container-element writes (`<elem>`
/// marker) flow through the same channel as named-field writes
/// when the synth Assign carries `FieldId::ELEM`.
#[test]
@ -1180,7 +1134,7 @@ mod tests {
);
}
/// Pointer-Phase 5: receiver projections are recorded under the
/// receiver projections are recorded under the
/// `u32::MAX` sentinel parameter index (mirror of
/// `SsaFuncSummary::receiver_to_*`).
#[test]
@ -1233,7 +1187,7 @@ mod tests {
assert!(is_container_write_callee("arr.__index_set__"));
}
/// W5: regression guard neither synth name should match the
/// W5: regression guard, neither synth name should match the
/// opposite predicate, otherwise the W2 read/write hooks would
/// double-fire on the same call.
#[test]
@ -1245,10 +1199,10 @@ mod tests {
#[test]
fn name_proxy_hints_collects_field_only_locals() {
let mut b = BodyBuilder::new();
// `c` is the receiver root location, hint=Other.
// `c` is the receiver, root location, hint=Other.
let c = b.fresh(Some("c"));
b.emit(c, SsaOp::SelfParam, Some("c"));
// `m := c.mu` field projection, hint=FieldOnly.
// `m := c.mu`, field projection, hint=FieldOnly.
let mu = b.intern_field("mu");
let m = b.fresh(Some("m"));
b.emit(

View file

@ -2,7 +2,7 @@
//!
//! Locations are interned to compact `LocId(u32)` handles so the
//! union-find resolver can operate on dense integer keys. Field
//! locations are keyed structurally by `(parent_loc_id, field_id)`
//! locations are keyed structurally by `(parent_loc_id, field_id)` ,
//! interning a `Field(parent, f)` always returns the same `LocId` no
//! matter how many times the same `(parent, f)` pair is requested.
@ -29,14 +29,14 @@ pub const MAX_POINTSTO_MEMBERS: usize = 16;
/// Compact handle for an interned [`AbsLoc`].
///
/// All abstract locations referenced by a single body share one
/// [`LocInterner`] `LocId`s are only meaningful relative to that
/// [`LocInterner`], `LocId`s are only meaningful relative to that
/// interner. IDs are assigned densely from 0 and are stable for the
/// lifetime of the interner so the union-find can index parent / rank
/// arrays directly.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct LocId(pub u32);
/// Sentinel "anywhere" location. Always `LocId(0)` the interner
/// Sentinel "anywhere" location. Always `LocId(0)`, the interner
/// reserves the first slot at construction so callers can compare
/// against it cheaply.
pub const LOC_TOP: LocId = LocId(0);
@ -48,7 +48,7 @@ pub const LOC_TOP: LocId = LocId(0);
/// is exceeded the chain folds to [`AbsLoc::Top`].
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum AbsLoc {
/// "Anywhere" the over-approximation used when precision is
/// "Anywhere", the over-approximation used when precision is
/// unrecoverable (e.g. a value sourced from outside the analysed
/// body, or a points-to set that exceeded the cap).
Top,
@ -60,7 +60,7 @@ pub enum AbsLoc {
/// file. The interned `u32` is the `SsaValue.0` of the call /
/// constructor instruction.
Alloc(BodyId, u32),
/// Function parameter the abstract identity of the value
/// Function parameter, the abstract identity of the value
/// supplied by the caller for parameter `index`. The receiver
/// (`self` / `this`) uses [`AbsLoc::SelfParam`] instead.
Param(BodyId, usize),
@ -69,7 +69,7 @@ pub enum AbsLoc {
/// receiver" sentinel index.
SelfParam(BodyId),
/// Heap field of a parent location: `parent.f`. `parent` is
/// itself a [`LocId`] chains of field accesses produce nested
/// itself a [`LocId`], chains of field accesses produce nested
/// `Field` locations. Depth is bounded by [`MAX_FIELD_DEPTH`].
Field { parent: LocId, field: FieldId },
}
@ -130,7 +130,7 @@ impl LocInterner {
}
/// Resolve a [`LocId`] back to its [`AbsLoc`]. Panics on out-of-
/// range ids only ids the interner produced are valid.
/// range ids, only ids the interner produced are valid.
#[inline]
pub fn resolve(&self, id: LocId) -> &AbsLoc {
&self.locs[id.0 as usize]
@ -202,7 +202,7 @@ impl LocInterner {
}
/// Coarse classification of a value's points-to set, used by consumers
/// (Phase 2: resource lifecycle) that don't need full set membership but
/// (Hierarchy: resource lifecycle) that don't need full set membership but
/// do need to know "is this value's heap identity a *field* of some
/// other value, or does it stand on its own?".
///
@ -213,7 +213,7 @@ impl LocInterner {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PtrProxyHint {
/// Every member of the points-to set is an [`AbsLoc::Field`]. The
/// value is a sub-object alias e.g. `m` in `m := c.mu`.
/// value is a sub-object alias, e.g. `m` in `m := c.mu`.
FieldOnly,
/// Anything else: the set is empty, contains a root location
/// ([`AbsLoc::SelfParam`] / [`AbsLoc::Param`] / [`AbsLoc::Alloc`]),
@ -242,7 +242,7 @@ impl Default for PointsToSet {
}
impl PointsToSet {
/// Empty set the value points to nothing tracked by the
/// Empty set, the value points to nothing tracked by the
/// analysis (e.g. a scalar constant).
pub fn empty() -> Self {
Self {
@ -257,7 +257,7 @@ impl PointsToSet {
Self { ids }
}
/// `{Top}` the universal over-approximation.
/// `{Top}`, the universal over-approximation.
pub fn top() -> Self {
Self::singleton(LOC_TOP)
}
@ -313,7 +313,7 @@ impl PointsToSet {
}
}
/// Set-union, in place. Returns `true` when `self` changed
/// Set-union, in place. Returns `true` when `self` changed ,
/// the constraint solver uses the bit to decide whether the
/// containing equivalence class needs another pass.
pub fn union_in_place(&mut self, other: &PointsToSet) -> bool {

View file

@ -1,24 +1,12 @@
//! Field-sensitive Steensgaard alias / points-to analysis.
//!
//! Sibling pass to [`crate::ssa::heap`]. Where `heap.rs` tracks per-value
//! container identity for taint propagation through container element
//! abstractions, this module tracks **field-sensitive** points-to so the
//! engine can distinguish a receiver from one of its sub-fields:
//!
//! - `c.mu.Lock()` — the lock is acquired on `Field(c, mu)`, not on `c`
//! itself. Without this distinction the resource-lifecycle pass
//! mis-attributes the acquire to the receiver and emits a spurious
//! "leakable resource" finding (the gin / `context.go` FP class).
//! - Cross-method field flow — method A writes `this.cache`, method B
//! reads `this.cache`; both observe a shared abstract location
//! `Field(SelfParam, cache)` only when fields have a stable identity
//! independent of the parent value.
//!
//! Phase 1 of the rollout (this commit) ships the analysis but no
//! consumer. Behaviour is unchanged whether `NYX_POINTER_ANALYSIS=1` is
//! set or not — the analysis is opt-in and only computed when callers
//! ask for it. Phase 2 (resource lifecycle) and Phase 3 (taint engine)
//! will start consuming the resulting facts.
//! Sibling to [`crate::ssa::heap`]: where heap tracks per-value
//! container identity for element abstractions, this module tracks
//! field-sensitive points-to so the engine can distinguish a receiver
//! from a sub-field. `c.mu.Lock()` acquires on `Field(c, mu)`, not `c`,
//! so the resource-lifecycle pass doesn't mis-attribute the acquire.
//! Cross-method field flow (method A writes `this.cache`, method B
//! reads it) observes the shared `Field(SelfParam, cache)` location.
pub mod analysis;
pub mod domain;
@ -29,12 +17,8 @@ pub use analysis::{
};
pub use domain::{AbsLoc, LocId, LocInterner, PointsToSet, PtrProxyHint};
/// Returns whether the field-sensitive pointer analysis is enabled at runtime.
///
/// Default: enabled (post-Phase-6 flip on 2026-04-26). Set
/// `NYX_POINTER_ANALYSIS=0` (or `false`) to disable for one release
/// cycle so customer scans can compare baselines. The env-var
/// override is removed entirely in the next release.
/// Returns whether the field-sensitive pointer analysis is enabled.
/// Set `NYX_POINTER_ANALYSIS=0` (or `false`) to disable.
#[inline]
pub fn is_enabled() -> bool {
!matches!(

View file

@ -97,14 +97,14 @@ pub fn compute_attack_rank(diag: &Diag) -> AttackRank {
// direction of precision loss is classified by
// `EngineNote::direction()` and drives a bounded penalty:
//
// * `Bail` analysis aborted on this body → -8.0
// * `OverReport` widening may have produced a false positive → -8.0
// * `UnderReport` fixpoint was cut short but this finding is
// * `Bail` , analysis aborted on this body → -8.0
// * `OverReport` , widening may have produced a false positive → -8.0
// * `UnderReport`, fixpoint was cut short but this finding is
// still a real flow → -3.0
// * `Informational` no penalty (cache reuse etc.)
// * `Informational`, no penalty (cache reuse etc.)
//
// The penalty is the *worst* direction across all attached notes
// not additive so a body with ten `OriginsTruncated` notes is not
// The penalty is the *worst* direction across all attached notes ,
// not additive, so a body with ten `OriginsTruncated` notes is not
// ranked below a body with one `ParseTimeout`. Magnitudes are
// chosen so that `High + capped` (60 8 = 52) still exceeds
// `Medium + taint + UserInput` (30 + 10 + 6 = 46), preserving the
@ -125,7 +125,7 @@ pub fn compute_attack_rank(diag: &Diag) -> AttackRank {
///
/// `None` when the finding has no evidence struct, no engine notes, or
/// only informational notes. Uses `worst_direction` so the penalty is
/// the single most credibility-damaging direction present adding more
/// the single most credibility-damaging direction present, adding more
/// notes of the same direction does not compound the penalty.
struct CompletenessPenalty {
value: f64,
@ -289,16 +289,16 @@ fn source_kind_priority(source_value: &str) -> f64 {
// Strong user-input signals
6.0
} else if lower.contains("env") || lower.contains("var(") || lower.contains("getenv") {
// Environment / config still attacker-controllable in many deployments
// Environment / config, still attacker-controllable in many deployments
5.0
} else if lower.contains("read") || lower.contains("file") || lower.contains("open") {
// File system needs indirect vector
// File system, needs indirect vector
3.0
} else if lower.contains("query") || lower.contains("fetch") || lower.contains("select") {
// Database needs prior injection
// Database, needs prior injection
2.0
} else {
// Unknown / unrecognised treat as moderately exploitable
// Unknown / unrecognised, treat as moderately exploitable
4.0
}
}
@ -931,7 +931,7 @@ mod tests {
#[test]
fn completeness_penalty_is_not_additive_across_notes() {
// Ten OriginsTruncated notes must produce the same penalty as one
// Ten OriginsTruncated notes must produce the same penalty as one ,
// the penalty reflects the worst direction, not a count.
let mut d_many = clean_diag_with_evidence();
let many = (0..10)

View file

@ -3,11 +3,11 @@
//! This module is entirely Rust-flavored helpers for the cross-file call graph.
//! Other languages do not need it. The two pieces are:
//!
//! * [`derive_module_path`] given a Rust source file path and an optional
//! * [`derive_module_path`], given a Rust source file path and an optional
//! crate root, produce its canonical crate-relative module path
//! (`src/foo/bar.rs` → `"foo::bar"`, `src/lib.rs` → `""`).
//!
//! * [`parse_rust_use_map`] walk the top-level `use_declaration` nodes of a
//! * [`parse_rust_use_map`], walk the top-level `use_declaration` nodes of a
//! parsed tree and produce a [`RustUseMap`] mapping local aliases to fully
//! qualified paths plus a list of wildcard imports.
//!
@ -27,7 +27,7 @@
//! * Macro-expanded `use` statements
//! * `pub use` re-exports across modules
//! * `extern crate alias_name;`
//! * Self-prefixed imports (`use self::sub::foo;`) treated as `self::sub::foo`
//! * Self-prefixed imports (`use self::sub::foo;`), treated as `self::sub::foo`
//!
//! These are flagged in the final pass-1 telemetry but do not block resolution.
@ -102,7 +102,7 @@ pub fn derive_module_path(file_path: &Path, scan_root: Option<&Path>) -> Option<
let mut segments: Vec<&str> = rel.iter().filter_map(|s| s.to_str()).collect();
// Strip a leading `src` directory if present. Files outside `src/` (e.g.
// tests, examples, build.rs) get a `None` here we do not have a stable
// tests, examples, build.rs) get a `None` here, we do not have a stable
// module path for them and resolution should fall back to file-based.
match segments.first().copied() {
Some("src") => {
@ -145,7 +145,7 @@ pub fn derive_module_path(file_path: &Path, scan_root: Option<&Path>) -> Option<
/// [`RustUseMap`].
///
/// The walk only inspects direct children of the source root. Nested `use`s
/// inside functions or impls are deliberately skipped their scope is local
/// inside functions or impls are deliberately skipped, their scope is local
/// and does not influence the cross-file call graph at the module level.
pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap {
let mut map = RustUseMap::default();
@ -160,7 +160,7 @@ pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap {
Some(n) => n,
None => {
// tree-sitter-rust 0.24 sometimes exposes the body as a named
// child instead of a field fall back to the first named child.
// child instead of a field, fall back to the first named child.
match child.named_child(0) {
Some(n) => n,
None => continue,
@ -179,7 +179,7 @@ pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap {
/// `b::c` inside `a::{b::c}` is flattened to `a::b::c`).
fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut RustUseMap) {
match node.kind() {
// `crate::auth::token::validate` terminal scoped path, leaf is the alias.
// `crate::auth::token::validate`, terminal scoped path, leaf is the alias.
"scoped_identifier" => {
let segments = scoped_segments(node, src);
if segments.is_empty() {
@ -191,7 +191,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
map.aliases.insert(leaf, full);
}
}
// `validate` bare identifier (e.g. `use foo::validate`).
// `validate`, bare identifier (e.g. `use foo::validate`).
"identifier" => {
let name = node_text(node, src).to_string();
if name.is_empty() {
@ -201,7 +201,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
segs.push(name.clone());
map.aliases.insert(name, segs.join("::"));
}
// `crate::auth::token::{validate, verify}` left side is the prefix,
// `crate::auth::token::{validate, verify}`, left side is the prefix,
// right side is a use_list of further use clauses.
"scoped_use_list" => {
// path field carries the prefix; the list field carries the body.
@ -239,7 +239,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
collect_use_paths(c, src, prefix, map);
}
}
// `crate::auth::token::validate as ok` alias the leaf identifier.
// `crate::auth::token::validate as ok`, alias the leaf identifier.
"use_as_clause" => {
let path_node = node
.child_by_field_name("path")
@ -256,7 +256,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
map.aliases.insert(alias_name, full);
}
}
// `crate::auth::token::*` record the prefix as a wildcard import.
// `crate::auth::token::*`, record the prefix as a wildcard import.
"use_wildcard" => {
// The wildcard's child is the path being wildcarded.
let path_node = node.named_child(0);
@ -270,7 +270,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
}
_ => {
// Unknown/unsupported form (e.g. macro_invocation in use position,
// attribute-prefixed clauses) flag in pass-1 telemetry, skip
// attribute-prefixed clauses), flag in pass-1 telemetry, skip
// here to keep the walk total.
}
}
@ -452,7 +452,7 @@ mod tests {
#[test]
fn module_path_no_cargo_toml_with_scan_root() {
// No Cargo.toml anywhere fall back to scan root.
// No Cargo.toml anywhere, fall back to scan root.
let dir = PathBuf::from("/tmp/nyx_mp_test_no_cargo");
std::fs::create_dir_all(dir.join("src")).ok();
// Make sure no Cargo.toml exists.
@ -535,7 +535,7 @@ mod tests {
#[test]
fn use_map_malformed_does_not_panic() {
// Truncated input must not panic.
// Truncated input, must not panic.
let src = b"use crate::auth::";
let tree = parse(std::str::from_utf8(src).unwrap());
let _ = parse_rust_use_map(src, &tree);

View file

@ -72,7 +72,7 @@ pub struct AppState {
pub findings_cache: Arc<RwLock<Option<CachedFindings>>>,
}
/// 50 MiB cap on request bodies generous for config uploads, tight
/// 50 MiB cap on request bodies, generous for config uploads, tight
/// enough to prevent OOM from a rogue client.
const MAX_BODY_BYTES: usize = 50 * 1024 * 1024;
@ -286,7 +286,7 @@ mod tests {
}
/// Panic inside a thread that holds a write guard on the shared config lock.
/// With `parking_lot::RwLock`, the lock must remain usable afterwards
/// With `parking_lot::RwLock`, the lock must remain usable afterwards ,
/// this is the poison-recovery contract we rely on in every route handler.
#[tokio::test]
async fn config_lock_survives_panic_in_write_guard() {

View file

@ -782,7 +782,7 @@ pub struct FuncSummaryView {
/// Enclosing container path (class / impl / module / outer function).
/// Empty for free top-level functions.
pub container: String,
/// Structural [`crate::symbol::FuncKind`] slug `"fn"`, `"method"`,
/// Structural [`crate::symbol::FuncKind`] slug, `"fn"`, `"method"`,
/// `"closure"`, etc. Lets the UI distinguish anonymous closures from
/// named functions for filtering.
pub func_kind: String,
@ -934,10 +934,10 @@ pub struct PointerView {
pub locations: Vec<PointerLocationView>,
pub values: Vec<PointerValueView>,
/// Field reads attributed to params/receiver via the field-points-to
/// extractor (Phase 5).
/// extractor.
pub field_reads: Vec<PointerFieldEntryView>,
/// Field writes attributed to params/receiver via the field-points-to
/// extractor (Phase 5).
/// extractor.
pub field_writes: Vec<PointerFieldEntryView>,
/// Number of distinct interned locations beyond the reserved Top sentinel.
pub location_count: usize,
@ -998,7 +998,7 @@ impl PointerView {
});
}
// Per-value pt sets emit only values with non-empty sets to keep
// Per-value pt sets, emit only values with non-empty sets to keep
// the payload focused on interesting facts.
let mut values: Vec<PointerValueView> = Vec::new();
for v in 0..ssa.num_values() as u32 {
@ -1064,12 +1064,12 @@ pub struct TypeFactDetailView {
pub ssa_value: u32,
pub var_name: Option<String>,
pub line: usize,
/// Type kind tag matches the [`TypeKind`] discriminant
/// Type kind tag, matches the [`TypeKind`] discriminant
/// (`String`, `Int`, `HttpClient`, `Dto`, …).
pub kind: String,
/// True when the value is allowed to be null/None.
pub nullable: bool,
/// Container/class name set for `HttpClient`, `DatabaseConnection`,
/// Container/class name, set for `HttpClient`, `DatabaseConnection`,
/// `Dto`, etc. Mirrors [`TypeKind::container_name`].
#[serde(skip_serializing_if = "Option::is_none")]
pub container: Option<String>,
@ -1437,7 +1437,7 @@ pub fn function_list(analysis: &FileAnalysis) -> Vec<FunctionInfo> {
/// Lower a single function to SSA and optimize it.
///
/// Returns the per-function body graph alongside the SSA. SSA is lowered
/// against `body.graph`, whose `NodeIndex` space is body-local the file's
/// against `body.graph`, whose `NodeIndex` space is body-local, the file's
/// top-level CFG (`analysis.cfg()`) has a different index space, so any
/// downstream analysis that indexes by `inst.cfg_node` must use the returned
/// `&Cfg`, not `analysis.cfg()`.
@ -1638,7 +1638,7 @@ pub fn analyse_file_summaries(
/// Run the file-level authorization extraction pipeline for the debug UI.
///
/// Returns the structured `AuthorizationModel` (routes, units, sensitive
/// operations, auth checks) plus the file bytes and an `enabled` flag
/// operations, auth checks) plus the file bytes and an `enabled` flag ,
/// the bytes drive line-number resolution in the view, and `enabled`
/// surfaces "auth analysis is off for this language" without conflating
/// it with an empty result.
@ -1651,7 +1651,7 @@ pub fn analyse_file_auth(
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?
.ok_or(StatusCode::BAD_REQUEST)?;
// Determine whether the auth rules were actually enabled for this
// file's language `extract_auth_model_for_debug` returns an empty
// file's language, `extract_auth_model_for_debug` returns an empty
// model both when the rules are disabled and when the file just
// happens to have no routes. The view distinguishes the two so the
// UI can show "analysis disabled" instead of "no routes found".
@ -2122,7 +2122,7 @@ fn main() {
// Belt-and-suspenders: assert that calling with the wrong (top-level)
// CFG would have panicked. We can't catch the panic across rayon
// worker threads here, but we can confirm at least one `inst.cfg_node`
// index lies outside `analysis.cfg()`'s range that's what triggers
// index lies outside `analysis.cfg()`'s range, that's what triggers
// the OOB indexing inside `transfer_inst`.
let toplevel_count = analysis.cfg().node_count();
let max_inst_idx = ssa

View file

@ -1,4 +1,4 @@
//! Health-score scoring engine v3.5.
//! Health-score scoring engine, v3.5.
//!
//! Pure-function scoring over a `HealthInputs` struct. Documented in
//! `docs/health-score-audit.md` (calibration, rationale) and
@ -15,7 +15,7 @@
//!
//! 2. **HIGH-count guardrails.** The *qualitative* axis: HIGH counts
//! cap the maximum grade and floor "no HIGH" to at least C. These
//! are non-negotiable promises even a perfect-everywhere-else
//! are non-negotiable promises, even a perfect-everywhere-else
//! repo with 6 confirmed HIGHs grades F.
//!
//! Modifiers (triage, trend, stale, regression, suppression hygiene)
@ -27,17 +27,17 @@
//! * Verdict-weighted credibility (`Confirmed > NotAttempted >
//! Inconclusive > Infeasible`). This is the structural protection
//! against false-positive-driven F grades while the scanner is
//! still maturing it auto-tightens as symex coverage grows.
//! still maturing, it auto-tightens as symex coverage grows.
//! * Cross-file vs intra-file vs AST-only weighting via
//! `context_factor`.
//! * Test-path downweight (0.3×) a HIGH in a test fixture is
//! * Test-path downweight (0.3×), a HIGH in a test fixture is
//! genuinely less concerning than one in a request handler.
//! * Effective HIGH count for ceilings the HIGH-count caps key on
//! * Effective HIGH count for ceilings, the HIGH-count caps key on
//! credibility-adjusted HIGHs, not raw HIGHs. A repo with 5
//! low-confidence HIGHs that got `NotAttempted` from symex doesn't
//! pay the same ceiling cost as a repo with 5 `Confirmed` HIGHs.
//! * Tighter modifier ranges so they can't flip a band.
//! * No `parse_success_rate` (it's actually a cache-miss metric
//! * No `parse_success_rate` (it's actually a cache-miss metric ,
//! see `project_parse_success_rate_misnomer.md`).
use crate::commands::scan::Diag;
@ -48,11 +48,11 @@ use crate::server::models::{BacklogStats, FindingSummary, HealthComponent, Healt
// ── Tunables ─────────────────────────────────────────────────────────────────
//
// Calibrated for v0.5.0 scanner FP rate. As Nyx symex coverage and
// rule precision improve, the HIGH ceilings should tighten see
// rule precision improve, the HIGH ceilings should tighten, see
// `docs/health-score-audit.md` "Calibration trajectory" for the
// roadmap.
/// Below this file count, we floor the size divisor at 1.0 tiny
/// Below this file count, we floor the size divisor at 1.0, tiny
/// repos can't claim infinite per-LOC dilution from one finding.
const FILES_FLOOR: f64 = 100.0;
@ -66,7 +66,7 @@ const QUALITY_DRAG_PER_FINDING: f64 = 0.05;
const QUALITY_DRAG_CAP: f64 = 15.0;
/// Below this finding count, the Triage component contributes
/// weight 0 we don't punish fresh users for not having triaged
/// weight 0, we don't punish fresh users for not having triaged
/// what didn't need triaging.
const TRIAGE_FLOOR: usize = 20;
@ -77,7 +77,7 @@ const STALE_PENALTY_CAP: f64 = 10.0;
// ── Public API ───────────────────────────────────────────────────────────────
/// Pure inputs to the health-score calculation. No app state, no DB
/// handles those upstream concerns are flattened into primitives the
/// handles, those upstream concerns are flattened into primitives the
/// scorer actually consumes.
#[derive(Debug, Clone, Copy)]
pub struct HealthInputs<'a> {
@ -120,7 +120,7 @@ pub fn compute(inp: &HealthInputs<'_>) -> HealthScore {
let quality_drag = quality_drag(weighted.quality_count);
let base_after_drag = (base_score - quality_drag).clamp(0.0, 100.0);
// Step 5: HIGH-count guardrails keyed on *effective* HIGH count
// Step 5: HIGH-count guardrails, keyed on *effective* HIGH count
// (credibility-weighted), not raw count. This is what protects
// users from FP-driven F grades while the scanner is maturing.
let ceiling = high_total_ceiling(weighted.effective_high);
@ -161,9 +161,9 @@ struct WeightedAggregate {
/// context_factor` across security findings. Quality lints are
/// handled separately via `quality_drag`.
raw_weight: f64,
/// Number of `*.quality.*` findings drives `quality_drag`.
/// Number of `*.quality.*` findings, drives `quality_drag`.
quality_count: usize,
/// Credibility-adjusted HIGH count (rounded) drives the HIGH
/// Credibility-adjusted HIGH count (rounded), drives the HIGH
/// ceiling and floor. A low-confidence + Inconclusive HIGH might
/// contribute 0.2; five of them would round to 1.
effective_high: usize,
@ -171,10 +171,10 @@ struct WeightedAggregate {
raw_high: usize,
raw_medium: usize,
raw_low_security: usize,
/// Confidence rate (high+medium*0.5)/total drives the
/// Confidence rate (high+medium*0.5)/total, drives the
/// confidence component. 100 if no findings.
confidence_rate: f64,
/// Symex coverage % of taint findings with any non-NotAttempted
/// Symex coverage, % of taint findings with any non-NotAttempted
/// verdict. Surfaced in component detail; not currently in score.
symex_coverage: f64,
}
@ -218,7 +218,7 @@ fn aggregate_findings(findings: &[Diag]) -> WeightedAggregate {
_ => 0.0,
};
// Symex coverage tracking only meaningful for findings with
// Symex coverage tracking, only meaningful for findings with
// taint-flow evidence (the ones symex even attempts).
if let Some(ev) = f.evidence.as_ref()
&& ev.symbolic.is_some()
@ -294,7 +294,7 @@ fn context_factor(f: &Diag) -> f64 {
return 0.3;
}
let Some(ev) = f.evidence.as_ref() else {
return 0.75; // No evidence at all pattern match
return 0.75; // No evidence at all, pattern match
};
if ev.flow_steps.is_empty() {
return 0.75;
@ -351,7 +351,7 @@ fn quality_drag(quality_count: usize) -> f64 {
(quality_count as f64 * QUALITY_DRAG_PER_FINDING).min(QUALITY_DRAG_CAP)
}
// ── HIGH guardrails calibrated for v0.5.0 FP rate ──────────────────────────
// ── HIGH guardrails, calibrated for v0.5.0 FP rate ──────────────────────────
/// Final-score ceiling keyed on *effective* HIGH count (credibility-
/// weighted, not raw). See module docstring for the rationale.
@ -398,7 +398,7 @@ fn build_components(
let sev_score = base_after_drag.round().clamp(0.0, 100.0) as u8;
let sev_detail = severity_detail(weighted, size_divisor, inp.repo_files, inp.backlog);
// Confidence component high-conf rate scaled into 0..=100.
// Confidence component, high-conf rate scaled into 0..=100.
let conf_score = weighted.confidence_rate.round().clamp(0.0, 100.0) as u8;
let conf_detail = format!(
"High-confidence rate {:.0}% across {} security finding{}",
@ -407,7 +407,7 @@ fn build_components(
plural_s(total - weighted.quality_count)
);
// Trend component only contributes weight when has_history.
// Trend component, only contributes weight when has_history.
let net = inp.fixed_since_last as i64 - inp.new_since_last as i64;
let trend_score = (50 + net * 5).clamp(0, 100) as u8;
let trend_weight = if inp.has_history { 0.20 } else { 0.0 };
@ -420,7 +420,7 @@ fn build_components(
"Not applicable: no prior scan to compare against (re-scan to populate)".into()
};
// Triage drops out when total < TRIAGE_FLOOR.
// Triage, drops out when total < TRIAGE_FLOOR.
let triage_active = total >= TRIAGE_FLOOR;
let triage_score = (inp.triage_coverage * 100.0).round().clamp(0.0, 100.0) as u8;
let triage_weight = if triage_active { 0.20 } else { 0.0 };
@ -470,7 +470,7 @@ fn build_components(
HealthComponent {
label: "Severity pressure".into(),
score: sev_score,
weight: 1.0, // Severity is the *base*, not a modifier full weight in the blend.
weight: 1.0, // Severity is the *base*, not a modifier, full weight in the blend.
detail: sev_detail,
},
HealthComponent {
@ -770,7 +770,7 @@ mod tests {
.collect();
let s = summary_of(&findings);
let h = compute(&first_scan(&s, &findings, 0.0, 100));
// The score reflects credibility should NOT crater to F.
// The score reflects credibility, should NOT crater to F.
assert!(
h.score >= 60,
"low-credibility HIGHs shouldn't crater to F, got {}",

View file

@ -65,7 +65,7 @@ pub struct JobManager {
job_order: Mutex<Vec<String>>,
active_job_id: Mutex<Option<String>>,
max_jobs: usize,
/// Dedicated rayon pool for scans keeps the global pool (and tokio
/// Dedicated rayon pool for scans, keeps the global pool (and tokio
/// worker threads) free so the web UI stays responsive during a scan.
scan_pool: rayon::ThreadPool,
}

View file

@ -632,7 +632,7 @@ pub struct HealthScore {
pub struct HealthComponent {
/// Human label (e.g. "Severity pressure", "Trend", "Triage").
pub label: String,
/// 0100 already inverted so higher = healthier.
/// 0100, already inverted so higher = healthier.
pub score: u8,
/// Weight applied when blending into the final score (0.01.0).
pub weight: f64,
@ -662,7 +662,7 @@ pub struct BacklogStats {
pub median_age_days: Option<u32>,
/// Findings older than 30 days that remain open.
pub stale_count: usize,
/// Histogram buckets (label, count) fixed 5 buckets.
/// Histogram buckets (label, count), fixed 5 buckets.
pub age_buckets: Vec<OverviewCount>,
}
@ -691,12 +691,12 @@ pub struct ConfidenceDistribution {
pub struct ScannerQuality {
pub files_scanned: u64,
pub files_skipped: u64,
/// 0.01.0 files_scanned / (files_scanned + files_skipped).
/// 0.01.0, files_scanned / (files_scanned + files_skipped).
pub parse_success_rate: f64,
pub functions_analyzed: u64,
pub call_edges: u64,
pub unresolved_calls: u64,
/// 0.01.0 call_edges / (call_edges + unresolved_calls).
/// 0.01.0, call_edges / (call_edges + unresolved_calls).
pub call_resolution_rate: f64,
/// % of taint findings that received a symbolic verdict (Confirmed|Infeasible|Inconclusive).
pub symex_verified_rate: f64,
@ -712,7 +712,7 @@ pub struct IssueCategoryBucket {
pub count: usize,
}
/// "Hot sink" a single callee that absorbs many findings.
/// "Hot sink", a single callee that absorbs many findings.
#[derive(Debug, Clone, Serialize)]
pub struct HotSink {
/// Callee name (best-effort; from flow_steps last Sink).
@ -723,7 +723,7 @@ pub struct HotSink {
/// One OWASP Top-10 (2021) bucket.
#[derive(Debug, Clone, Serialize)]
pub struct OwaspBucket {
/// "A01:2021 Broken Access Control" etc.
/// "A01:2021, Broken Access Control" etc.
pub code: String,
pub label: String,
pub count: usize,

View file

@ -41,7 +41,7 @@ pub async fn observe(mut request: Request, next: Next) -> Response {
response.headers_mut().insert(REQUEST_ID_HEADER, value);
}
// Skip noisy SSE channel long-lived stream pollutes logs.
// Skip noisy SSE channel, long-lived stream pollutes logs.
if path != "/api/events" {
if status.is_server_error() {
tracing::error!(

View file

@ -1,15 +1,15 @@
//! Static rule-id → OWASP Top-10 (2021) mapping for the dashboard.
//!
//! Rule IDs follow the convention `{lang}.{family}.{name}` (e.g. `js.xss.outer_html`).
//! The family segment is what determines the bucket. Conservative when in doubt,
//! The family segment is what determines the bucket. Conservative, when in doubt,
//! map to the closest fit; rules with no obvious bucket are left unbucketed.
use crate::server::models::OwaspBucket;
use std::collections::HashMap;
/// Extract the family token from a rule ID. Handles two ID shapes:
/// 1. `lang.family.name` typical (e.g. `js.xss.outer_html`)
/// 2. `family-subname` or single-segment engine-emitted (e.g.
/// 1. `lang.family.name`, typical (e.g. `js.xss.outer_html`)
/// 2. `family-subname` or single-segment, engine-emitted (e.g.
/// `state-resource-leak`, `taint-unsanitised-flow`, `cfg-error-fallthrough`)
fn extract_family(rule_id: &str) -> &str {
if let Some(idx) = rule_id.find('.') {
@ -33,23 +33,23 @@ pub fn owasp_bucket_for(rule_id: &str) -> Option<(&'static str, &'static str)> {
}
Some(match family {
// A01 Broken Access Control
// A01, Broken Access Control
"auth" | "csrf" | "mass_assign" | "path" | "redirect" => ("A01", "Broken Access Control"),
// A02 Cryptographic Failures
// A02, Cryptographic Failures
"crypto" | "secrets" => ("A02", "Cryptographic Failures"),
// A03 Injection (covers SQLi, XSS, command, code-eval, template, NoSQL, LDAP, reflection,
// A03, Injection (covers SQLi, XSS, command, code-eval, template, NoSQL, LDAP, reflection,
// and engine-level taint findings without a more specific family tag).
"sqli" | "xss" | "cmdi" | "code_exec" | "template" | "nosql" | "ldap" | "reflection"
| "taint" => ("A03", "Injection"),
// A05 Security Misconfiguration (TLS verify off, cookie flags, prototype pollution)
// A05, Security Misconfiguration (TLS verify off, cookie flags, prototype pollution)
"config" | "transport" | "prototype" => ("A05", "Security Misconfiguration"),
// A08 Software and Data Integrity Failures
// A08, Software and Data Integrity Failures
"deser" => ("A08", "Software and Data Integrity Failures"),
// A09 Logging & Monitoring Failures
// A09, Logging & Monitoring Failures
"log" => ("A09", "Logging and Monitoring Failures"),
// A10 SSRF
// A10, SSRF
"ssrf" => ("A10", "Server-Side Request Forgery"),
// Memory-safety + state-machine resource lifecycle bugs closest OWASP fit is
// Memory-safety + state-machine resource lifecycle bugs, closest OWASP fit is
// A04 Insecure Design (defensive depth).
"memory" | "state" => ("A04", "Insecure Design"),
// Quality findings (e.g. rs.quality.unwrap) and CFG structural issues
@ -162,7 +162,7 @@ mod tests {
fn malformed_rule_returns_none() {
// single-segment "not" → family "not" → unmapped → None
assert_eq!(owasp_bucket_for("not-a-rule"), None);
// "js.onlytwo" family is "onlytwo" which is unmapped
// "js.onlytwo", family is "onlytwo" which is unmapped
assert_eq!(owasp_bucket_for("js.onlytwo"), None);
}

View file

@ -282,7 +282,7 @@ async fn remove_terminator(
// ── Sources / Sinks / Sanitizers (by kind) ───────────────────────────────────
fn list_by_kind(state: &AppState, target_kind: &str) -> Vec<LabelEntryView> {
// Built-in rules live on /api/rules keep this endpoint focused on the
// Built-in rules live on /api/rules, keep this endpoint focused on the
// user's own additions in nyx.local.
let target_rule_kind = match target_kind {
"source" => RuleKind::Source,

View file

@ -306,8 +306,8 @@ async fn get_type_facts(
}
/// GET /api/debug/auth?file=<path>
/// Return the file-scoped authorization model routes, units,
/// sensitive operations, and auth checks for the debug UI.
/// Return the file-scoped authorization model, routes, units,
/// sensitive operations, and auth checks, for the debug UI.
async fn get_auth(
State(state): State<AppState>,
Query(q): Query<FileQuery>,

View file

@ -55,7 +55,7 @@ struct TreeEntry {
struct SymbolEntry {
name: String,
/// Legacy display kind (`"function"` / `"method"`) used by existing CSS
/// classes in the frontend. Kept for backward-compat new consumers
/// classes in the frontend. Kept for backward-compat, new consumers
/// should prefer `func_kind`.
kind: String,
/// Structural [`crate::symbol::FuncKind`] slug (`"fn"`, `"method"`,
@ -291,7 +291,7 @@ async fn get_symbols(
let entries: Vec<SymbolEntry> = symbols
.into_iter()
.map(|(name, arity, _lang, namespace, container, func_kind)| {
// Legacy `kind` field still used by existing CSS classes
// Legacy `kind` field, still used by existing CSS classes
// (`symbol-kind-method`, `symbol-kind-function`). Map any
// method-like FuncKind onto `"method"` and everything else
// onto `"function"` so the rendered icon stays sensible.

View file

@ -73,7 +73,7 @@ fn load_latest_findings_internal(state: &AppState) -> LoadedFindings {
/// Build (or fetch from cache) the per-scan derived views.
///
/// Returns clones of `Arc`s so callers can drop the lock immediately and work
/// without contention. Triage state is *not* baked into the cached views it
/// without contention. Triage state is *not* baked into the cached views, it
/// changes on a different cadence and is overlaid per request.
fn cached_for_latest(state: &AppState) -> CachedFindings {
let loaded = load_latest_findings_internal(state);
@ -85,7 +85,7 @@ fn cached_for_latest(state: &AppState) -> CachedFindings {
}
}
// Slow path: rebuild. Guard against concurrent rebuilds of the same key
// Slow path: rebuild. Guard against concurrent rebuilds of the same key ,
// a second writer that finds the cache already populated for our key
// simply returns it.
let mut guard = state.findings_cache.write();

View file

@ -29,7 +29,7 @@ pub fn routes() -> Router<AppState> {
.route("/overview/baseline/{scan_id}", post(set_baseline_path))
}
/// GET /api/overview aggregated dashboard data.
/// GET /api/overview, aggregated dashboard data.
async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
// 1. Load latest findings (in-memory → DB fallback)
let findings = crate::server::routes::findings::load_latest_findings(&state);
@ -121,7 +121,7 @@ async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
new_since_last,
fixed_since_last,
reintroduced: reintroduced_count,
// Files-scanned proxy for repo size used for size-aware
// Files-scanned proxy for repo size, used for size-aware
// severity dampening in `health::compute`. See
// `docs/health-score-audit.md` for calibration data.
repo_files: scanner_quality
@ -129,10 +129,10 @@ async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
.map(|q| q.files_scanned)
.filter(|&f| f > 0),
backlog: backlog.as_ref(),
// Trend is meaningless without ≥2 completed scans
// Trend is meaningless without ≥2 completed scans ,
// matches the first-scan check `compare_to_current` uses.
has_history: history.scans.len() >= 2,
// Suppression-hygiene modifier populated when the
// Suppression-hygiene modifier, populated when the
// suppression panel was computable for this scan.
blanket_suppression_rate: suppression_hygiene.as_ref().map(|s| s.blanket_rate),
},
@ -173,7 +173,7 @@ async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
})
}
/// GET /api/overview/trends scan-over-scan finding counts.
/// GET /api/overview/trends, scan-over-scan finding counts.
async fn overview_trends(State(state): State<AppState>) -> Json<Vec<TrendPoint>> {
let mut points = Vec::new();
@ -218,7 +218,7 @@ struct BaselineBody {
scan_id: String,
}
/// POST /api/overview/baseline { scan_id } pin a scan as the baseline for drift comparison.
/// POST /api/overview/baseline { scan_id }, pin a scan as the baseline for drift comparison.
async fn set_baseline(
State(state): State<AppState>,
Json(body): Json<BaselineBody>,
@ -226,7 +226,7 @@ async fn set_baseline(
set_baseline_inner(&state, &body.scan_id)
}
/// POST /api/overview/baseline/:scan_id convenience path-form for clients without a JSON body.
/// POST /api/overview/baseline/:scan_id, convenience path-form for clients without a JSON body.
async fn set_baseline_path(
State(state): State<AppState>,
AxPath(scan_id): AxPath<String>,
@ -248,7 +248,7 @@ fn set_baseline_inner(state: &AppState, scan_id: &str) -> Result<StatusCode, Sta
Ok(StatusCode::NO_CONTENT)
}
/// DELETE /api/overview/baseline clear the pinned baseline.
/// DELETE /api/overview/baseline, clear the pinned baseline.
async fn clear_baseline(State(state): State<AppState>) -> Result<StatusCode, StatusCode> {
let pool = state
.db_pool
@ -381,7 +381,7 @@ impl ScanHistory {
(new_count, fixed_count, reintroduced)
}
/// Trend slope across the last N totals 1.0 means strictly improving,
/// Trend slope across the last N totals, 1.0 means strictly improving,
/// -1.0 strictly regressing, 0.0 unchanged. Returns None with <3 points.
fn trend_slope(&self) -> Option<f64> {
if self.scans.len() < 3 {
@ -712,7 +712,7 @@ fn compute_cross_file_ratio(findings: &[Diag]) -> f64 {
cross as f64 / findings.len() as f64
}
/// Hot sinks are *only* meaningful for taint findings counting AST rule IDs
/// Hot sinks are *only* meaningful for taint findings, counting AST rule IDs
/// (e.g. `rs.quality.unwrap`) here just duplicates the Top Rules table. So we
/// deliberately require a real Sink-step callee (or a parsable sink snippet)
/// and skip everything else. Empty result → frontend hides the card.
@ -751,7 +751,7 @@ fn compute_hot_sinks(findings: &[Diag], limit: usize) -> Vec<HotSink> {
rows
}
/// Pull the leading identifier from a sink snippet a best-effort heuristic
/// Pull the leading identifier from a sink snippet, a best-effort heuristic
/// for the dashboard's "hot sinks" list.
fn extract_callee_from_snippet(s: &str) -> String {
let trimmed = s.trim();
@ -932,7 +932,7 @@ fn compute_suppression_hygiene(state: &AppState, findings: &[Diag]) -> Suppressi
}
fn compute_backlog(state: &AppState, findings: &[Diag], history: &ScanHistory) -> BacklogStats {
// No useful aging data on the first scan every fingerprint was first-seen
// No useful aging data on the first scan, every fingerprint was first-seen
// today by definition. Avoid the misleading "0d / 0d / 0" display.
if history.scans.len() <= 1 {
return BacklogStats {
@ -1046,7 +1046,7 @@ fn build_posture(
current_total: usize,
) -> PostureSummary {
// First-scan case: no prior data to diff against. Saying "stable / no change"
// is misleading we genuinely don't know yet.
// is misleading, we genuinely don't know yet.
if history.scans.len() <= 1 {
return PostureSummary {
trend: "unknown".into(),

View file

@ -61,7 +61,7 @@ fn build_rule_list(state: &AppState) -> Vec<RuleInfo> {
rules
}
/// GET /api/rules list all rules with finding counts.
/// GET /api/rules, list all rules with finding counts.
async fn list_rules(State(state): State<AppState>) -> Json<Vec<RuleListItem>> {
let rules = build_rule_list(&state);
@ -99,7 +99,7 @@ async fn list_rules(State(state): State<AppState>) -> Json<Vec<RuleListItem>> {
Json(items)
}
/// GET /api/rules/:id full detail for one rule.
/// GET /api/rules/:id, full detail for one rule.
async fn get_rule(
State(state): State<AppState>,
Path(id): Path<String>,
@ -140,7 +140,7 @@ async fn get_rule(
}))
}
/// POST /api/rules/:id/toggle enable/disable a rule.
/// POST /api/rules/:id/toggle, enable/disable a rule.
async fn toggle_rule(
State(state): State<AppState>,
Path(id): Path<String>,
@ -162,7 +162,7 @@ async fn toggle_rule(
Ok(Json(serde_json::json!({ "status": "ok", "rule_id": id })))
}
/// POST /api/rules/clone clone a built-in rule to custom.
/// POST /api/rules/clone, clone a built-in rule to custom.
async fn clone_rule(
State(state): State<AppState>,
Json(body): Json<serde_json::Value>,

View file

@ -213,7 +213,7 @@ async fn delete_scan(
Json(serde_json::json!({ "error": msg })),
));
}
// "Scan not found" in memory is fine may be DB-only
// "Scan not found" in memory is fine, may be DB-only
}
// Delete from DB (CASCADE handles metrics + logs)

View file

@ -3,8 +3,8 @@
//! This file is designed to be committed to version control so that triage
//! decisions travel with the code and are shared across team members.
//!
//! The file uses **portable fingerprints** computed with paths relative to the
//! project root so they match across machines regardless of where the repo is
//! The file uses **portable fingerprints**, computed with paths relative to the
//! project root, so they match across machines regardless of where the repo is
//! checked out.
use crate::commands::scan::Diag;

View file

@ -59,7 +59,7 @@ impl BaseAliasResult {
///
/// For each entry `(dst_val, src_val)` where copy prop replaced `dst` with
/// `src`, looks up the original variable names. If both are plain identifiers
/// (no dots i.e. not field paths), they are registered as base aliases.
/// (no dots, i.e. not field paths), they are registered as base aliases.
/// Transitive closure is computed so `b = a; c = b` yields group `{a, b, c}`.
pub fn compute_base_aliases(
copy_map: &HashMap<SsaValue, SsaValue>,
@ -103,7 +103,7 @@ pub fn compute_base_aliases(
let ra = find(parent, a);
let rb = find(parent, b);
if ra != rb {
// Arbitrary root choice alphabetically smaller becomes root
// Arbitrary root choice, alphabetically smaller becomes root
// for determinism.
if ra < rb {
parent.insert(rb, ra);
@ -130,7 +130,7 @@ pub fn compute_base_aliases(
None => continue,
};
// Only alias plain idents dotted paths (field accesses) are tracked
// Only alias plain idents, dotted paths (field accesses) are tracked
// independently in SSA and handled by field-aware suppression.
if dst_name.contains('.') || src_name.contains('.') {
continue;

Some files were not shown because too many files have changed in this diff Show more