mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-06 19:35:13 +02:00
Python fp and docs updtes (#58)
* refactor: Update comments for clarity and add expectations.json files for performance metrics * feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks * feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks * refactor: Simplify code formatting for better readability in multiple files * refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration * feat: Update Java and Python patterns to include new security rules * refactor: Improve comment clarity and consistency across multiple Rust files * refactor: Simplify code formatting for improved readability in integration tests and module files * refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
parent
4db0805de6
commit
a438886217
291 changed files with 9485 additions and 3851 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -5,6 +5,7 @@
|
|||
/.idea
|
||||
/frontend/node_modules
|
||||
/src/server/assets/dist
|
||||
/marketing
|
||||
/.nyx
|
||||
/logs
|
||||
/book
|
||||
|
|
|
|||
|
|
@ -33,6 +33,13 @@ pkg-url = "{ repo }/releases/download/v{ version }/nyx-{ target }{ archive-suffi
|
|||
pkg-fmt = "zip"
|
||||
bin-dir = "target/{ target }/release/{ bin }{ binary-ext }"
|
||||
|
||||
# docs.rs builds the `serve` feature (default) so the server module renders.
|
||||
# `smt` is left off — bundled Z3 takes too long on docs.rs builders, and
|
||||
# `smt-system-z3` needs a system library that isn't available there.
|
||||
[package.metadata.docs.rs]
|
||||
features = ["serve"]
|
||||
rustdoc-args = ["--cfg", "docsrs"]
|
||||
|
||||
[features]
|
||||
default = ["serve"]
|
||||
serve = ["dep:axum", "dep:tokio", "dep:tokio-stream", "dep:tower-http"]
|
||||
|
|
|
|||
|
|
@ -152,6 +152,12 @@ The corpus also holds a small set of vulnerable/patched pairs extracted from pub
|
|||
|
||||
Fixtures live under [`tests/benchmark/cve_corpus/`](tests/benchmark/cve_corpus/) with upstream attribution headers.
|
||||
|
||||
<!--
|
||||
### Real-world findings
|
||||
|
||||
- **Nextcloud server**, [PR #59979](https://github.com/nextcloud/server/pull/59979), merged. The runtime decoder for this column already restricted `allowed_classes`, but the repair routine called `unserialize()` without it, so magic methods on referenced classes could still run. Fix matches the runtime path.
|
||||
-->
|
||||
|
||||
---
|
||||
|
||||
## How it works
|
||||
|
|
|
|||
Binary file not shown.
|
Before Width: | Height: | Size: 16 MiB After Width: | Height: | Size: 15 MiB |
295
build.rs
295
build.rs
|
|
@ -1,7 +1,9 @@
|
|||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
render_docs_for_rustdoc();
|
||||
|
||||
// Only relevant when the serve feature is active
|
||||
if std::env::var("CARGO_FEATURE_SERVE").is_err() {
|
||||
return;
|
||||
|
|
@ -14,11 +16,11 @@ fn main() {
|
|||
println!("cargo:rerun-if-changed=src/server/assets/dist/index.html");
|
||||
|
||||
if index_html.exists() {
|
||||
// Dist already built — nothing to do
|
||||
// Dist already built, nothing to do
|
||||
return;
|
||||
}
|
||||
|
||||
// Dist missing — try to build frontend
|
||||
// Dist missing, try to build frontend
|
||||
let frontend_dir = Path::new("frontend");
|
||||
if !frontend_dir.join("package.json").exists() {
|
||||
emit_placeholder_and_warn(dist_dir);
|
||||
|
|
@ -56,6 +58,293 @@ fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Rustdoc / docs.rs: render docs/*.md into $OUT_DIR with relative .md links
|
||||
// rewritten to absolute github.com/elicpeter/nyx URLs so they resolve when the
|
||||
// markdown is embedded in rustdoc via #![doc = include_str!(...)].
|
||||
//
|
||||
// Source of truth stays in docs/. Files that don't exist (published-crate
|
||||
// builds where docs/ wasn't packaged) fall back to a one-line stub so rustdoc
|
||||
// still compiles.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const GH_DOCS_BASE: &str = "https://github.com/elicpeter/nyx/blob/master/docs";
|
||||
|
||||
struct DocSpec {
|
||||
/// Path under docs/, e.g. "how-it-works.md" or "detectors/taint.md".
|
||||
src: &'static str,
|
||||
/// Output filename in $OUT_DIR.
|
||||
out: &'static str,
|
||||
}
|
||||
|
||||
const DOC_SPECS: &[DocSpec] = &[
|
||||
DocSpec {
|
||||
src: "how-it-works.md",
|
||||
out: "lib_intro.md",
|
||||
},
|
||||
DocSpec {
|
||||
src: "detectors/taint.md",
|
||||
out: "taint.md",
|
||||
},
|
||||
DocSpec {
|
||||
src: "detectors/cfg.md",
|
||||
out: "cfg_analysis.md",
|
||||
},
|
||||
DocSpec {
|
||||
src: "detectors/state.md",
|
||||
out: "state.md",
|
||||
},
|
||||
DocSpec {
|
||||
src: "detectors/patterns.md",
|
||||
out: "patterns.md",
|
||||
},
|
||||
DocSpec {
|
||||
src: "auth.md",
|
||||
out: "auth_analysis.md",
|
||||
},
|
||||
];
|
||||
|
||||
fn render_docs_for_rustdoc() {
|
||||
let Ok(out_dir) = std::env::var("OUT_DIR") else {
|
||||
return;
|
||||
};
|
||||
let out_dir = PathBuf::from(out_dir);
|
||||
let docs_dir = Path::new("docs");
|
||||
|
||||
for spec in DOC_SPECS {
|
||||
let src_path = docs_dir.join(spec.src);
|
||||
println!("cargo:rerun-if-changed=docs/{}", spec.src);
|
||||
let out_path = out_dir.join(spec.out);
|
||||
let rendered = match std::fs::read_to_string(&src_path) {
|
||||
Ok(raw) => rewrite_doc_links(&raw, spec.src),
|
||||
Err(_) => format!(
|
||||
"See [`{base}/{src}`]({base}/{src}).\n",
|
||||
base = GH_DOCS_BASE,
|
||||
src = spec.src,
|
||||
),
|
||||
};
|
||||
if let Err(e) = std::fs::write(&out_path, rendered) {
|
||||
println!(
|
||||
"cargo:warning=failed to write rendered doc {}: {}",
|
||||
out_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Render markdown for embedding in rustdoc.
|
||||
///
|
||||
/// 1. Rewrites relative `.md` links to absolute github.com URLs:
|
||||
/// - inline links: `](path.md)` and `](path.md#anchor)`
|
||||
/// - reference defs: `[id]: path.md`
|
||||
/// 2. Labels unmarked fenced code blocks as `text` so rustdoc does not try
|
||||
/// to compile them as Rust (and choke on Unicode like `→`).
|
||||
/// 3. Annotates `rust` fences with `,ignore` so rustdoc doesn't try to
|
||||
/// compile or run prose-level snippets as doctests. GitHub still
|
||||
/// highlights them as Rust because it keys off the first token.
|
||||
///
|
||||
/// Skips link rewriting inside code fences. Skips link rewriting for URLs
|
||||
/// that are already absolute (have a scheme), pure anchors (`#section`),
|
||||
/// or non-`.md` paths.
|
||||
fn rewrite_doc_links(content: &str, source_rel: &str) -> String {
|
||||
let source_dir = Path::new(source_rel)
|
||||
.parent()
|
||||
.map(|p| p.to_string_lossy().into_owned())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut out = String::with_capacity(content.len() + 256);
|
||||
let mut in_fence = false;
|
||||
|
||||
for line in content.split_inclusive('\n') {
|
||||
let body = line.strip_suffix('\n').unwrap_or(line);
|
||||
let trimmed = body.trim_start();
|
||||
if trimmed.starts_with("```") {
|
||||
let lang = trimmed.trim_start_matches('`').trim();
|
||||
if in_fence {
|
||||
in_fence = false;
|
||||
out.push_str(line);
|
||||
} else {
|
||||
in_fence = true;
|
||||
let indent_len = body.len() - trimmed.len();
|
||||
if lang.is_empty() {
|
||||
out.push_str(&body[..indent_len]);
|
||||
out.push_str("```text");
|
||||
if line.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
} else if is_rust_fence_needing_ignore(lang) {
|
||||
out.push_str(&body[..indent_len]);
|
||||
out.push_str("```rust,ignore");
|
||||
if line.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
} else {
|
||||
out.push_str(line);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if in_fence {
|
||||
out.push_str(line);
|
||||
} else {
|
||||
rewrite_links_in_line(body, &source_dir, &mut out);
|
||||
if line.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
fn rewrite_links_in_line(line: &str, source_dir: &str, out: &mut String) {
|
||||
let bytes = line.as_bytes();
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
// Inline link: `](URL)`, markdown URLs do not contain a raw `)`.
|
||||
if i + 1 < bytes.len() && bytes[i] == b']' && bytes[i + 1] == b'(' {
|
||||
out.push_str("](");
|
||||
i += 2;
|
||||
let url_start = i;
|
||||
while i < bytes.len() && bytes[i] != b')' {
|
||||
i += 1;
|
||||
}
|
||||
let url = &line[url_start..i];
|
||||
out.push_str(&maybe_rewrite_url(url, source_dir));
|
||||
}
|
||||
// Reference def: `]: URL`.
|
||||
else if i + 2 < bytes.len()
|
||||
&& bytes[i] == b']'
|
||||
&& bytes[i + 1] == b':'
|
||||
&& bytes[i + 2] == b' '
|
||||
{
|
||||
out.push_str("]: ");
|
||||
i += 3;
|
||||
let url_start = i;
|
||||
while i < bytes.len() && bytes[i] != b' ' {
|
||||
i += 1;
|
||||
}
|
||||
let url = &line[url_start..i];
|
||||
out.push_str(&maybe_rewrite_url(url, source_dir));
|
||||
} else {
|
||||
// `]` (0x5D) is ASCII; UTF-8 continuation bytes are 0x80-0xBF
|
||||
// and start bytes are 0xC0+, so byte-level scanning of `]` is
|
||||
// safe. For non-ASCII bytes, copy the full codepoint at once.
|
||||
let b = bytes[i];
|
||||
if b < 0x80 {
|
||||
out.push(b as char);
|
||||
i += 1;
|
||||
} else {
|
||||
let len = utf8_seq_len(b);
|
||||
let end = (i + len).min(bytes.len());
|
||||
out.push_str(&line[i..end]);
|
||||
i = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// True for `rust` / `rust,...` fences that don't already opt out of
|
||||
/// doctest execution. We rewrite these to `rust,ignore` because the prose
|
||||
/// snippets in docs/ are illustrative, not standalone-compilable.
|
||||
fn is_rust_fence_needing_ignore(lang: &str) -> bool {
|
||||
let mut parts = lang.split(',').map(|p| p.trim());
|
||||
let Some(first) = parts.next() else {
|
||||
return false;
|
||||
};
|
||||
if !first.eq_ignore_ascii_case("rust") {
|
||||
return false;
|
||||
}
|
||||
for tag in parts {
|
||||
let t = tag.to_ascii_lowercase();
|
||||
if t == "ignore" || t == "no_run" || t == "compile_fail" || t == "should_panic" {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn utf8_seq_len(lead: u8) -> usize {
|
||||
// lead < 0xC0 covers ASCII and unexpected continuation bytes; treat both as
|
||||
// single-byte to make progress.
|
||||
if lead < 0xC0 {
|
||||
1
|
||||
} else if lead < 0xE0 {
|
||||
2
|
||||
} else if lead < 0xF0 {
|
||||
3
|
||||
} else {
|
||||
4
|
||||
}
|
||||
}
|
||||
|
||||
fn maybe_rewrite_url(url: &str, source_dir: &str) -> String {
|
||||
if url.is_empty() {
|
||||
return url.to_string();
|
||||
}
|
||||
// Already absolute (scheme://, mailto:, ssh://, etc.), leave alone.
|
||||
if has_scheme(url) {
|
||||
return url.to_string();
|
||||
}
|
||||
// Pure anchor, leave alone.
|
||||
if url.starts_with('#') {
|
||||
return url.to_string();
|
||||
}
|
||||
// Split off optional anchor.
|
||||
let (path, anchor) = match url.find('#') {
|
||||
Some(p) => (&url[..p], &url[p..]),
|
||||
None => (url, ""),
|
||||
};
|
||||
// Only rewrite if the path looks like a markdown file.
|
||||
if !path.ends_with(".md") {
|
||||
return url.to_string();
|
||||
}
|
||||
// Resolve relative to source_dir.
|
||||
let combined = if source_dir.is_empty() {
|
||||
path.to_string()
|
||||
} else {
|
||||
format!("{}/{}", source_dir, path)
|
||||
};
|
||||
let normalised = normalise_path(&combined);
|
||||
format!("{}/{}{}", GH_DOCS_BASE, normalised, anchor)
|
||||
}
|
||||
|
||||
fn has_scheme(url: &str) -> bool {
|
||||
// RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) ":"
|
||||
let mut chars = url.chars();
|
||||
let first = match chars.next() {
|
||||
Some(c) => c,
|
||||
None => return false,
|
||||
};
|
||||
if !first.is_ascii_alphabetic() {
|
||||
return false;
|
||||
}
|
||||
for c in chars {
|
||||
if c == ':' {
|
||||
return true;
|
||||
}
|
||||
if !(c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn normalise_path(path: &str) -> String {
|
||||
let mut stack: Vec<&str> = Vec::new();
|
||||
for seg in path.split('/') {
|
||||
match seg {
|
||||
"" | "." => {}
|
||||
".." => {
|
||||
stack.pop();
|
||||
}
|
||||
other => stack.push(other),
|
||||
}
|
||||
}
|
||||
stack.join("/")
|
||||
}
|
||||
|
||||
fn emit_placeholder_and_warn(dist_dir: &Path) {
|
||||
// Create minimal placeholder files so compilation succeeds
|
||||
std::fs::create_dir_all(dist_dir).ok();
|
||||
|
|
|
|||
|
|
@ -9,6 +9,16 @@ Nyx ships four independent detector families. They run together in `--mode full`
|
|||
| [State model](detectors/state.md) | `state-*` | Per-function state lattice | Use-after-close, double-close, leaks, unauthenticated access |
|
||||
| [AST patterns](detectors/patterns.md) | `<lang>.<cat>.<name>` | Tree-sitter structural match | Banned APIs, weak crypto, dangerous constructs |
|
||||
|
||||
The taint family is split into cap-specific rule classes when a sink callee carries multiple vulnerability classes:
|
||||
|
||||
| Rule id | Cap | Surface |
|
||||
|---|---|---|
|
||||
| `taint-unsanitised-flow` | every cap except `data_exfil` and `unauthorized_id` | Default taint flow class |
|
||||
| `taint-data-exfiltration` | `data_exfil` | Sensitive data flowing into the payload of an outbound network request (body / headers / json on `fetch`, body on `XMLHttpRequest.send`). Distinct from SSRF: the destination is fixed but attacker-influenced bytes leave the process. |
|
||||
| `rs.auth.missing_ownership_check.taint` | `unauthorized_id` | Rust auth subsystem fold-in; see [auth.md](auth.md). |
|
||||
|
||||
A single call site can fire several of these at once when it carries multiple gates — `fetch(taintedUrl, {body: tainted})` produces both an SSRF finding (URL flow) and a `taint-data-exfiltration` finding (body flow), each with its own cap mask rather than a conflated union.
|
||||
|
||||
For Rust auth-specific rules (`rs.auth.*`), see [auth.md](auth.md).
|
||||
|
||||
## How they combine
|
||||
|
|
|
|||
|
|
@ -134,7 +134,8 @@ Sources, sanitizers, and sinks are linked by named capabilities. A sanitizer onl
|
|||
| `fmt_string` | | | `printf(var)` |
|
||||
| `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` with concatenation |
|
||||
| `deserialize` | | | `pickle.loads`, `yaml.load`, `Marshal.load` |
|
||||
| `ssrf` | | URL-prefix locks | `requests.get`, `fetch`, `HttpClient.send` |
|
||||
| `ssrf` | | URL-prefix locks | `requests.get`, `fetch` URL arg, outbound HTTP destination |
|
||||
| `data_exfil` | | | `fetch` body / headers / json, `XMLHttpRequest.send` body |
|
||||
| `code_exec` | | | `eval`, `exec`, `Function` |
|
||||
| `crypto` | | | weak-algorithm constructors |
|
||||
| `unauthorized_id` | request-bound scoped IDs (Rust auth analysis) | ownership check | row-level write |
|
||||
|
|
|
|||
|
|
@ -112,12 +112,14 @@ The tables below are generated from `src/patterns/<lang>.rs` by [`tools/docgen`]
|
|||
| `go.crypto.md5` | Low | A | Medium |
|
||||
| `go.crypto.sha1` | Low | A | Medium |
|
||||
|
||||
### Java: 8 patterns
|
||||
### Java: 10 patterns
|
||||
|
||||
| Rule ID | Severity | Tier | Confidence |
|
||||
|---|---|---|---|
|
||||
| `java.cmdi.runtime_exec` | High | A | High |
|
||||
| `java.code_exec.text4shell_interpolator` | High | A | High |
|
||||
| `java.deser.readobject` | High | A | High |
|
||||
| `java.deser.snakeyaml_unsafe_constructor` | High | A | High |
|
||||
| `java.reflection.class_forname` | Medium | A | High |
|
||||
| `java.reflection.method_invoke` | Medium | A | High |
|
||||
| `java.sqli.execute_concat` | Medium | B | Medium |
|
||||
|
|
@ -168,7 +170,7 @@ The tables below are generated from `src/patterns/<lang>.rs` by [`tools/docgen`]
|
|||
| `php.crypto.rand` | Low | A | Medium |
|
||||
| `php.crypto.sha1` | Low | A | Medium |
|
||||
|
||||
### Python: 13 patterns
|
||||
### Python: 14 patterns
|
||||
|
||||
| Rule ID | Severity | Tier | Confidence |
|
||||
|---|---|---|---|
|
||||
|
|
@ -182,6 +184,7 @@ The tables below are generated from `src/patterns/<lang>.rs` by [`tools/docgen`]
|
|||
| `py.code_exec.compile` | Medium | A | High |
|
||||
| `py.deser.shelve_open` | Medium | A | High |
|
||||
| `py.sqli.execute_format` | Medium | B | Medium |
|
||||
| `py.sqli.text_format` | Medium | B | Medium |
|
||||
| `py.xss.jinja_from_string` | Medium | A | High |
|
||||
| `py.crypto.md5` | Low | A | Medium |
|
||||
| `py.crypto.sha1` | Low | A | Medium |
|
||||
|
|
|
|||
|
|
@ -19,8 +19,8 @@ use serde::{Deserialize, Serialize};
|
|||
|
||||
/// Bit-level abstract fact: known-zero and known-one masks.
|
||||
///
|
||||
/// - `top()` = `{known_zero: 0, known_one: 0}` — no bits known
|
||||
/// - `bottom()` = `{known_zero: MAX, known_one: MAX}` — contradictory
|
||||
/// - `top()` = `{known_zero: 0, known_one: 0}`, no bits known
|
||||
/// - `bottom()` = `{known_zero: MAX, known_one: MAX}`, contradictory
|
||||
/// - `from_const(n)` = all 64 bits known
|
||||
///
|
||||
/// Invariant: `known_zero & known_one == 0` for non-bottom values.
|
||||
|
|
@ -253,7 +253,7 @@ impl AbstractDomain for BitFact {
|
|||
}
|
||||
}
|
||||
|
||||
/// Widen: same as join (finite lattice height — 64 bits × 3 states).
|
||||
/// Widen: same as join (finite lattice height, 64 bits × 3 states).
|
||||
fn widen(&self, other: &Self) -> Self {
|
||||
self.join(other)
|
||||
}
|
||||
|
|
@ -511,7 +511,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn right_shift_unknown_sign() {
|
||||
// Sign bit unknown — high bits after shift should be unknown
|
||||
// Sign bit unknown, high bits after shift should be unknown
|
||||
let a = BitFact {
|
||||
known_zero: 0x0F,
|
||||
known_one: 0,
|
||||
|
|
@ -687,7 +687,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `a ⊓ b ⊑ a` and `a ⊓ b ⊑ b` — meet is the greatest lower bound.
|
||||
/// `a ⊓ b ⊑ a` and `a ⊓ b ⊑ b`, meet is the greatest lower bound.
|
||||
#[test]
|
||||
fn meet_is_lower_bound_bit() {
|
||||
let xs = sample_bits();
|
||||
|
|
@ -700,7 +700,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `a ⊑ a ⊔ b` and `b ⊑ a ⊔ b` — join is the least upper bound.
|
||||
/// `a ⊑ a ⊔ b` and `b ⊑ a ⊔ b`, join is the least upper bound.
|
||||
#[test]
|
||||
fn join_is_upper_bound_bit() {
|
||||
let xs = sample_bits();
|
||||
|
|
|
|||
|
|
@ -10,9 +10,9 @@ use serde::{Deserialize, Serialize};
|
|||
|
||||
/// Numeric interval: `[lo, hi]` inclusive bounds.
|
||||
///
|
||||
/// - `top()` = `[None, None]` — any integer
|
||||
/// - `bottom()` = `[1, 0]` — empty / unsatisfiable (lo > hi)
|
||||
/// - `exact(n)` = `[n, n]` — singleton
|
||||
/// - `top()` = `[None, None]`, any integer
|
||||
/// - `bottom()` = `[1, 0]`, empty / unsatisfiable (lo > hi)
|
||||
/// - `exact(n)` = `[n, n]`, singleton
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct IntervalFact {
|
||||
pub lo: Option<i64>,
|
||||
|
|
@ -278,7 +278,7 @@ impl IntervalFact {
|
|||
/// - One non-negative singleton mask `m`: `[0, m]` regardless of other
|
||||
/// operand's sign (two's complement AND with a non-negative mask always
|
||||
/// produces a non-negative result bounded by the mask).
|
||||
/// - Both non-negative: `[0, min(a.hi, b.hi)]` — AND can only clear bits.
|
||||
/// - Both non-negative: `[0, min(a.hi, b.hi)]`, AND can only clear bits.
|
||||
pub fn bit_and(&self, other: &Self) -> Self {
|
||||
if self.is_bottom() || other.is_bottom() {
|
||||
return Self::bottom();
|
||||
|
|
@ -330,7 +330,7 @@ impl IntervalFact {
|
|||
/// - Singletons: exact computation.
|
||||
/// - `x | 0` → `x`, `0 | x` → `x`.
|
||||
/// - Both non-negative with known upper bounds: `[max(a.lo, b.lo),
|
||||
/// next_pow2_minus1(max(a.hi, b.hi))]` — OR can set any bit below
|
||||
/// next_pow2_minus1(max(a.hi, b.hi))]`, OR can set any bit below
|
||||
/// the highest set bit of either operand.
|
||||
pub fn bit_or(&self, other: &Self) -> Self {
|
||||
if self.is_bottom() || other.is_bottom() {
|
||||
|
|
@ -1054,7 +1054,7 @@ mod tests {
|
|||
let a = IntervalFact::exact(i64::MIN);
|
||||
let b = IntervalFact::exact(-1);
|
||||
let r = a.div(&b);
|
||||
// Either bound becomes None (graceful) — exact representation
|
||||
// Either bound becomes None (graceful), exact representation
|
||||
// depends on the impl, but we mainly assert no panic occurred
|
||||
// and the result is a valid interval.
|
||||
assert!(
|
||||
|
|
@ -1078,7 +1078,7 @@ mod tests {
|
|||
assert_eq!(r.hi, Some(2));
|
||||
}
|
||||
|
||||
/// Modulo by an interval that *contains* zero must escape to Top —
|
||||
/// Modulo by an interval that *contains* zero must escape to Top ,
|
||||
/// modulo-by-zero is undefined and we cannot precise-narrow it.
|
||||
#[test]
|
||||
fn modulo_divisor_spans_zero_is_top() {
|
||||
|
|
@ -1096,7 +1096,7 @@ mod tests {
|
|||
|
||||
/// `[i64::MIN, i64::MAX]` is the maximal interval. Any join with
|
||||
/// any other interval must remain `[i64::MIN, i64::MAX]` (or Top
|
||||
/// equivalent) — this guards against accidental narrowing on join.
|
||||
/// equivalent), this guards against accidental narrowing on join.
|
||||
#[test]
|
||||
fn full_range_is_join_absorbing() {
|
||||
let full = IntervalFact {
|
||||
|
|
@ -1347,7 +1347,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// Modulo with exact-zero divisor — must escape to Top.
|
||||
/// Modulo with exact-zero divisor, must escape to Top.
|
||||
#[test]
|
||||
fn modulo_by_exact_zero_is_top() {
|
||||
let a = IntervalFact {
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ pub fn is_enabled() -> bool {
|
|||
|
||||
/// Per-SSA-value abstract element: product of all subdomains.
|
||||
///
|
||||
/// Each subdomain is independent — join, meet, widen, and leq are applied
|
||||
/// Each subdomain is independent, join, meet, widen, and leq are applied
|
||||
/// component-wise. Adding a new subdomain requires adding a field here
|
||||
/// and updating the component-wise implementations.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
|
@ -182,15 +182,15 @@ pub const MAX_LITERAL_PREFIX_LEN: usize = 64;
|
|||
/// restricted so the summary size stays constant regardless of callee body
|
||||
/// complexity:
|
||||
///
|
||||
/// * [`IntervalTransfer::Top`] — no interval knowledge crosses (default).
|
||||
/// * [`IntervalTransfer::Identity`] — return = param (pass-through).
|
||||
/// * [`IntervalTransfer::Affine`] — return = param * `mul` + `add` with
|
||||
/// * [`IntervalTransfer::Top`], no interval knowledge crosses (default).
|
||||
/// * [`IntervalTransfer::Identity`], return = param (pass-through).
|
||||
/// * [`IntervalTransfer::Affine`], return = param * `mul` + `add` with
|
||||
/// `i64` constants; overflow defaults to Top at apply time.
|
||||
/// * [`IntervalTransfer::Clamped`] — return is always in `[lo, hi]` regardless
|
||||
/// * [`IntervalTransfer::Clamped`], return is always in `[lo, hi]` regardless
|
||||
/// of input. Captures callee-intrinsic bounds (e.g. `saturating` ops).
|
||||
///
|
||||
/// No unbounded expression trees, no nesting. A callee whose behaviour does
|
||||
/// not fit one of these forms falls back to `Top` — we never try to encode
|
||||
/// not fit one of these forms falls back to `Top`, we never try to encode
|
||||
/// richer algebra in the summary.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub enum IntervalTransfer {
|
||||
|
|
@ -247,9 +247,9 @@ impl IntervalTransfer {
|
|||
/// Mirrors [`IntervalTransfer`] for the string subdomain. Bounded by
|
||||
/// [`MAX_LITERAL_PREFIX_LEN`] to keep summary size constant.
|
||||
///
|
||||
/// * [`StringTransfer::Unknown`] — default.
|
||||
/// * [`StringTransfer::Identity`] — return = param.
|
||||
/// * [`StringTransfer::LiteralPrefix`] — return has this literal prefix
|
||||
/// * [`StringTransfer::Unknown`], default.
|
||||
/// * [`StringTransfer::Identity`], return = param.
|
||||
/// * [`StringTransfer::LiteralPrefix`], return has this literal prefix
|
||||
/// regardless of input (callee-intrinsic).
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub enum StringTransfer {
|
||||
|
|
@ -325,7 +325,7 @@ impl StringTransfer {
|
|||
/// caller's knowledge of each argument, without having to re-run the callee.
|
||||
///
|
||||
/// Composition rule: `apply(input) = (interval.apply, string.apply,
|
||||
/// bits=top)`. The bit domain is always Top — we do not track cross-file
|
||||
/// bits=top)`. The bit domain is always Top, we do not track cross-file
|
||||
/// bit transfers.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
|
||||
pub struct AbstractTransfer {
|
||||
|
|
@ -351,7 +351,7 @@ impl AbstractTransfer {
|
|||
Self::default()
|
||||
}
|
||||
|
||||
/// True when neither subdomain carries any information — equivalent to
|
||||
/// True when neither subdomain carries any information, equivalent to
|
||||
/// "omit this entry entirely".
|
||||
pub fn is_top(&self) -> bool {
|
||||
is_interval_top(&self.interval) && is_string_unknown(&self.string)
|
||||
|
|
@ -410,7 +410,7 @@ impl AbstractState {
|
|||
/// Set abstract value for an SSA value. Drops Top values to save space.
|
||||
pub fn set(&mut self, v: SsaValue, val: AbstractValue) {
|
||||
if val.is_top() {
|
||||
// Don't store Top — it's the default
|
||||
// Don't store Top, it's the default
|
||||
if let Ok(idx) = self.values.binary_search_by_key(&v, |(id, _)| *id) {
|
||||
self.values.remove(idx);
|
||||
}
|
||||
|
|
@ -422,7 +422,7 @@ impl AbstractState {
|
|||
if self.values.len() < MAX_ABSTRACT_VALUES {
|
||||
self.values.insert(idx, (v, val));
|
||||
}
|
||||
// Over budget: silently drop (conservative — defaults to Top)
|
||||
// Over budget: silently drop (conservative, defaults to Top)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
//! Each axis is a three-value lattice [`Tri::No`] / [`Tri::Yes`] / [`Tri::Maybe`]
|
||||
//! where `Maybe` is Top (unknown) and `No` / `Yes` are the two definite
|
||||
//! refinements. A value is path-safe for a FILE_IO sink iff
|
||||
//! `dotdot == No && absolute == No` — i.e. we have proof that *no* `..`
|
||||
//! `dotdot == No && absolute == No`, i.e. we have proof that *no* `..`
|
||||
//! component and *no* absolute root can leak through. `normalized == Yes`
|
||||
//! alone is not sufficient (canonicalising an absolute input still produces
|
||||
//! an absolute path); prefix_lock is used separately to certify containment
|
||||
|
|
@ -52,7 +52,7 @@ pub enum Tri {
|
|||
No,
|
||||
/// Proven present.
|
||||
Yes,
|
||||
/// Unknown — no transfer or guard has proved the axis yet.
|
||||
/// Unknown, no transfer or guard has proved the axis yet.
|
||||
Maybe,
|
||||
}
|
||||
|
||||
|
|
@ -367,12 +367,12 @@ impl AbstractDomain for PathFact {
|
|||
/// narrowed axis can be proved safe.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum PathRejection {
|
||||
/// `x.contains("..")` — false branch proves `dotdot = No` on the receiver.
|
||||
/// `x.contains("..")`, false branch proves `dotdot = No` on the receiver.
|
||||
DotDot,
|
||||
/// `x.starts_with("/")` / `x.starts_with('\\')` — false branch proves
|
||||
/// `x.starts_with("/")` / `x.starts_with('\\')`, false branch proves
|
||||
/// `absolute = No` on the receiver.
|
||||
AbsoluteSlash,
|
||||
/// `x.is_absolute()` / `Path::new(x).is_absolute()` — false branch proves
|
||||
/// `x.is_absolute()` / `Path::new(x).is_absolute()`, false branch proves
|
||||
/// `absolute = No` on the argument/receiver.
|
||||
IsAbsolute,
|
||||
/// Not a path-rejection idiom.
|
||||
|
|
@ -384,7 +384,7 @@ pub enum PathRejection {
|
|||
/// the listed axis is refined.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum PathAssertion {
|
||||
/// `x.starts_with("<literal_root>")` — true branch attaches
|
||||
/// `x.starts_with("<literal_root>")`, true branch attaches
|
||||
/// `prefix_lock = Some("<literal_root>")` to the receiver.
|
||||
PrefixLock(String),
|
||||
/// Not a path-assertion idiom.
|
||||
|
|
@ -426,7 +426,7 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec
|
|||
let clause = clause.trim();
|
||||
// Multi-axis special case: `!filepath.IsLocal(p)` (Go).
|
||||
// `filepath.IsLocal` returns true iff the path stays within the
|
||||
// current directory — no leading `/`, no `..` segments, no Windows
|
||||
// current directory, no leading `/`, no `..` segments, no Windows
|
||||
// drive root. Idiomatic Go path-traversal guard:
|
||||
// `if !filepath.IsLocal(p) { return }`
|
||||
// The TRUE branch terminates; the FALSE branch (where IsLocal is
|
||||
|
|
@ -449,7 +449,7 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec
|
|||
out
|
||||
}
|
||||
|
||||
/// Detect `!filepath.IsLocal(<expr>)` — Go's idiomatic path-traversal
|
||||
/// Detect `!filepath.IsLocal(<expr>)`, Go's idiomatic path-traversal
|
||||
/// guard. Whitespace-tolerant: `! filepath.IsLocal(`, `!filepath . IsLocal(`,
|
||||
/// etc. Used by [`classify_path_rejection_axes`] to inject both
|
||||
/// [`PathRejection::DotDot`] and [`PathRejection::IsAbsolute`] on the false
|
||||
|
|
@ -475,7 +475,7 @@ fn has_negated_filepath_is_local(clause: &str) -> bool {
|
|||
fn classify_path_rejection_atom(clause: &str) -> PathRejection {
|
||||
// `.contains("..")` (Rust, Java) / `.includes("..")` (JS/TS) /
|
||||
// `.include?("..")` (Ruby) / `strings.Contains(s, "..")` (Go) /
|
||||
// `strstr(s, "..")` (C/C++) — every form recognised by
|
||||
// `strstr(s, "..")` (C/C++), every form recognised by
|
||||
// `extract_contains_arg` returns `..` if the needle is the dotdot
|
||||
// segment.
|
||||
if let Some(needle) = extract_contains_arg(clause)
|
||||
|
|
@ -483,7 +483,7 @@ fn classify_path_rejection_atom(clause: &str) -> PathRejection {
|
|||
{
|
||||
return PathRejection::DotDot;
|
||||
}
|
||||
// Python `".." in s` — operator form. Look for `".." in <something>`
|
||||
// Python `".." in s`, operator form. Look for `".." in <something>`
|
||||
// anywhere in the clause text. Conservative: requires the literal
|
||||
// `".." in ` substring (whitespace-tolerant).
|
||||
if has_python_dotdot_in(clause) {
|
||||
|
|
@ -681,7 +681,7 @@ pub fn classify_path_assertion(text: &str) -> PathAssertion {
|
|||
/// * Must be non-empty.
|
||||
/// * The leaf segment must begin with an ASCII uppercase letter
|
||||
/// (Rust's variant / struct / type grammar).
|
||||
/// * The leaf segment must be ASCII alphanumeric / underscore — no
|
||||
/// * The leaf segment must be ASCII alphanumeric / underscore, no
|
||||
/// method call noise (parentheses, argument lists) survives here
|
||||
/// because callees arrive in their normalised scoped-identifier
|
||||
/// form.
|
||||
|
|
@ -700,7 +700,7 @@ pub fn is_structural_variant_ctor(callee: &str) -> bool {
|
|||
// upper-camel-case names an enum variant or tuple struct (`Some`,
|
||||
// `Ok`, `MyResult`). A scoped identifier whose *penultimate*
|
||||
// segment is upper-camel-case names an associated constructor on
|
||||
// that type — `Box::new`, `Cell::from`, `PathBuf::with_capacity`,
|
||||
// that type, `Box::new`, `Cell::from`, `PathBuf::with_capacity`,
|
||||
// etc. The latter is the lower-leaf-case shape we want to admit
|
||||
// alongside the bare-variant shape.
|
||||
let segments: smallvec::SmallVec<[&str; 4]> =
|
||||
|
|
@ -731,7 +731,7 @@ pub fn is_structural_variant_ctor(callee: &str) -> bool {
|
|||
/// PathFact of the receiver/first argument (the value being sanitised);
|
||||
/// it is used as the baseline to which the call's effect is applied.
|
||||
///
|
||||
/// Returned [`None`] means the callee is not a recognised path primitive —
|
||||
/// Returned [`None`] means the callee is not a recognised path primitive ,
|
||||
/// the caller should leave the result at its pre-existing PathFact (Top).
|
||||
///
|
||||
/// Backwards-compatible wrapper around [`classify_path_primitive_rust`].
|
||||
|
|
@ -743,7 +743,7 @@ pub fn classify_path_primitive(callee: &str, input_fact: &PathFact) -> Option<Pa
|
|||
|
||||
/// Per-language path-primitive dispatcher.
|
||||
///
|
||||
/// Routes to the language-specific classifier — Rust, Python, JS/TS, Go,
|
||||
/// Routes to the language-specific classifier, Rust, Python, JS/TS, Go,
|
||||
/// Java, Ruby, PHP, or C/C++. Returns [`None`] for languages without a
|
||||
/// classifier (or callees the language's classifier doesn't recognise).
|
||||
pub fn classify_path_primitive_for_lang(
|
||||
|
|
@ -784,7 +784,7 @@ pub fn is_structural_variant_ctor_for_lang(lang: crate::symbol::Lang, callee: &s
|
|||
}
|
||||
|
||||
/// Per-language predicate for "this callee is a zero-arg fresh-allocation
|
||||
/// constructor" — used by the variant-rejection-path classifier so that
|
||||
/// constructor", used by the variant-rejection-path classifier so that
|
||||
/// `String::new()` (Rust) / `''` (Python/JS/Java/...) is recognised as a
|
||||
/// no-attacker-content fresh value with cleared `dotdot`/`absolute` axes.
|
||||
///
|
||||
|
|
@ -803,7 +803,7 @@ pub fn is_zero_arg_allocator_for_lang(lang: crate::symbol::Lang, _callee: &str)
|
|||
false
|
||||
}
|
||||
|
||||
/// Rust path-primitive classifier — `fs::canonicalize`, `Path::new`,
|
||||
/// Rust path-primitive classifier, `fs::canonicalize`, `Path::new`,
|
||||
/// `PathBuf::from`, identity-string conversions.
|
||||
pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
// Accept both path-qualified (`std::fs::canonicalize`, `fs::canonicalize`)
|
||||
|
|
@ -826,7 +826,7 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti
|
|||
// `Path::new(s)` / `PathBuf::from(s)`:
|
||||
// pass-through of the input's PathFact so downstream `starts_with`
|
||||
// checks against a Path/PathBuf value still see the underlying
|
||||
// string's narrowed axes. No axis is forced — wrapping does not
|
||||
// string's narrowed axes. No axis is forced, wrapping does not
|
||||
// sanitize on its own.
|
||||
"new" | "from" => {
|
||||
if callee_contains_segment(callee, "Path") || callee_contains_segment(callee, "PathBuf")
|
||||
|
|
@ -837,8 +837,8 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti
|
|||
}
|
||||
}
|
||||
// Identity conversions on strings/paths. Each one re-binds the
|
||||
// same logical value — the converted String / PathBuf / OsString
|
||||
// still describes the exact same filesystem path — so the PathFact
|
||||
// same logical value, the converted String / PathBuf / OsString
|
||||
// still describes the exact same filesystem path, so the PathFact
|
||||
// flows through unchanged. Without this, a sanitised `s: &str`
|
||||
// would lose its narrowed axes the moment the helper returns
|
||||
// `s.to_string()` / `s.to_owned()` / `String::from(s)`.
|
||||
|
|
@ -849,7 +849,7 @@ pub fn classify_path_primitive_rust(callee: &str, input_fact: &PathFact) -> Opti
|
|||
}
|
||||
}
|
||||
|
||||
/// Python path-primitive classifier — `os.path.normpath`, `os.path.realpath`,
|
||||
/// Python path-primitive classifier, `os.path.normpath`, `os.path.realpath`,
|
||||
/// `pathlib.Path.resolve`, `os.path.abspath`.
|
||||
///
|
||||
/// Pattern conventions: tree-sitter-python emits dotted attribute access as
|
||||
|
|
@ -893,7 +893,7 @@ pub fn classify_path_primitive_python(callee: &str, input_fact: &PathFact) -> Op
|
|||
}
|
||||
}
|
||||
|
||||
/// JavaScript / TypeScript path-primitive classifier — Node's `path` module:
|
||||
/// JavaScript / TypeScript path-primitive classifier, Node's `path` module:
|
||||
/// `path.normalize`, `path.resolve`, `path.join`.
|
||||
pub fn classify_path_primitive_js(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
let leaf = rightmost_segment(callee);
|
||||
|
|
@ -920,7 +920,7 @@ pub fn classify_path_primitive_js(callee: &str, input_fact: &PathFact) -> Option
|
|||
}
|
||||
}
|
||||
|
||||
/// Go path-primitive classifier — `path/filepath` package:
|
||||
/// Go path-primitive classifier, `path/filepath` package:
|
||||
/// `filepath.Clean`, `filepath.Abs`.
|
||||
pub fn classify_path_primitive_go(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
let leaf = rightmost_segment(callee);
|
||||
|
|
@ -947,7 +947,7 @@ pub fn classify_path_primitive_go(callee: &str, input_fact: &PathFact) -> Option
|
|||
}
|
||||
}
|
||||
|
||||
/// Java path-primitive classifier — `java.nio.file.Path.normalize` /
|
||||
/// Java path-primitive classifier, `java.nio.file.Path.normalize` /
|
||||
/// `Paths.get(s).normalize().toAbsolutePath()`.
|
||||
pub fn classify_path_primitive_java(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
let leaf = rightmost_segment(callee);
|
||||
|
|
@ -980,7 +980,7 @@ pub fn classify_path_primitive_java(callee: &str, input_fact: &PathFact) -> Opti
|
|||
}
|
||||
}
|
||||
|
||||
/// Ruby path-primitive classifier — `File.expand_path` / `Pathname#cleanpath`.
|
||||
/// Ruby path-primitive classifier, `File.expand_path` / `Pathname#cleanpath`.
|
||||
pub fn classify_path_primitive_ruby(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
let leaf = rightmost_segment(callee);
|
||||
match leaf {
|
||||
|
|
@ -1005,13 +1005,13 @@ pub fn classify_path_primitive_ruby(callee: &str, input_fact: &PathFact) -> Opti
|
|||
}
|
||||
}
|
||||
|
||||
/// PHP path-primitive classifier — `realpath`, `basename`.
|
||||
/// PHP path-primitive classifier, `realpath`, `basename`.
|
||||
pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
let leaf = rightmost_segment(callee);
|
||||
match leaf {
|
||||
// `realpath($s)`:
|
||||
// Resolves symlinks and `..`, returns absolute path. Returns
|
||||
// `false` if the file doesn't exist — but on the success path
|
||||
// `false` if the file doesn't exist, but on the success path
|
||||
// (which is what reaches a sink), it produces a clean absolute path.
|
||||
"realpath" => {
|
||||
let mut f = input_fact.clone();
|
||||
|
|
@ -1021,7 +1021,7 @@ pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Optio
|
|||
Some(f)
|
||||
}
|
||||
// `basename($s)`:
|
||||
// Strips directory components — guaranteed to contain no `..`
|
||||
// Strips directory components, guaranteed to contain no `..`
|
||||
// (basename of `..` is `..`, but basename of any traversal-
|
||||
// prefixed path is just the leaf). Conservative: clear dotdot.
|
||||
"basename" => {
|
||||
|
|
@ -1034,7 +1034,7 @@ pub fn classify_path_primitive_php(callee: &str, input_fact: &PathFact) -> Optio
|
|||
}
|
||||
}
|
||||
|
||||
/// C / C++ path-primitive classifier — POSIX `realpath`,
|
||||
/// C / C++ path-primitive classifier, POSIX `realpath`,
|
||||
/// `std::filesystem::canonical`.
|
||||
pub fn classify_path_primitive_c_cpp(callee: &str, input_fact: &PathFact) -> Option<PathFact> {
|
||||
let leaf = rightmost_segment(callee);
|
||||
|
|
@ -1089,7 +1089,7 @@ fn extract_contains_arg(text: &str) -> Option<String> {
|
|||
"strstr(",
|
||||
] {
|
||||
if let Some(idx) = text.find(prefix) {
|
||||
// Skip past the first argument (receiver) — the literal needle
|
||||
// Skip past the first argument (receiver), the literal needle
|
||||
// is the second arg, separated by a comma. Find the comma at
|
||||
// top level inside this call.
|
||||
let inner = &text[idx + prefix.len()..];
|
||||
|
|
@ -1123,7 +1123,7 @@ fn extract_starts_with_arg(text: &str) -> Option<String> {
|
|||
return Some(s);
|
||||
}
|
||||
}
|
||||
// Go free-function form `strings.HasPrefix(r, "/")` — second arg.
|
||||
// Go free-function form `strings.HasPrefix(r, "/")`, second arg.
|
||||
if let Some(idx) = text.find("strings.HasPrefix(") {
|
||||
let inner = &text[idx + "strings.HasPrefix(".len()..];
|
||||
if let Some(comma_idx) = top_level_comma(inner) {
|
||||
|
|
@ -1762,7 +1762,7 @@ mod tests {
|
|||
assert!(is_structural_variant_ctor("Box::new"));
|
||||
assert!(is_structural_variant_ctor("std::option::Option::Some"));
|
||||
// User-defined upper-camel-case variant name participates the
|
||||
// same way — name list is not part of the contract.
|
||||
// same way, name list is not part of the contract.
|
||||
assert!(is_structural_variant_ctor("MyResult::Ok"));
|
||||
assert!(is_structural_variant_ctor("Wrapper"));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
//! String abstract domain for abstract interpretation.
|
||||
//!
|
||||
//! Tracks known prefix, suffix, and — when provably bounded — the finite set
|
||||
//! Tracks known prefix, suffix, and, when provably bounded, the finite set
|
||||
//! of possible concrete string values. Used for SSRF suppression (URL prefix
|
||||
//! proves host is locked), command-injection suppression (lookup result
|
||||
//! bounded to a safe set of literals), and general string analysis.
|
||||
|
|
@ -78,7 +78,7 @@ impl StringFact {
|
|||
/// the finite domain is `{s}`.
|
||||
///
|
||||
/// Empty prefix/suffix are normalised to `None` because "starts/ends with
|
||||
/// the empty string" carries no constraint — keeping `Some("")` would
|
||||
/// the empty string" carries no constraint, keeping `Some("")` would
|
||||
/// break join idempotence (`Some("")` ⊔ `Some("")` collapses to `None`).
|
||||
pub fn exact(s: &str) -> Self {
|
||||
let prefix = truncate_prefix(s);
|
||||
|
|
@ -134,7 +134,7 @@ impl StringFact {
|
|||
/// Inputs are sorted and deduped. If the cardinality exceeds
|
||||
/// [`MAX_DOMAIN_SIZE`] or the input is empty, the domain collapses to
|
||||
/// `None` (Top on this sub-field). The prefix/suffix sub-fields remain
|
||||
/// unset — callers can combine with [`Self::exact`] for single-element
|
||||
/// unset, callers can combine with [`Self::exact`] for single-element
|
||||
/// sets if tighter facts are desired.
|
||||
pub fn finite_set(values: Vec<String>) -> Self {
|
||||
let mut v = values;
|
||||
|
|
@ -411,7 +411,7 @@ fn truncate_suffix(s: &str) -> String {
|
|||
/// Longest common prefix of two strings, char-aligned.
|
||||
///
|
||||
/// Iterates by `char` rather than `byte` so multi-byte UTF-8 code points are
|
||||
/// either kept whole or dropped — a byte-wise comparison would slice into the
|
||||
/// either kept whole or dropped, a byte-wise comparison would slice into the
|
||||
/// middle of a code point and produce mojibake (`x as char` on a UTF-8
|
||||
/// continuation byte yields a garbage Latin-1 character).
|
||||
pub fn longest_common_prefix(a: &str, b: &str) -> String {
|
||||
|
|
@ -746,7 +746,7 @@ mod tests {
|
|||
let a = StringFact::from_prefix("https://api.example.com/");
|
||||
let b = StringFact::from_prefix("https://db.example.com/");
|
||||
let r = a.join(&b);
|
||||
// Common prefix is "https://" — anything past that diverges.
|
||||
// Common prefix is "https://", anything past that diverges.
|
||||
assert_eq!(
|
||||
r.prefix.as_deref(),
|
||||
Some("https://"),
|
||||
|
|
@ -781,7 +781,7 @@ mod tests {
|
|||
]
|
||||
}
|
||||
|
||||
/// `x ⊔ x = x` — join is idempotent across all sample shapes.
|
||||
/// `x ⊔ x = x`, join is idempotent across all sample shapes.
|
||||
#[test]
|
||||
fn join_idempotent_string() {
|
||||
for a in sample_strings() {
|
||||
|
|
@ -789,7 +789,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `x ⊔ y = y ⊔ x` — join is commutative.
|
||||
/// `x ⊔ y = y ⊔ x`, join is commutative.
|
||||
#[test]
|
||||
fn join_commutative_string() {
|
||||
let xs = sample_strings();
|
||||
|
|
@ -806,7 +806,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `x ⊓ x = x` — meet is idempotent.
|
||||
/// `x ⊓ x = x`, meet is idempotent.
|
||||
#[test]
|
||||
fn meet_idempotent_string() {
|
||||
for a in sample_strings() {
|
||||
|
|
@ -814,7 +814,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `x ⊓ y = y ⊓ x` — meet is commutative.
|
||||
/// `x ⊓ y = y ⊓ x`, meet is commutative.
|
||||
#[test]
|
||||
fn meet_commutative_string() {
|
||||
let xs = sample_strings();
|
||||
|
|
@ -844,7 +844,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `x ⊑ x` — leq is reflexive.
|
||||
/// `x ⊑ x`, leq is reflexive.
|
||||
#[test]
|
||||
fn leq_reflexive_string() {
|
||||
for a in sample_strings() {
|
||||
|
|
@ -852,7 +852,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// **Soundness**: `widen(a, b) ⊒ join(a, b)` — widening must
|
||||
/// **Soundness**: `widen(a, b) ⊒ join(a, b)`, widening must
|
||||
/// over-approximate join, otherwise dataflow loses information.
|
||||
#[test]
|
||||
fn widen_over_approximates_join_string() {
|
||||
|
|
@ -905,7 +905,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// Empty-string exact value must distinguish from Top — it is a
|
||||
/// Empty-string exact value must distinguish from Top, it is a
|
||||
/// singleton (`{""}`), not unconstrained. After the empty-prefix
|
||||
/// normalisation, prefix/suffix are `None` (carry no extra info)
|
||||
/// but the `domain` field still pins the value to exactly `""`.
|
||||
|
|
|
|||
261
src/ast.rs
261
src/ast.rs
|
|
@ -127,12 +127,12 @@ use crate::utils::snippet::line_snippet as extract_line_snippet;
|
|||
/// [`normalize_namespace`] convention) back to the absolute path the
|
||||
/// diagnostic pipeline expects.
|
||||
///
|
||||
/// * Empty `file_rel` — single-file scans normalize every namespace to
|
||||
/// * Empty `file_rel`, single-file scans normalize every namespace to
|
||||
/// `""`; treat that as "the file under analysis" and return
|
||||
/// `fallback.to_string_lossy()`.
|
||||
/// * `scan_root` absent — we have no workspace root to resolve against;
|
||||
/// * `scan_root` absent, we have no workspace root to resolve against;
|
||||
/// return `file_rel` verbatim (it may already be absolute).
|
||||
/// * Otherwise — join `scan_root` with `file_rel`.
|
||||
/// * Otherwise, join `scan_root` with `file_rel`.
|
||||
fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -> String {
|
||||
if file_rel.is_empty() {
|
||||
return fallback.to_string_lossy().into_owned();
|
||||
|
|
@ -163,7 +163,7 @@ fn build_taint_diag(
|
|||
let source_info = cfg_graph.node_weight(finding.source);
|
||||
// The reconstructed flow path is the authoritative view of where the
|
||||
// taint started *in this body*. When present, prefer its first step's
|
||||
// CFG span over `finding.source_span` — which can be stale across
|
||||
// CFG span over `finding.source_span`, which can be stale across
|
||||
// multi-hop cross-body remaps (e.g. JS two-level solve where a
|
||||
// callee-interior source gets its span rewritten to the enclosing
|
||||
// body's entry node). Fall back to `source_span`, then to the source
|
||||
|
|
@ -183,7 +183,7 @@ fn build_taint_diag(
|
|||
|
||||
// Prefer the source CFG node's callee string when it's a call expression
|
||||
// (e.g. `os.getenv("X")`). For property-access sources like
|
||||
// `navigator.userAgent` there is no callee — fall back to the first flow
|
||||
// `navigator.userAgent` there is no callee, fall back to the first flow
|
||||
// step's `variable` (the SSA var name, e.g. "userAgent"), then to the
|
||||
// source node's `taint.defines` / first `taint.uses` entry, before
|
||||
// finally giving up and rendering "(unknown)".
|
||||
|
|
@ -289,7 +289,7 @@ fn build_taint_diag(
|
|||
|
||||
// Convert raw flow steps to display FlowSteps. When the finding has a
|
||||
// primary_location distinct from the call site, the last raw step is
|
||||
// really the Call — reclassify it and append a synthetic Sink step
|
||||
// really the Call, reclassify it and append a synthetic Sink step
|
||||
// pointing at the callee-internal dangerous instruction so analysts
|
||||
// see both the call site and the final sink in the trace.
|
||||
let mut flow_steps: Vec<FlowStep> = finding
|
||||
|
|
@ -348,7 +348,7 @@ fn build_taint_diag(
|
|||
.clone()
|
||||
.or_else(|| Some(short_call_site.clone()));
|
||||
|
||||
// Resolved sink capability bits — used by deduplication to distinguish
|
||||
// Resolved sink capability bits, used by deduplication to distinguish
|
||||
// sinks with different cap types on the same source line (e.g.
|
||||
// `sink_sql(x); sink_shell(x);`).
|
||||
let sink_caps_bits: u16 = cfg_graph[finding.sink]
|
||||
|
|
@ -361,13 +361,33 @@ fn build_taint_diag(
|
|||
})
|
||||
.fold(0u16, |acc, b| acc | b);
|
||||
|
||||
// Phase C: when the sink's required caps include UNAUTHORIZED_ID — and
|
||||
// the finding actually reached that sink via the taint engine — use a
|
||||
// dedicated auth rule id so the finding is namespaced alongside the
|
||||
// standalone `auth_analysis` subsystem's output instead of being folded
|
||||
// into the generic `taint-unsanitised-flow` bucket.
|
||||
let diag_id = if sink_caps_bits & crate::labels::Cap::UNAUTHORIZED_ID.bits() != 0 {
|
||||
// Cap-specific rule-id routing.
|
||||
//
|
||||
// 1. `UNAUTHORIZED_ID`: namespace alongside the standalone `auth_analysis`
|
||||
// subsystem's output so cross-tool aggregation lines up.
|
||||
// 2. `DATA_EXFIL`: route to `taint-data-exfiltration` so SARIF surfaces a
|
||||
// distinct rule id from SSRF, the two share callees (e.g. `fetch`)
|
||||
// but represent different vulnerability classes.
|
||||
//
|
||||
// Prefer the per-finding `effective_sink_caps` (set by the multi-gate
|
||||
// SSA dispatch) when populated; fall back to the union of all sink-label
|
||||
// caps on the CFG node so legacy paths that build findings without
|
||||
// setting `effective_sink_caps` still pick the right rule id.
|
||||
let effective_caps = if finding.effective_sink_caps.is_empty() {
|
||||
crate::labels::Cap::from_bits_truncate(sink_caps_bits)
|
||||
} else {
|
||||
finding.effective_sink_caps
|
||||
};
|
||||
let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
|
||||
"rs.auth.missing_ownership_check.taint".to_string()
|
||||
} else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
|
||||
&& !effective_caps.contains(crate::labels::Cap::SSRF)
|
||||
{
|
||||
format!(
|
||||
"taint-data-exfiltration (source {}:{})",
|
||||
source_point.row + 1,
|
||||
source_point.column + 1
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"taint-unsanitised-flow (source {}:{})",
|
||||
|
|
@ -452,7 +472,7 @@ fn build_taint_diag(
|
|||
|
||||
/// Resolve a file extension to a language slug (e.g. `"rust"`,
|
||||
/// `"javascript"`). Public façade over [`lang_for_path`] for callers
|
||||
/// that only need the slug — used by the debug API to look up
|
||||
/// that only need the slug, used by the debug API to look up
|
||||
/// per-language rule enablement without re-parsing the file.
|
||||
pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> {
|
||||
lang_for_path(path).map(|(_, slug)| slug)
|
||||
|
|
@ -467,7 +487,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
|
|||
// use `.cc` / `.cxx` / `.hpp` / `.hh` / `.h++` rather than the
|
||||
// `.cpp` synthetic-fixture extension. Without these mappings,
|
||||
// the scanner silently skipped them. Headers (`.h` is omitted
|
||||
// intentionally — it's also valid C and disambiguating without a
|
||||
// intentionally, it's also valid C and disambiguating without a
|
||||
// build system is brittle).
|
||||
Some("cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++") => {
|
||||
Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp"))
|
||||
|
|
@ -481,7 +501,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
|
|||
"typescript",
|
||||
)),
|
||||
// TSX grammar is a superset of TypeScript plus JSX element/attribute
|
||||
// nodes — all TypeScript KINDS / RULES / PARAM_CONFIG entries apply,
|
||||
// nodes, all TypeScript KINDS / RULES / PARAM_CONFIG entries apply,
|
||||
// and JSX-specific sinks (e.g. `dangerouslySetInnerHTML`) layer on top
|
||||
// via the same `typescript` slug.
|
||||
Some("tsx") => Some((
|
||||
|
|
@ -493,7 +513,7 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
|
|||
"javascript",
|
||||
)),
|
||||
// JSX uses the same JavaScript grammar (tree-sitter-javascript handles
|
||||
// JSX natively) — slug "javascript" so all JS rules apply.
|
||||
// JSX natively), slug "javascript" so all JS rules apply.
|
||||
Some("jsx") => Some((
|
||||
Language::from(tree_sitter_javascript::LANGUAGE),
|
||||
"javascript",
|
||||
|
|
@ -739,7 +759,7 @@ impl<'a> ParsedSource<'a> {
|
|||
continue;
|
||||
}
|
||||
// Layer C: PHP `unserialize($x, ['allowed_classes' => [...]])`
|
||||
// or `unserialize($x, ['allowed_classes' => false])` —
|
||||
// or `unserialize($x, ['allowed_classes' => false])` ,
|
||||
// PHP 7+ structural mitigation against object injection.
|
||||
// When the call passes an `allowed_classes` option set to
|
||||
// either `false` (no class instantiation) or an array
|
||||
|
|
@ -762,7 +782,7 @@ impl<'a> ParsedSource<'a> {
|
|||
// format-string contributes attacker-controlled length.
|
||||
// When the source argument is a string literal (or a
|
||||
// ternary of two string literals), the contributed length
|
||||
// is statically bounded — there is no overflow vector
|
||||
// is statically bounded, there is no overflow vector
|
||||
// for an attacker even if the destination buffer is
|
||||
// mis-sized. Same principle for `sprintf` when the
|
||||
// format string is a literal containing no bare `%s`
|
||||
|
|
@ -818,7 +838,7 @@ impl<'a> ParsedSource<'a> {
|
|||
/// Sort, dedup, and optionally downgrade severity for non-production paths.
|
||||
///
|
||||
/// Dedup key matches the `issues` table PRIMARY KEY `(file_id, rule_id,
|
||||
/// line, col)` — severity is NOT part of the key. Two diags that agree
|
||||
/// line, col)`, severity is NOT part of the key. Two diags that agree
|
||||
/// on (line, col, id) but differ in severity (e.g. a pattern-rule finding
|
||||
/// plus a taint-pipeline finding on the same call) would otherwise survive
|
||||
/// dedup here and crash the indexer with a UNIQUE constraint violation.
|
||||
|
|
@ -854,7 +874,7 @@ impl<'a> ParsedFile<'a> {
|
|||
// project-level `FrameworkContext` misses frameworks the file
|
||||
// obviously imports. Augment the per-file rule set with any
|
||||
// framework-conditional rules keyed off in-file import specifiers
|
||||
// (e.g. `import fastify from 'fastify'`). Idempotent — skips
|
||||
// (e.g. `import fastify from 'fastify'`). Idempotent, skips
|
||||
// frameworks already active from the manifest pass.
|
||||
let in_file_fws =
|
||||
crate::utils::project::detect_in_file_frameworks(source.bytes, source.lang_slug);
|
||||
|
|
@ -931,13 +951,13 @@ impl<'a> ParsedFile<'a> {
|
|||
self.source.lang_slug,
|
||||
);
|
||||
|
||||
// Phase 6 (typed call-graph subtype awareness): every
|
||||
// every
|
||||
// `FuncSummary` exported from this file carries a copy of the
|
||||
// file's `hierarchy_edges` so the inheritance / impl /
|
||||
// implements relationships persist through SQLite round-trips
|
||||
// and re-merge into `crate::callgraph::TypeHierarchyIndex` at
|
||||
// call-graph build time. Cheap (one clone per summary) and
|
||||
// strictly additive — `merge_summaries` deduplicates downstream.
|
||||
// strictly additive, `merge_summaries` deduplicates downstream.
|
||||
if !self.file_cfg.hierarchy_edges.is_empty() {
|
||||
let edges = self.file_cfg.hierarchy_edges.clone();
|
||||
for s in &mut out {
|
||||
|
|
@ -982,7 +1002,7 @@ impl<'a> ParsedFile<'a> {
|
|||
///
|
||||
/// Returns two vectors keyed by canonical [`crate::symbol::FuncKey`].
|
||||
/// The `FuncKey` identity preserves `(lang, namespace, container, name,
|
||||
/// arity, disambig, kind)` — so two same-name definitions in this file
|
||||
/// arity, disambig, kind)`, so two same-name definitions in this file
|
||||
/// (e.g. a free `process` and a `Worker::process`, or overloads with
|
||||
/// different arities) land on distinct entries instead of the later one
|
||||
/// shadowing the earlier one.
|
||||
|
|
@ -1003,7 +1023,7 @@ impl<'a> ParsedFile<'a> {
|
|||
|
||||
// Use the FileCfg path (same one `analyse_file` uses at taint time) so
|
||||
// the SSA summaries stored cross-file match exactly what pass 2 will
|
||||
// resolve against — no NodeIndex-space or entry-detection drift.
|
||||
// resolve against, no NodeIndex-space or entry-detection drift.
|
||||
let locator = crate::summary::SinkSiteLocator {
|
||||
tree: &self.source.tree,
|
||||
bytes: self.source.bytes,
|
||||
|
|
@ -1024,7 +1044,7 @@ impl<'a> ParsedFile<'a> {
|
|||
/// Lower every function body in this file to SSA exactly once. Used by
|
||||
/// [`analyse_file_fused`] to share the result between the taint engine
|
||||
/// ([`run_cfg_analyses_with_lowered`]) and the SSA artifact filter
|
||||
/// ([`build_eligible_bodies_from_lowered`]) — the prior code path lowered
|
||||
/// ([`build_eligible_bodies_from_lowered`]), the prior code path lowered
|
||||
/// twice (once inside `analyse_file`, once inside
|
||||
/// `extract_ssa_artifacts_from_file_cfg`) and accounted for ~24% of the
|
||||
/// pass-2 wall-clock on the bench corpus.
|
||||
|
|
@ -1038,7 +1058,7 @@ impl<'a> ParsedFile<'a> {
|
|||
/// here populates `param_to_sink` with concrete coordinates that the
|
||||
/// emission path then promotes into `Finding.primary_location`,
|
||||
/// causing the same-file summary-resolved sink to be reported at the
|
||||
/// callee-internal sink line instead of the call site — which both
|
||||
/// callee-internal sink line instead of the call site, which both
|
||||
/// duplicates the intraprocedural finding the taint engine already
|
||||
/// emits at that exact line and re-attributes the flow finding away
|
||||
/// from the user-visible call site. Closure-capture, lambda, and
|
||||
|
|
@ -1263,13 +1283,11 @@ impl<'a> ParsedFile<'a> {
|
|||
state::build_resource_method_summaries(&self.file_cfg.bodies, caller_lang);
|
||||
let mut all_state_findings = Vec::new();
|
||||
for body in &self.file_cfg.bodies {
|
||||
// Phase 2 of the pointer-analysis rollout: when
|
||||
// `NYX_POINTER_ANALYSIS=1` is set, derive a `var_name →
|
||||
// PtrProxyHint` map from the body's points-to facts so
|
||||
// the proxy-acquire transfer can suppress SymbolId
|
||||
// attribution on field-aliased receivers (e.g. `m :=
|
||||
// c.mu; m.Lock()`). Strict-additive — `None` when the
|
||||
// env-var is unset and behaviour matches today exactly.
|
||||
// When `NYX_POINTER_ANALYSIS=1` is set, derive a
|
||||
// `var_name → PtrProxyHint` map from the body's
|
||||
// points-to facts so the proxy-acquire transfer can
|
||||
// suppress SymbolId attribution on field-aliased
|
||||
// receivers (e.g. `m := c.mu; m.Lock()`).
|
||||
let body_pointer_hints = cfg_analysis::build_body_const_facts(body, caller_lang)
|
||||
.as_ref()
|
||||
.and_then(|f| {
|
||||
|
|
@ -1379,15 +1397,11 @@ impl<'a> ParsedFile<'a> {
|
|||
)
|
||||
}
|
||||
|
||||
/// Build a per-file `var_name → TypeKind` map by running SSA + type
|
||||
/// facts on each body and copying type facts for SSA values whose
|
||||
/// definition recorded a source-level variable name. When the same
|
||||
/// name resolves to different non-`Unknown` types across bodies the
|
||||
/// entry is dropped — absence is safe because the auth analysis
|
||||
/// sink gate simply falls back to its syntactic heuristics. Returns
|
||||
/// `None` when no body produces any typed variable (non-Rust files
|
||||
/// currently emit few `LocalCollection` / security-typed facts, but
|
||||
/// this path is language-agnostic).
|
||||
/// Build a per-file `var_name → TypeKind` map from SSA + type facts.
|
||||
/// Conflicting non-`Unknown` types across bodies drop the entry ,
|
||||
/// absence is safe because the auth sink gate falls back to
|
||||
/// syntactic heuristics. Returns `None` when no body produces a
|
||||
/// typed variable.
|
||||
fn collect_file_var_types(&self) -> Option<auth_analysis::VarTypes> {
|
||||
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
|
||||
let mut merged: std::collections::HashMap<String, crate::ssa::type_facts::TypeKind> =
|
||||
|
|
@ -1492,7 +1506,7 @@ pub fn build_cfg_for_file(path: &Path, cfg: &Config) -> NyxResult<Option<(FileCf
|
|||
|
||||
/// Parse a file and return its `AuthorizationModel` for debug inspection.
|
||||
///
|
||||
/// Runs only the auth-extraction pipeline — no taint, no CFG construction.
|
||||
/// Runs only the auth-extraction pipeline, no taint, no CFG construction.
|
||||
/// Returns `None` for binary files or unsupported languages. Used by the
|
||||
/// `/api/debug/auth` route to surface the structured authorization model
|
||||
/// (routes, units, sensitive operations, auth checks) in the debug UI.
|
||||
|
|
@ -1607,7 +1621,7 @@ pub fn perf_stage_breakdown_fused(
|
|||
/// Diagnostic stage-timing helper for the perf audit.
|
||||
///
|
||||
/// Times each stage of pass 2 internally and returns µs counts. Returns
|
||||
/// `None` for unsupported languages. Not used in production — just for
|
||||
/// `None` for unsupported languages. Not used in production, just for
|
||||
/// `tests/perf_breakdown.rs` to attribute time inside `run_rules_on_bytes`
|
||||
/// without touching the hot path.
|
||||
#[doc(hidden)]
|
||||
|
|
@ -1651,7 +1665,7 @@ pub fn perf_stage_breakdown(
|
|||
///
|
||||
/// This is the shared pass-1 pipeline for indexed scans: parses once, builds
|
||||
/// CFG once, and returns both summary types. Uses the same `ParsedFile`
|
||||
/// pipeline as `analyse_file_fused` — no divergent extraction path.
|
||||
/// pipeline as `analyse_file_fused`, no divergent extraction path.
|
||||
pub fn extract_all_summaries_from_bytes(
|
||||
bytes: &[u8],
|
||||
path: &Path,
|
||||
|
|
@ -1727,7 +1741,7 @@ fn is_call_all_args_literal(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
|||
}
|
||||
|
||||
// If the argument list is empty (no args), we conservatively do NOT
|
||||
// suppress — the danger may come from side effects, not arguments.
|
||||
// suppress, the danger may come from side effects, not arguments.
|
||||
has_any_arg
|
||||
}
|
||||
|
||||
|
|
@ -1745,7 +1759,7 @@ fn find_enclosing_call(mut node: tree_sitter::Node) -> Option<tree_sitter::Node>
|
|||
if kind == "function_call_expression" {
|
||||
return Some(node);
|
||||
}
|
||||
// Stop at scope/statement boundaries — don't cross into outer calls
|
||||
// Stop at scope/statement boundaries, don't cross into outer calls
|
||||
if kind.contains("block")
|
||||
|| kind.contains("body")
|
||||
|| kind == "program"
|
||||
|
|
@ -1780,13 +1794,20 @@ fn find_arg_list(call: tree_sitter::Node) -> Option<tree_sitter::Node> {
|
|||
fn is_literal_node(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||||
let kind = node.kind();
|
||||
match kind {
|
||||
// String literals (most languages)
|
||||
// String literals, but Python's `string` node also covers
|
||||
// f-strings, which carry `interpolation` children. An f-string
|
||||
// with interpolation is *not* a literal: it embeds arbitrary
|
||||
// expressions, so a sink call like `cursor.execute(f"…{x}")`
|
||||
// must not be suppressed under Layer A's "all-literal args"
|
||||
// shortcut. Same shape applies to any tree-sitter grammar
|
||||
// that nests an `interpolation` (or `string_interpolation`)
|
||||
// child inside a string node.
|
||||
"string"
|
||||
| "string_literal"
|
||||
| "interpreted_string_literal"
|
||||
| "raw_string_literal"
|
||||
| "string_content"
|
||||
| "string_fragment" => true,
|
||||
| "string_fragment" => !has_interpolation(node),
|
||||
|
||||
// Numeric literals
|
||||
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
|
||||
|
|
@ -1901,7 +1922,7 @@ fn is_php_include_param_passthrough(include_node: tree_sitter::Node, bytes: &[u8
|
|||
}
|
||||
return true;
|
||||
}
|
||||
// Stop at class/program scope without a matching function — bare
|
||||
// Stop at class/program scope without a matching function, bare
|
||||
// top-level `include $var` does not benefit from this guard.
|
||||
"program" | "class_declaration" | "trait_declaration" | "interface_declaration" => {
|
||||
return false;
|
||||
|
|
@ -2011,7 +2032,7 @@ fn is_var_reassigned_before(
|
|||
/// PHP-only: returns `true` when the captured `function_call_expression`
|
||||
/// node is `unserialize($x, [..., 'allowed_classes' => <ARRAY|false>, ...])`.
|
||||
/// This is the canonical PHP 7+ structural mitigation against object
|
||||
/// injection — explicitly restricting which classes the deserialiser may
|
||||
/// injection, explicitly restricting which classes the deserialiser may
|
||||
/// instantiate. Only suppress when the option is either:
|
||||
///
|
||||
/// - `'allowed_classes' => false` (no class instantiation), or
|
||||
|
|
@ -2091,9 +2112,9 @@ fn is_php_unserialize_allowed_classes_restricted(
|
|||
// Accept structural mitigation forms. The intent signal is
|
||||
// "developer explicitly set allowed_classes to something other than
|
||||
// `true`":
|
||||
// - boolean `false` — no class instantiation at all
|
||||
// - array literal — explicit allow-list
|
||||
// - class-constant reference — `self::ALLOWED_CLASSES` /
|
||||
// - boolean `false` , no class instantiation at all
|
||||
// - array literal , explicit allow-list
|
||||
// - class-constant reference , `self::ALLOWED_CLASSES` /
|
||||
// `Foo::CONSTANTS` resolved to
|
||||
// a const array; engine cannot
|
||||
// statically inspect, but the
|
||||
|
|
@ -2126,7 +2147,7 @@ fn is_php_unserialize_allowed_classes_restricted(
|
|||
/// `cpp.memory.*` mirrors) when the source argument can carry
|
||||
/// attacker-controlled length. Calls whose source is a string literal
|
||||
/// have a compile-time bound and cannot overflow due to attacker input
|
||||
/// — a too-small destination is a fixed developer bug (caught by
|
||||
///, a too-small destination is a fixed developer bug (caught by
|
||||
/// compiler warnings / `-fstack-protector` / clang-tidy / ASan), not an
|
||||
/// exploitable channel. Suppressing these literal-source calls is a
|
||||
/// deliberate noise / false-positive reduction aligned with Nyx's scope
|
||||
|
|
@ -2141,14 +2162,14 @@ fn is_php_unserialize_allowed_classes_restricted(
|
|||
/// - `tests/fixtures/real_world/c/state/malloc_lifecycle.expect.json`
|
||||
/// - `tests/fixtures/real_world/cpp/state/new_delete.expect.json`
|
||||
/// - `tests/fixtures/real_world/cpp/state/malloc_branches.expect.json`
|
||||
/// - Positive cases (suppression must NOT fire — source is a parameter
|
||||
/// - Positive cases (suppression must NOT fire, source is a parameter
|
||||
/// or other attacker-reachable value) live as hard expectations
|
||||
/// (`must_match: true`) in the taint fixtures:
|
||||
/// - `tests/fixtures/real_world/c/taint/buffer_overflow.c`
|
||||
/// - `tests/fixtures/real_world/cpp/taint/gets_strcpy.cpp`
|
||||
///
|
||||
/// Removing this function or weakening its predicate would be caught by
|
||||
/// neither — it would be caught by the unit tests below.
|
||||
/// neither, it would be caught by the unit tests below.
|
||||
///
|
||||
/// Pattern rules `c.memory.strcpy` / `c.memory.strcat` / `c.memory.sprintf`
|
||||
/// (and the `cpp.memory.*` mirrors) flag the call syntactically; their
|
||||
|
|
@ -2173,7 +2194,7 @@ fn is_php_unserialize_allowed_classes_restricted(
|
|||
/// - source / format is an identifier (could be tainted, e.g.
|
||||
/// `sprintf(buf, fmt, …)`) → keep firing
|
||||
/// - format is `concatenated_string` containing identifier macros (e.g.
|
||||
/// `"%" PRId64`) — we cannot statically expand the macro, so refuse
|
||||
/// `"%" PRId64`), we cannot statically expand the macro, so refuse
|
||||
/// - bare `%s` in format → keep firing (could read unbounded length)
|
||||
fn is_c_buffer_call_literal_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||||
let kind = match rule_id {
|
||||
|
|
@ -2226,7 +2247,7 @@ enum CBufferRule {
|
|||
/// True for: a C/C++ string literal, OR a `conditional_expression` whose
|
||||
/// consequence + alternative are both either string literals or ALL_CAPS
|
||||
/// identifiers (the canonical preprocessor-macro naming convention for
|
||||
/// string-constant `#define`s — `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
|
||||
/// string-constant `#define`s, `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
|
||||
/// pervasively in postgres' `formatting.c::DCH_a_m`). Parenthesised forms
|
||||
/// are unwrapped.
|
||||
///
|
||||
|
|
@ -2348,7 +2369,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
|
|||
}
|
||||
i += 1;
|
||||
if i >= bytes.len() {
|
||||
// trailing `%` — malformed, refuse to suppress
|
||||
// trailing `%`, malformed, refuse to suppress
|
||||
return false;
|
||||
}
|
||||
if bytes[i] == b'%' {
|
||||
|
|
@ -2391,7 +2412,7 @@ pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
|
|||
let conv = bytes[i];
|
||||
i += 1;
|
||||
match conv {
|
||||
// Numeric / char / pointer specifiers — bounded output for any input
|
||||
// Numeric / char / pointer specifiers, bounded output for any input
|
||||
b'd' | b'i' | b'u' | b'o' | b'x' | b'X' | b'c' | b'e' | b'E' | b'f' | b'F' | b'g'
|
||||
| b'G' | b'a' | b'A' | b'p' | b'n' => continue,
|
||||
// String specifier: only safe when precision-bounded
|
||||
|
|
@ -2494,7 +2515,7 @@ struct TaintSuppressionCtx {
|
|||
/// distinguish "taint proved safe" from "taint failed to track".
|
||||
taint_finding_lines_by_func: HashMap<Option<String>, HashSet<usize>>,
|
||||
/// Functions where the SSA engine emitted at least one
|
||||
/// `all_validated` event — every tainted input to *some* sink in
|
||||
/// `all_validated` event, every tainted input to *some* sink in
|
||||
/// the function passed through a recognised validation/
|
||||
/// sanitisation predicate. Drained from
|
||||
/// `take_all_validated_spans`; positive evidence that the engine
|
||||
|
|
@ -2502,14 +2523,14 @@ struct TaintSuppressionCtx {
|
|||
/// `taint-unsanitised-flow` finding fired and no Sanitizer label
|
||||
/// is present. Covers validation, dominator-based pruning,
|
||||
/// early-return guards, type-check predicates, and interprocedural
|
||||
/// sanitiser wrappers — all of which legitimately clear taint via
|
||||
/// sanitiser wrappers, all of which legitimately clear taint via
|
||||
/// SSA branch-narrowing rather than a labelled sanitiser node.
|
||||
engine_validated_funcs: HashSet<Option<String>>,
|
||||
/// Functions where some Source's defining variable is later
|
||||
/// rebound to a literal RHS (carries `TaintMeta.const_text`) in
|
||||
/// the same scope, with no Source label on the rebinding node.
|
||||
/// Positive evidence that the engine's SSA renaming structurally
|
||||
/// kills the source's taint before any sink can read it — covers
|
||||
/// kills the source's taint before any sink can read it, covers
|
||||
/// `cmd = getenv(); cmd = "echo hello"; system(cmd)` patterns
|
||||
/// where the rebind is what makes the code safe but the engine
|
||||
/// has no `Sanitizer` label or `taint-unsanitised-flow` finding to
|
||||
|
|
@ -2520,7 +2541,7 @@ struct TaintSuppressionCtx {
|
|||
/// interprocedural analysis cleared the flow through a
|
||||
/// user-defined wrapper (e.g. `def sanitize(s): return
|
||||
/// shlex.quote(s)`). The current per-function `Sanitizer` check
|
||||
/// only sees direct sanitisers in the *caller's* scope — without
|
||||
/// only sees direct sanitisers in the *caller's* scope, without
|
||||
/// this signal, every helper-wrapped sanitiser fires as an
|
||||
/// AST-pattern FP because the engine cleared the value via Phase
|
||||
/// 11 inline analysis but the sink's enclosing scope has no
|
||||
|
|
@ -2687,7 +2708,7 @@ impl TaintSuppressionCtx {
|
|||
// an "interproc sanitiser caller" when its body invokes any
|
||||
// helper whose own body contains a labelled Sanitizer. This
|
||||
// handles wrappers like `def sanitize(s): return
|
||||
// shlex.quote(s)` — the engine clears taint via Phase 11
|
||||
// shlex.quote(s)`, the engine clears taint via
|
||||
// inline analysis, but the caller's scope has no labelled
|
||||
// Sanitizer of its own to satisfy Condition 4(b).
|
||||
let mut interproc_sanitizer_callers: HashSet<Option<String>> = HashSet::new();
|
||||
|
|
@ -2703,7 +2724,7 @@ impl TaintSuppressionCtx {
|
|||
// each to its enclosing function via `sink_func_at_line`, and
|
||||
// record the function as "engine-validated". The set was
|
||||
// populated by `ssa_events_to_findings` whenever the engine
|
||||
// emitted an `SsaTaintEvent { all_validated: true, .. }` —
|
||||
// emitted an `SsaTaintEvent { all_validated: true, .. }` ,
|
||||
// i.e. the engine reached a sink and proved every tainted
|
||||
// input passed validation. This is the broadest form of
|
||||
// engine-success evidence, covering predicate validation
|
||||
|
|
@ -2762,7 +2783,7 @@ impl TaintSuppressionCtx {
|
|||
// sink, since taint couldn't have evaluated a flow that doesn't exist.
|
||||
let func = match self.sink_func_at_line.get(&line) {
|
||||
Some(f) => f,
|
||||
None => return false, // No CFG sink at this line — taint had no opportunity to evaluate
|
||||
None => return false, // No CFG sink at this line, taint had no opportunity to evaluate
|
||||
};
|
||||
match self.source_lines_by_func.get(func) {
|
||||
Some(source_lines) => {
|
||||
|
|
@ -2788,7 +2809,7 @@ impl TaintSuppressionCtx {
|
|||
// OR
|
||||
// (c) the SSA engine emitted at least one `all_validated`
|
||||
// event in this function (engine reached *some* sink and
|
||||
// proved every tainted input was validated — covers
|
||||
// proved every tainted input was validated, covers
|
||||
// predicate validation, dominator early-return,
|
||||
// type-check predicates, and interprocedural sanitiser
|
||||
// wrappers that don't carry an explicit Sanitizer
|
||||
|
|
@ -2796,18 +2817,18 @@ impl TaintSuppressionCtx {
|
|||
// OR
|
||||
// (d) the function rebinds a Source's defining variable to
|
||||
// a literal RHS at a later line (engine's SSA renaming
|
||||
// structurally kills taint before any sink reads it —
|
||||
// structurally kills taint before any sink reads it ,
|
||||
// covers `cmd = getenv(); cmd = "echo"; system(cmd)`),
|
||||
// OR
|
||||
// (e) the function calls a same-file helper whose body
|
||||
// contains a labelled Sanitizer (interprocedural
|
||||
// sanitiser wrapper — covers `def sanitize(s): return
|
||||
// sanitiser wrapper, covers `def sanitize(s): return
|
||||
// shlex.quote(s)` patterns where the engine clears
|
||||
// taint via Phase 11 inline analysis but the caller's
|
||||
// taint via inline analysis but the caller's
|
||||
// scope has no Sanitizer label of its own).
|
||||
//
|
||||
// When none hold, we can't distinguish silent engine failure
|
||||
// from real safety — e.g. Go points-to limitation on `&local`
|
||||
// from real safety, e.g. Go points-to limitation on `&local`
|
||||
// Decode destinations leaves the chain writeback fired but the
|
||||
// field-cell propagation dead, suppressing legitimate
|
||||
// AST-pattern findings on every Go CRUD handler whose Decode
|
||||
|
|
@ -2854,7 +2875,7 @@ pub fn run_rules_on_bytes(
|
|||
maybe_inject_test_panic(path);
|
||||
|
||||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||||
// Not a recognized tree-sitter language — try text-based patterns,
|
||||
// Not a recognized tree-sitter language, try text-based patterns,
|
||||
// but first surface a parse-timeout synthetic diag if that's what
|
||||
// caused try_new to return None.
|
||||
let mut out = scan_text_based_patterns(bytes, path, cfg);
|
||||
|
|
@ -2964,7 +2985,7 @@ pub fn analyse_file_fused(
|
|||
maybe_inject_test_panic(path);
|
||||
|
||||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||||
// Not a recognized tree-sitter language — try text-based patterns,
|
||||
// Not a recognized tree-sitter language, try text-based patterns,
|
||||
// and surface a parse-timeout synthetic diag if that's what caused
|
||||
// try_new to return None.
|
||||
let mut diags = scan_text_based_patterns(bytes, path, cfg);
|
||||
|
|
@ -2995,7 +3016,7 @@ pub fn analyse_file_fused(
|
|||
let (ssa_summaries, ssa_bodies) = if needs_cfg {
|
||||
// Lower SSA exactly once and feed both the taint engine and the
|
||||
// SSA-artifact extractor. Pre-fix, both consumers re-lowered the
|
||||
// same `FileCfg` independently — `lower_all_functions_from_bodies`
|
||||
// same `FileCfg` independently, `lower_all_functions_from_bodies`
|
||||
// accounted for ~20% of `analyse_file_fused` wall-clock on the
|
||||
// bench corpus.
|
||||
//
|
||||
|
|
@ -3294,7 +3315,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
|
|||
"method param pass-through should be recognised"
|
||||
);
|
||||
|
||||
// Local variable assigned from concat — NOT a pass-through.
|
||||
// Local variable assigned from concat, NOT a pass-through.
|
||||
let code = b"<?php\nclass C { function f(string $base): void { $f = $base . '/x.php'; include $f; } }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_php_capture(&tree, code, q);
|
||||
|
|
@ -3303,7 +3324,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
|
|||
"concat-built local should NOT be treated as pass-through"
|
||||
);
|
||||
|
||||
// Param reassigned before include — NOT a pass-through.
|
||||
// Param reassigned before include, NOT a pass-through.
|
||||
let code = b"<?php\nfunction f($file) { $file = $_GET['x']; include $file; }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_php_capture(&tree, code, q);
|
||||
|
|
@ -3312,7 +3333,7 @@ fn php_include_param_passthrough_recognises_canonical_shapes() {
|
|||
"reassigned param should NOT be treated as pass-through"
|
||||
);
|
||||
|
||||
// Top-level (no enclosing function) — NOT a pass-through.
|
||||
// Top-level (no enclosing function), NOT a pass-through.
|
||||
let code = b"<?php\n$file = $_GET['x'];\ninclude $file;\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_php_capture(&tree, code, q);
|
||||
|
|
@ -3357,7 +3378,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
|
|||
"allowed_classes => self::CONST should be recognised as safe"
|
||||
);
|
||||
|
||||
// allowed_classes => true — unsafe default, must NOT be suppressed
|
||||
// allowed_classes => true, unsafe default, must NOT be suppressed
|
||||
let code = b"<?php\n$x = unserialize($d, ['allowed_classes' => true]);\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_php_capture(&tree, code, q);
|
||||
|
|
@ -3366,7 +3387,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
|
|||
"allowed_classes => true is the unsafe default, should NOT be suppressed"
|
||||
);
|
||||
|
||||
// No second arg — must NOT be suppressed
|
||||
// No second arg, must NOT be suppressed
|
||||
let code = b"<?php\n$x = unserialize($d);\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_php_capture(&tree, code, q);
|
||||
|
|
@ -3375,7 +3396,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
|
|||
"single-arg unserialize should NOT be suppressed"
|
||||
);
|
||||
|
||||
// Dynamic options variable — must NOT be suppressed
|
||||
// Dynamic options variable, must NOT be suppressed
|
||||
let code = b"<?php\n$x = unserialize($d, $opts);\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_php_capture(&tree, code, q);
|
||||
|
|
@ -3387,7 +3408,7 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
|
|||
|
||||
#[test]
|
||||
fn sprintf_format_safety_classifier() {
|
||||
// Numeric / char / pointer specifiers — bounded by definition.
|
||||
// Numeric / char / pointer specifiers, bounded by definition.
|
||||
assert!(sprintf_format_is_safe(""));
|
||||
assert!(sprintf_format_is_safe("hello world"));
|
||||
assert!(sprintf_format_is_safe("%d"));
|
||||
|
|
@ -3396,11 +3417,11 @@ fn sprintf_format_safety_classifier() {
|
|||
assert!(sprintf_format_is_safe("%5d %x %llo"));
|
||||
assert!(sprintf_format_is_safe("%%literal-percent"));
|
||||
assert!(sprintf_format_is_safe("%p"));
|
||||
// Precision-bounded `%s` / `%.*s` — output capped at precision.
|
||||
// Precision-bounded `%s` / `%.*s`, output capped at precision.
|
||||
assert!(sprintf_format_is_safe(" %.*s"));
|
||||
assert!(sprintf_format_is_safe("%.5s"));
|
||||
assert!(sprintf_format_is_safe("[%-.10s]"));
|
||||
// Bare `%s` / width-only `%5s` — width is a *minimum*, length is
|
||||
// Bare `%s` / width-only `%5s`, width is a *minimum*, length is
|
||||
// unbounded. Must NOT be suppressed.
|
||||
assert!(!sprintf_format_is_safe("%s"));
|
||||
assert!(!sprintf_format_is_safe("hello %s world"));
|
||||
|
|
@ -3441,7 +3462,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
let q_strcat = r#"(call_expression function: (identifier) @id (#eq? @id "strcat")) @vuln"#;
|
||||
let q_sprintf = r#"(call_expression function: (identifier) @id (#eq? @id "sprintf")) @vuln"#;
|
||||
|
||||
// strcpy(dst, "literal") — postgres autoprewarm shape.
|
||||
// strcpy(dst, "literal"), postgres autoprewarm shape.
|
||||
let code = b"void f(char *d) { strcpy(d, \"pg_prewarm\"); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||||
|
|
@ -3450,7 +3471,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"strcpy with string-literal source must be suppressed"
|
||||
);
|
||||
|
||||
// strcpy(dst, cond ? "a" : "b") — string-literal ternary.
|
||||
// strcpy(dst, cond ? "a" : "b"), string-literal ternary.
|
||||
let code = b"void f(char *s, int h) { strcpy(s, (h >= 12) ? \"p.m.\" : \"a.m.\"); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||||
|
|
@ -3459,7 +3480,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"strcpy with ternary-of-literals source must be suppressed"
|
||||
);
|
||||
|
||||
// strcpy(dst, cond ? P_M_STR : A_M_STR) — postgres formatting.c
|
||||
// strcpy(dst, cond ? P_M_STR : A_M_STR), postgres formatting.c
|
||||
// shape with #define'd ALL_CAPS string-constant macros.
|
||||
let code = b"#define P_M_STR \"p.m.\"\n#define A_M_STR \"a.m.\"\nvoid f(char *s, int h) { strcpy(s, (h >= 12) ? P_M_STR : A_M_STR); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
|
|
@ -3469,7 +3490,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"strcpy with ternary-of-ALL_CAPS-macros must be suppressed"
|
||||
);
|
||||
|
||||
// strcpy(dst, cond ? var_a : var_b) — lowercase variables, NOT a
|
||||
// strcpy(dst, cond ? var_a : var_b), lowercase variables, NOT a
|
||||
// recognisable preprocessor macro shape. Must NOT suppress.
|
||||
let code = b"void f(char *s, int h, char *a, char *b) { strcpy(s, (h >= 12) ? a : b); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
|
|
@ -3479,7 +3500,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"strcpy with ternary-of-lowercase-vars must NOT be suppressed"
|
||||
);
|
||||
|
||||
// strcat(dst, "literal") — same principle as strcpy.
|
||||
// strcat(dst, "literal"), same principle as strcpy.
|
||||
let code = b"void f(char *d) { strcat(d, \" (done)\"); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_strcat);
|
||||
|
|
@ -3488,7 +3509,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"strcat with string-literal source must be suppressed"
|
||||
);
|
||||
|
||||
// sprintf(dst, "%lld%c", ...) — numeric format string.
|
||||
// sprintf(dst, "%lld%c", ...), numeric format string.
|
||||
let code = b"void f(char *cp, long long v, char u) { sprintf(cp, \"%lld%c\", v, u); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||||
|
|
@ -3497,7 +3518,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"sprintf with numeric-only format must be suppressed"
|
||||
);
|
||||
|
||||
// sprintf(str, " %.*s", N, x) — precision-bounded `%s`.
|
||||
// sprintf(str, " %.*s", N, x), precision-bounded `%s`.
|
||||
let code = b"void f(char *str, int n, const char *x) { sprintf(str, \" %.*s\", n, x); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||||
|
|
@ -3506,7 +3527,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"sprintf with precision-bounded `%.*s` must be suppressed"
|
||||
);
|
||||
|
||||
// strcpy(dst, src) where src is a non-literal — must NOT suppress.
|
||||
// strcpy(dst, src) where src is a non-literal, must NOT suppress.
|
||||
let code = b"void f(char *d, char **a) { strcpy(d, a[1]); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||||
|
|
@ -3515,7 +3536,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"strcpy with non-literal source must NOT be suppressed"
|
||||
);
|
||||
|
||||
// sprintf with bare `%s` — must NOT suppress.
|
||||
// sprintf with bare `%s`, must NOT suppress.
|
||||
let code = b"void f(char *b, const char *u) { sprintf(b, \"%s\", u); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||||
|
|
@ -3525,7 +3546,7 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
);
|
||||
|
||||
// sprintf with non-literal format (concatenated_string with PRI* macro)
|
||||
// — must NOT suppress (engine cannot statically expand the macro).
|
||||
//, must NOT suppress (engine cannot statically expand the macro).
|
||||
let code = b"void f(char *b, long long v) { sprintf(b, \"%\" PRId64, v); }\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||||
|
|
@ -3543,3 +3564,51 @@ fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
|||
"Layer D should only fire for buffer-overflow rule ids"
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression: `is_literal_node` must NOT classify a Python f-string
|
||||
/// (a `string` node containing `interpolation` children) as literal.
|
||||
/// Layer A's "all-args-literal → suppress Security finding" shortcut
|
||||
/// otherwise hides every CVE that injects via `cursor.execute(f"…{x}…")`
|
||||
/// or `text(f"…{x}…")`. Motivated by CVE-2025-69662 (geopandas SQLi
|
||||
/// via `text(f"SELECT … '{geom_name}' …")`) and CVE-2025-24793
|
||||
/// (snowflake-connector-python f-string-built CREATE STAGE / DROP).
|
||||
#[test]
|
||||
fn is_literal_node_rejects_python_fstring_with_interpolation() {
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||||
parser.set_language(&lang).unwrap();
|
||||
|
||||
// f-string with one interpolation segment, must be non-literal.
|
||||
let code = b"x = f\"SELECT * WHERE y = '{u}'\"\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let assignment = tree
|
||||
.root_node()
|
||||
.child(0)
|
||||
.and_then(|s| s.child(0))
|
||||
.expect("assignment node");
|
||||
let rhs = assignment
|
||||
.child_by_field_name("right")
|
||||
.expect("RHS of assignment");
|
||||
assert_eq!(rhs.kind(), "string");
|
||||
assert!(
|
||||
!is_literal_node(rhs, code),
|
||||
"f-string with interpolation must not be classified as literal"
|
||||
);
|
||||
|
||||
// Plain string literal, must remain literal.
|
||||
let code = b"x = \"plain literal\"\n";
|
||||
let tree = parser.parse(code, None).unwrap();
|
||||
let assignment = tree
|
||||
.root_node()
|
||||
.child(0)
|
||||
.and_then(|s| s.child(0))
|
||||
.expect("assignment node");
|
||||
let rhs = assignment
|
||||
.child_by_field_name("right")
|
||||
.expect("RHS of assignment");
|
||||
assert_eq!(rhs.kind(), "string");
|
||||
assert!(
|
||||
is_literal_node(rhs, code),
|
||||
"plain string literal must be classified as literal"
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -220,7 +220,7 @@ fn check_token_override_without_validation(
|
|||
let mut findings = Vec::new();
|
||||
|
||||
for unit in &model.units {
|
||||
// The rule reasons about "Token acceptance flow" — by
|
||||
// The rule reasons about "Token acceptance flow", by
|
||||
// construction, that is a user-facing handler that receives a
|
||||
// token from the client and writes through token-bound state.
|
||||
// Internal helpers, Celery / cron tasks, Django migrations,
|
||||
|
|
@ -335,15 +335,12 @@ fn has_prior_subject_auth(
|
|||
})
|
||||
}
|
||||
|
||||
/// Phase A4 row-fetch exemption.
|
||||
/// Row-fetch exemption.
|
||||
///
|
||||
/// Recognises the canonical "fetch-then-authorize" idiom in row-level
|
||||
/// authz code: a route handler fetches a row by id (`let community =
|
||||
/// Community::read(pool, data.community_id)?`), then calls a named
|
||||
/// authorization function on the fetched row (`check_community_user_action(
|
||||
/// &user, &community, ...)`). The authorization check appears
|
||||
/// textually after the fetch, so the existing `check.line <= op.line`
|
||||
/// rule cannot cover the fetch.
|
||||
/// Recognises the "fetch-then-authorize" idiom: a handler fetches a
|
||||
/// row by id then calls a named authorization function on it. The
|
||||
/// check appears textually after the fetch, so the
|
||||
/// `check.line <= op.line` rule cannot cover the fetch.
|
||||
///
|
||||
/// The exemption fires only when:
|
||||
/// 1. `op` is the row-fetch operation itself (line == row let-line).
|
||||
|
|
@ -353,7 +350,7 @@ fn has_prior_subject_auth(
|
|||
/// Coverage is intentionally narrow: only the row-fetch operation is
|
||||
/// exempted. Any sink that runs *between* the fetch and the check
|
||||
/// (e.g. `delete(community)` before `check_*`) still flags, because
|
||||
/// its subject is `community` itself — not a fetch arg — and we
|
||||
/// its subject is `community` itself, not a fetch arg, and we
|
||||
/// require the operation to be a row-fetch site to apply the
|
||||
/// exemption.
|
||||
fn has_row_fetch_exemption(unit: &AnalysisUnit, op: &SensitiveOperation) -> bool {
|
||||
|
|
@ -374,8 +371,8 @@ fn has_row_fetch_exemption(unit: &AnalysisUnit, op: &SensitiveOperation) -> bool
|
|||
|
||||
// Look for any non-login auth check whose subjects mention the row.
|
||||
// Match against the *root* of the subject's chain (`a.b.c` → `a`)
|
||||
// so an auth check on a row's nested field — e.g.
|
||||
// `is_mod_or_admin(pool, &user, comment_view.community.id)` —
|
||||
// so an auth check on a row's nested field, e.g.
|
||||
// `is_mod_or_admin(pool, &user, comment_view.community.id)` ,
|
||||
// still names the row var.
|
||||
unit.auth_checks.iter().any(|check| {
|
||||
if matches!(
|
||||
|
|
@ -425,6 +422,32 @@ fn has_prior_collection_auth(
|
|||
}
|
||||
|
||||
fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
||||
// **Route-level guard short-circuit.**
|
||||
//
|
||||
// A check declared at the route boundary (Flask `@requires_role`,
|
||||
// FastAPI `dependencies=[Depends(requires_access_dag(method=
|
||||
// "POST", access_entity=DagAccessEntity.RUN))]`, Django
|
||||
// `@permission_required`, Spring `@PreAuthorize`, Rails
|
||||
// `before_action :authorize`, axum `RequireAuthorizationLayer`)
|
||||
// gates the entire handler. The decorator / dependency call is
|
||||
// opaque to the engine, the inner `requires_access_dag` carries
|
||||
// no per-arg `ValueRef` pointing back into the handler body, so
|
||||
// the per-name subject coverage walk below cannot match it. The
|
||||
// structural shape, however, is unambiguous: every value the
|
||||
// handler receives, every row it fetches, and every sink it
|
||||
// calls runs after the route-level check has decided
|
||||
// authorization.
|
||||
//
|
||||
// `has_prior_subject_auth` already filters out
|
||||
// `LoginGuard` / `TokenExpiry` / `TokenRecipient` kinds before
|
||||
// calling this helper (login alone proves identity, not
|
||||
// authorization), so by the time we land here the kind is
|
||||
// `Other` / `Membership` / `Ownership` / `AdminGuard`, i.e. an
|
||||
// authorization-bearing decorator-level check. Returning `true`
|
||||
// unconditionally for those is the correct semantics.
|
||||
if check.is_route_level {
|
||||
return true;
|
||||
}
|
||||
let subject_key = canonical_subject_name(subject);
|
||||
let subject_related_base = related_subject_base(subject);
|
||||
// A2 + B3: walk the row-binding chain from this subject so a
|
||||
|
|
@ -447,7 +470,7 @@ fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &Analy
|
|||
// check authorizes the resulting row (e.g. `check_community_user_action(
|
||||
// &user, &community, ..)` after `let community = Community::read(
|
||||
// pool, data.community_id)`), the check materially covers
|
||||
// `data.community_id` too — it gated access to the row that was
|
||||
// `data.community_id` too, it gated access to the row that was
|
||||
// fetched using that id, so any subsequent operation re-using the
|
||||
// same id (read of a related view, mutation on the row itself) is
|
||||
// within the scope of that authorization.
|
||||
|
|
@ -527,7 +550,7 @@ fn auth_check_covers_subject(check: &AuthCheck, subject: &ValueRef, unit: &Analy
|
|||
/// to recover every ancestor row binding name. Cycle-safe via a
|
||||
/// visited set; depth-bounded at 16 hops to keep the worst case
|
||||
/// trivial. Returns a vec containing `start` followed by each
|
||||
/// ancestor — empty when `start` is empty.
|
||||
/// ancestor, empty when `start` is empty.
|
||||
fn row_binding_chain(unit: &AnalysisUnit, start: &str) -> Vec<String> {
|
||||
let mut chain: Vec<String> = Vec::new();
|
||||
if start.is_empty() {
|
||||
|
|
@ -583,7 +606,7 @@ fn is_relevant_target_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
|||
/// it to a literal constant (`id := "id"`, `let userId = 1`, etc.).
|
||||
/// Such bindings cannot be user-controlled and so must not be
|
||||
/// classified as scoped-identifier subjects. Only matches plain
|
||||
/// `Identifier`-kind subjects (no base/field) — member chains like
|
||||
/// `Identifier`-kind subjects (no base/field), member chains like
|
||||
/// `req.params.id` still pass through to the regular checks.
|
||||
fn is_const_bound_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
||||
if subject.base.is_some() || subject.field.is_some() {
|
||||
|
|
@ -594,22 +617,22 @@ fn is_const_bound_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
|||
|
||||
/// True iff `subject` is a plain identifier that resolves to a
|
||||
/// function parameter whose static type is a payload-incompatible
|
||||
/// scalar (numeric or boolean — see [`super::apply_typed_bounded_params`]).
|
||||
/// scalar (numeric or boolean, see [`super::apply_typed_bounded_params`]).
|
||||
/// Spring `@PathVariable Long userId`, Axum `Path<i64>`, NestJS
|
||||
/// `@Param('id') id: number`, and FastAPI `user_id: int` all qualify.
|
||||
///
|
||||
/// Phase 6: also matches member-access subjects like `dto.userId`
|
||||
/// also matches member-access subjects like `dto.userId`
|
||||
/// when `dto` is a typed-extractor parameter recognised by a Phase
|
||||
/// 1-2 matcher AND the field's declared TypeKind is Int/Bool.
|
||||
fn is_typed_bounded_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
||||
if subject.base.is_none() && subject.field.is_none() {
|
||||
return unit.typed_bounded_vars.contains(&subject.name);
|
||||
}
|
||||
// Phase 6: member-access shape `base.field` whose `base` is a
|
||||
// member-access shape `base.field` whose `base` is a
|
||||
// typed-extractor parameter and whose field is declared as an
|
||||
// Int/Bool in the same-file DTO definition. Per Hard Rule 3,
|
||||
// only fires when the base param itself was recognised by a
|
||||
// Phase 1-2 matcher — bare `dto.age` without a framework gate
|
||||
// typed-extractor matcher, bare `dto.age` without a framework gate
|
||||
// never lifts.
|
||||
let Some(base) = subject.base.as_deref() else {
|
||||
return false;
|
||||
|
|
@ -645,7 +668,7 @@ fn is_actor_context_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
|||
// A3: `V.id`-shape subjects where `V` is bound from a login-guard /
|
||||
// auth-check call (or from a typed self-actor extractor parameter)
|
||||
// are the caller's own id. `V.group_id` / `V.workspace_id` stay
|
||||
// relevant — only self-identifier fields trip this branch, so
|
||||
// relevant, only self-identifier fields trip this branch, so
|
||||
// foreign scoped ids on the same actor binding still flag.
|
||||
if let Some(base) = subject.base.as_deref() {
|
||||
let root = base.split('.').next().unwrap_or(base);
|
||||
|
|
@ -657,7 +680,7 @@ fn is_actor_context_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool {
|
|||
}
|
||||
|
||||
// Transitive copy of `V.id`: `let uid = user.id; query(.., &[uid])`
|
||||
// — the subject `uid` is a plain identifier with no base/field, but
|
||||
//, the subject `uid` is a plain identifier with no base/field, but
|
||||
// was recorded as a self-actor id copy at extract time. Treat it
|
||||
// as actor context.
|
||||
if unit.self_actor_id_vars.contains(&subject.name) {
|
||||
|
|
@ -810,15 +833,15 @@ fn is_id_like_name(name: &str) -> bool {
|
|||
}
|
||||
|
||||
/// True when the analysis unit shows positive evidence of receiving
|
||||
/// user-controlled input — the precondition for any auth rule that
|
||||
/// user-controlled input, the precondition for any auth rule that
|
||||
/// reasons about "scoped identifier" or "token-acceptance flow"
|
||||
/// shapes.
|
||||
///
|
||||
/// A unit qualifies if any of the following hold:
|
||||
/// * It is a recognised framework route handler (`RouteHandler` —
|
||||
/// * It is a recognised framework route handler (`RouteHandler` ,
|
||||
/// the strongest signal: registered with a router).
|
||||
/// * It accesses a request-shaped value (`request.body`, `req.params`,
|
||||
/// `c.Query(..)`, etc.) — populated as `context_inputs`.
|
||||
/// `c.Query(..)`, etc.), populated as `context_inputs`.
|
||||
/// * It declares at least one parameter whose name signals an
|
||||
/// externally-supplied value (id-like, token-like, request-like).
|
||||
/// Internal helpers that take only typed objects
|
||||
|
|
@ -826,7 +849,7 @@ fn is_id_like_name(name: &str) -> bool {
|
|||
/// `items`) are excluded.
|
||||
///
|
||||
/// Migrations, Celery tasks, pytest fixtures, conftest hooks, and
|
||||
/// pure utility helpers fail all three conditions and are skipped —
|
||||
/// pure utility helpers fail all three conditions and are skipped ,
|
||||
/// they cannot, by construction, be the entry point of an
|
||||
/// authentication-bearing flow.
|
||||
fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool {
|
||||
|
|
@ -843,7 +866,7 @@ fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool {
|
|||
/// as part of its calling contract? Captures three classes of name:
|
||||
/// * id-like (`*_id`, `*Id`, `id`, `*Ids`),
|
||||
/// * token-like (`token`, `*_token`, `accessToken`),
|
||||
/// * framework-request objects (`request`, `req`, `ctx` — the
|
||||
/// * framework-request objects (`request`, `req`, `ctx`, the
|
||||
/// standard names used by Express/Django/Flask/Gin/Axum/NestJS
|
||||
/// handlers as the parameter that carries the HTTP request).
|
||||
///
|
||||
|
|
@ -851,12 +874,26 @@ fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool {
|
|||
/// functions that, while not registered as route handlers, are
|
||||
/// clearly invoked with caller-supplied identifiers or request data.
|
||||
fn is_external_input_param_name(name: &str) -> bool {
|
||||
// Pytest / unittest.mock convention: parameters injected by
|
||||
// `@mock.patch(...)` decorators are universally named
|
||||
// `mock_<thing>` (`mock_project_id`, `mock_session`,
|
||||
// `mock_user_id`). Their values are MagicMock instances created
|
||||
// by the test framework, not user-supplied input, even when the
|
||||
// suffix carries an id-shaped tail. Refusing the entire `mock_`
|
||||
// prefix is structural (mirrors pytest's documented convention)
|
||||
// and closes the airflow `tests/unit/google/cloud/hooks/`
|
||||
// cluster where every test method takes
|
||||
// `(self, get_conn, mock_project_id)` and the suffix tripped the
|
||||
// id-like heuristic.
|
||||
if name.starts_with("mock_") || name.starts_with("mocked_") {
|
||||
return false;
|
||||
}
|
||||
if is_id_like_name(name) {
|
||||
return true;
|
||||
}
|
||||
let lower = name.to_ascii_lowercase();
|
||||
// Token-shaped: bare `token` or any `*_token` / `*Token` /
|
||||
// `accessToken` / `refreshToken`-style suffix. Conservative —
|
||||
// `accessToken` / `refreshToken`-style suffix. Conservative ,
|
||||
// only fires on explicit token-naming, not on incidental
|
||||
// substrings.
|
||||
if lower == "token" || lower.ends_with("_token") || lower.ends_with("token") {
|
||||
|
|
@ -951,7 +988,7 @@ mod tests {
|
|||
assert!(is_actor_context_subject(&member("user", "uid"), &unit));
|
||||
|
||||
// Pitfall guard: `user.group_id` / `user.workspace_id` stay
|
||||
// relevant — only self-identifier fields trip the widening.
|
||||
// relevant, only self-identifier fields trip the widening.
|
||||
assert!(!is_actor_context_subject(
|
||||
&member("user", "group_id"),
|
||||
&unit
|
||||
|
|
@ -962,7 +999,7 @@ mod tests {
|
|||
));
|
||||
|
||||
// Variables not in self_actor_vars fall back to the existing
|
||||
// identity-key match — `target.id` still flags.
|
||||
// identity-key match, `target.id` still flags.
|
||||
assert!(!is_actor_context_subject(&member("target", "id"), &unit));
|
||||
}
|
||||
|
||||
|
|
@ -1036,7 +1073,7 @@ mod tests {
|
|||
assert!(!is_relevant_target_subject(&plain("id"), &unit));
|
||||
|
||||
// Plain `id` NOT in the const-bound set still flags as
|
||||
// relevant — regression guard for the user-controlled case.
|
||||
// relevant, regression guard for the user-controlled case.
|
||||
let unit2 = empty_unit();
|
||||
assert!(is_relevant_target_subject(&plain("id"), &unit2));
|
||||
|
||||
|
|
@ -1046,12 +1083,12 @@ mod tests {
|
|||
assert!(is_relevant_target_subject(&member("req", "id"), &unit));
|
||||
}
|
||||
|
||||
/// Phase 5 typed-bounded subject exclusion: a parameter whose
|
||||
/// Hierarchy: a parameter whose
|
||||
/// static type was recovered as `Int`/`Bool` (Spring `Long userId`,
|
||||
/// Axum `Path<i64>`, FastAPI `user_id: int`) has its name added to
|
||||
/// `unit.typed_bounded_vars` by `apply_typed_bounded_params`. The
|
||||
/// subject `userId` then must not be classified as a scoped
|
||||
/// identifier — the framework guarantees the value is numeric and
|
||||
/// identifier, the framework guarantees the value is numeric and
|
||||
/// cannot drive ownership-bypass.
|
||||
#[test]
|
||||
fn typed_bounded_plain_subjects_are_not_relevant() {
|
||||
|
|
@ -1066,7 +1103,7 @@ mod tests {
|
|||
assert!(is_relevant_target_subject(&plain("user_id"), &unit2));
|
||||
|
||||
// Member access `req.user_id` is unaffected (only plain
|
||||
// identifiers are exempted — fields/base remain regular
|
||||
// identifiers are exempted, fields/base remain regular
|
||||
// subjects so DTO-shape leaks still flag).
|
||||
unit.typed_bounded_vars.insert("req".into());
|
||||
assert!(is_relevant_target_subject(&member("req", "user_id"), &unit));
|
||||
|
|
@ -1080,17 +1117,17 @@ mod tests {
|
|||
#[test]
|
||||
fn unit_user_input_evidence_recognises_external_inputs() {
|
||||
// Function with no params and no context_inputs (Celery task
|
||||
// shape) — must NOT count as user-input-bearing.
|
||||
// shape), must NOT count as user-input-bearing.
|
||||
let mut unit = empty_unit();
|
||||
assert!(!unit_has_user_input_evidence(&unit));
|
||||
|
||||
// Adding internal-typed params (apps, schema_editor — Django
|
||||
// Adding internal-typed params (apps, schema_editor, Django
|
||||
// migration RunPython callback shape) keeps the gate closed.
|
||||
unit.params.push("apps".into());
|
||||
unit.params.push("schema_editor".into());
|
||||
assert!(!unit_has_user_input_evidence(&unit));
|
||||
|
||||
// pytest hook shape: (config, items) — gate stays closed.
|
||||
// pytest hook shape: (config, items), gate stays closed.
|
||||
let mut unit = empty_unit();
|
||||
unit.params.push("config".into());
|
||||
unit.params.push("items".into());
|
||||
|
|
@ -1161,14 +1198,22 @@ mod tests {
|
|||
assert!(!is_external_input_param_name("manager"));
|
||||
// `c` alone is too common as a local variable to count.
|
||||
assert!(!is_external_input_param_name("c"));
|
||||
// Pytest / unittest.mock fixture-injected mocks: `mock_<x>` /
|
||||
// `mocked_<x>` names are MagicMock instances, not user input,
|
||||
// even when the suffix (`mock_project_id`) is id-shaped.
|
||||
assert!(!is_external_input_param_name("mock_project_id"));
|
||||
assert!(!is_external_input_param_name("mock_session"));
|
||||
assert!(!is_external_input_param_name("mock_user_id"));
|
||||
assert!(!is_external_input_param_name("mocked_request"));
|
||||
assert!(!is_external_input_param_name("mocked_token"));
|
||||
}
|
||||
|
||||
/// Phase A4 row-fetch exemption.
|
||||
/// Row-fetch exemption.
|
||||
///
|
||||
/// Row var declared at line 10; auth check naming the row appears
|
||||
/// at line 20. An operation at line 10 (the fetch) is exempted
|
||||
/// because the auth check authorises the resulting row. Coverage
|
||||
/// is intentionally narrow — operations between fetch (10) and
|
||||
/// is intentionally narrow, operations between fetch (10) and
|
||||
/// check (20) that are NOT row-fetch sites must still flag.
|
||||
#[test]
|
||||
fn row_fetch_exemption_covers_fetch_when_check_names_row() {
|
||||
|
|
@ -1192,6 +1237,7 @@ mod tests {
|
|||
line: 20,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
});
|
||||
|
||||
let fetch_op = SensitiveOperation {
|
||||
|
|
@ -1206,7 +1252,7 @@ mod tests {
|
|||
assert!(has_row_fetch_exemption(&unit, &fetch_op));
|
||||
|
||||
// Operation at a different line (between fetch and check) is
|
||||
// NOT a row-fetch site — exemption does not apply.
|
||||
// NOT a row-fetch site, exemption does not apply.
|
||||
let mid_op = SensitiveOperation {
|
||||
kind: OperationKind::Mutation,
|
||||
sink_class: None,
|
||||
|
|
@ -1229,7 +1275,7 @@ mod tests {
|
|||
"community".to_string(),
|
||||
(10, vec![member("data", "community_id")]),
|
||||
);
|
||||
// No auth check pushed — exemption must NOT apply.
|
||||
// No auth check pushed, exemption must NOT apply.
|
||||
|
||||
let fetch_op = SensitiveOperation {
|
||||
kind: OperationKind::Read,
|
||||
|
|
@ -1256,7 +1302,7 @@ mod tests {
|
|||
(10, vec![member("data", "community_id")]),
|
||||
);
|
||||
// Login-only check on the row should NOT exempt the row-fetch
|
||||
// — login proves identity, not authorization.
|
||||
//, login proves identity, not authorization.
|
||||
unit.auth_checks.push(AuthCheck {
|
||||
kind: AuthCheckKind::LoginGuard,
|
||||
callee: "require_login".into(),
|
||||
|
|
@ -1265,6 +1311,7 @@ mod tests {
|
|||
line: 20,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
});
|
||||
|
||||
let fetch_op = SensitiveOperation {
|
||||
|
|
@ -1305,10 +1352,11 @@ mod tests {
|
|||
line: 20,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
};
|
||||
|
||||
// Direct member subject `data.community_id` (the original
|
||||
// request field) — covered via reverse-walk.
|
||||
// request field), covered via reverse-walk.
|
||||
assert!(auth_check_covers_subject(
|
||||
&check,
|
||||
&member("data", "community_id"),
|
||||
|
|
@ -1334,7 +1382,7 @@ mod tests {
|
|||
/// Subject as plain identifier copied from the request
|
||||
/// (`let community_id = data.community_id; let community =
|
||||
/// Community::read(pool, community_id);`) must also benefit from
|
||||
/// the reverse-walk — `row_population_data["community"]` then
|
||||
/// the reverse-walk, `row_population_data["community"]` then
|
||||
/// records `[community_id]` (a plain identifier, not the
|
||||
/// member-access shape).
|
||||
#[test]
|
||||
|
|
@ -1352,6 +1400,7 @@ mod tests {
|
|||
line: 20,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
};
|
||||
|
||||
assert!(auth_check_covers_subject(
|
||||
|
|
@ -1392,9 +1441,10 @@ mod tests {
|
|||
line: 20,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
};
|
||||
|
||||
// Sink subject is the bare alias — covered via the chain.
|
||||
// Sink subject is the bare alias, covered via the chain.
|
||||
assert!(auth_check_covers_subject(
|
||||
&check,
|
||||
&plain("community_id"),
|
||||
|
|
@ -1412,4 +1462,73 @@ mod tests {
|
|||
// Plain identifier with no alias entry must NOT be covered.
|
||||
assert!(!auth_check_covers_subject(&check, &plain("post_id"), &unit));
|
||||
}
|
||||
|
||||
/// Route-level guard short-circuit (FastAPI / Flask /
|
||||
/// Django / Spring / Rails / axum decorator-level auth).
|
||||
///
|
||||
/// The decorator-level `@requires_role` /
|
||||
/// `dependencies=[Depends(requires_access_dag(...))]` /
|
||||
/// `before_action :authorize` runs before the handler body and
|
||||
/// authorizes every value the handler receives. The check has
|
||||
/// no per-arg `ValueRef` pointing back into the body, so the
|
||||
/// per-name subject coverage walk cannot model the semantics.
|
||||
/// `auth_check_covers_subject` short-circuits `true` for any
|
||||
/// authorization-bearing route-level check (LoginGuard etc. are
|
||||
/// already filtered out by `has_prior_subject_auth`).
|
||||
#[test]
|
||||
fn auth_check_covers_subject_route_level_short_circuits() {
|
||||
use crate::auth_analysis::model::{AuthCheck, AuthCheckKind};
|
||||
|
||||
let unit = empty_unit();
|
||||
let route_check = AuthCheck {
|
||||
kind: AuthCheckKind::Other,
|
||||
callee: "requires_access_dag".into(),
|
||||
subjects: Vec::new(), // route-level checks carry no body subjects
|
||||
span: (0, 0),
|
||||
line: 0,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: true,
|
||||
};
|
||||
|
||||
// Any subject is covered when the check is route-level ,
|
||||
// path param, request body field, row-fetch receiver, all of
|
||||
// them. The per-name walk would have rejected each.
|
||||
assert!(auth_check_covers_subject(
|
||||
&route_check,
|
||||
&plain("dag_id"),
|
||||
&unit
|
||||
));
|
||||
assert!(auth_check_covers_subject(
|
||||
&route_check,
|
||||
&member("req", "dag_run_id"),
|
||||
&unit
|
||||
));
|
||||
assert!(auth_check_covers_subject(
|
||||
&route_check,
|
||||
&plain("dag"),
|
||||
&unit
|
||||
));
|
||||
|
||||
// Sanity check: an in-body check with no subjects (the prior
|
||||
// shape) does NOT cover arbitrary subjects. Without the
|
||||
// route-level flag, the empty subjects vec means the
|
||||
// `check.subjects.iter().any(...)` walk fails for every
|
||||
// candidate.
|
||||
let in_body_check = AuthCheck {
|
||||
kind: AuthCheckKind::Other,
|
||||
callee: "requires_access_dag".into(),
|
||||
subjects: Vec::new(),
|
||||
span: (0, 0),
|
||||
line: 0,
|
||||
args: Vec::new(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
};
|
||||
assert!(!auth_check_covers_subject(
|
||||
&in_body_check,
|
||||
&plain("dag_id"),
|
||||
&unit
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -173,7 +173,7 @@ impl AuthAnalysisRules {
|
|||
/// Does the LAST segment of the callee match a configured non-sink
|
||||
/// method name (case-sensitive exact)? Used to recognise DOM-API
|
||||
/// methods like `addEventListener` / `appendChild` regardless of
|
||||
/// receiver — `someElement.addEventListener` is just as
|
||||
/// receiver, `someElement.addEventListener` is just as
|
||||
/// categorically client-side as `document.addEventListener`.
|
||||
pub fn callee_has_non_sink_method(&self, callee: &str) -> bool {
|
||||
let last = bare_method_name(callee);
|
||||
|
|
@ -200,19 +200,19 @@ impl AuthAnalysisRules {
|
|||
/// Classify a call into a [`SinkClass`].
|
||||
///
|
||||
/// Dispatch order (first match wins):
|
||||
/// 1. `InMemoryLocal` — receiver is a known non-sink collection
|
||||
/// 1. `InMemoryLocal`, receiver is a known non-sink collection
|
||||
/// (tracked in `non_sink_vars` or matches a configured
|
||||
/// non-sink prefix).
|
||||
/// 2. `RealtimePublish` — receiver first-segment matches a
|
||||
/// 2. `RealtimePublish`, receiver first-segment matches a
|
||||
/// configured realtime prefix (e.g. `realtime`, `pubsub`).
|
||||
/// 3. `OutboundNetwork` — receiver first-segment matches a
|
||||
/// 3. `OutboundNetwork`, receiver first-segment matches a
|
||||
/// configured outbound-network prefix (e.g. `http`, `reqwest`).
|
||||
/// 4. `CacheCrossTenant` — receiver first-segment matches a
|
||||
/// 4. `CacheCrossTenant`, receiver first-segment matches a
|
||||
/// configured cache prefix (e.g. `cache`, `redis`).
|
||||
/// 5. `DbMutation` — callee name matches `mutation_indicator_names`.
|
||||
/// 6. `DbCrossTenantRead` — callee name matches `read_indicator_names`.
|
||||
/// 5. `DbMutation`, callee name matches `mutation_indicator_names`.
|
||||
/// 6. `DbCrossTenantRead`, callee name matches `read_indicator_names`.
|
||||
///
|
||||
/// Returns `None` when the callee matches none of the above — the
|
||||
/// Returns `None` when the callee matches none of the above, the
|
||||
/// call site is ignored by ownership-gap checks.
|
||||
pub fn classify_sink_class(
|
||||
&self,
|
||||
|
|
@ -227,8 +227,8 @@ impl AuthAnalysisRules {
|
|||
// (`el.addEventListener`, `parent.appendChild`) are categorically
|
||||
// not data-layer auth-relevant operations. These shapes would
|
||||
// otherwise prefix-match read/mutation indicators (`get`, `add`,
|
||||
// `remove`) — `getElementById` canonicalises to `getelementbyid`
|
||||
// which `starts_with("get")` — and falsely classify as
|
||||
// `remove`), `getElementById` canonicalises to `getelementbyid`
|
||||
// which `starts_with("get")`, and falsely classify as
|
||||
// `DbCrossTenantRead` / `DbMutation`.
|
||||
if self.callee_has_non_sink_global_receiver(callee)
|
||||
|| self.callee_has_non_sink_method(callee)
|
||||
|
|
@ -251,7 +251,7 @@ impl AuthAnalysisRules {
|
|||
// receiver. When the receiver chain itself contains a call
|
||||
// expression (`w.Header().Get(..)`, `r.URL.Query().Get(..)`,
|
||||
// `db.Tx(..).Query(..)`), the receiver is the *return value of
|
||||
// another call* — its type is opaque to the auth analyser and
|
||||
// another call*, its type is opaque to the auth analyser and
|
||||
// the bare verb match is too speculative to assume a data-layer
|
||||
// sink. The realtime/outbound/cache prefix dispatches above
|
||||
// already match by the chain root; if none of them claimed the
|
||||
|
|
@ -501,6 +501,13 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
|
|||
"user_passes_test".into(),
|
||||
"verify_access".into(),
|
||||
"authorize".into(),
|
||||
// FastAPI dependency-injection auth idiom: airflow uses
|
||||
// `Depends(requires_access_dag(method="GET"))`,
|
||||
// `requires_access_connection(...)`, etc. The unwrapped
|
||||
// inner call name is `requires_access_<resource>`; the
|
||||
// `requires_access` prefix matches all variants via
|
||||
// `matches_name`.
|
||||
"requires_access".into(),
|
||||
],
|
||||
mutation_indicator_names: vec![
|
||||
"update".into(),
|
||||
|
|
@ -615,7 +622,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
|
|||
"verify_access!".into(),
|
||||
"can_access?".into(),
|
||||
"can?".into(),
|
||||
// Rails per-record permission predicates — the canonical
|
||||
// Rails per-record permission predicates, the canonical
|
||||
// "load by id, then check on the loaded record" idiom
|
||||
// (see redmine `app/controllers/issues_controller.rb`,
|
||||
// mastodon controllers, diaspora ApplicationController).
|
||||
|
|
@ -961,7 +968,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
|
|||
"can_access".into(),
|
||||
"can_manage".into(),
|
||||
// Common project-specific helpers seen in real Axum/Rocket
|
||||
// codebases — kept as defaults so user code that names
|
||||
// codebases, kept as defaults so user code that names
|
||||
// its membership helper after the resource still gets
|
||||
// recognised. Users can extend via `nyx.toml`.
|
||||
"require_group_member".into(),
|
||||
|
|
@ -1045,7 +1052,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
|
|||
"FxHashSet".into(),
|
||||
"DashMap".into(),
|
||||
"DashSet".into(),
|
||||
// `serde_json::Map` (last-segment `Map`) — common JSON
|
||||
// `serde_json::Map` (last-segment `Map`), common JSON
|
||||
// body builder where `m.insert("k", v)` is a string-key
|
||||
// assignment on an in-memory object, not a DB write.
|
||||
"Map".into(),
|
||||
|
|
@ -1161,7 +1168,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
|
|||
],
|
||||
non_sink_receiver_types: Vec::new(),
|
||||
non_sink_receiver_name_prefixes: Vec::new(),
|
||||
// Browser/DOM globals — calls on these receivers are
|
||||
// Browser/DOM globals, calls on these receivers are
|
||||
// categorically client-side (no server-side authorization
|
||||
// semantics). Without this list, `document.getElementById`
|
||||
// would prefix-match the read-indicator `get`,
|
||||
|
|
@ -1196,7 +1203,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules {
|
|||
"WeakMap".into(),
|
||||
"WeakSet".into(),
|
||||
],
|
||||
// DOM-API methods — when the LAST segment of the callee
|
||||
// DOM-API methods, when the LAST segment of the callee
|
||||
// matches, the call is non-data-layer regardless of receiver
|
||||
// (`el.addEventListener`, `parent.appendChild`). These
|
||||
// methods would otherwise prefix-match `add`, `remove`,
|
||||
|
|
@ -1345,7 +1352,7 @@ pub fn first_receiver_segment(callee: &str) -> &str {
|
|||
callee.split('.').next().unwrap_or(callee)
|
||||
}
|
||||
|
||||
/// True when the callee's receiver chain contains a call expression —
|
||||
/// True when the callee's receiver chain contains a call expression ,
|
||||
/// i.e. the LAST segment is being invoked on the *return value* of an
|
||||
/// earlier call (`w.Header().Get`, `r.URL.Query().Get`,
|
||||
/// `db.Tx(opts).Query`). Detected as: the substring before the last
|
||||
|
|
@ -1366,7 +1373,7 @@ pub fn receiver_is_chained_call(callee: &str) -> bool {
|
|||
/// (`member`, `owner`, `admin`, `access`, `permission`, `manager`,
|
||||
/// `editor`, `viewer`, `user`, `mod`). The resource segment is
|
||||
/// project-specific (`trip`, `doc`, `project`, `community`, …) and
|
||||
/// cannot be enumerated in the static defaults — but the
|
||||
/// cannot be enumerated in the static defaults, but the
|
||||
/// prefix+role pattern is unambiguous enough that recognising it as
|
||||
/// an authorization check is safe. Also accepts `is_<role>` /
|
||||
/// `is_<role>_(or|and)_<role>...` predicate forms (`is_admin`,
|
||||
|
|
@ -1398,7 +1405,7 @@ fn is_require_resource_role_call(name: &str) -> bool {
|
|||
}
|
||||
|
||||
// Pattern 2: `is_<role>` and `is_<role>_(or|and)_<role>...`.
|
||||
// Conservative role list — excludes `user` / `staff` to avoid
|
||||
// Conservative role list, excludes `user` / `staff` to avoid
|
||||
// matching ambiguous predicates like `is_user`.
|
||||
if let Some(rest) = lower.strip_prefix("is_")
|
||||
&& !rest.is_empty()
|
||||
|
|
@ -1682,7 +1689,7 @@ mod tests {
|
|||
assert!(receiver_is_chained_call("r.URL.Query().Get"));
|
||||
assert!(receiver_is_chained_call("db.Tx(opts).Query"));
|
||||
assert!(receiver_is_chained_call("client.WithToken(t).Get"));
|
||||
// Pure field/identifier chain — no `(` anywhere.
|
||||
// Pure field/identifier chain, no `(` anywhere.
|
||||
assert!(!receiver_is_chained_call("repo.Find"));
|
||||
assert!(!receiver_is_chained_call("c.Fs.Create"));
|
||||
assert!(!receiver_is_chained_call("globalBatchJobsMetrics.save"));
|
||||
|
|
@ -1701,7 +1708,7 @@ mod tests {
|
|||
let empty: HashSet<String> = HashSet::new();
|
||||
|
||||
// Chained-call receiver: verb-name fallback is suppressed.
|
||||
// The minio `w.Header().Get(constName)` cluster — `Get` would
|
||||
// The minio `w.Header().Get(constName)` cluster, `Get` would
|
||||
// match the `Get` read indicator on a bare receiver but the
|
||||
// chained-call shape masks the receiver type.
|
||||
assert_eq!(rules.classify_sink_class("w.Header().Get", &empty), None);
|
||||
|
|
@ -1742,7 +1749,7 @@ mod tests {
|
|||
let rules = build_auth_rules(&cfg, "javascript");
|
||||
let empty: HashSet<String> = HashSet::new();
|
||||
|
||||
// Globals — receiver-first-segment match.
|
||||
// Globals, receiver-first-segment match.
|
||||
assert_eq!(
|
||||
rules.classify_sink_class("document.getElementById", &empty),
|
||||
Some(SinkClass::InMemoryLocal)
|
||||
|
|
@ -1760,7 +1767,7 @@ mod tests {
|
|||
Some(SinkClass::InMemoryLocal)
|
||||
);
|
||||
|
||||
// Method allowlist — last-segment match regardless of receiver.
|
||||
// Method allowlist, last-segment match regardless of receiver.
|
||||
assert_eq!(
|
||||
rules.classify_sink_class("input.addEventListener", &empty),
|
||||
Some(SinkClass::InMemoryLocal)
|
||||
|
|
@ -1801,22 +1808,22 @@ mod tests {
|
|||
assert!(rules.is_authorization_check("authz::require_trip_member"));
|
||||
assert!(rules.is_authorization_check("self.require_album_editor"));
|
||||
|
||||
// Negatives — random `require_*` calls without a known role
|
||||
// Negatives, random `require_*` calls without a known role
|
||||
// suffix do NOT count as authorization.
|
||||
assert!(!rules.is_authorization_check("require_db"));
|
||||
assert!(!rules.is_authorization_check("require_user"));
|
||||
assert!(!rules.is_authorization_check("require_login"));
|
||||
// Bare `require_member` / `require_owner` (no resource segment)
|
||||
// aren't enough — the resource segment is what makes the helper
|
||||
// aren't enough, the resource segment is what makes the helper
|
||||
// unambiguous.
|
||||
assert!(!rules.is_authorization_check("require_member"));
|
||||
assert!(!rules.is_authorization_check("require_owner"));
|
||||
}
|
||||
|
||||
/// Phase A4 — broader verb / role / context-suffix shapes seen in
|
||||
/// real-world Rust apps. `check_<resource>_<role>_action` is the
|
||||
/// canonical lemmy idiom; verifying the `is_<role>` predicate
|
||||
/// recogniser closes `is_mod_or_admin` style checks.
|
||||
/// Broader verb / role / context-suffix shapes seen in real-world
|
||||
/// Rust apps. `check_<resource>_<role>_action` is the canonical
|
||||
/// lemmy idiom; the `is_<role>` predicate recogniser closes
|
||||
/// `is_mod_or_admin` style checks.
|
||||
#[test]
|
||||
fn is_authorization_check_recognises_check_action_and_predicate_shapes() {
|
||||
let cfg = Config::default();
|
||||
|
|
@ -1847,7 +1854,7 @@ mod tests {
|
|||
assert!(rules.is_authorization_check("is_admin_or_moderator"));
|
||||
assert!(rules.is_authorization_check("is_member_and_owner"));
|
||||
|
||||
// Negatives — predicates whose tokens are NOT known auth roles.
|
||||
// Negatives, predicates whose tokens are NOT known auth roles.
|
||||
assert!(!rules.is_authorization_check("is_user"));
|
||||
assert!(!rules.is_authorization_check("is_logged_in"));
|
||||
assert!(!rules.is_authorization_check("is_active"));
|
||||
|
|
|
|||
|
|
@ -384,8 +384,8 @@ fn classify_rocket_param(
|
|||
///
|
||||
/// **Looser than [`super::common::is_self_actor_type_text`] by
|
||||
/// design.** This recogniser runs only on the type of a route-bound
|
||||
/// parameter — appearing in a route handler signature is itself a
|
||||
/// strong signal — and a false positive here just over-credits the
|
||||
/// parameter, appearing in a route handler signature is itself a
|
||||
/// strong signal, and a false positive here just over-credits the
|
||||
/// route with a login guard, which is conservative w.r.t. flagging.
|
||||
/// `is_self_actor_type_text` runs on every parameter, including in
|
||||
/// non-route functions, and a false positive there suppresses
|
||||
|
|
@ -625,6 +625,11 @@ pub(crate) fn inject_guard_checks(
|
|||
line,
|
||||
args: call.args.clone(),
|
||||
condition_text: None,
|
||||
// Route-level guard injected from a tower / axum layer
|
||||
// (`RequireAuthorizationLayer`, `axum_login::login_required!`,
|
||||
// …). Tells `auth_check_covers_subject` to short-circuit
|
||||
// for any non-login-guard match.
|
||||
is_route_level: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -209,7 +209,12 @@ fn collect_class_based_routes(
|
|||
}
|
||||
let line = method_node.start_position().row + 1;
|
||||
for call in &middleware_calls {
|
||||
if let Some(check) = auth_check_from_call_site(call, line, rules) {
|
||||
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
|
||||
// Django class-based-view decorators (`@method_decorator(login_required)`,
|
||||
// `@permission_required(...)`) and DRF `permission_classes`
|
||||
// are declared at the route boundary; mark route-level
|
||||
// so coverage applies to the action body's operations.
|
||||
check.is_route_level = true;
|
||||
unit.auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
|
|
@ -443,7 +448,14 @@ fn inject_middleware_auth(
|
|||
return;
|
||||
};
|
||||
for call in middleware_calls {
|
||||
if let Some(check) = auth_check_from_call_site(call, line, rules) {
|
||||
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
|
||||
// Django decorators (`@login_required`, `@permission_required`,
|
||||
// `@user_passes_test`, etc.) and DRF `permission_classes` are
|
||||
// declared at the route boundary; mark route-level so
|
||||
// `auth_check_covers_subject` short-circuits `true` for any
|
||||
// non-login-guard match. See flask.rs / model.rs for the
|
||||
// full rationale.
|
||||
check.is_route_level = true;
|
||||
unit.auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,6 +67,15 @@ fn maybe_collect_flask_route(
|
|||
for decorator in decorator_expressions(node) {
|
||||
if let Some(mut specs) = parse_flask_route_decorator(decorator, bytes) {
|
||||
route_specs.append(&mut specs);
|
||||
// FastAPI puts route-level dependencies (auth checks +
|
||||
// logging hooks) inside the route decorator's
|
||||
// `dependencies=[Depends(...)]` keyword argument, instead
|
||||
// of as separate `@decorator` lines like Flask. Walk the
|
||||
// route decorator's keyword args for that shape and lift
|
||||
// each `Depends(call(...))` element into the
|
||||
// middleware_calls list, so the same `inject_middleware_auth`
|
||||
// path that Flask uses also picks up FastAPI auth deps.
|
||||
middleware_calls.extend(extract_fastapi_dependencies(decorator, bytes));
|
||||
} else {
|
||||
middleware_calls.extend(expand_decorator_calls(decorator, bytes));
|
||||
}
|
||||
|
|
@ -220,6 +229,75 @@ fn expand_decorator_calls(node: Node<'_>, bytes: &[u8]) -> Vec<CallSite> {
|
|||
vec![call_site_from_node(node, bytes)]
|
||||
}
|
||||
|
||||
/// Walk the route-decorator call's keyword args looking for the FastAPI
|
||||
/// `dependencies=[Depends(call(...)), Depends(call), ...]` shape. For
|
||||
/// each `Depends(...)` list element, extract the inner callable as a
|
||||
/// `CallSite` so it can flow through `inject_middleware_auth` and be
|
||||
/// matched against the per-language authorization-check / login-guard
|
||||
/// name lists. Refuses non-call elements and `Depends(...)` without a
|
||||
/// recognised inner call shape.
|
||||
///
|
||||
/// The function is decoupled from Flask semantics (Flask routes never
|
||||
/// use `dependencies=`); the lookup is purely structural and matches
|
||||
/// FastAPI's documented dependency-injection convention. Lives in the
|
||||
/// flask module because Flask's route-decorator parser already targets
|
||||
/// the `@<router>.<method>(<path>, ...)` shape that FastAPI shares.
|
||||
fn extract_fastapi_dependencies(decorator_expr: Node<'_>, bytes: &[u8]) -> Vec<CallSite> {
|
||||
if decorator_expr.kind() != "call" {
|
||||
return Vec::new();
|
||||
}
|
||||
let Some(arguments) = decorator_expr.child_by_field_name("arguments") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let Some(value) = keyword_argument_value(arguments, bytes, "dependencies") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let mut out = Vec::new();
|
||||
for element in named_children(value) {
|
||||
if let Some(call) = unwrap_depends_call(element, bytes) {
|
||||
out.push(call);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Unwrap one `Depends(...)` list element from a FastAPI `dependencies`
|
||||
/// list and return the inner callable as a `CallSite`. Three shapes
|
||||
/// are accepted:
|
||||
/// * `Depends(callee(arg1, arg2))`, most common, the inner call is
|
||||
/// the callable factory invocation; record `callee` as the auth
|
||||
/// check.
|
||||
/// * `Depends(callee)`, bare reference; record `callee` itself.
|
||||
/// * `Depends()` / non-`Depends` items, skipped.
|
||||
fn unwrap_depends_call(node: Node<'_>, bytes: &[u8]) -> Option<CallSite> {
|
||||
if node.kind() != "call" {
|
||||
return None;
|
||||
}
|
||||
let function = node.child_by_field_name("function")?;
|
||||
let function_text = text(function, bytes);
|
||||
if !is_depends_callee(&function_text) {
|
||||
return None;
|
||||
}
|
||||
let arguments = node.child_by_field_name("arguments")?;
|
||||
let first = named_children(arguments).into_iter().next()?;
|
||||
match first.kind() {
|
||||
"call" => Some(call_site_from_node(first, bytes)),
|
||||
"identifier" | "attribute" | "scoped_identifier" => Some(call_site_from_node(first, bytes)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// True for the FastAPI `Depends` marker, including the
|
||||
/// fully-qualified `fastapi.Depends` form. Conservative: only literal
|
||||
/// matches, no canonicalisation.
|
||||
fn is_depends_callee(callee: &str) -> bool {
|
||||
let trimmed = callee.trim();
|
||||
matches!(
|
||||
trimmed,
|
||||
"Depends" | "fastapi.Depends" | "fastapi.params.Depends"
|
||||
)
|
||||
}
|
||||
|
||||
fn inject_middleware_auth(
|
||||
model: &mut AuthorizationModel,
|
||||
unit_idx: usize,
|
||||
|
|
@ -231,8 +309,48 @@ fn inject_middleware_auth(
|
|||
return;
|
||||
};
|
||||
for call in middleware_calls {
|
||||
if let Some(check) = auth_check_from_call_site(call, line, rules) {
|
||||
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
|
||||
// Mark as route-level: the check is declared at the route
|
||||
// boundary (Flask `@requires_role(...)` decorator, FastAPI
|
||||
// `dependencies=[Depends(...)]`, or any custom-router
|
||||
// equivalent) and semantically authorizes every value the
|
||||
// handler receives, path param, body, query, downstream
|
||||
// row fetches, the lot. `auth_check_covers_subject` reads
|
||||
// `is_route_level` and short-circuits `true` for any
|
||||
// non-login-guard match, which is the correct shape for a
|
||||
// decorator-level guard whose inner call carries no
|
||||
// per-arg subject ref pointing back into the handler body.
|
||||
// LoginGuard / TokenExpiry / TokenRecipient kinds are
|
||||
// already excluded by `has_prior_subject_auth`'s filter
|
||||
// before they reach `auth_check_covers_subject`, so the
|
||||
// flag is safe to set unconditionally here, it has no
|
||||
// effect on those kinds.
|
||||
check.is_route_level = true;
|
||||
unit.auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod fastapi_dependencies_tests {
|
||||
use super::is_depends_callee;
|
||||
|
||||
/// `is_depends_callee` only matches the FastAPI `Depends` marker.
|
||||
/// Any other wrapper call inside `dependencies=[...]` is ignored ,
|
||||
/// extracting an inner callee from the wrong wrapper would
|
||||
/// misclassify logging hooks or filter callables as auth checks.
|
||||
#[test]
|
||||
fn is_depends_callee_recognises_canonical_forms() {
|
||||
assert!(is_depends_callee("Depends"));
|
||||
assert!(is_depends_callee("fastapi.Depends"));
|
||||
assert!(is_depends_callee("fastapi.params.Depends"));
|
||||
// Whitespace tolerance.
|
||||
assert!(is_depends_callee(" Depends "));
|
||||
// Negatives.
|
||||
assert!(!is_depends_callee("Annotated"));
|
||||
assert!(!is_depends_callee("Body"));
|
||||
assert!(!is_depends_callee("Depends.something"));
|
||||
assert!(!is_depends_callee("RequiresAuth"));
|
||||
assert!(!is_depends_callee(""));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,5 +61,104 @@ pub fn extract_authorization_model(
|
|||
}
|
||||
}
|
||||
|
||||
// **Dedup units by span across extractors.** Multiple extractors
|
||||
// (e.g. Flask + Django on a Python file) each call
|
||||
// `collect_top_level_units`, producing one unit per top-level
|
||||
// function. When one extractor also recognises a route on that
|
||||
// function and promotes its copy to `RouteHandler` (with injected
|
||||
// middleware auth checks), the *other* extractor's untouched
|
||||
// `Function` copy still runs through `check_ownership_gaps` and
|
||||
// emits the FP from a unit that never saw the middleware-derived
|
||||
// auth check.
|
||||
//
|
||||
// This step keeps a single canonical unit per source span,
|
||||
// preferring `RouteHandler` over `Function`, merging auth_checks
|
||||
// and folding operation lists conservatively. Route registrations
|
||||
// are remapped to the surviving unit index.
|
||||
deduplicate_units_by_span(&mut model);
|
||||
|
||||
model
|
||||
}
|
||||
|
||||
fn deduplicate_units_by_span(model: &mut AuthorizationModel) {
|
||||
use crate::auth_analysis::model::{AnalysisUnit, AnalysisUnitKind};
|
||||
use std::collections::HashMap;
|
||||
|
||||
// First pass: choose a winner for each span, prefer the
|
||||
// first-seen `RouteHandler` over any `Function` copy.
|
||||
let mut winner_by_span: HashMap<(usize, usize), usize> = HashMap::new();
|
||||
for (idx, unit) in model.units.iter().enumerate() {
|
||||
let key = unit.span;
|
||||
match winner_by_span.get(&key) {
|
||||
None => {
|
||||
winner_by_span.insert(key, idx);
|
||||
}
|
||||
Some(&existing) => {
|
||||
let prev_kind = model.units[existing].kind;
|
||||
if prev_kind != AnalysisUnitKind::RouteHandler
|
||||
&& unit.kind == AnalysisUnitKind::RouteHandler
|
||||
{
|
||||
winner_by_span.insert(key, idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: drain auth_checks from losers so we can append them
|
||||
// to the winners after the layout collapses.
|
||||
let mut moved_checks: Vec<Vec<crate::auth_analysis::model::AuthCheck>> =
|
||||
Vec::with_capacity(model.units.len());
|
||||
for old_idx in 0..model.units.len() {
|
||||
let span = model.units[old_idx].span;
|
||||
let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
|
||||
if winner == old_idx {
|
||||
moved_checks.push(Vec::new());
|
||||
} else {
|
||||
moved_checks.push(std::mem::take(&mut model.units[old_idx].auth_checks));
|
||||
}
|
||||
}
|
||||
|
||||
// Third pass: emit surviving units (clone the winners) and build
|
||||
// the old-idx → new-idx remap.
|
||||
let mut new_idx_for_old: HashMap<usize, usize> = HashMap::new();
|
||||
let mut surviving: Vec<AnalysisUnit> = Vec::with_capacity(winner_by_span.len());
|
||||
for old_idx in 0..model.units.len() {
|
||||
let span = model.units[old_idx].span;
|
||||
let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
|
||||
if winner == old_idx {
|
||||
new_idx_for_old.insert(old_idx, surviving.len());
|
||||
surviving.push(model.units[old_idx].clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Fourth pass: drain loser auth_checks into their winners, deduping
|
||||
// by (span, callee). Operations are not merged: both extractor
|
||||
// passes recompute the same operation list from the AST, so the
|
||||
// winner already carries the canonical set.
|
||||
for (old_idx, checks) in moved_checks.iter_mut().enumerate() {
|
||||
let span = model.units[old_idx].span;
|
||||
let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
|
||||
if winner == old_idx {
|
||||
continue;
|
||||
}
|
||||
let Some(&new_winner_idx) = new_idx_for_old.get(&winner) else {
|
||||
continue;
|
||||
};
|
||||
for check in checks.drain(..) {
|
||||
let already_present = surviving[new_winner_idx]
|
||||
.auth_checks
|
||||
.iter()
|
||||
.any(|existing| existing.span == check.span && existing.callee == check.callee);
|
||||
if !already_present {
|
||||
surviving[new_winner_idx].auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
model.units = surviving;
|
||||
for route in &mut model.routes {
|
||||
if let Some(&new_idx) = new_idx_for_old.get(&route.unit_idx) {
|
||||
route.unit_idx = new_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -137,7 +137,14 @@ fn maybe_collect_controller(
|
|||
let line = child.start_position().row + 1;
|
||||
let middleware_calls = applicable_filters(&filter_directives, &action_name);
|
||||
for call in &middleware_calls {
|
||||
if let Some(check) = auth_check_from_call_site(call, line, rules) {
|
||||
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
|
||||
// Rails `before_action :authorize_user`-style filter
|
||||
// callbacks run before the action and authorize the
|
||||
// entire request, same shape as FastAPI / Flask
|
||||
// `dependencies=[Depends(...)]`. Mark route-level so
|
||||
// `auth_check_covers_subject` covers the row-fetches
|
||||
// and downstream sinks the action body performs.
|
||||
check.is_route_level = true;
|
||||
unit.auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -114,7 +114,13 @@ fn maybe_collect_route(
|
|||
);
|
||||
let line = block.start_position().row + 1;
|
||||
for call in before_filters {
|
||||
if let Some(check) = auth_check_from_call_site(call, line, rules) {
|
||||
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
|
||||
// Sinatra `before` filters run before the route handler
|
||||
// body and authorize the request as a whole, same shape
|
||||
// as Rails `before_action`. Route-level so coverage
|
||||
// applies to the handler's row fetches and downstream
|
||||
// sinks.
|
||||
check.is_route_level = true;
|
||||
unit.auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,7 +111,15 @@ fn maybe_collect_controller(
|
|||
rules,
|
||||
);
|
||||
for call in &middleware_calls {
|
||||
if let Some(check) = auth_check_from_call_site(call, line, rules) {
|
||||
if let Some(mut check) = auth_check_from_call_site(call, line, rules) {
|
||||
// Spring `@PreAuthorize` / `@Secured` /
|
||||
// `@RolesAllowed` annotations are declared at the
|
||||
// method or class boundary and authorize the entire
|
||||
// request, same shape as FastAPI / Flask
|
||||
// `dependencies=[Depends(...)]`. Mark route-level
|
||||
// so `auth_check_covers_subject` covers row fetches
|
||||
// and downstream sinks in the handler body.
|
||||
check.is_route_level = true;
|
||||
unit.auth_checks.push(check);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
#![doc = include_str!(concat!(env!("OUT_DIR"), "/auth_analysis.md"))]
|
||||
|
||||
pub mod checks;
|
||||
pub mod config;
|
||||
pub mod extract;
|
||||
|
|
@ -26,7 +28,7 @@ fn byte_offset_to_point(tree: &Tree, byte: usize) -> tree_sitter::Point {
|
|||
/// source-level variable name. Built at `run_auth_analysis` call sites
|
||||
/// by merging type facts across all bodies in the file; a variable name
|
||||
/// with conflicting types in different bodies is dropped (absence is
|
||||
/// safe — the sink gate just falls back to name-based classification).
|
||||
/// safe, the sink gate just falls back to name-based classification).
|
||||
pub type VarTypes = HashMap<String, TypeKind>;
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
|
|
@ -87,7 +89,7 @@ pub fn run_auth_analysis(
|
|||
/// Used by pass 1 to persist per-file auth summaries for cross-file
|
||||
/// helper lifting. Only returns summaries for units whose body
|
||||
/// already proves at least one positional parameter under ownership /
|
||||
/// membership / admin / authorization check — i.e. the exact
|
||||
/// membership / admin / authorization check, i.e. the exact
|
||||
/// single-file lift set, so the cross-file variant does not widen what
|
||||
/// counts as a helper.
|
||||
pub fn extract_auth_summaries_by_key(
|
||||
|
|
@ -198,7 +200,7 @@ fn build_unit_summary(unit: &model::AnalysisUnit) -> Option<model::AuthCheckSumm
|
|||
|
||||
/// Walk every `SensitiveOperation` in the model and, when the call's
|
||||
/// receiver root variable has a known SSA type, override `sink_class`
|
||||
/// to the type-implied class. Strictly additive — only overrides
|
||||
/// to the type-implied class. Strictly additive, only overrides
|
||||
/// when the type map produces a definite class, otherwise leaves the
|
||||
/// name/prefix-derived classification intact.
|
||||
fn apply_var_types_to_model(
|
||||
|
|
@ -229,11 +231,11 @@ fn apply_var_types_to_model(
|
|||
/// reassignment from user input (`let id = req.params.id`) never gets
|
||||
/// suppressed by accident.
|
||||
///
|
||||
/// Phase 6: when a parameter's type is a [`TypeKind::Dto`], lift each
|
||||
/// when a parameter's type is a [`TypeKind::Dto`], lift each
|
||||
/// of its `Int`/`Bool` fields as `typed_bounded_dto_fields[<param>]`
|
||||
/// so member-access subjects like `dto.age` are recognised as
|
||||
/// payload-incompatible. Only fires when the base param itself was
|
||||
/// recognised as a typed extractor by a Phase 1-2 matcher — bare
|
||||
/// recognised as a typed extractor by a typed-extractor matcher, bare
|
||||
/// parameters with no framework gate never lift their fields.
|
||||
fn apply_typed_bounded_params(model: &mut model::AuthorizationModel, var_types: &VarTypes) {
|
||||
for unit in &mut model.units {
|
||||
|
|
@ -310,7 +312,7 @@ fn sink_class_for_type(
|
|||
///
|
||||
/// When `global_summaries` is `Some`, cross-file helpers are looked up
|
||||
/// via [`GlobalSummaries::get_auth`] after the same-file summary
|
||||
/// gather — this recovers the handler-in-file-A calling
|
||||
/// gather, this recovers the handler-in-file-A calling
|
||||
/// `require_owner`-in-file-B case that single-file lifting cannot see.
|
||||
fn apply_helper_lifting(
|
||||
model: &mut model::AuthorizationModel,
|
||||
|
|
@ -408,7 +410,7 @@ fn build_helper_summaries(
|
|||
let mut summary = AuthCheckSummary::default();
|
||||
for check in &unit.auth_checks {
|
||||
// We only lift checks that actively prove ownership /
|
||||
// membership / admin-rights / authorize-helper — login
|
||||
// membership / admin-rights / authorize-helper, login
|
||||
// and token-validity checks don't justify foreign-id
|
||||
// mutations and we want to keep parity with
|
||||
// `has_prior_subject_auth`'s filter.
|
||||
|
|
@ -435,7 +437,7 @@ fn build_helper_summaries(
|
|||
}
|
||||
}
|
||||
if !summary.param_auth_kinds.is_empty() {
|
||||
// Deduplicate by last segment of the function name — the
|
||||
// Deduplicate by last segment of the function name, the
|
||||
// lifting site matches the call's last segment too.
|
||||
let last = name.rsplit('.').next().unwrap_or(name).to_string();
|
||||
summaries
|
||||
|
|
@ -492,7 +494,7 @@ fn stronger_check_kind(a: model::AuthCheckKind, b: model::AuthCheckKind) -> mode
|
|||
/// For one unit, synthesise an `AuthCheck` at every call site that
|
||||
/// targets a helper with a non-trivial summary. Subjects are taken
|
||||
/// from `call_site.args_value_refs[K]` for each auth-checked param
|
||||
/// position K — these are the caller's concrete subjects passed at
|
||||
/// position K, these are the caller's concrete subjects passed at
|
||||
/// that arg slot, exactly what `auth_check_covers_subject` needs.
|
||||
fn synthesise_checks_for_unit(
|
||||
unit: &model::AnalysisUnit,
|
||||
|
|
@ -501,7 +503,7 @@ fn synthesise_checks_for_unit(
|
|||
let line_of = |span: (usize, usize)| -> usize {
|
||||
// Span is byte offsets; we don't have direct access to a Tree
|
||||
// here. Caller assigns line via `line` field on call_site
|
||||
// through CallSite metadata absence — fall back to the unit's
|
||||
// through CallSite metadata absence, fall back to the unit's
|
||||
// line since covers_subject uses `check.line <= op.line` and
|
||||
// helper calls are typically near the unit start.
|
||||
let _ = span;
|
||||
|
|
@ -541,6 +543,7 @@ fn synthesise_checks_for_unit(
|
|||
line,
|
||||
args: call.args.clone(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
});
|
||||
}
|
||||
out
|
||||
|
|
@ -563,7 +566,7 @@ fn call_site_line(unit: &model::AnalysisUnit, call: &model::CallSite) -> Option<
|
|||
None
|
||||
}
|
||||
|
||||
/// Cross-file variant of [`synthesise_checks_for_unit`] — for each
|
||||
/// Cross-file variant of [`synthesise_checks_for_unit`], for each
|
||||
/// call site in `unit`, resolve the callee against `GlobalSummaries`
|
||||
/// and look up an `AuthCheckSummary` that was persisted by some other
|
||||
/// file's pass-1 extraction. Skips call sites already handled by the
|
||||
|
|
@ -589,7 +592,7 @@ fn synthesise_cross_file_checks_for_unit(
|
|||
if unit.name.as_deref() == Some(last) {
|
||||
continue;
|
||||
}
|
||||
// Skip if the single-file map already handled this callee —
|
||||
// Skip if the single-file map already handled this callee ,
|
||||
// that path has richer same-file context (existing
|
||||
// summaries from sibling units in this model) and its
|
||||
// synthesised check is strictly more precise.
|
||||
|
|
@ -636,6 +639,7 @@ fn synthesise_cross_file_checks_for_unit(
|
|||
line,
|
||||
args: call.args.clone(),
|
||||
condition_text: None,
|
||||
is_route_level: false,
|
||||
});
|
||||
}
|
||||
out
|
||||
|
|
@ -767,7 +771,7 @@ mod tests {
|
|||
Some(SinkClass::DbCrossTenantRead)
|
||||
);
|
||||
// DatabaseConnection: unrecognized verb (`execute`) → DbMutation
|
||||
// (conservative default — treat as write-shaped).
|
||||
// (conservative default, treat as write-shaped).
|
||||
assert_eq!(
|
||||
sink_class_for_type(&TypeKind::DatabaseConnection, "conn.execute", &rules),
|
||||
Some(SinkClass::DbMutation)
|
||||
|
|
@ -819,7 +823,7 @@ mod tests {
|
|||
)));
|
||||
let var_types: VarTypes = HashMap::new();
|
||||
apply_var_types_to_model(&mut model, &rules, &var_types);
|
||||
// Unchanged — no entry in var_types for `db`.
|
||||
// Unchanged, no entry in var_types for `db`.
|
||||
assert_eq!(
|
||||
model.units[0].operations[0].sink_class,
|
||||
Some(SinkClass::DbMutation)
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ pub enum OperationKind {
|
|||
}
|
||||
|
||||
/// Classification of a sensitive operation by the resource it targets.
|
||||
/// `check_ownership_gaps` only fires on the first five classes —
|
||||
/// `check_ownership_gaps` only fires on the first five classes ,
|
||||
/// `InMemoryLocal` is never authorization-relevant.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum SinkClass {
|
||||
|
|
@ -76,7 +76,7 @@ pub enum SinkClass {
|
|||
/// (Redis / memcache / distributed cache client).
|
||||
CacheCrossTenant,
|
||||
/// A method call against a local, in-memory collection (HashMap,
|
||||
/// HashSet, Vec, …) — never authorization-relevant.
|
||||
/// HashSet, Vec, …), never authorization-relevant.
|
||||
InMemoryLocal,
|
||||
}
|
||||
|
||||
|
|
@ -133,6 +133,33 @@ pub struct AuthCheck {
|
|||
pub line: usize,
|
||||
pub args: Vec<String>,
|
||||
pub condition_text: Option<String>,
|
||||
/// True when the check was declared at the route boundary
|
||||
/// (decorator / middleware / dependency-injection list) rather
|
||||
/// than as a per-call check inside the handler body.
|
||||
///
|
||||
/// Route-level non-login-guard checks authorize the *entire*
|
||||
/// handler, they gate every value the handler receives, every
|
||||
/// row the handler fetches, and every operation downstream. An
|
||||
/// in-body `auth_check_covers_subject` walk that requires a
|
||||
/// per-name subject match cannot model that semantics: a
|
||||
/// FastAPI `dependencies=[Depends(requires_access_dag(method=
|
||||
/// "POST", access_entity=DagAccessEntity.RUN))]` is opaque to
|
||||
/// the engine, the inner `requires_access_dag` call carries no
|
||||
/// per-arg subject ref pointing to `dag_id` or `dag.id`. The
|
||||
/// flag tells `auth_check_covers_subject` to short-circuit
|
||||
/// `true` for any non-login-guard route-level check, leaving
|
||||
/// only the LoginGuard / TokenExpiry / TokenRecipient kinds
|
||||
/// (already excluded upstream by `has_prior_subject_auth`'s
|
||||
/// filter) to be ignored.
|
||||
///
|
||||
/// Set by `inject_middleware_auth` (Django, Flask, FastAPI) at
|
||||
/// the route-decorator entry point. Default `false` for
|
||||
/// in-body checks (`require_membership(user, group_id)`,
|
||||
/// `is_admin(user)`, etc.), those still flow through the
|
||||
/// per-subject coverage logic so a check on
|
||||
/// `community.creator_id` doesn't blanket-suppress every other
|
||||
/// subject in the unit.
|
||||
pub is_route_level: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
|
|
@ -140,7 +167,7 @@ pub struct SensitiveOperation {
|
|||
pub kind: OperationKind,
|
||||
/// Sink classification. `None` means the operation was recorded
|
||||
/// for taxonomy completeness but does not match any known resource
|
||||
/// class — defensive, and currently unused.
|
||||
/// class, defensive, and currently unused.
|
||||
pub sink_class: Option<SinkClass>,
|
||||
pub callee: String,
|
||||
pub subjects: Vec<ValueRef>,
|
||||
|
|
@ -183,7 +210,7 @@ pub struct AnalysisUnit {
|
|||
/// "fetch-then-authorize" exemption in `checks.rs`: if a row-fetch
|
||||
/// operation produces variable `V` and SOME auth check elsewhere
|
||||
/// in the unit names `V`, the row-fetch operation is considered
|
||||
/// authorized — even though the check appears textually after the
|
||||
/// authorized, even though the check appears textually after the
|
||||
/// fetch. This is the standard idiom in row-level authz code:
|
||||
/// fetch the row first to extract the resource id, then call
|
||||
/// `check_<resource>_<role>(&user, &row, ...)` to authorize it.
|
||||
|
|
@ -199,7 +226,7 @@ pub struct AnalysisUnit {
|
|||
/// copies of `V.id` / `V.user_id` / `V.uid` / `V.userId` for some
|
||||
/// `V ∈ self_actor_vars`). Populated when the extractor sees
|
||||
/// `let X = V.id` or `let X = (V.id as ..).into()` / `V.id.into()`
|
||||
/// shapes — anywhere a route-handler reduces the authenticated
|
||||
/// shapes, anywhere a route-handler reduces the authenticated
|
||||
/// principal to a scalar id and reuses it as a SQL parameter.
|
||||
/// Consulted by `is_actor_context_subject` so subjects whose `name`
|
||||
/// is in this set count as actor context, not foreign scoped IDs.
|
||||
|
|
@ -217,7 +244,7 @@ pub struct AnalysisUnit {
|
|||
/// one of these names.
|
||||
pub authorized_sql_vars: HashSet<String>,
|
||||
/// Local variables bound (by `let`, `:=`, `var`, `const`) to a
|
||||
/// pure literal — string, integer, float, or boolean. These are
|
||||
/// pure literal, string, integer, float, or boolean. These are
|
||||
/// developer-chosen constants and cannot be user-controlled, so
|
||||
/// they must never trip `<lang>.auth.missing_ownership_check`
|
||||
/// even when the variable name passes `is_id_like`. Closes the
|
||||
|
|
@ -231,22 +258,21 @@ pub struct AnalysisUnit {
|
|||
/// `is_typed_bounded_subject` so parameters like Spring `Long
|
||||
/// userId`, Axum `Path<i64>`, or FastAPI `user_id: int` are not
|
||||
/// classified as scoped-identifier subjects even when their name
|
||||
/// passes `is_id_like` — the framework guarantees the value is a
|
||||
/// passes `is_id_like`, the framework guarantees the value is a
|
||||
/// number that cannot carry a SQL/file/shell payload.
|
||||
pub typed_bounded_vars: HashSet<String>,
|
||||
/// Phase 6: per-DTO-extractor parameter, the field names whose
|
||||
/// per-DTO-extractor parameter, the field names whose
|
||||
/// declared type is a payload-incompatible scalar. Map key is the
|
||||
/// parameter name (e.g. `dto`), value is the list of field names
|
||||
/// (e.g. `["age", "count"]`). Populated by
|
||||
/// [`super::apply_typed_bounded_params`] only when the parameter
|
||||
/// itself was recognised as a typed extractor by a Phase 1-2
|
||||
/// matcher — bare parameters with no framework gate never lift
|
||||
/// their fields.
|
||||
/// itself was recognised as a typed extractor, bare parameters
|
||||
/// with no framework gate never lift their fields.
|
||||
pub typed_bounded_dto_fields: HashMap<String, Vec<String>>,
|
||||
/// Per-unit dynamic session-base text set, supplementing the
|
||||
/// hard-coded list in `is_self_scoped_session_base`. Populated by
|
||||
/// the extractor when a parameter's static type signals a known
|
||||
/// auth-context shape — e.g. TRPC's `Options { ctx: { user:
|
||||
/// auth-context shape, e.g. TRPC's `Options { ctx: { user:
|
||||
/// NonNullable<TrpcSessionUser> } }` adds `<localCtx>.user` so
|
||||
/// downstream `ctx.user.id` accesses count as actor context. Each
|
||||
/// entry is the dotted base text (e.g. `"ctx.user"`,
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@
|
|||
pub enum SqlAuthClassification {
|
||||
/// Query is auth-gated. The JOIN (or direct WHERE) pins returned
|
||||
/// rows to the bound user. We don't track *which* bind position
|
||||
/// here — the caller treats whichever bind value flows into the
|
||||
/// here, the caller treats whichever bind value flows into the
|
||||
/// query as the user-id witness; that's safe because the caller
|
||||
/// already requires the row binding to come from a `let X = …`
|
||||
/// site we can name.
|
||||
|
|
@ -37,12 +37,12 @@ pub enum SqlAuthClassification {
|
|||
|
||||
/// Classify `sql` as auth-gated under the configured ACL tables.
|
||||
/// Returns `Some(Authorized)` when one of the recognized patterns
|
||||
/// holds, `None` otherwise (conservative — unknown shapes are treated
|
||||
/// holds, `None` otherwise (conservative, unknown shapes are treated
|
||||
/// as unauthorized).
|
||||
pub fn classify_sql_query(sql: &str, acl_tables: &[String]) -> Option<SqlAuthClassification> {
|
||||
let normalized = normalize_sql(sql);
|
||||
if !normalized.trim_start().starts_with("select") {
|
||||
// For B3 we only authorize SELECT queries — INSERT/UPDATE/DELETE
|
||||
// For B3 we only authorize SELECT queries, INSERT/UPDATE/DELETE
|
||||
// need their own analysis and aren't in scope. (A literal
|
||||
// `DELETE … WHERE user_id = ?N` could be safely authorized,
|
||||
// but the call sites we care about for FP suppression are
|
||||
|
|
@ -60,7 +60,7 @@ pub fn classify_sql_query(sql: &str, acl_tables: &[String]) -> Option<SqlAuthCla
|
|||
}
|
||||
|
||||
/// `SELECT … FROM <T> [AS] <ALIAS>? JOIN <ACL> [AS] <GA>? ON … WHERE
|
||||
/// <GA?>.user_id = ?N` — verifies that an ACL table appears in a JOIN
|
||||
/// <GA?>.user_id = ?N`, verifies that an ACL table appears in a JOIN
|
||||
/// clause and that the WHERE clause contains a `<…>.user_id = ?` (or
|
||||
/// bare `user_id = ?`) predicate. Order of the WHERE predicates
|
||||
/// doesn't matter; AND/OR connectors are ignored.
|
||||
|
|
@ -87,14 +87,14 @@ fn matches_join_through_acl(sql: &str, acl_tables: &[String]) -> bool {
|
|||
where_clause_contains_user_id_bind(where_clause)
|
||||
}
|
||||
|
||||
/// Direct ownership: `SELECT … FROM <T> WHERE … user_id = ?N` — no
|
||||
/// Direct ownership: `SELECT … FROM <T> WHERE … user_id = ?N`, no
|
||||
/// JOIN. Covers single-table reads where the row already carries the
|
||||
/// owning user id (`SELECT … FROM docs WHERE user_id = ?1`). We do
|
||||
/// NOT require `id = ?M` to also be present; the `user_id = ?N`
|
||||
/// predicate alone is sufficient, since any row returned must be
|
||||
/// owned by the bound user.
|
||||
///
|
||||
/// Refuses to fire when a JOIN is present — the JOIN target may not
|
||||
/// Refuses to fire when a JOIN is present, the JOIN target may not
|
||||
/// be in the ACL list, so the WHERE predicate (which may apply to
|
||||
/// the joined table, e.g. `WHERE al.user_id = ?N` against an
|
||||
/// `audit_log` JOIN) doesn't actually pin the primary rows to the
|
||||
|
|
@ -125,7 +125,7 @@ fn where_clause_contains_user_id_bind(where_clause: &str) -> bool {
|
|||
for (idx, _) in where_only.match_indices(needle) {
|
||||
// Make sure this is a column boundary on the left side
|
||||
// (avoid matching `posted_user_id` or `target_user_id`
|
||||
// — those don't pin to the actor).
|
||||
//, those don't pin to the actor).
|
||||
let before = where_only[..idx].chars().last();
|
||||
if !is_column_boundary_left(before) {
|
||||
continue;
|
||||
|
|
@ -158,11 +158,11 @@ fn looks_like_bind_param(after_eq: &str) -> bool {
|
|||
return false;
|
||||
}
|
||||
match bytes[0] {
|
||||
// ?N (sqlite/sqlx anonymous) — accept ?, ?1, ?2…
|
||||
// ?N (sqlite/sqlx anonymous), accept ?, ?1, ?2…
|
||||
b'?' => true,
|
||||
// $N (postgres style) — require a digit after.
|
||||
// $N (postgres style), require a digit after.
|
||||
b'$' => bytes.get(1).is_some_and(|b| b.is_ascii_digit()),
|
||||
// :name (named bind) — require an identifier char after.
|
||||
// :name (named bind), require an identifier char after.
|
||||
b':' => bytes
|
||||
.get(1)
|
||||
.is_some_and(|b| b.is_ascii_alphabetic() || *b == b'_'),
|
||||
|
|
@ -277,7 +277,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn join_against_non_acl_table_is_not_authorized() {
|
||||
// `audit_log` is not in the configured ACL list — JOIN doesn't
|
||||
// `audit_log` is not in the configured ACL list, JOIN doesn't
|
||||
// pin rows to the bound user, so the query is unauthorized.
|
||||
let sql = "SELECT d.* FROM docs d \
|
||||
JOIN audit_log al ON al.doc_id = d.id \
|
||||
|
|
@ -301,7 +301,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn similar_column_names_do_not_trip_user_id_match() {
|
||||
// `posted_user_id` shouldn't satisfy the `user_id = ?` check —
|
||||
// `posted_user_id` shouldn't satisfy the `user_id = ?` check ,
|
||||
// that column doesn't pin to the actor.
|
||||
let sql = "SELECT * FROM posts WHERE posted_user_id = ?1";
|
||||
assert_eq!(classify_sql_query(sql, &acl()), None);
|
||||
|
|
|
|||
203
src/callgraph.rs
203
src/callgraph.rs
|
|
@ -16,7 +16,7 @@ use std::path::{Path, PathBuf};
|
|||
#[derive(Debug, Clone)]
|
||||
pub struct CallEdge {
|
||||
/// The raw callee string as it appeared in source (e.g. `"env::var"`).
|
||||
/// Preserved for diagnostics — **not** the normalized form used for resolution.
|
||||
/// Preserved for diagnostics, **not** the normalized form used for resolution.
|
||||
#[allow(dead_code)] // used for future diagnostics and path display
|
||||
pub call_site: String,
|
||||
}
|
||||
|
|
@ -28,7 +28,7 @@ pub struct UnresolvedCallee {
|
|||
pub callee_name: String,
|
||||
}
|
||||
|
||||
/// A callee that matched multiple function definitions — ambiguous.
|
||||
/// A callee that matched multiple function definitions, ambiguous.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AmbiguousCallee {
|
||||
pub caller: FuncKey,
|
||||
|
|
@ -168,14 +168,14 @@ pub(crate) fn callee_container_hint(raw: &str) -> &str {
|
|||
///
|
||||
/// Key design notes:
|
||||
///
|
||||
/// * Keys are **language-scoped** — a Java `findById` and a Python
|
||||
/// * Keys are **language-scoped**, a Java `findById` and a Python
|
||||
/// `findById` never alias. Every other index in this module is also
|
||||
/// language-scoped (`by_lang_name`, `by_lang_qualified`); keeping the
|
||||
/// same partition here means devirtualisation's "subset of today's
|
||||
/// targets" invariant is structurally preserved.
|
||||
/// * The container key carries the [`FuncKey::container`] verbatim
|
||||
/// (e.g. `"Repository"` or nested `"Outer::Inner"`). Empty containers
|
||||
/// are not indexed in `by_container` — free top-level functions live
|
||||
/// are not indexed in `by_container`, free top-level functions live
|
||||
/// only in `by_name` and are looked up via the `None` container path.
|
||||
/// * `SmallVec` inline capacity is sized for the common case (≤ 2 same-
|
||||
/// container overloads, ≤ 4 same-name candidates across containers);
|
||||
|
|
@ -199,7 +199,7 @@ impl ClassMethodIndex {
|
|||
/// Iteration is over every `FuncKey` in the map; each key is
|
||||
/// inserted into `by_name` and (when its container is non-empty)
|
||||
/// into `by_container`. No ordering guarantees on the candidate
|
||||
/// vectors — call sites that need determinism should sort downstream.
|
||||
/// vectors, call sites that need determinism should sort downstream.
|
||||
pub fn build(summaries: &GlobalSummaries) -> Self {
|
||||
let mut by_container: HashMap<(Lang, String, String), SmallVec<[FuncKey; 2]>> =
|
||||
HashMap::new();
|
||||
|
|
@ -223,11 +223,11 @@ impl ClassMethodIndex {
|
|||
|
||||
/// Resolve `(container, method)` to its candidate target set.
|
||||
///
|
||||
/// * `container = Some(c)` — return only candidates whose defining
|
||||
/// * `container = Some(c)`, return only candidates whose defining
|
||||
/// container equals `c`. Empty slice when no such target exists,
|
||||
/// even if a same-name function lives in another container.
|
||||
/// This is the **devirtualised** path: a hard subset of `by_name`.
|
||||
/// * `container = None` — return every same-name candidate in the
|
||||
/// * `container = None`, return every same-name candidate in the
|
||||
/// language. This is the **fallback** path used when the receiver
|
||||
/// type is unknown; matches today's name-only behaviour.
|
||||
///
|
||||
|
|
@ -264,48 +264,19 @@ impl ClassMethodIndex {
|
|||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Type hierarchy index — Phase 6 (subtype awareness)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// ── Type hierarchy index ────────────────────────────────────────────────
|
||||
|
||||
/// Per-language `(super_type) → SmallVec<[sub_type]>` index built once
|
||||
/// per call-graph construction from every merged
|
||||
/// [`crate::summary::FuncSummary::hierarchy_edges`]. When a method
|
||||
/// call's receiver is statically typed as a super-class / trait /
|
||||
/// interface, the call-graph wedge fans out the edge to every concrete
|
||||
/// implementer's matching method — recovering the dispatch precision
|
||||
/// that would otherwise be lost to today's name-only resolution.
|
||||
/// Per-language `(super_type) → sub-types` index built from every merged
|
||||
/// [`crate::summary::FuncSummary::hierarchy_edges`]. Lets virtual
|
||||
/// dispatch fan out to every concrete implementer's matching method.
|
||||
///
|
||||
/// Subtype semantics covered:
|
||||
/// * Java `class X extends Y` / `class X implements I` / `interface
|
||||
/// I extends J`
|
||||
/// * Rust `impl Trait for Type`
|
||||
/// * TypeScript `class X extends Y implements I` /
|
||||
/// `interface I extends J`
|
||||
/// * Python `class X(Base)` (excludes `object`)
|
||||
/// * PHP, Ruby, C++ — see [`crate::cfg::hierarchy`] for the
|
||||
/// per-language extraction rules.
|
||||
/// Covers Java `extends`/`implements`, Rust `impl Trait for Type`, TS
|
||||
/// `extends`/`implements`, Python `class X(Base)`, plus PHP/Ruby/C++
|
||||
/// (see [`crate::cfg::hierarchy`]). Go's structural interfaces are
|
||||
/// intentionally omitted, name-only resolution is used instead.
|
||||
///
|
||||
/// Go's structural / implicit interface satisfaction is intractable to
|
||||
/// enumerate from per-file information and is **deliberately omitted**
|
||||
/// — Go callers fall back to today's name-only resolution, so
|
||||
/// precision is unchanged from the pre-Phase-6 baseline.
|
||||
///
|
||||
/// Key design notes
|
||||
/// ────────────────
|
||||
///
|
||||
/// * **Language-scoped.** Mirrors [`ClassMethodIndex`]: a Java
|
||||
/// `Repository` and a Python `Repository` never alias.
|
||||
/// * **Bare container names.** No namespace qualification. When
|
||||
/// container names alias across unrelated namespaces (rare in
|
||||
/// practice, common in mono-repos) the resolver may over-fan-out;
|
||||
/// that is conservative for *correctness* (a subset of dispatch
|
||||
/// targets is unsafe — virtual dispatch may genuinely reach any
|
||||
/// implementer) and may need namespace-qualified keying as a
|
||||
/// Phase 6.5 follow-up if benchmark precision regresses.
|
||||
/// * **`SmallVec` inline capacity.** 4 implementers per super-type
|
||||
/// covers most real-world hierarchies without spillover; spillover
|
||||
/// allocates but keeps lookups O(1) amortised.
|
||||
/// Container names are bare (no namespace), so cross-namespace aliases
|
||||
/// may over-fan-out. That is conservative for correctness.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct TypeHierarchyIndex {
|
||||
/// `(lang, super_type)` → distinct sub-type / impl container names.
|
||||
|
|
@ -438,15 +409,11 @@ impl TypeHierarchyIndex {
|
|||
/// 3. On ambiguity: use two-segment qualified name to narrow candidates
|
||||
/// 4. Interop edges (explicit cross-language bridges)
|
||||
///
|
||||
/// **Phase 3 (typed call-graph devirtualisation):** when an SSA
|
||||
/// summary on the caller carries a `(call_ordinal, container_name)`
|
||||
/// entry in [`crate::summary::ssa_summary::SsaFuncSummary::typed_call_receivers`],
|
||||
/// the matching call site is first resolved via [`ClassMethodIndex`]
|
||||
/// restricted to the receiver-typed container. An exact match (after
|
||||
/// arity filter) becomes the edge; a multi-candidate hit is fed back
|
||||
/// into the standard resolver via `CalleeQuery.receiver_type`; a
|
||||
/// zero-candidate hit falls through to today's name-only resolution
|
||||
/// so receiver-type misclassifications never silently drop edges.
|
||||
/// Typed-call devirtualisation: when the caller's SSA summary carries
|
||||
/// a typed container for a call ordinal, that site is first resolved
|
||||
/// via [`ClassMethodIndex`] restricted to the receiver type. Exact
|
||||
/// match → edge; multi-candidate → fed back through
|
||||
/// `CalleeQuery.receiver_type`; zero match → name-only fallback.
|
||||
///
|
||||
/// Unresolved and ambiguous callees are recorded for diagnostics but
|
||||
/// do **not** create edges.
|
||||
|
|
@ -460,7 +427,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
index.insert(key.clone(), idx);
|
||||
}
|
||||
|
||||
// Phase 3: build a single `(lang, container, name) → candidates`
|
||||
// build a single `(lang, container, name) → candidates`
|
||||
// index from the merged summaries. Used below to devirtualise
|
||||
// every method-call edge whose receiver has a recoverable type
|
||||
// fact. Cost is one allocation per FuncKey across the program;
|
||||
|
|
@ -468,7 +435,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
// win on codebases with many same-name methods.
|
||||
let method_index = ClassMethodIndex::build(summaries);
|
||||
|
||||
// Phase 6: build a sibling `(lang, super_type) → sub_types` index
|
||||
// build a sibling `(lang, super_type) → sub_types` index
|
||||
// from every merged summary's `hierarchy_edges`. Consumed below
|
||||
// to fan out method-call edges to all known concrete
|
||||
// implementers when a receiver's static type is a super-class /
|
||||
|
|
@ -497,7 +464,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
None
|
||||
};
|
||||
|
||||
// Phase 3: per-caller `(call_ordinal → container_name)` map
|
||||
// per-caller `(call_ordinal → container_name)` map
|
||||
// pulled from the caller's SSA summary, when one exists.
|
||||
// Empty when the caller has no SSA summary (zero-param trivial
|
||||
// bodies skip extraction unless they had typed receivers) or
|
||||
|
|
@ -520,23 +487,15 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
let leaf = callee_leaf_name(raw_callee);
|
||||
// Two-segment form for diagnostics / fallback disambiguation.
|
||||
let qualified = normalize_callee_name(raw_callee);
|
||||
// Structured arity carried per call site — used to disambiguate
|
||||
// Structured arity carried per call site, used to disambiguate
|
||||
// same-name/different-arity overloads during resolution.
|
||||
let arity_hint: Option<usize> = site.arity;
|
||||
|
||||
// Phase 3 devirtualisation entry point. Only fires for
|
||||
// method calls (sites carrying a structured receiver) when
|
||||
// the caller's SSA summary recorded a typed container for
|
||||
// this ordinal. When `Some(container)` resolves to a
|
||||
// single arity-matching target, we add the edge and skip
|
||||
// the standard resolver. When it resolves to multiple,
|
||||
// we fall through with the container hinted as
|
||||
// `receiver_type` so `resolve_callee`'s authoritative
|
||||
// step-1 picks the right one. When it resolves to zero,
|
||||
// we fall through entirely so today's name-only path can
|
||||
// still find the edge — preserving the
|
||||
// "subset of today's targets, never a superset" rule
|
||||
// even under type-fact misclassification.
|
||||
// Devirtualisation: for method calls whose SSA summary
|
||||
// recorded a typed container, resolve via ClassMethodIndex
|
||||
// first. Single match → direct edge; multi → fall through
|
||||
// with `receiver_type` set; zero → name-only fallback so
|
||||
// misclassified receivers never silently drop edges.
|
||||
let typed_container: Option<&str> = if site.receiver.is_some() {
|
||||
typed_receivers.get(&site.ordinal).copied()
|
||||
} else {
|
||||
|
|
@ -544,12 +503,10 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
};
|
||||
|
||||
if let Some(container) = typed_container {
|
||||
// Phase 6: resolve the typed container *plus* every
|
||||
// known sub-type / impl in the hierarchy index, so a
|
||||
// receiver typed as a super-class / trait / interface
|
||||
// fans out to every concrete implementer. When the
|
||||
// hierarchy has no matching super-type entry, this
|
||||
// collapses to the Phase 3 direct-container lookup.
|
||||
// Resolve the typed container plus every known
|
||||
// sub-type / impl, so a super-class / trait / interface
|
||||
// receiver fans out to every concrete implementer.
|
||||
// No hierarchy entry → direct-container lookup.
|
||||
let widened: Vec<FuncKey> = hierarchy.resolve_with_hierarchy(
|
||||
&method_index,
|
||||
caller_key.lang,
|
||||
|
|
@ -575,8 +532,8 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
}
|
||||
continue;
|
||||
}
|
||||
// Phase 6: multiple arity-filtered candidates means
|
||||
// genuine virtual dispatch through a super-type — fan
|
||||
// multiple arity-filtered candidates means
|
||||
// genuine virtual dispatch through a super-type, fan
|
||||
// out to *every* implementer. This widens edges
|
||||
// (correctly: the call genuinely may target any
|
||||
// implementer at runtime) so SCC sizes may grow on
|
||||
|
|
@ -614,7 +571,7 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
continue;
|
||||
}
|
||||
// Either zero matches (fall through to legacy path) or
|
||||
// multiple matches on the direct container — let
|
||||
// multiple matches on the direct container, let
|
||||
// `resolve_callee` apply its authoritative
|
||||
// receiver_type filter + tie-breakers.
|
||||
if !arity_filtered.is_empty() {
|
||||
|
|
@ -652,8 +609,8 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
|
||||
// Rust callers with a module-qualified call (no receiver) go
|
||||
// through the `use`-map aware resolver first. When the call has
|
||||
// a structured receiver it is a method call — the qualifier is
|
||||
// an impl/trait name, not a module path — so we fall back to the
|
||||
// a structured receiver it is a method call, the qualifier is
|
||||
// an impl/trait name, not a module path, so we fall back to the
|
||||
// structured resolver. All other languages skip the use-map
|
||||
// branch entirely.
|
||||
let use_rust_path = caller_key.lang == Lang::Rust && site.receiver.is_none();
|
||||
|
|
@ -671,11 +628,11 @@ pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdg
|
|||
// categorize each hint so the resolver can apply the right
|
||||
// policy:
|
||||
//
|
||||
// * `namespace_qualifier` — structured module/namespace
|
||||
// * `namespace_qualifier`, structured module/namespace
|
||||
// prefix (`env` in `env::var`, `http` in `http.Get`).
|
||||
// * `receiver_var` — syntactic receiver variable (e.g.
|
||||
// * `receiver_var`, syntactic receiver variable (e.g.
|
||||
// `obj` in `obj.method`); used only as a last tie-break.
|
||||
// * `caller_container` — caller's own class/impl, so bare
|
||||
// * `caller_container`, caller's own class/impl, so bare
|
||||
// `foo()` inside a method resolves to the same class.
|
||||
//
|
||||
// The raw text-parsed container (legacy
|
||||
|
|
@ -815,7 +772,7 @@ fn resolve_via_interop(
|
|||
/// Compute SCC decomposition and topological ordering of the call graph.
|
||||
///
|
||||
/// `petgraph::algo::tarjan_scc` returns SCCs in *reverse* topological order
|
||||
/// of the condensation DAG — i.e. leaf SCCs (no outgoing cross-SCC edges)
|
||||
/// of the condensation DAG, i.e. leaf SCCs (no outgoing cross-SCC edges)
|
||||
/// come **first**. That is exactly the **callee-first** order suitable for
|
||||
/// bottom-up taint propagation.
|
||||
pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
|
||||
|
|
@ -850,7 +807,7 @@ pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
|
|||
/// [`crate::commands::scan::run_topo_batches`]. `cross_file` is a tighter
|
||||
/// signal used by joint fixed-point convergence: it implies the
|
||||
/// recursion involves at least one cross-file call edge, so the inline
|
||||
/// cache and per-iteration findings need joint convergence — not just
|
||||
/// cache and per-iteration findings need joint convergence, not just
|
||||
/// summary convergence.
|
||||
pub struct FileBatch<'a> {
|
||||
pub files: Vec<&'a PathBuf>,
|
||||
|
|
@ -901,7 +858,7 @@ pub fn callers_of(cg: &CallGraph, callee: &FuncKey) -> Vec<FuncKey> {
|
|||
/// result is a `HashSet<String>` suitable for membership checks while
|
||||
/// filtering the batch's file list.
|
||||
///
|
||||
/// A changed callee's *own* namespace is also included — if the
|
||||
/// A changed callee's *own* namespace is also included, if the
|
||||
/// callee's summary was refined, the file it lives in may itself
|
||||
/// have been a caller (intra-file recursion) or may carry sibling
|
||||
/// functions whose analysis should be re-run alongside the callee
|
||||
|
|
@ -958,7 +915,7 @@ pub fn scc_file_batches_with_metadata<'a>(
|
|||
|
||||
// 2. Build file relative-path → (min topo index, has_mutual_recursion, cross_file).
|
||||
// `cross_file` is set whenever the file participates in an SCC whose
|
||||
// nodes span more than one namespace — the cross-file signal.
|
||||
// nodes span more than one namespace, the cross-file signal.
|
||||
let mut file_topo: HashMap<&str, (usize, bool, bool)> = HashMap::new();
|
||||
for (topo_pos, &scc_idx) in analysis.topo_scc_callee_first.iter().enumerate() {
|
||||
let scc_recursive = analysis.sccs[scc_idx].len() > 1;
|
||||
|
|
@ -1015,7 +972,7 @@ pub fn scc_file_batches_with_metadata<'a>(
|
|||
/// of its functions appear. This ensures leaf callees are available as early
|
||||
/// as possible for files that depend on them. Caller functions in the same
|
||||
/// file that happen to be in a later SCC are no worse off than the current
|
||||
/// fully-parallel approach — they simply don't yet benefit from ordering,
|
||||
/// fully-parallel approach, they simply don't yet benefit from ordering,
|
||||
/// but nothing is lost.
|
||||
///
|
||||
/// Returns `(ordered_batches, orphan_files)` where orphan_files are paths
|
||||
|
|
@ -1188,7 +1145,7 @@ mod tests {
|
|||
fn same_name_python_and_rust() {
|
||||
let py_foo = make_summary("foo", "handler.py", "python", 0, vec![]);
|
||||
let rs_foo = make_summary("foo", "handler.rs", "rust", 0, vec![]);
|
||||
// Python caller calls "foo" — should only see the Python one
|
||||
// Python caller calls "foo", should only see the Python one
|
||||
let py_caller = make_summary("main", "app.py", "python", 0, vec!["foo"]);
|
||||
|
||||
let gs = merge_summaries(vec![py_foo, rs_foo, py_caller], None);
|
||||
|
|
@ -1315,7 +1272,7 @@ mod tests {
|
|||
let gs = merge_summaries(vec![helper_a, helper_b, caller], None);
|
||||
let cg = build_call_graph(&gs, &[]);
|
||||
|
||||
assert_eq!(cg.graph.edge_count(), 0); // no edge — ambiguous
|
||||
assert_eq!(cg.graph.edge_count(), 0); // no edge, ambiguous
|
||||
assert!(cg.unresolved_not_found.is_empty());
|
||||
assert_eq!(cg.unresolved_ambiguous.len(), 1);
|
||||
assert_eq!(cg.unresolved_ambiguous[0].callee_name, "helper");
|
||||
|
|
@ -1728,7 +1685,7 @@ mod tests {
|
|||
// Two "send" functions in different namespaces.
|
||||
let send_http = make_summary("send", "src/http.rs", "rust", 0, vec![]);
|
||||
let send_mail = make_summary("send", "src/mail.rs", "rust", 0, vec![]);
|
||||
// Caller is in a third namespace, calling "http::send" — leaf "send"
|
||||
// Caller is in a third namespace, calling "http::send", leaf "send"
|
||||
// is ambiguous, but "http" qualifier should match "src/http.rs".
|
||||
let caller = make_summary("caller", "src/main.rs", "rust", 0, vec!["http::send"]);
|
||||
|
||||
|
|
@ -1766,7 +1723,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn unqualified_callee_stays_ambiguous() {
|
||||
// Same setup but caller uses unqualified "send" — no disambiguation
|
||||
// Same setup but caller uses unqualified "send", no disambiguation
|
||||
let send_http = make_summary("send", "src/http.rs", "rust", 0, vec![]);
|
||||
let send_mail = make_summary("send", "src/mail.rs", "rust", 0, vec![]);
|
||||
let caller = make_summary("caller", "src/main.rs", "rust", 0, vec!["send"]);
|
||||
|
|
@ -1806,7 +1763,7 @@ mod tests {
|
|||
// ── structured-metadata disambiguation (callee metadata) ─────────────
|
||||
|
||||
/// Helper: build a summary whose callees carry structured CalleeSite
|
||||
/// metadata — used by the tests below to exercise arity / receiver /
|
||||
/// metadata, used by the tests below to exercise arity / receiver /
|
||||
/// qualifier propagation into resolution.
|
||||
fn summary_with_sites(
|
||||
name: &str,
|
||||
|
|
@ -1840,7 +1797,7 @@ mod tests {
|
|||
// Two `encode` functions in the same file, different arities.
|
||||
let encode1 = make_summary("encode", "src/codec.rs", "rust", 1, vec![]);
|
||||
let encode2 = make_summary("encode", "src/codec.rs", "rust", 2, vec![]);
|
||||
// Caller lives in *another* file so namespace does not disambiguate —
|
||||
// Caller lives in *another* file so namespace does not disambiguate ,
|
||||
// the only signal is the per-call-site arity.
|
||||
let caller = summary_with_sites(
|
||||
"driver",
|
||||
|
|
@ -2007,7 +1964,7 @@ mod tests {
|
|||
#[test]
|
||||
fn legacy_string_callees_still_resolve() {
|
||||
let helper = make_summary("helper", "src/lib.rs", "rust", 0, vec![]);
|
||||
// make_summary already returns CalleeSite::bare entries — i.e. the
|
||||
// make_summary already returns CalleeSite::bare entries, i.e. the
|
||||
// "lifted legacy" form with no arity or receiver metadata.
|
||||
let caller = make_summary("main", "src/lib.rs", "rust", 0, vec!["helper"]);
|
||||
let gs = merge_summaries(vec![helper, caller], None);
|
||||
|
|
@ -2017,7 +1974,7 @@ mod tests {
|
|||
assert!(cg.unresolved_ambiguous.is_empty());
|
||||
}
|
||||
|
||||
// ── ClassMethodIndex (Phase 1: structural index, no behaviour wiring) ──
|
||||
// ── ClassMethodIndex ────────────────────────────────────────────────
|
||||
|
||||
/// Helper: `(name, container)` pairs in the same file. Builds two
|
||||
/// summaries with the same leaf name on different containers so the
|
||||
|
|
@ -2058,7 +2015,7 @@ mod tests {
|
|||
assert_eq!(cache_hits.len(), 1);
|
||||
assert_eq!(cache_hits[0].container, "Cache");
|
||||
|
||||
// Bare-name lookup keeps both candidates — fallback behaviour.
|
||||
// Bare-name lookup keeps both candidates, fallback behaviour.
|
||||
let bare_hits = idx.resolve(Lang::Rust, None, "findById");
|
||||
assert_eq!(
|
||||
bare_hits.len(),
|
||||
|
|
@ -2070,7 +2027,7 @@ mod tests {
|
|||
#[test]
|
||||
fn class_method_index_falls_back_to_name_when_container_unknown() {
|
||||
// `None` container or empty-string container both route to
|
||||
// the bare-name index — equivalent to today's name-only edge
|
||||
// the bare-name index, equivalent to today's name-only edge
|
||||
// insertion.
|
||||
let svc = make_method_summary("process", "OrderService", "src/svc.rs", "rust", 1);
|
||||
let helper = make_summary("process", "src/util.rs", "rust", 1, vec![]);
|
||||
|
|
@ -2082,7 +2039,7 @@ mod tests {
|
|||
let none_hits = idx.resolve(Lang::Rust, None, "process");
|
||||
assert_eq!(none_hits.len(), 2);
|
||||
|
||||
// Empty string container behaves identically to None — it is
|
||||
// Empty string container behaves identically to None, it is
|
||||
// not stored under any container key.
|
||||
let empty_hits = idx.resolve(Lang::Rust, Some(""), "process");
|
||||
assert_eq!(empty_hits.len(), 2);
|
||||
|
|
@ -2107,7 +2064,7 @@ mod tests {
|
|||
.is_empty()
|
||||
);
|
||||
// Right method, wrong container → empty (no fallback to bare-name
|
||||
// when a container is supplied — that's the whole devirtualisation
|
||||
// when a container is supplied, that's the whole devirtualisation
|
||||
// promise).
|
||||
assert!(
|
||||
idx.resolve(Lang::Rust, Some("OtherClass"), "findById")
|
||||
|
|
@ -2140,7 +2097,7 @@ mod tests {
|
|||
#[test]
|
||||
fn class_method_index_handles_arity_overloads() {
|
||||
// Two arity overloads on the same container are both kept under
|
||||
// the same `(container, name)` key — arity narrowing is the
|
||||
// the same `(container, name)` key, arity narrowing is the
|
||||
// caller's responsibility (today's resolver also does this).
|
||||
let one = make_method_summary("encode", "Codec", "src/codec.rs", "rust", 1);
|
||||
let two = make_method_summary("encode", "Codec", "src/codec.rs", "rust", 2);
|
||||
|
|
@ -2156,7 +2113,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
// ── Phase 3: devirtualised edge insertion via typed_call_receivers ──
|
||||
// ── devirtualised edge insertion via typed_call_receivers ──
|
||||
|
||||
/// Two `findById` definitions live on different containers in
|
||||
/// different files. A caller whose SSA summary records the
|
||||
|
|
@ -2241,7 +2198,7 @@ mod tests {
|
|||
use crate::summary::ssa_summary::SsaFuncSummary;
|
||||
|
||||
// Single `process` on `Worker`. No `process` exists on
|
||||
// `Other` — that's the receiver type the caller's SSA
|
||||
// `Other`, that's the receiver type the caller's SSA
|
||||
// summary will (incorrectly) record.
|
||||
let worker = make_method_summary("process", "Worker", "src/worker.rs", "rust", 1);
|
||||
let caller = summary_with_sites(
|
||||
|
|
@ -2270,7 +2227,7 @@ mod tests {
|
|||
gs.insert_ssa(
|
||||
caller_key.clone(),
|
||||
SsaFuncSummary {
|
||||
// Wrong receiver type — `Other::process` does not exist.
|
||||
// Wrong receiver type, `Other::process` does not exist.
|
||||
typed_call_receivers: vec![(0, "Other".to_string())],
|
||||
..Default::default()
|
||||
},
|
||||
|
|
@ -2292,7 +2249,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
// ── Phase 6: TypeHierarchyIndex ───────────────────────────────────
|
||||
// ── TypeHierarchyIndex ───────────────────────────────────
|
||||
|
||||
/// Helper: build a hierarchy index from a list of
|
||||
/// `(lang, sub, super)` edges by injecting them onto a single
|
||||
|
|
@ -2334,7 +2291,7 @@ mod tests {
|
|||
TypeHierarchyIndex::build(&gs)
|
||||
}
|
||||
|
||||
/// B-1: Round-trip — a hierarchy built from a small set of edges
|
||||
/// B-1: Round-trip, a hierarchy built from a small set of edges
|
||||
/// answers `subs_of` correctly and `super_keys_len` matches the
|
||||
/// distinct super count.
|
||||
#[test]
|
||||
|
|
@ -2356,7 +2313,7 @@ mod tests {
|
|||
assert_eq!(h.super_keys_len(), 2);
|
||||
}
|
||||
|
||||
/// B-2: Java interface dispatch — `Repository r; r.findById(...)`
|
||||
/// B-2: Java interface dispatch, `Repository r; r.findById(...)`
|
||||
/// fans out to every concrete implementer's `findById`.
|
||||
#[test]
|
||||
fn b2_java_interface_dispatch_fans_out_to_all_impls() {
|
||||
|
|
@ -2421,7 +2378,7 @@ mod tests {
|
|||
assert_eq!(targets.len(), 2, "B-2: exactly two fan-out edges expected");
|
||||
}
|
||||
|
||||
/// B-3: Java extends — `Base b; b.foo()` reaches Base AND Derived
|
||||
/// B-3: Java extends, `Base b; b.foo()` reaches Base AND Derived
|
||||
/// when Derived extends Base. Pins inheritance fan-out separately
|
||||
/// from interface implements.
|
||||
#[test]
|
||||
|
|
@ -2479,7 +2436,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// B-4: Rust trait dispatch — `Box<dyn Repo>; r.find(...)` reaches
|
||||
/// B-4: Rust trait dispatch, `Box<dyn Repo>; r.find(...)` reaches
|
||||
/// every `impl Repo for X` `find`.
|
||||
#[test]
|
||||
fn b4_rust_trait_dispatch_fans_out_to_impls() {
|
||||
|
|
@ -2536,10 +2493,9 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// B-7: Empty hierarchy — when the typed container has no recorded
|
||||
/// B-7: Empty hierarchy, when the typed container has no recorded
|
||||
/// sub-types, `resolve_with_hierarchy` collapses to the direct
|
||||
/// `ClassMethodIndex::resolve` lookup. Pin: Phase 6 is a no-op
|
||||
/// when no inheritance was extracted.
|
||||
/// `ClassMethodIndex::resolve` lookup.
|
||||
#[test]
|
||||
fn b7_empty_hierarchy_falls_back_to_single_container() {
|
||||
use crate::summary::ssa_summary::SsaFuncSummary;
|
||||
|
|
@ -2561,7 +2517,7 @@ mod tests {
|
|||
);
|
||||
|
||||
let mut gs = merge_summaries(vec![repo, cache, caller], None);
|
||||
// No hierarchy_edges set anywhere — Repository has no
|
||||
// No hierarchy_edges set anywhere, Repository has no
|
||||
// sub-types, so devirtualisation collapses to direct match.
|
||||
let caller_key = FuncKey {
|
||||
lang: Lang::Rust,
|
||||
|
|
@ -2589,10 +2545,9 @@ mod tests {
|
|||
assert_eq!(targets[0].container, "Repository");
|
||||
}
|
||||
|
||||
/// B-8: Concrete sub-type — when the receiver is typed as the
|
||||
/// B-8: Concrete sub-type, when the receiver is typed as the
|
||||
/// concrete sub-class (not the super-type), no hierarchy
|
||||
/// expansion fires. Pin: Phase 6 narrows on concrete types
|
||||
/// exactly like Phase 3.
|
||||
/// expansion fires.
|
||||
#[test]
|
||||
fn b8_concrete_subtype_does_not_widen() {
|
||||
use crate::summary::ssa_summary::SsaFuncSummary;
|
||||
|
|
@ -2654,7 +2609,7 @@ mod tests {
|
|||
assert_eq!(targets[0].container, "UserRepo");
|
||||
}
|
||||
|
||||
/// B-9: Diamond — multiple impls sharing a super-type, dedup
|
||||
/// B-9: Diamond, multiple impls sharing a super-type, dedup
|
||||
/// applied per call site so each FuncKey is edged at most once.
|
||||
#[test]
|
||||
fn b9_diamond_dedup_one_edge_per_funckey() {
|
||||
|
|
@ -2662,7 +2617,7 @@ mod tests {
|
|||
|
||||
let a = make_method_summary("doIt", "A", "src/A.java", "java", 0);
|
||||
let b = make_method_summary("doIt", "B", "src/B.java", "java", 0);
|
||||
// A and B both extend Iface in two separate file emissions —
|
||||
// A and B both extend Iface in two separate file emissions ,
|
||||
// hierarchy_edges duplicates across files; dedup expected.
|
||||
let mut h1 = make_method_summary("__h", "Iface", "src/I1.java", "java", 0);
|
||||
h1.hierarchy_edges = vec![
|
||||
|
|
@ -2722,7 +2677,7 @@ mod tests {
|
|||
assert!(containers.contains("A") && containers.contains("B"));
|
||||
}
|
||||
|
||||
/// B-13: Stale hierarchy edge — sub-type referenced by an edge
|
||||
/// B-13: Stale hierarchy edge, sub-type referenced by an edge
|
||||
/// no longer has a matching FuncKey. Resolver must not panic
|
||||
/// and must still resolve to whatever IS present.
|
||||
#[test]
|
||||
|
|
@ -2730,7 +2685,7 @@ mod tests {
|
|||
use crate::summary::ssa_summary::SsaFuncSummary;
|
||||
|
||||
// `Base` exists; `Derived` referenced by hierarchy_edges but
|
||||
// its `foo` is never defined. Phase 6 must not panic and
|
||||
// its `foo` is never defined. Resolver must not panic and
|
||||
// must still emit the Base::foo edge.
|
||||
let base = make_method_summary("foo", "Base", "src/Base.java", "java", 0);
|
||||
let mut h = make_method_summary("__h", "X", "src/X.java", "java", 0);
|
||||
|
|
@ -2815,7 +2770,7 @@ mod tests {
|
|||
arity: Some(0),
|
||||
..Default::default()
|
||||
};
|
||||
// A typed_call_receivers entry with ordinal=0 — but since the
|
||||
// A typed_call_receivers entry with ordinal=0, but since the
|
||||
// site has receiver=None, this MUST be ignored.
|
||||
gs.insert_ssa(
|
||||
caller_key.clone(),
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ use tree_sitter::Node;
|
|||
/// at the *case-level* shape `build_switch` sees here. Rust `match`, Go
|
||||
/// `switch`, and Java arrow-switches qualify; classic Java/C/C++/JS switches
|
||||
/// with fall-through do not. The check is per-language because Java mixes
|
||||
/// arrow and classic shapes — that's handled by inspecting the case kind in
|
||||
/// arrow and classic shapes, that's handled by inspecting the case kind in
|
||||
/// [`extract_case_literal_text`].
|
||||
fn lang_has_exclusive_cases(lang: &str) -> bool {
|
||||
matches!(lang, "rust" | "go")
|
||||
|
|
@ -19,7 +19,7 @@ fn lang_has_exclusive_cases(lang: &str) -> bool {
|
|||
/// Extract the scrutinee subtree from a switch-like AST node.
|
||||
///
|
||||
/// Returns the AST node referenced by the language's scrutinee field. Only
|
||||
/// fires for Rust `match`, Go `switch`, and Java `switch` statements — other
|
||||
/// fires for Rust `match`, Go `switch`, and Java `switch` statements, other
|
||||
/// languages return `None` so [`build_switch`] keeps its legacy behavior.
|
||||
fn extract_scrutinee_node<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
|
||||
let field = match lang {
|
||||
|
|
@ -39,7 +39,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
|
|||
let kind = case.kind();
|
||||
match (lang, kind) {
|
||||
("rust", "match_arm") => {
|
||||
// Reject guarded arms — `match x { y if cond => ... }`.
|
||||
// Reject guarded arms, `match x { y if cond => ... }`.
|
||||
if case.child_by_field_name("guard").is_some() {
|
||||
return None;
|
||||
}
|
||||
|
|
@ -71,7 +71,7 @@ fn extract_case_literal_text<'a>(case: Node<'a>, lang: &str, code: &'a [u8]) ->
|
|||
text_of(inner, code)
|
||||
}
|
||||
("go", "expression_case") => {
|
||||
// Go case `case v1, v2: ...` — only handle exactly one expression.
|
||||
// Go case `case v1, v2: ...`, only handle exactly one expression.
|
||||
let value = case.child_by_field_name("value")?;
|
||||
let mut named_children: Vec<Node> = Vec::new();
|
||||
let mut cursor = value.walk();
|
||||
|
|
@ -195,7 +195,7 @@ pub(super) fn extract_catch_param_name<'a>(
|
|||
// -------------------------------------------------------------------------
|
||||
|
||||
/// Builds CFG for Ruby's `begin`/`rescue`/`ensure` blocks (and `body_statement`
|
||||
/// with inline rescue). Ruby's `begin` has no `body` field — the try-body
|
||||
/// with inline rescue). Ruby's `begin` has no `body` field, the try-body
|
||||
/// statements are direct children before `rescue`/`else`/`ensure` nodes.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn build_begin_rescue<'a>(
|
||||
|
|
@ -305,7 +305,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
|
||||
vec![synth]
|
||||
} else {
|
||||
// No param name — will wire exception edges to first rescue body node
|
||||
// No param name, will wire exception edges to first rescue body node
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
|
|
@ -333,7 +333,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
current_body_id,
|
||||
)
|
||||
} else {
|
||||
// No body field — build rescue node itself as a block.
|
||||
// No body field, build rescue node itself as a block.
|
||||
// Filter out meta-children (exceptions, exception_variable) by
|
||||
// iterating and building only statement children.
|
||||
let mut rescue_cursor = rescue_node.walk();
|
||||
|
|
@ -407,7 +407,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
try_exits
|
||||
};
|
||||
|
||||
// 6. Build ensure clause (Ruby's finally — always runs)
|
||||
// 6. Build ensure clause (Ruby's finally, always runs)
|
||||
if let Some(ensure_node) = ensure_clause {
|
||||
let mut ensure_preds: Vec<NodeIndex> = Vec::new();
|
||||
ensure_preds.extend(&normal_exits);
|
||||
|
|
@ -443,7 +443,7 @@ pub(super) fn build_begin_rescue<'a>(
|
|||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// switch handler — multi-way dispatch with fallthrough
|
||||
// switch handler, multi-way dispatch with fallthrough
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/// True for AST kinds that wrap a single switch case body.
|
||||
|
|
@ -490,7 +490,7 @@ pub(super) fn case_has_default_label(case: Node<'_>) -> bool {
|
|||
/// Build CFG for a switch statement.
|
||||
///
|
||||
/// The dispatch is decomposed into a chain of binary `StmtKind::If` headers
|
||||
/// — one per non-default case — because the SSA terminator only models 0/1/2
|
||||
///, one per non-default case, because the SSA terminator only models 0/1/2
|
||||
/// successors. A monolithic N-way header would otherwise be collapsed to
|
||||
/// `Goto(first)` and silently drop every other case. Each header's True edge
|
||||
/// reaches its case body; the False edge falls through to the next header (or
|
||||
|
|
@ -544,7 +544,7 @@ pub(super) fn build_switch<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
// Grammar didn't expose recognisable case nodes — fall back to a single
|
||||
// Grammar didn't expose recognisable case nodes, fall back to a single
|
||||
// header + Block-style walk so nodes still get linked.
|
||||
if cases.is_empty() {
|
||||
let header = push_node(
|
||||
|
|
@ -603,7 +603,7 @@ pub(super) fn build_switch<'a>(
|
|||
// arrow-switch), pre-extract the scrutinee text + idents so the synthetic
|
||||
// dispatch headers can carry a `<scrutinee> == <case_literal>` condition.
|
||||
// Falls back to `None` when the scrutinee is structurally complex (calls,
|
||||
// member chains, parenthesized expressions in Go) — the existing first-
|
||||
// member chains, parenthesized expressions in Go), the existing first-
|
||||
// reachable behavior remains correct in that case.
|
||||
let supports_exclusive_cases = lang_has_exclusive_cases(lang) || lang == "java";
|
||||
let (scrutinee_text, scrutinee_idents) = if supports_exclusive_cases {
|
||||
|
|
@ -647,7 +647,7 @@ pub(super) fn build_switch<'a>(
|
|||
for (idx, (case, is_default)) in cases.iter().copied().enumerate() {
|
||||
let is_last = idx + 1 == cases.len();
|
||||
|
||||
// Default at the chain tail doesn't get its own dispatch If — the
|
||||
// Default at the chain tail doesn't get its own dispatch If, the
|
||||
// previous header's False edge already targets it directly.
|
||||
let case_first_preds: Vec<NodeIndex> = if is_default && is_last {
|
||||
// First node of the default body becomes the False target of the
|
||||
|
|
@ -675,12 +675,13 @@ pub(super) fn build_switch<'a>(
|
|||
);
|
||||
// The dispatch header is purely structural (it stands in for the
|
||||
// discriminant comparison). It must not inherit Sink/Source labels
|
||||
// from the case body's text — push_node uses `text_of(ast)` for
|
||||
// from the case body's text, push_node uses `text_of(ast)` for
|
||||
// non-call kinds, which would let the body text drive classification.
|
||||
g[header].taint.labels.clear();
|
||||
g[header].call.callee = None;
|
||||
g[header].call.sink_payload_args = None;
|
||||
g[header].call.destination_uses = None;
|
||||
g[header].call.gate_filters.clear();
|
||||
// For mutually-exclusive switch shapes with a single-ident
|
||||
// scrutinee, synthesize a `<scrutinee> == <case_literal>`
|
||||
// structured condition on the dispatch header so SSA lowering
|
||||
|
|
@ -958,7 +959,7 @@ pub(super) fn build_try<'a>(
|
|||
|
||||
vec![synth]
|
||||
} else {
|
||||
// No param name — wire exception edges directly to first catch body node
|
||||
// No param name, wire exception edges directly to first catch body node
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ fn js_try_catch_has_exception_edges() {
|
|||
|
||||
/// When a classifiable call (here `eval`, a built-in JS sink) is nested
|
||||
/// inside a multi-line statement, the CFG node's `classification_span()`
|
||||
/// should point at the inner call, not at the outer statement's start —
|
||||
/// should point at the inner call, not at the outer statement's start ,
|
||||
/// so finding display reports the line the dangerous call actually lives
|
||||
/// on. `ast.span` must still cover the whole outer statement for
|
||||
/// structural passes that need the statement grain.
|
||||
|
|
@ -86,7 +86,7 @@ fn inner_call_override_narrows_classification_span() {
|
|||
}
|
||||
|
||||
/// `classification_span()` must fall back to `ast.span` when no narrower
|
||||
/// sub-expression was recorded — so existing structural code paths keep
|
||||
/// sub-expression was recorded, so existing structural code paths keep
|
||||
/// working unchanged for nodes whose classification applies to the whole
|
||||
/// outer node.
|
||||
#[test]
|
||||
|
|
@ -125,7 +125,7 @@ fn callee_span_unset_when_no_narrowing_is_possible() {
|
|||
// A bare `eval(x);` on one line: `first_call_ident` finds the
|
||||
// call_expression whose span is nearly the whole expression_statement
|
||||
// (different by the trailing `;`). `classification_span` still
|
||||
// returns a sensible line — but the exact trimming is an
|
||||
// returns a sensible line, but the exact trimming is an
|
||||
// implementation detail. What we assert here is the invariant:
|
||||
// if callee_span *is* set, it must be contained in ast.span.
|
||||
let src = b"function f() { eval(x); }";
|
||||
|
|
@ -708,7 +708,7 @@ fn python_if_and() {
|
|||
|
||||
#[test]
|
||||
fn ruby_unless_and() {
|
||||
// `unless a && b` — chain built, branches swapped
|
||||
// `unless a && b`, chain built, branches swapped
|
||||
// Body should run when condition is false
|
||||
let src = b"def f\n unless a && b\n x\n end\nend\n";
|
||||
let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE);
|
||||
|
|
@ -848,7 +848,7 @@ fn parse_tree(src: &[u8], ts_lang: Language) -> tree_sitter::Tree {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_skips_lambda_body() {
|
||||
// `process(lambda: eval(dangerous))` — Python-style.
|
||||
// `process(lambda: eval(dangerous))`, Python-style.
|
||||
// first_call_ident should return "process", not "eval".
|
||||
let src = b"process(lambda: eval(dangerous))";
|
||||
let ts_lang = Language::from(tree_sitter_python::LANGUAGE);
|
||||
|
|
@ -860,7 +860,7 @@ fn first_call_ident_skips_lambda_body() {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_skips_arrow_function_body() {
|
||||
// `process(() => eval(dangerous))` — JS arrow function in argument.
|
||||
// `process(() => eval(dangerous))`, JS arrow function in argument.
|
||||
let src = b"process(() => eval(dangerous))";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -871,7 +871,7 @@ fn first_call_ident_skips_arrow_function_body() {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_skips_named_function_in_arg() {
|
||||
// `process(function inner() { eval(dangerous); })` — named function expression in arg.
|
||||
// `process(function inner() { eval(dangerous); })`, named function expression in arg.
|
||||
let src = b"process(function inner() { eval(dangerous); })";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -882,7 +882,7 @@ fn first_call_ident_skips_named_function_in_arg() {
|
|||
|
||||
#[test]
|
||||
fn first_call_ident_normal_nested_call() {
|
||||
// `outer(inner(x))` — inner is NOT behind a function boundary, should be reachable.
|
||||
// `outer(inner(x))`, inner is NOT behind a function boundary, should be reachable.
|
||||
let src = b"outer(inner(x))";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -895,7 +895,7 @@ fn first_call_ident_normal_nested_call() {
|
|||
#[test]
|
||||
fn first_call_ident_finds_call_not_blocked_by_function() {
|
||||
// Ensure a call at the same level as a function literal is still found.
|
||||
// `[function() {}, actual_call()]` — array with function and call.
|
||||
// `[function() {}, actual_call()]`, array with function and call.
|
||||
let src = b"[function() {}, actual_call()]";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let tree = parse_tree(src, ts_lang);
|
||||
|
|
@ -908,7 +908,7 @@ fn first_call_ident_finds_call_not_blocked_by_function() {
|
|||
|
||||
#[test]
|
||||
fn callee_not_resolved_from_nested_function_arg() {
|
||||
// `safe_wrapper(function() { eval(user_input); })` — the CFG for the
|
||||
// `safe_wrapper(function() { eval(user_input); })`, the CFG for the
|
||||
// outer call should resolve the callee as "safe_wrapper", never "eval".
|
||||
let src = b"function f() { safe_wrapper(function() { eval(user_input); }); }";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
|
|
@ -923,7 +923,7 @@ fn callee_not_resolved_from_nested_function_arg() {
|
|||
assert!(has_safe, "expected a node with callee 'safe_wrapper'");
|
||||
|
||||
// The outer body should NOT have a node with callee "eval" attributed
|
||||
// to the outer expression — eval lives inside the nested function body.
|
||||
// to the outer expression, eval lives inside the nested function body.
|
||||
let outer_eval = body.graph.node_weights().any(|info| {
|
||||
info.call.callee.as_deref() == Some("eval") && info.ast.enclosing_func.is_none()
|
||||
});
|
||||
|
|
@ -1117,6 +1117,7 @@ fn clone_preserves_all_sub_structs() {
|
|||
kwargs: vec![("shell".into(), vec!["True".into()])],
|
||||
arg_string_literals: vec![Some("lit".into())],
|
||||
destination_uses: None,
|
||||
gate_filters: Vec::new(),
|
||||
},
|
||||
taint: TaintMeta {
|
||||
labels: {
|
||||
|
|
@ -1399,7 +1400,7 @@ fn js_promisify_ignored_for_non_js_langs() {
|
|||
|
||||
#[test]
|
||||
fn js_promisify_non_call_value_ignored() {
|
||||
// RHS is not a promisify call — no binding should be captured.
|
||||
// RHS is not a promisify call, no binding should be captured.
|
||||
let src = b"const execAsync = child_process.exec;";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
let file_cfg = parse_to_file_cfg(src, "javascript", ts_lang);
|
||||
|
|
@ -1471,7 +1472,7 @@ fn cpp_function_extracts_param_names() {
|
|||
// ── callee-site metadata extraction ──────────────────────────────────
|
||||
|
||||
/// Callees collected into `LocalFuncSummary` should now carry structured
|
||||
/// arity, receiver, and qualifier fields — not just a bare name.
|
||||
/// arity, receiver, and qualifier fields, not just a bare name.
|
||||
#[test]
|
||||
fn local_summary_callees_carry_arity_and_receiver() {
|
||||
// Two calls: one is a plain function call with 2 args, the other is
|
||||
|
|
@ -1703,7 +1704,7 @@ fn local_summary_callees_have_distinct_ordinals() {
|
|||
.find(|(k, _)| k.name == "outer")
|
||||
.unwrap();
|
||||
|
||||
// Dedup key is (name, arity, receiver, qualifier, ordinal) — the two
|
||||
// Dedup key is (name, arity, receiver, qualifier, ordinal), the two
|
||||
// `a()` sites have different ordinals, so both must appear.
|
||||
let a_sites: Vec<_> = outer.callees.iter().filter(|c| c.name == "a").collect();
|
||||
assert_eq!(
|
||||
|
|
@ -1825,7 +1826,7 @@ fn anon_fn_named_from_short_var_decl_go() {
|
|||
|
||||
#[test]
|
||||
fn iife_callee_resolves_to_anon_body_js() {
|
||||
// `(function(arg){eval(arg);})(q)` — the CallFn arm must produce
|
||||
// `(function(arg){eval(arg);})(q)`, the CallFn arm must produce
|
||||
// a synthetic anon callee name so that taint can match the
|
||||
// inline body's FuncKey.
|
||||
let src = b"(function(arg){ eval(arg); })(q);";
|
||||
|
|
@ -1898,7 +1899,7 @@ fn strip_tags(s: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_unrecognised_literals() {
|
||||
// `.replace("foo", "bar")` contains no dangerous pattern — must NOT be
|
||||
// `.replace("foo", "bar")` contains no dangerous pattern, must NOT be
|
||||
// credited as a sanitizer. Preserves the FP→TN guard: replace calls
|
||||
// that don't strip anything dangerous must stay transparent to taint.
|
||||
let src = br#"
|
||||
|
|
@ -1916,7 +1917,7 @@ fn rewrite(s: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_when_replacement_reintroduces_pattern() {
|
||||
// `.replace("x", "..")` strips `x` but *reintroduces* `..` — be
|
||||
// `.replace("x", "..")` strips `x` but *reintroduces* `..`, be
|
||||
// maximally conservative and abandon all credit for this chain.
|
||||
let src = br#"
|
||||
fn evil(s: &str) -> String {
|
||||
|
|
@ -1933,7 +1934,7 @@ fn evil(s: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_dynamic_arg() {
|
||||
// `.replace(var, "")` — search is not a literal; pattern analysis can
|
||||
// `.replace(var, "")`, search is not a literal; pattern analysis can
|
||||
// say nothing about what was stripped. Must not earn credit.
|
||||
let src = br#"
|
||||
fn dynamic(s: &str, needle: &str) -> String {
|
||||
|
|
@ -1950,7 +1951,7 @@ fn dynamic(s: &str, needle: &str) -> String {
|
|||
|
||||
#[test]
|
||||
fn replace_chain_rejects_non_identifier_base() {
|
||||
// `get_s().replace("..", "")` — innermost receiver is a call, not a
|
||||
// `get_s().replace("..", "")`, innermost receiver is a call, not a
|
||||
// parameter. We have no reason to believe `get_s()` returns a value
|
||||
// that benefits the caller; refuse credit.
|
||||
let src = br#"
|
||||
|
|
@ -1976,7 +1977,7 @@ fn find_node_defining<'a>(cfg: &'a Cfg, var: &str) -> Option<&'a NodeInfo> {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_detected_on_js_property_read() {
|
||||
// `var count = items.length` — property access on a member expression
|
||||
// `var count = items.length`, property access on a member expression
|
||||
// should mark the CFG node as a numeric-length access so the
|
||||
// type-fact analysis infers TypeKind::Int for `count`.
|
||||
let src = br#"function f(items) {
|
||||
|
|
@ -1994,7 +1995,7 @@ fn numeric_length_access_detected_on_js_property_read() {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_detected_on_js_zero_arg_method_call() {
|
||||
// `var n = str.length()` — zero-arg method call form (uncommon in JS
|
||||
// `var n = str.length()`, zero-arg method call form (uncommon in JS
|
||||
// but present in other languages). Detector should unwrap a
|
||||
// zero-arg call around a member expression.
|
||||
let src = br#"function f(list) {
|
||||
|
|
@ -2012,7 +2013,7 @@ fn numeric_length_access_detected_on_js_zero_arg_method_call() {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_ignores_unrelated_properties() {
|
||||
// `var v = arr.foo` — arbitrary property reads must not be flagged.
|
||||
// `var v = arr.foo`, arbitrary property reads must not be flagged.
|
||||
let src = br#"function f(arr) {
|
||||
var v = arr.foo;
|
||||
return v;
|
||||
|
|
@ -2028,7 +2029,7 @@ fn numeric_length_access_ignores_unrelated_properties() {
|
|||
|
||||
#[test]
|
||||
fn numeric_length_access_ignores_method_calls_with_args() {
|
||||
// `var r = s.indexOf('x')` — the detector must reject any call with
|
||||
// `var r = s.indexOf('x')`, the detector must reject any call with
|
||||
// positional arguments because those aren't pure length reads.
|
||||
let src = br#"function f(s) {
|
||||
var r = s.indexOf('x');
|
||||
|
|
@ -2043,7 +2044,7 @@ fn numeric_length_access_ignores_method_calls_with_args() {
|
|||
);
|
||||
}
|
||||
|
||||
// ── Pointer-Phase 6 / W5: subscript lowering tests ────────────────────────
|
||||
//── subscript lowering tests ────────────────────────
|
||||
|
||||
/// Scope for tests that flip `NYX_POINTER_ANALYSIS=1` so the CFG-side
|
||||
/// subscript synthesis activates. The env-var is restored afterwards
|
||||
|
|
@ -2290,7 +2291,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
|
|||
);
|
||||
}
|
||||
|
||||
/// JS switch fall-through (`case 1: a(); case 2: b();`) — case 1's
|
||||
/// JS switch fall-through (`case 1: a(); case 2: b();`), case 1's
|
||||
/// exit should flow into case 2's body so taint from `first()`
|
||||
/// reaches `second()`'s sinks.
|
||||
///
|
||||
|
|
@ -2301,7 +2302,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
|
|||
/// structural shape.
|
||||
/// (b) `first()` has a non-Back forward out-edge that lands inside
|
||||
/// the case-2 sub-graph (the actual fall-through wire), so we
|
||||
/// prove there *is* a fall-through edge — not just an
|
||||
/// prove there *is* a fall-through edge, not just an
|
||||
/// Entry→…→Exit path that happens to walk through both calls
|
||||
/// via the dispatch chain.
|
||||
///
|
||||
|
|
@ -2309,7 +2310,7 @@ fn js_switch_default_in_middle_reorders_to_tail() {
|
|||
/// Seq passthrough nodes (one per surrounding scope), so the
|
||||
/// fall-through edge from `first()` lands on the *first wrapper
|
||||
/// Seq node* of case 2, not on `second()` itself. Asserting that
|
||||
/// `second()` has ≥2 in-edges would therefore be wrong — the True
|
||||
/// `second()` has ≥2 in-edges would therefore be wrong, the True
|
||||
/// edge from the case-2 dispatch If targets the wrapper node, and
|
||||
/// only a single Seq chain leads from there to `second()`.
|
||||
#[test]
|
||||
|
|
@ -2800,7 +2801,7 @@ fn nested_loops_two_headers_two_back_edges() {
|
|||
|
||||
#[test]
|
||||
fn loop_with_break_no_back_edge_from_break() {
|
||||
// A `break` short-circuits the loop body — its edge must NOT be a
|
||||
// A `break` short-circuits the loop body, its edge must NOT be a
|
||||
// back edge to the header (it leaves the loop entirely).
|
||||
let src = b"function f() { while (cond()) { if (done()) break; body(); } }";
|
||||
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
|
||||
|
|
@ -2879,7 +2880,7 @@ fn chained_method_call_rebinds_to_inner_gated_sink() {
|
|||
// no longer be the recorded callee for this node.
|
||||
if callee.ends_with("https.get") {
|
||||
// The inner-gate path must have populated sink_payload_args
|
||||
// (the gate's payload arg is position 0 — the URL string).
|
||||
// (the gate's payload arg is position 0, the URL string).
|
||||
assert!(
|
||||
info.call.sink_payload_args.is_some(),
|
||||
"expected sink_payload_args to be populated for chained \
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ use super::{
|
|||
member_expr_text, push_node, text_of,
|
||||
};
|
||||
use crate::labels::{DataLabel, LangAnalysisRules, classify};
|
||||
use crate::utils::snippet::truncate_at_char_boundary;
|
||||
use petgraph::graph::NodeIndex;
|
||||
use smallvec::SmallVec;
|
||||
use tree_sitter::Node;
|
||||
|
|
@ -72,20 +73,15 @@ pub(super) fn push_condition_node<'a>(
|
|||
code: &'a [u8],
|
||||
enclosing_func: Option<&str>,
|
||||
) -> NodeIndex {
|
||||
// Pass cond_ast as both args — sub-conditions are never `unless` nodes
|
||||
// Pass cond_ast as both args, sub-conditions are never `unless` nodes
|
||||
let (inner, negated) = detect_negation(cond_ast, cond_ast, lang);
|
||||
let mut vars = Vec::new();
|
||||
collect_idents(inner, code, &mut vars);
|
||||
vars.sort();
|
||||
vars.dedup();
|
||||
vars.truncate(MAX_COND_VARS);
|
||||
let text = text_of(cond_ast, code).map(|t| {
|
||||
if t.len() > MAX_CONDITION_TEXT_LEN {
|
||||
t[..MAX_CONDITION_TEXT_LEN].to_string()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
});
|
||||
let text = text_of(cond_ast, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
let span = (cond_ast.start_byte(), cond_ast.end_byte());
|
||||
g.add_node(NodeInfo {
|
||||
kind: StmtKind::If,
|
||||
|
|
@ -140,7 +136,7 @@ pub(super) fn detect_rust_let_match_guard<'a>(
|
|||
/// Synthesize a `StmtKind::If` CFG node carrying a Rust match-arm guard's
|
||||
/// condition text and vars. The let-binding name is added to `condition_vars`
|
||||
/// so `apply_branch_predicates` narrows validation to that specific variable
|
||||
/// — the variable that receives the arm's value and flows to downstream sinks.
|
||||
///, the variable that receives the arm's value and flows to downstream sinks.
|
||||
pub(super) fn emit_rust_match_guard_if<'a>(
|
||||
g: &mut Cfg,
|
||||
guard: Node<'a>,
|
||||
|
|
@ -154,13 +150,8 @@ pub(super) fn emit_rust_match_guard_if<'a>(
|
|||
vars.sort();
|
||||
vars.dedup();
|
||||
vars.truncate(MAX_COND_VARS);
|
||||
let text = text_of(guard, code).map(|t| {
|
||||
if t.len() > MAX_CONDITION_TEXT_LEN {
|
||||
t[..MAX_CONDITION_TEXT_LEN].to_string()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
});
|
||||
let text = text_of(guard, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
let span = (guard.start_byte(), guard.end_byte());
|
||||
g.add_node(NodeInfo {
|
||||
kind: StmtKind::If,
|
||||
|
|
@ -181,7 +172,7 @@ pub(super) fn emit_rust_match_guard_if<'a>(
|
|||
/// `lhs_text` is then synthesised by SSA lowering at the join.
|
||||
///
|
||||
/// The condition's identifiers live on the If node's `condition_vars`, **not**
|
||||
/// on the branch `uses`. This is the whole point of the split — cond is control
|
||||
/// on the branch `uses`. This is the whole point of the split, cond is control
|
||||
/// flow, branches are data flow.
|
||||
///
|
||||
/// Returns the exit frontier for downstream statement chaining (a single-element
|
||||
|
|
@ -219,7 +210,7 @@ pub(super) fn build_ternary_diamond<'a>(
|
|||
g[cond_if].is_eq_with_const = detect_eq_with_const(cond_ast, lang);
|
||||
connect_all(g, preds, cond_if, pred_edge);
|
||||
|
||||
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node) —
|
||||
// 2. Branches. Each branch produces its own exit frontier (≥ 1 node) ,
|
||||
// a nested ternary recurses and returns its own join node.
|
||||
let true_exits = lower_ternary_branch(
|
||||
cons_ast,
|
||||
|
|
@ -332,7 +323,7 @@ pub(super) fn lower_ternary_branch<'a>(
|
|||
analysis_rules,
|
||||
);
|
||||
|
||||
// The branch expression's own `defines` (if any — typically None for a
|
||||
// The branch expression's own `defines` (if any, typically None for a
|
||||
// pure value expression) is replaced with the outer LHS so that both
|
||||
// branches agree on the target, driving phi insertion at the join.
|
||||
g[node].taint.defines = Some(lhs_text.to_string());
|
||||
|
|
@ -410,7 +401,7 @@ pub(super) fn classify_ternary_lhs(
|
|||
.unwrap_or_default();
|
||||
|
||||
// Try the full dotted path first (e.g. "document.cookie"), then fall back
|
||||
// to the property alone (e.g. "innerHTML") — mirrors the LHS classification
|
||||
// to the property alone (e.g. "innerHTML"), mirrors the LHS classification
|
||||
// already performed in `push_node` for non-split assignments.
|
||||
if let Some(l) = classify(lang, &lhs_text, extra) {
|
||||
labels.push(l);
|
||||
|
|
@ -429,7 +420,7 @@ pub(super) fn classify_ternary_lhs(
|
|||
/// Recursively decompose a boolean condition into a chain of `StmtKind::If` nodes
|
||||
/// with short-circuit edges.
|
||||
///
|
||||
/// Returns `(true_exits, false_exits)` — the sets of nodes from which True/False
|
||||
/// Returns `(true_exits, false_exits)`, the sets of nodes from which True/False
|
||||
/// edges should connect to the then/else branches.
|
||||
pub(super) fn build_condition_chain<'a>(
|
||||
cond_ast: Node<'a>,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use tree_sitter::Node;
|
|||
///
|
||||
/// Used by decorator extraction to reduce `login_required`, `permission_required(...)`,
|
||||
/// `flask_login.login_required`, `hasRole('ADMIN')` to their first identifier
|
||||
/// name — the matcher target.
|
||||
/// name, the matcher target.
|
||||
fn leading_ident_text(node: Node<'_>, code: &[u8]) -> Option<String> {
|
||||
let mut cur = node;
|
||||
loop {
|
||||
|
|
@ -56,7 +56,7 @@ fn normalize_decorator_name(raw: &str) -> String {
|
|||
let trimmed = raw.trim();
|
||||
let trimmed = trimmed.trim_start_matches(':').trim_start_matches('@');
|
||||
// If a call syntax leaked through (e.g. `UseGuards(AuthGuard)`), keep only
|
||||
// the head — callers that want the arg handle it separately.
|
||||
// the head, callers that want the arg handle it separately.
|
||||
let head = trimmed
|
||||
.split(['(', ' ', '\t', '\n'])
|
||||
.next()
|
||||
|
|
@ -115,7 +115,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
|
|||
/// are `decorator` nodes containing an `identifier` or `call` expression.
|
||||
/// - **JS/TS**: decorators attach to `method_definition` children or appear
|
||||
/// as siblings inside `class_body`; stage-3 decorators use `decorator` nodes.
|
||||
/// `@UseGuards(AuthGuard)` — we include the call args too.
|
||||
/// `@UseGuards(AuthGuard)`, we include the call args too.
|
||||
/// - **Java**: annotations live in the `modifiers` child of `method_declaration`;
|
||||
/// kinds are `marker_annotation` / `annotation`.
|
||||
/// - **Rust**: `function_item` has `attribute_item` siblings (outer `#[..]`).
|
||||
|
|
@ -127,7 +127,7 @@ fn decorator_arg_names(decorator_ast: Node<'_>, code: &[u8]) -> Vec<String> {
|
|||
/// at class body scope applies to every method in the class. `only:` /
|
||||
/// `except:` hash args scope the filter to the listed action names; the
|
||||
/// filter is only recorded for the current method when the scope matches.
|
||||
/// Conditional filters (`if:` / `unless:`) are not honored — those require
|
||||
/// Conditional filters (`if:` / `unless:`) are not honored, those require
|
||||
/// predicate evaluation and are deferred.
|
||||
pub(super) fn extract_auth_decorators<'a>(
|
||||
func_node: Node<'a>,
|
||||
|
|
@ -379,12 +379,12 @@ pub(super) fn extract_auth_decorators<'a>(
|
|||
}
|
||||
|
||||
/// If a Ruby statement is `before_action :name` (or `before_filter :name`),
|
||||
/// push the normalized filter name into `out` — honoring any `only:` / `except:`
|
||||
/// push the normalized filter name into `out`, honoring any `only:` / `except:`
|
||||
/// hash arguments against `method_name`.
|
||||
///
|
||||
/// Positional symbol args (`before_action :a, :b, only: [:x]`) all share the
|
||||
/// single trailing scope. Conditional filters (`if:` / `unless:`) are not
|
||||
/// honored here — those require predicate evaluation and are deferred.
|
||||
/// honored here, those require predicate evaluation and are deferred.
|
||||
fn collect_ruby_before_action(
|
||||
node: Node<'_>,
|
||||
code: &[u8],
|
||||
|
|
@ -499,7 +499,7 @@ fn collect_ruby_before_action(
|
|||
|
||||
/// Parse a single `only:` / `except:` hash pair and append the symbol list into
|
||||
/// the corresponding out-vec. Sets the `*_present` flag when the key is seen,
|
||||
/// regardless of whether the value parses into any symbols — treating
|
||||
/// regardless of whether the value parses into any symbols, treating
|
||||
/// `only: []` as "no actions match" is safer than ignoring the scope.
|
||||
fn collect_ruby_filter_pair(
|
||||
pair_node: Node<'_>,
|
||||
|
|
|
|||
|
|
@ -1,26 +1,28 @@
|
|||
//! Phase 6.1: per-language DTO definition collectors.
|
||||
//! per-language DTO definition collectors.
|
||||
//!
|
||||
//! Walks a parsed file's AST and emits `(class_name, DtoFields)` pairs
|
||||
//! for class / interface / struct / Pydantic-model declarations whose
|
||||
//! field types resolve to a recognised [`TypeKind`].
|
||||
//!
|
||||
//! Strictly additive: classes whose fields cannot be classified produce
|
||||
//! a `DtoFields` with an empty `fields` map — the caller must decide
|
||||
//! a `DtoFields` with an empty `fields` map, the caller must decide
|
||||
//! whether to use that as a "Dto with no inferred fields" or fall back
|
||||
//! to the pre-Phase-6 Object/Unknown classification.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use tree_sitter::Node;
|
||||
|
||||
use super::helpers::text_of;
|
||||
use super::params::{java_type_to_kind, python_primitive_to_kind, ts_type_to_kind};
|
||||
use super::params::{
|
||||
java_type_to_kind, python_primitive_to_kind, ts_type_to_kind, ts_type_to_local_collection,
|
||||
};
|
||||
use crate::ssa::type_facts::{DtoFields, TypeKind};
|
||||
|
||||
/// Collect all DTO-shaped class definitions in a parsed file.
|
||||
///
|
||||
/// Dispatches per-language; returns an empty map for languages without
|
||||
/// a Phase 6 collector (Go, Ruby, PHP, C/C++ — DTOs in those ecosystems
|
||||
/// a collector (Go, Ruby, PHP, C/C++, DTOs in those ecosystems
|
||||
/// either don't follow framework conventions Nyx tracks today, or are
|
||||
/// already covered by other type-inference paths).
|
||||
pub(super) fn collect_dto_classes(
|
||||
|
|
@ -39,6 +41,55 @@ pub(super) fn collect_dto_classes(
|
|||
out
|
||||
}
|
||||
|
||||
/// Collect same-file `type X = Map<...>` / `Set<...>` / `T[]`
|
||||
/// aliases for TS / JS so the param classifier can resolve a
|
||||
/// parameter typed `m: ElementsMap` (where
|
||||
/// `type ElementsMap = Map<K, V>`) to
|
||||
/// [`TypeKind::LocalCollection`].
|
||||
///
|
||||
/// Empty for non-JS/TS languages. Cross-file aliases are not
|
||||
/// resolved here, that requires the multi-file type-resolution
|
||||
/// pipeline that doesn't yet exist for TS. Excalidraw's
|
||||
/// `type ElementsMap = Map<...>` is in
|
||||
/// `packages/element/src/types.ts`; users that import the alias
|
||||
/// without a same-file copy still see the original FP. Most
|
||||
/// real-repo aliases the FP cluster touched were declared in the
|
||||
/// same file as their consumers (see fixture).
|
||||
pub(super) fn collect_type_alias_local_collections(
|
||||
root: Node<'_>,
|
||||
lang: &str,
|
||||
code: &[u8],
|
||||
) -> HashSet<String> {
|
||||
let mut out: HashSet<String> = HashSet::new();
|
||||
if matches!(lang, "typescript" | "ts" | "javascript" | "js") {
|
||||
collect_ts_type_alias_local_collections(root, code, &mut out);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn collect_ts_type_alias_local_collections(root: Node<'_>, code: &[u8], out: &mut HashSet<String>) {
|
||||
walk(root, &mut |node| {
|
||||
if node.kind() != "type_alias_declaration" {
|
||||
return;
|
||||
}
|
||||
let Some(name_node) = node.child_by_field_name("name") else {
|
||||
return;
|
||||
};
|
||||
let Some(alias_name) = text_of(name_node, code) else {
|
||||
return;
|
||||
};
|
||||
let Some(value_node) = node.child_by_field_name("value") else {
|
||||
return;
|
||||
};
|
||||
let Some(value_text) = text_of(value_node, code) else {
|
||||
return;
|
||||
};
|
||||
if ts_type_to_local_collection(value_text.trim()).is_some() {
|
||||
out.insert(alias_name);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// Java
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -163,7 +214,7 @@ fn extract_ts_property<'a>(node: Node<'a>, code: &'a [u8]) -> Option<(String, Ty
|
|||
let name_node = node.child_by_field_name("name")?;
|
||||
let field_name = text_of(name_node, code)?;
|
||||
let type_anno = node.child_by_field_name("type")?;
|
||||
// type_annotation node text is `: T` — walk to the inner type.
|
||||
// type_annotation node text is `: T`, walk to the inner type.
|
||||
let type_text = type_anno
|
||||
.named_child(0)
|
||||
.and_then(|t| text_of(t, code))
|
||||
|
|
@ -193,7 +244,7 @@ fn collect_rust(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFields
|
|||
return;
|
||||
};
|
||||
if body.kind() != "field_declaration_list" {
|
||||
// Tuple struct or unit struct — no named fields.
|
||||
// Tuple struct or unit struct, no named fields.
|
||||
return;
|
||||
}
|
||||
let mut fields = DtoFields::new(class_name.clone());
|
||||
|
|
@ -291,7 +342,7 @@ fn collect_python(root: Node<'_>, code: &[u8], out: &mut HashMap<String, DtoFiel
|
|||
/// Conservative supertype scan: returns true when the class definition
|
||||
/// has a superclass list whose text mentions `BaseModel` (covers both
|
||||
/// `BaseModel` and `pydantic.BaseModel`). No false positives on
|
||||
/// non-Pydantic classes named `BaseModel`-something — match is on the
|
||||
/// non-Pydantic classes named `BaseModel`-something, match is on the
|
||||
/// full token, not a substring.
|
||||
fn python_inherits_basemodel<'a>(class_node: Node<'a>, code: &'a [u8]) -> bool {
|
||||
let Some(supers) = class_node.child_by_field_name("superclasses") else {
|
||||
|
|
@ -418,7 +469,7 @@ mod tests {
|
|||
"#;
|
||||
let dtos = collect("rust", src);
|
||||
// Tuple structs have no named fields and must NOT produce a
|
||||
// DtoFields entry — Phase 6 only handles named-field DTOs.
|
||||
// DtoFields entry, This collector only handles named-field DTOs.
|
||||
assert!(!dtos.contains_key("Wrap"));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,11 +19,11 @@ pub(crate) fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option<String> {
|
|||
///
|
||||
/// For `Runtime.getRuntime().exec(cmd)`, the receiver of `exec` is the call
|
||||
/// `Runtime.getRuntime()`. This function drills through that to return
|
||||
/// `"Runtime"` — the outermost non-call object. This lets labels like
|
||||
/// `"Runtime"`, the outermost non-call object. This lets labels like
|
||||
/// `"Runtime.exec"` match correctly.
|
||||
pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<String> {
|
||||
match lookup(lang, n.kind()) {
|
||||
// The receiver is itself a call — drill into ITS receiver.
|
||||
// The receiver is itself a call, drill into ITS receiver.
|
||||
// e.g. for `Runtime.getRuntime()`, the object is `Runtime`.
|
||||
Kind::CallFn | Kind::CallMethod => {
|
||||
let inner = n
|
||||
|
|
@ -53,7 +53,7 @@ pub(crate) fn root_receiver_text(n: Node, lang: &str, code: &[u8]) -> Option<Str
|
|||
/// identifier (e.g. call expressions, subscripts, `this`/`self`, etc.).
|
||||
pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
|
||||
let mut cur = n;
|
||||
// Bounded walk — tree-sitter can nest deeply but we only need a handful
|
||||
// Bounded walk, tree-sitter can nest deeply but we only need a handful
|
||||
// of hops for real code.
|
||||
for _ in 0..16 {
|
||||
match cur.kind() {
|
||||
|
|
@ -68,7 +68,7 @@ pub(crate) fn root_member_receiver(n: Node, code: &[u8]) -> Option<String> {
|
|||
cur = cur.child_by_field_name("value")?;
|
||||
}
|
||||
// Drill through nested calls / method chains to find the base
|
||||
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` —
|
||||
// identifier. E.g. `Connection::open(p).unwrap().execute(...)` ,
|
||||
// the receiver of `.execute` is the `.unwrap()` call whose
|
||||
// object is `Connection::open(p)`; we want the leftmost plain
|
||||
// identifier the chain resolves to (for SSA var_stacks lookup).
|
||||
|
|
@ -212,7 +212,7 @@ pub(crate) fn first_call_ident_with_span<'a>(
|
|||
return ident.map(|s| (s, span));
|
||||
}
|
||||
Kind::Function => {
|
||||
// Do not descend into nested function/lambda bodies —
|
||||
// Do not descend into nested function/lambda bodies ,
|
||||
// they are separate scopes and should not contribute
|
||||
// callee identifiers to the parent expression.
|
||||
continue;
|
||||
|
|
@ -240,7 +240,7 @@ pub(crate) fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> O
|
|||
/// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does.
|
||||
///
|
||||
/// Returns `(callee_text, label, span)` where `span` is the byte range of the
|
||||
/// inner call node itself — used to populate `CallMeta.callee_span` so that
|
||||
/// inner call node itself, used to populate `CallMeta.callee_span` so that
|
||||
/// display sites can report the actual call location rather than the enclosing
|
||||
/// statement's span.
|
||||
pub(crate) fn find_classifiable_inner_call<'a>(
|
||||
|
|
@ -251,7 +251,7 @@ pub(crate) fn find_classifiable_inner_call<'a>(
|
|||
) -> Option<(String, DataLabel, (usize, usize))> {
|
||||
let mut cursor = n.walk();
|
||||
for c in n.children(&mut cursor) {
|
||||
// Do not descend into Kind::Function nodes — they will be extracted
|
||||
// Do not descend into Kind::Function nodes, they will be extracted
|
||||
// as separate BodyCfg entries and should not contribute inner callees
|
||||
// to the parent expression.
|
||||
if lookup(lang, c.kind()) == Kind::Function {
|
||||
|
|
@ -329,7 +329,7 @@ pub(crate) fn member_expr_text_inner(n: Node, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"member_expression" | "attribute" | "selector_expression" => {
|
||||
// Tree-sitter exposes the receiver under `object` (JS/TS, Python),
|
||||
// `value` (Rust field_expression — handled in the matching arm
|
||||
// `value` (Rust field_expression, handled in the matching arm
|
||||
// above), or `operand` (Go selector_expression). Without the
|
||||
// `operand` fallback, Go member access like `r.Body` collapsed to
|
||||
// just the trailing field (`Body`), so source rules keyed on the
|
||||
|
|
@ -442,7 +442,7 @@ pub(crate) fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
|
|||
/// This finds anonymous functions / arrow functions / closures that are
|
||||
/// passed as arguments to a call and should be analysed as separate
|
||||
/// function scopes. Only direct function-argument children are collected
|
||||
/// (not functions nested inside other functions — those get handled when
|
||||
/// (not functions nested inside other functions, those get handled when
|
||||
/// the outer function is recursed into).
|
||||
pub(crate) fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec<Node<'a>> {
|
||||
let mut funcs = Vec::new();
|
||||
|
|
@ -558,7 +558,7 @@ pub(crate) fn derive_anon_fn_name_from_context<'a>(
|
|||
}
|
||||
|
||||
// Python: `h = lambda: ...` parents as `assignment`, handled above.
|
||||
// Python `default_parameter` assigning `def foo(x=lambda: 0)` — ambiguous, skip.
|
||||
// Python `default_parameter` assigning `def foo(x=lambda: 0)`, ambiguous, skip.
|
||||
_ => {
|
||||
// Some grammars wrap the RHS in an `expression`, `expression_list`,
|
||||
// or similar node between the binding site and the function literal.
|
||||
|
|
@ -709,7 +709,7 @@ pub(crate) fn collect_idents(n: Node, code: &[u8], out: &mut Vec<String>) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: AST kind names for subscript / index expressions
|
||||
/// AST kind names for subscript / index expressions
|
||||
/// across the languages whose container-element flow we model.
|
||||
///
|
||||
/// JS/TS use `subscript_expression`; Python uses `subscript`; Go uses
|
||||
|
|
@ -724,7 +724,7 @@ pub(crate) fn is_subscript_kind(kind: &str) -> bool {
|
|||
)
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: when the LHS of an assignment statement is a
|
||||
/// when the LHS of an assignment statement is a
|
||||
/// subscript / index expression (or a single-element wrapper around
|
||||
/// one), return that node. Returns `None` for multi-target Go
|
||||
/// `expression_list`s, identifier LHSs, member-expression LHSs, etc.
|
||||
|
|
@ -745,10 +745,10 @@ pub(crate) fn subscript_lhs_node<'a>(lhs: Node<'a>, lang: &str) -> Option<Node<'
|
|||
None
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: extract `(array_text, index_text)` from a
|
||||
/// extract `(array_text, index_text)` from a
|
||||
/// subscript / index AST node.
|
||||
///
|
||||
/// Returns `None` when the array operand is not a plain identifier — we
|
||||
/// Returns `None` when the array operand is not a plain identifier, we
|
||||
/// only synthesise `__index_get__` / `__index_set__` calls when the
|
||||
/// receiver resolves cleanly to a SSA-renamed local, since the W2/W4
|
||||
/// container hooks need a stable receiver var_name to drive
|
||||
|
|
@ -771,7 +771,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
|
|||
n.named_children(&mut cur).nth(1)
|
||||
})?;
|
||||
let arr_kind = arr.kind();
|
||||
// Only proceed when the array is a plain identifier — otherwise
|
||||
// Only proceed when the array is a plain identifier, otherwise
|
||||
// we can't bind a stable receiver name for the synth Call.
|
||||
if !matches!(
|
||||
arr_kind,
|
||||
|
|
@ -780,7 +780,7 @@ pub(crate) fn subscript_components<'a>(n: Node<'a>, code: &'a [u8]) -> Option<(S
|
|||
return None;
|
||||
}
|
||||
let arr_text = text_of(arr, code)?;
|
||||
// PHP-style `$x` strip not needed here — Go/JS/Python don't use it.
|
||||
// PHP-style `$x` strip not needed here, Go/JS/Python don't use it.
|
||||
let idx_text = text_of(idx, code)?;
|
||||
Some((arr_text, idx_text))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
//! Phase 6: per-language class / trait / interface hierarchy extraction.
|
||||
//! per-language class / trait / interface hierarchy extraction.
|
||||
//!
|
||||
//! Walks a parsed file's AST and emits `(sub_container, super_container)`
|
||||
//! pairs for every declared inheritance / impl / implements relationship.
|
||||
|
|
@ -47,7 +47,7 @@ pub(crate) fn collect_hierarchy_edges(
|
|||
"php" => collect_php(root, code, &mut push),
|
||||
"cpp" | "c++" => collect_cpp(root, code, &mut push),
|
||||
// Go: structural / implicit interface satisfaction is intractable
|
||||
// per-file; Phase 6 deliberately skips it.
|
||||
// per-file; deliberately skipped it.
|
||||
// C: no inheritance.
|
||||
_ => {}
|
||||
}
|
||||
|
|
@ -70,7 +70,7 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
|
|||
let Some(sub) = text_of(name_node, code) else {
|
||||
return;
|
||||
};
|
||||
// `superclass` field on class_declaration — singular `extends Y`.
|
||||
// `superclass` field on class_declaration, singular `extends Y`.
|
||||
if let Some(superclass) = node.child_by_field_name("superclass") {
|
||||
let mut cursor = superclass.walk();
|
||||
for c in superclass.named_children(&mut cursor) {
|
||||
|
|
@ -79,13 +79,13 @@ fn collect_java<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mu
|
|||
}
|
||||
}
|
||||
}
|
||||
// `interfaces` field on class_declaration — `implements I, J`
|
||||
// `interfaces` field on class_declaration, `implements I, J`
|
||||
// wraps a `super_interfaces` → `type_list`.
|
||||
if let Some(ifaces) = node.child_by_field_name("interfaces") {
|
||||
collect_java_type_list(ifaces, code, &sub, push);
|
||||
}
|
||||
// `extends_interfaces` is an unnamed child on
|
||||
// interface_declaration — `extends Foo, Bar` for an
|
||||
// interface_declaration, `extends Foo, Bar` for an
|
||||
// interface. Walk children directly since it's not a field.
|
||||
let mut cursor = node.walk();
|
||||
for c in node.named_children(&mut cursor) {
|
||||
|
|
@ -123,7 +123,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"type_identifier" | "identifier" => text_of(n, code),
|
||||
"generic_type" => {
|
||||
// `Foo<T>` — the leading child is the bare type identifier.
|
||||
// `Foo<T>`, the leading child is the bare type identifier.
|
||||
let mut cursor = n.walk();
|
||||
for c in n.named_children(&mut cursor) {
|
||||
if matches!(
|
||||
|
|
@ -136,7 +136,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
None
|
||||
}
|
||||
"scoped_type_identifier" => {
|
||||
// `pkg.Foo` — return last segment.
|
||||
// `pkg.Foo`, return last segment.
|
||||
text_of(n, code).map(|s| {
|
||||
let last = s.rsplit('.').next().unwrap_or(&s);
|
||||
last.to_string()
|
||||
|
|
@ -152,7 +152,7 @@ fn type_identifier_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
|
||||
/// Walk for `impl_item` nodes and emit edges from the concrete type to
|
||||
/// the trait being implemented. Inherent impls (`impl Foo {}`) emit
|
||||
/// no edge — there is no super-trait relationship to record.
|
||||
/// no edge, there is no super-trait relationship to record.
|
||||
fn collect_rust<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &mut F) {
|
||||
walk(root, &mut |node| {
|
||||
if node.kind() != "impl_item" {
|
||||
|
|
@ -179,7 +179,7 @@ fn rust_path_leaf(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"type_identifier" | "identifier" => text_of(n, code),
|
||||
"scoped_type_identifier" | "scoped_identifier" => {
|
||||
// `crate::foo::Bar` — last segment.
|
||||
// `crate::foo::Bar`, last segment.
|
||||
let s = text_of(n, code)?;
|
||||
Some(s.rsplit("::").next().unwrap_or(&s).to_string())
|
||||
}
|
||||
|
|
@ -286,12 +286,12 @@ fn collect_python<F: FnMut(String, String)>(root: Node<'_>, code: &[u8], push: &
|
|||
let Some(superclasses) = node.child_by_field_name("superclasses") else {
|
||||
return; // no parents
|
||||
};
|
||||
// `superclasses` is an `argument_list` — each non-keyword
|
||||
// `superclasses` is an `argument_list`, each non-keyword
|
||||
// argument is a base class.
|
||||
let mut cursor = superclasses.walk();
|
||||
for arg in superclasses.named_children(&mut cursor) {
|
||||
if let Some(t) = python_base_text(arg, code) {
|
||||
// Skip Python `object` — not informative.
|
||||
// Skip Python `object`, not informative.
|
||||
if t != "object" {
|
||||
push(sub.clone(), t);
|
||||
}
|
||||
|
|
@ -304,7 +304,7 @@ fn python_base_text(n: Node<'_>, code: &[u8]) -> Option<String> {
|
|||
match n.kind() {
|
||||
"identifier" => text_of(n, code),
|
||||
"attribute" => {
|
||||
// `pkg.Base` — last segment.
|
||||
// `pkg.Base`, last segment.
|
||||
let s = text_of(n, code)?;
|
||||
Some(s.rsplit('.').next().unwrap_or(&s).to_string())
|
||||
}
|
||||
|
|
@ -474,7 +474,7 @@ mod tests {
|
|||
let src = "interface Mine extends Foo, Bar {}";
|
||||
let edges = collect("java", src);
|
||||
// tree-sitter-java models `extends` on interface as `extends_interfaces`
|
||||
// rooted at the same node — at least one of the parents should land.
|
||||
// rooted at the same node, at least one of the parents should land.
|
||||
assert!(
|
||||
edges.iter().any(|(s, _)| s == "Mine"),
|
||||
"interface extends should emit at least one edge; got {edges:?}"
|
||||
|
|
@ -516,8 +516,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn python_class_object_base_skipped() {
|
||||
// Inheriting from `object` is not informative — Python's
|
||||
// implicit root. Phase 6 omits these edges to keep the
|
||||
// Inheriting from `object` is not informative, Python's
|
||||
// implicit root. We omit these edges to keep the
|
||||
// hierarchy index focused on user-defined relationships.
|
||||
let src = "class Plain(object):\n pass\n";
|
||||
let edges = collect("python", src);
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ use tree_sitter::{Node, Tree};
|
|||
/// - ES6: `import { A as B } from 'mod'` → B → ImportBinding { original: A, module: mod }
|
||||
/// - CommonJS: `const { A: B } = require('mod')` → B → ImportBinding { original: A, module: mod }
|
||||
///
|
||||
/// Only aliased (renamed) bindings are recorded — same-name imports (e.g.
|
||||
/// Only aliased (renamed) bindings are recorded, same-name imports (e.g.
|
||||
/// `import { exec }`) are already resolvable by their original name.
|
||||
pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBindings {
|
||||
let mut bindings = ImportBindings::new();
|
||||
|
|
@ -149,7 +149,7 @@ pub(super) fn extract_import_bindings(tree: &Tree, code: &[u8]) -> ImportBinding
|
|||
continue;
|
||||
}
|
||||
// The alias is accessed via the "alias" field (a `name` node).
|
||||
// The qualified name has no field — find it by kind.
|
||||
// The qualified name has no field, find it by kind.
|
||||
let alias_node = clause.child_by_field_name("alias");
|
||||
let mut c2 = clause.walk();
|
||||
let qname_node = clause
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
|
|||
/// (JS `object`, TS `object`, Python `dictionary`). `names` contains
|
||||
/// identifiers lifted from pair values whose key matches any entry in
|
||||
/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field
|
||||
/// pairs are present, returns `Some(vec![])` — the sink is effectively
|
||||
/// pairs are present, returns `Some(vec![])`, the sink is effectively
|
||||
/// silenced because no destination identifier exists.
|
||||
/// * `None` if the arg is absent, is not an object literal (plain string
|
||||
/// / ident / expression), or has splat/spread children that break static
|
||||
|
|
@ -77,7 +77,7 @@ pub(super) fn extract_destination_field_idents(
|
|||
match child.kind() {
|
||||
// `spread_element` (JS/TS) / `dictionary_splat` (Python): we can't
|
||||
// statically attribute spread contents to specific fields, so
|
||||
// bail out — caller falls back to the whole-arg filter, matching
|
||||
// bail out, caller falls back to the whole-arg filter, matching
|
||||
// the conservative posture used by arg_uses for splats.
|
||||
"spread_element" | "dictionary_splat" => {
|
||||
return None;
|
||||
|
|
@ -107,7 +107,7 @@ pub(super) fn extract_destination_field_idents(
|
|||
}
|
||||
}),
|
||||
// Computed keys like `[someVar]` can't be statically
|
||||
// resolved — skip (conservative: not a destination field).
|
||||
// resolved, skip (conservative: not a destination field).
|
||||
"computed_property_name" => continue,
|
||||
_ => text_of(key_node, code),
|
||||
};
|
||||
|
|
@ -200,7 +200,7 @@ pub(super) fn extract_const_keyword_arg(
|
|||
continue;
|
||||
}
|
||||
let value_node = child.child_by_field_name("value")?;
|
||||
// Only return a literal — identifiers / calls / complex exprs are
|
||||
// Only return a literal, identifiers / calls / complex exprs are
|
||||
// "dynamic" and must be reported as `None` so the gate can
|
||||
// distinguish literal-safe from dynamic.
|
||||
return match value_node.kind() {
|
||||
|
|
@ -252,7 +252,7 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8])
|
|||
/// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as
|
||||
/// `arg0`). Returns `None` when the call has no arguments.
|
||||
///
|
||||
/// Used by per-language shape-aware sink suppression — for example, Ruby
|
||||
/// Used by per-language shape-aware sink suppression, for example, Ruby
|
||||
/// ActiveRecord query methods (`where`, `order`, `pluck`, …) are intrinsically
|
||||
/// parameterised when arg 0 is a hash/symbol/array/non-interpolated string,
|
||||
/// regardless of taint reaching that argument.
|
||||
|
|
@ -268,7 +268,7 @@ pub(super) fn arg0_kind_and_interpolation(call_node: Node) -> Option<(String, bo
|
|||
|
||||
/// Walk a Java method-chain receiver looking for an inner `method_invocation`
|
||||
/// whose method name matches one of `target_methods` (e.g. `createQuery`,
|
||||
/// `prepareStatement`). Returns the kind of that inner call's arg 0 — used
|
||||
/// `prepareStatement`). Returns the kind of that inner call's arg 0, used
|
||||
/// to verify the SQL-bearing call up-chain was given a string literal rather
|
||||
/// than a concatenation / method call.
|
||||
///
|
||||
|
|
@ -307,7 +307,7 @@ pub(super) fn java_chain_arg0_kind_for_method(
|
|||
/// method identifier matches one of `target_methods`, then return that
|
||||
/// inner call's [`arg0_kind_and_interpolation`]. Used when the CFG node
|
||||
/// represents a chained expression like `Model.where(...).preload(...).to_a`
|
||||
/// — the outermost call (`to_a`) has no arguments, so the shape suppressor
|
||||
///, the outermost call (`to_a`) has no arguments, so the shape suppressor
|
||||
/// must reach down the chain to inspect `where`'s arg 0.
|
||||
///
|
||||
/// Conservative: returns `None` if the chain doesn't contain a matching
|
||||
|
|
@ -353,6 +353,116 @@ fn subtree_has_interpolation(n: Node) -> bool {
|
|||
n.named_children(&mut cursor).any(subtree_has_interpolation)
|
||||
}
|
||||
|
||||
/// Walk a JS/TS method-chain receiver-side to find an inner `call_expression`
|
||||
/// whose member-property name matches one of `target_methods` (e.g. `query`,
|
||||
/// `execute`). Returns the `(kind, has_interp)` of that inner call's arg 0.
|
||||
///
|
||||
/// Used to recognise ORM-accessor chains where a labelled SQL sink sits on
|
||||
/// the receiver side of a parameterised execute method:
|
||||
/// `strapi.db.query('admin::api-token').findOne({...})`. The outer call
|
||||
/// (`findOne`) is the CFG node; the inner labelled `db.query` call carries
|
||||
/// the literal model UID that proves the chain is parameterised.
|
||||
///
|
||||
/// Conservative: returns `None` when no matching inner call is found, so
|
||||
/// callers fall through to the no-suppression path.
|
||||
pub(super) fn js_chain_arg0_kind_for_method(
|
||||
expr: Node,
|
||||
target_methods: &[&str],
|
||||
code: &[u8],
|
||||
) -> Option<(String, bool)> {
|
||||
let n = unwrap_parens(expr);
|
||||
// tree-sitter-typescript / -javascript: call_expression with fields
|
||||
// `function` (member_expression / identifier) and `arguments`.
|
||||
if n.kind() == "call_expression" {
|
||||
// Check this call's callee: if its property name (or full text) ends
|
||||
// with one of `target_methods`, this is the inner labelled call.
|
||||
if let Some(function) = n.child_by_field_name("function") {
|
||||
// Property of a member_expression; falls back to the function
|
||||
// text itself for bare-identifier calls.
|
||||
let prop_text = function
|
||||
.child_by_field_name("property")
|
||||
.and_then(|p| text_of(p, code));
|
||||
let full_text = text_of(function, code);
|
||||
let leaf_text = full_text
|
||||
.as_ref()
|
||||
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
|
||||
let matched = target_methods.iter().any(|m| {
|
||||
prop_text.as_deref() == Some(*m)
|
||||
|| leaf_text.as_deref() == Some(*m)
|
||||
|| full_text.as_deref() == Some(*m)
|
||||
|| full_text
|
||||
.as_deref()
|
||||
.is_some_and(|s| s.ends_with(&format!(".{m}")))
|
||||
});
|
||||
if matched {
|
||||
return arg0_kind_and_interpolation(n);
|
||||
}
|
||||
// Drill down the receiver spine: function.object is the prior
|
||||
// call in the chain.
|
||||
if let Some(object) = function.child_by_field_name("object")
|
||||
&& let Some(found) = js_chain_arg0_kind_for_method(object, target_methods, code)
|
||||
{
|
||||
return Some(found);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Walk the receiver chain of a JS/TS call to count *non-execute* method
|
||||
/// calls between the outer call and an inner labelled call to
|
||||
/// `target_inner` (e.g. `query`, `execute`). Returns the immediate outer
|
||||
/// chain method name (e.g. `findOne`) when an inner-call to `target_inner`
|
||||
/// exists somewhere on the receiver spine, otherwise `None`.
|
||||
///
|
||||
/// Used alongside [`js_chain_arg0_kind_for_method`] to verify the chain
|
||||
/// shape `<inner>.query(LITERAL).<orm_method>(...)`, bare
|
||||
/// `connection.query("SELECT ...")` returns `None` because there is no
|
||||
/// outer chain method.
|
||||
pub(super) fn js_chain_outer_method_for_inner<'a>(
|
||||
outer: Node<'a>,
|
||||
target_inner: &[&str],
|
||||
code: &'a [u8],
|
||||
) -> Option<String> {
|
||||
let n = unwrap_parens(outer);
|
||||
if n.kind() != "call_expression" {
|
||||
return None;
|
||||
}
|
||||
let function = n.child_by_field_name("function")?;
|
||||
let object = function.child_by_field_name("object")?;
|
||||
// If `object` itself is a call_expression whose property matches
|
||||
// `target_inner`, the immediate outer is `function.property`.
|
||||
if object.kind() == "call_expression" {
|
||||
let inner_function = object.child_by_field_name("function");
|
||||
if let Some(inner_function) = inner_function {
|
||||
let prop_text = inner_function
|
||||
.child_by_field_name("property")
|
||||
.and_then(|p| text_of(p, code));
|
||||
let full_text = text_of(inner_function, code);
|
||||
let leaf_text = full_text
|
||||
.as_ref()
|
||||
.map(|s| s.rsplit('.').next().unwrap_or(s).to_string());
|
||||
let inner_matched = target_inner.iter().any(|m| {
|
||||
prop_text.as_deref() == Some(*m)
|
||||
|| leaf_text.as_deref() == Some(*m)
|
||||
|| full_text.as_deref() == Some(*m)
|
||||
|| full_text
|
||||
.as_deref()
|
||||
.is_some_and(|s| s.ends_with(&format!(".{m}")))
|
||||
});
|
||||
if inner_matched {
|
||||
return function
|
||||
.child_by_field_name("property")
|
||||
.and_then(|p| text_of(p, code).map(|s| s.to_string()));
|
||||
}
|
||||
}
|
||||
// Recurse: outer chain may have more depth (`a.b().c().d()` ,
|
||||
// d is outermost, c is next, target may be at b or further in).
|
||||
return js_chain_outer_method_for_inner(object, target_inner, code);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// For a chained method call (`a.b().c().d()`), walk down the receiver
|
||||
/// chain (`function.object`) and return the innermost call_expression
|
||||
/// alongside its callee text (e.g. `"http.get"`).
|
||||
|
|
@ -385,7 +495,7 @@ pub(super) fn find_chained_inner_call<'a>(
|
|||
return None;
|
||||
}
|
||||
// Recurse: the inner call may itself be chained
|
||||
// (`axios.get(u).then(h).catch(h)` — innermost is `axios.get`).
|
||||
// (`axios.get(u).then(h).catch(h)`, innermost is `axios.get`).
|
||||
if let Some(inner) = find_chained_inner_call(object, lang, code) {
|
||||
return Some(inner);
|
||||
}
|
||||
|
|
@ -398,7 +508,7 @@ pub(super) fn find_chained_inner_call<'a>(
|
|||
.or_else(|| object.child_by_field_name("name"))?;
|
||||
// Multi-line dotted member expressions (`http\n .get`) include
|
||||
// formatting whitespace in the source-text slice. The labels map
|
||||
// keys are literal `"http.get"` etc. — strip whitespace so the
|
||||
// keys are literal `"http.get"` etc., strip whitespace so the
|
||||
// chained-call inner-gate rebinding fires for both single-line and
|
||||
// multi-line chain styles. Also strips `\r` for CRLF sources.
|
||||
// Motivated by upstream Parse Server CVE-2025-64430 which uses the
|
||||
|
|
@ -410,18 +520,18 @@ pub(super) fn find_chained_inner_call<'a>(
|
|||
|
||||
/// Recursively walk the receiver chain of `outer` (a CallFn / CallMethod
|
||||
/// node) and yield each *named argument* of every inner call along the
|
||||
/// way. Outer's own arguments are NOT included — the caller already
|
||||
/// way. Outer's own arguments are NOT included, the caller already
|
||||
/// handles those via the standard `pre_emit_arg_source_nodes` pass over
|
||||
/// `outer.arguments`.
|
||||
///
|
||||
/// For `json.NewDecoder(r.Body).Decode(emoji)`:
|
||||
/// outer = `.Decode(emoji)` — caller iterates `emoji`
|
||||
/// inner = `json.NewDecoder(r.Body)` — yielded arg: `r.Body`
|
||||
/// outer = `.Decode(emoji)` , caller iterates `emoji`
|
||||
/// inner = `json.NewDecoder(r.Body)` , yielded arg: `r.Body`
|
||||
///
|
||||
/// We only pull from each inner call's `arguments` field, never from its
|
||||
/// `function`/`method`/receiver expressions. That distinction matters
|
||||
/// because chained source-receivers like `r.URL.Query()` expose a
|
||||
/// member-text path that classifies as a Source — but it's the OUTER
|
||||
/// member-text path that classifies as a Source, but it's the OUTER
|
||||
/// chain text (`r.URL.Query.Get`) that already classifies, so emitting
|
||||
/// a synth source for the inner-call's own callee would double-count.
|
||||
///
|
||||
|
|
@ -498,7 +608,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
|
|||
return false;
|
||||
}
|
||||
let first_arg = named[0];
|
||||
// Extract the raw text of arg 0 — must be a string literal or
|
||||
// Extract the raw text of arg 0, must be a string literal or
|
||||
// template string without interpolation.
|
||||
let query_text = match first_arg.kind() {
|
||||
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
|
||||
|
|
@ -511,7 +621,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
|
|||
.named_children(&mut c)
|
||||
.any(|ch| ch.kind() == "template_substitution")
|
||||
{
|
||||
return false; // dynamic — not safe
|
||||
return false; // dynamic, not safe
|
||||
}
|
||||
text_of(first_arg, code)
|
||||
}
|
||||
|
|
@ -534,7 +644,7 @@ pub(super) fn is_parameterized_query_call(call_node: Node, code: &[u8]) -> bool
|
|||
/// - `$1`, `$2`, …, `$N` (PostgreSQL positional)
|
||||
/// - `?` (MySQL / SQLite positional)
|
||||
/// - `%s` (Python DB-API / psycopg2)
|
||||
/// - `:identifier` (Oracle / named parameters) — requires the colon to be
|
||||
/// - `:identifier` (Oracle / named parameters), requires the colon to be
|
||||
/// preceded by a space or `=` (to avoid matching JS ternary / object
|
||||
/// literals).
|
||||
pub(super) fn has_sql_placeholders(s: &str) -> bool {
|
||||
|
|
@ -559,7 +669,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
|
|||
&& i + 1 < len
|
||||
&& bytes[i + 1].is_ascii_alphabetic() =>
|
||||
{
|
||||
// :identifier — must be preceded by whitespace/= to avoid
|
||||
// :identifier, must be preceded by whitespace/= to avoid
|
||||
// false positives on object literals or ternary operators.
|
||||
return true;
|
||||
}
|
||||
|
|
@ -581,7 +691,7 @@ pub(super) fn has_sql_placeholders(s: &str) -> bool {
|
|||
#[allow(clippy::only_used_in_recursion)]
|
||||
pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
|
||||
match node.kind() {
|
||||
// Scalar strings — but reject if they contain interpolation
|
||||
// Scalar strings, but reject if they contain interpolation
|
||||
// (e.g. Ruby `"hello #{name}"`, Python f-strings).
|
||||
"string"
|
||||
| "string_literal"
|
||||
|
|
@ -602,7 +712,7 @@ pub(super) fn is_syntactic_literal(node: Node, code: &[u8]) -> bool {
|
|||
// PHP encapsed_string: safe only if no variable interpolation
|
||||
"encapsed_string" => !has_interpolation_cfg(node),
|
||||
|
||||
// Wrapper: PHP/Go wrap each arg in an `argument` node — unwrap
|
||||
// Wrapper: PHP/Go wrap each arg in an `argument` node, unwrap
|
||||
"argument" => {
|
||||
node.named_child_count() == 1
|
||||
&& node
|
||||
|
|
@ -765,7 +875,7 @@ pub(super) fn has_only_literal_args(call_node: Node, code: &[u8]) -> bool {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
// Zero-arg calls are not "all literal" — taint can still flow via a
|
||||
// Zero-arg calls are not "all literal", taint can still flow via a
|
||||
// non-literal receiver (e.g. `tainted.readObject()`), and the sink-
|
||||
// suppression gate (`info.all_args_literal`) must not skip these.
|
||||
if !any_arg {
|
||||
|
|
@ -781,7 +891,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
|
|||
let mut cursor = node.walk();
|
||||
for child in node.children(&mut cursor) {
|
||||
let kind = child.kind();
|
||||
// Skip argument lists — those are checked by the caller.
|
||||
// Skip argument lists, those are checked by the caller.
|
||||
if kind == "arguments" || kind == "argument_list" || kind == "actual_parameters" {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -804,7 +914,7 @@ pub(super) fn check_inner_call_args(node: Node, code: &[u8]) -> bool {
|
|||
/// Returns one `Vec<String>` per argument (in parameter-position order).
|
||||
/// Returns empty if argument list can't be found or contains spread/keyword args.
|
||||
pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>> {
|
||||
// Ruby `subshell` (backticks) has no `arguments` field — its children are
|
||||
// Ruby `subshell` (backticks) has no `arguments` field, its children are
|
||||
// string fragments and `interpolation` nodes. Lift each interpolation's
|
||||
// identifiers into a positional arg so taint flows from `#{var}` into the
|
||||
// synthetic "subshell" sink.
|
||||
|
|
@ -834,7 +944,7 @@ pub(super) fn extract_arg_uses(call_node: Node, code: &[u8]) -> Vec<Vec<String>>
|
|||
for child in args_node.named_children(&mut cursor) {
|
||||
let kind = child.kind();
|
||||
// Named / keyword arguments are tracked separately in `CallMeta.kwargs`
|
||||
// and do not participate in positional indexing — skip them here so
|
||||
// and do not participate in positional indexing, skip them here so
|
||||
// `arg_uses` remains strictly positional. Splats (spread/dict splat)
|
||||
// still invalidate positional mapping; bail out in that case.
|
||||
if kind == "spread_element"
|
||||
|
|
@ -1058,13 +1168,13 @@ pub(super) fn detect_rust_replace_chain_sanitizer(call_ast: Node, code: &[u8]) -
|
|||
/// Mirrors [`detect_rust_replace_chain_sanitizer`] but for the single-call
|
||||
/// (non-method-chain) Go shape. The caller wires the resulting cap into
|
||||
/// the call's [`crate::labels::DataLabel::Sanitizer`] label, which the
|
||||
/// taint engine consumes via the standard sanitizer pathway — taint flows
|
||||
/// taint engine consumes via the standard sanitizer pathway, taint flows
|
||||
/// in on `s`, the matching cap is stripped from the result.
|
||||
pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> Option<Cap> {
|
||||
if call_ast.kind() != "call_expression" {
|
||||
return None;
|
||||
}
|
||||
// The call's `function` field is a `selector_expression` — `operand`
|
||||
// The call's `function` field is a `selector_expression`, `operand`
|
||||
// is the package ident (`strings`), `field` is the method ident.
|
||||
let func = call_ast.child_by_field_name("function")?;
|
||||
if func.kind() != "selector_expression" {
|
||||
|
|
@ -1085,7 +1195,7 @@ pub(super) fn detect_go_replace_call_sanitizer(call_ast: Node, code: &[u8]) -> O
|
|||
let new_lit = extract_const_string_arg(call_ast, 2, code)?;
|
||||
|
||||
// If the replacement itself reintroduces a dangerous sequence, don't
|
||||
// credit the strip — matches the Rust chain detector's policy.
|
||||
// credit the strip, matches the Rust chain detector's policy.
|
||||
if !caps_stripped_by_literal_pattern(&new_lit).is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
|
@ -1106,7 +1216,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
|
|||
}
|
||||
match lookup(lang, n.kind()) {
|
||||
Kind::Function => {
|
||||
// Function/closure expression passed as argument — return the same
|
||||
// Function/closure expression passed as argument, return the same
|
||||
// synthetic anon name used by build_sub so callback_bindings and
|
||||
// source_to_callback can match it to the extracted BodyCfg.
|
||||
n.child_by_field_name("name")
|
||||
|
|
@ -1155,7 +1265,7 @@ pub(super) fn call_ident_of<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Opti
|
|||
/// returned vector is parallel to [`extract_arg_uses`] / [`extract_arg_callees`].
|
||||
///
|
||||
/// Bails on splats so that a variadic call (`f(*args)`, `f(...xs)`) produces
|
||||
/// an empty vector — positional indices past the splat are meaningless and
|
||||
/// an empty vector, positional indices past the splat are meaningless and
|
||||
/// downstream passes already treat an empty vector as "no info".
|
||||
pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<Option<String>> {
|
||||
let Some(args_node) = call_node.child_by_field_name("arguments") else {
|
||||
|
|
@ -1175,7 +1285,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
|
|||
return Vec::new();
|
||||
}
|
||||
// Named / keyword arguments are tracked separately in `kwargs` and
|
||||
// don't participate in positional indexing — skip them here so this
|
||||
// don't participate in positional indexing, skip them here so this
|
||||
// vector stays aligned with `arg_uses`.
|
||||
if kind == "keyword_argument" || kind == "named_argument" {
|
||||
continue;
|
||||
|
|
@ -1198,7 +1308,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
|
|||
| "raw_string_literal"
|
||||
// PHP's double-quoted form (single-quoted maps to `string`).
|
||||
// Only safe to lift when there is no `encapsed_string` /
|
||||
// `embedded_expression` interpolation child — checked below.
|
||||
// `embedded_expression` interpolation child, checked below.
|
||||
| "encapsed_string" => {
|
||||
let raw = text_of(target, code);
|
||||
raw.and_then(|s| strip_literal_quotes(&s, target, code))
|
||||
|
|
@ -1212,7 +1322,7 @@ pub(super) fn extract_arg_string_literals(call_node: Node, code: &[u8]) -> Vec<O
|
|||
|
||||
/// Strip surrounding quotes from a syntactic string literal, resolving the
|
||||
/// `string_content` child for Rust-style two-level string nodes. Returns the
|
||||
/// raw inner text (no escape-sequence processing) — sufficient for whitelist
|
||||
/// raw inner text (no escape-sequence processing), sufficient for whitelist
|
||||
/// matching against shell-metachar sets.
|
||||
pub(super) fn strip_literal_quotes(raw: &str, node: Node, code: &[u8]) -> Option<String> {
|
||||
// Rust/tree-sitter-rust: `string_literal` wraps a `string_content` child.
|
||||
|
|
@ -1320,7 +1430,7 @@ pub(super) fn def_use(
|
|||
// Python/Ruby `expression_statement` → `assignment`)
|
||||
let mut cursor = ast.walk();
|
||||
for child in ast.children(&mut cursor) {
|
||||
// Only use left/right fields for actual assignment nodes — binary
|
||||
// Only use left/right fields for actual assignment nodes, binary
|
||||
// expressions also have left/right but are not definitions.
|
||||
let is_assign = matches!(lookup(lang, child.kind()), Kind::Assignment);
|
||||
let child_name = child
|
||||
|
|
@ -1403,7 +1513,7 @@ pub(super) fn def_use(
|
|||
(defs, uses, vec![])
|
||||
}
|
||||
|
||||
// if‑let / while‑let — the `let_condition` binds a variable from
|
||||
// if‑let / while‑let, the `let_condition` binds a variable from
|
||||
// the value expression. E.g. `if let Ok(cmd) = env::var("CMD")`
|
||||
// defines `cmd` and uses `env`, `var`, `CMD`.
|
||||
Kind::If | Kind::While => {
|
||||
|
|
@ -1418,7 +1528,7 @@ pub(super) fn def_use(
|
|||
let mut tmp = Vec::<String>::new();
|
||||
collect_idents(pat, code, &mut tmp);
|
||||
// The first plain identifier in the pattern is the binding.
|
||||
// Skip type identifiers (e.g. "Ok" in Ok(cmd)) — take the
|
||||
// Skip type identifiers (e.g. "Ok" in Ok(cmd)), take the
|
||||
// last ident which is the inner binding name.
|
||||
defs = tmp.into_iter().last();
|
||||
}
|
||||
|
|
|
|||
443
src/cfg/mod.rs
443
src/cfg/mod.rs
|
|
@ -14,6 +14,7 @@ use crate::labels::{
|
|||
};
|
||||
use crate::summary::FuncSummary;
|
||||
use crate::symbol::{FuncKey, Lang};
|
||||
use crate::utils::snippet::truncate_at_char_boundary;
|
||||
use smallvec::SmallVec;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
|
@ -54,8 +55,8 @@ use literals::{
|
|||
extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg,
|
||||
extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node,
|
||||
find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args,
|
||||
is_parameterized_query_call, java_chain_arg0_kind_for_method, ruby_chain_arg0_for_method,
|
||||
walk_chain_inner_call_args,
|
||||
is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
|
||||
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
|
||||
};
|
||||
use params::{
|
||||
compute_container_and_kind, extract_param_meta, inject_framework_param_sources,
|
||||
|
|
@ -74,7 +75,7 @@ pub fn extract_param_meta_for_test<'a>(
|
|||
}
|
||||
|
||||
/// Test-only helper to populate the per-file DTO class map without
|
||||
/// running `build_cfg`. Used by the Phase 6 audit harness in
|
||||
/// running `build_cfg`. Used by the DTO audit harness in
|
||||
/// `tests/typed_extractors_audit.rs` to verify that
|
||||
/// `classify_param_type_*` resolves a same-file DTO via the
|
||||
/// thread-local map.
|
||||
|
|
@ -91,30 +92,26 @@ pub fn clear_dto_classes_for_test() {
|
|||
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Structural DFS index for function bodies
|
||||
// -------------------------------------------------------------------------
|
||||
//
|
||||
// Per-file map of function-node start_byte → depth-first preorder index.
|
||||
// Populated at the start of `build_cfg`, consumed by every site that
|
||||
// previously formatted `<anon@{start_byte}>` or stored `start_byte` as
|
||||
// the disambig. The DFS index is stable against edits elsewhere in the
|
||||
// file (inserting a line above a function does not change its index).
|
||||
//
|
||||
// Thread-local is safe because `build_cfg` is not re-entrant within a
|
||||
// single rayon worker: each file is parsed and CFG-built to completion
|
||||
// before the next one starts.
|
||||
// Per-file map of function-node start_byte → DFS preorder index. Stable
|
||||
// against unrelated edits (inserting a line above a function doesn't
|
||||
// change its index). Thread-local is safe, `build_cfg` is not
|
||||
// re-entrant within a single rayon worker.
|
||||
thread_local! {
|
||||
static FN_DFS_INDICES: RefCell<HashMap<usize, u32>> = RefCell::new(HashMap::new());
|
||||
/// Phase 6: per-file DTO class definitions. Populated at the top
|
||||
/// of [`build_cfg`] by [`dto::collect_dto_classes`] so per-parameter
|
||||
/// classifiers can resolve `@RequestBody T dto` /
|
||||
/// `Json<CreateUser>` / `Annotated[CreateUser, Body()]` to a
|
||||
/// [`crate::ssa::type_facts::TypeKind::Dto`] when the DTO type is
|
||||
/// declared in the same file. Cleared at the end of `build_cfg`
|
||||
/// so thread-local state never leaks between files.
|
||||
/// Per-file DTO class definitions, populated at the top of
|
||||
/// [`build_cfg`] so per-parameter classifiers can resolve typed
|
||||
/// extractors against same-file DTOs.
|
||||
pub(crate) static DTO_CLASSES: RefCell<HashMap<String, crate::ssa::type_facts::DtoFields>>
|
||||
= RefCell::new(HashMap::new());
|
||||
/// Per-file set of TS / JS `type X = Map<...>` (or `Set<...>` /
|
||||
/// `Array<...>` / `T[]`) aliases, populated at the top of
|
||||
/// [`build_cfg`]. Lets `classify_param_type_ts` resolve a
|
||||
/// parameter typed `m: ElementsMap` to
|
||||
/// [`crate::ssa::type_facts::TypeKind::LocalCollection`] via
|
||||
/// same-file alias lookup. Cross-file aliases are not yet
|
||||
/// resolved.
|
||||
pub(crate) static TYPE_ALIAS_LC: RefCell<std::collections::HashSet<String>>
|
||||
= RefCell::new(std::collections::HashSet::new());
|
||||
}
|
||||
|
||||
/// Populate the per-file DFS-index map from a preorder walk of the
|
||||
|
|
@ -148,11 +145,8 @@ fn fn_dfs_index(start_byte: usize) -> Option<u32> {
|
|||
FN_DFS_INDICES.with(|cell| cell.borrow().get(&start_byte).copied())
|
||||
}
|
||||
|
||||
/// Synthetic name for an anonymous function. Uses the DFS index when
|
||||
/// available (`<anon#N>`), falls back to the byte offset when the map
|
||||
/// is empty (e.g. during tests that bypass `build_cfg`). The `#`
|
||||
/// sigil is intentionally different from `@` so the two formats are
|
||||
/// distinguishable by downstream consumers.
|
||||
/// Synthetic name for an anonymous function: `<anon#N>` from the DFS
|
||||
/// index when available, `<anon@OFFSET>` as fallback.
|
||||
pub(crate) fn anon_fn_name(start_byte: usize) -> String {
|
||||
match fn_dfs_index(start_byte) {
|
||||
Some(idx) => format!("<anon#{idx}>"),
|
||||
|
|
@ -160,9 +154,7 @@ pub(crate) fn anon_fn_name(start_byte: usize) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
/// Prefix check that accepts both the new `<anon#...>` and legacy
|
||||
/// `<anon@...>` formats. Used by code paths that classify whether a
|
||||
/// function name came from anonymous synthesis.
|
||||
/// True for any anonymous-function synthesis prefix.
|
||||
pub(crate) fn is_anon_fn_name(name: &str) -> bool {
|
||||
name.starts_with("<anon#") || name.starts_with("<anon@")
|
||||
}
|
||||
|
|
@ -235,9 +227,9 @@ pub struct CallMeta {
|
|||
///
|
||||
/// CFG construction does NOT populate this field today (callee already
|
||||
/// carries the full path). It is the canonical place to read the original
|
||||
/// textual callee for **debug/display only** — analysis code should walk
|
||||
/// SSA `FieldProj` receivers (Phase 4) or use the
|
||||
/// [`crate::labels::bare_method_name`] textual fallback (Phase 5).
|
||||
/// textual callee for **debug/display only**, analysis code should walk
|
||||
/// SSA `FieldProj` receivers or use the
|
||||
/// [`crate::labels::bare_method_name`] textual fallback.
|
||||
#[doc(hidden)]
|
||||
#[serde(default)]
|
||||
pub callee_text: Option<String>,
|
||||
|
|
@ -248,14 +240,14 @@ pub struct CallMeta {
|
|||
pub outer_callee: Option<String>,
|
||||
/// Byte span of the inner call that supplied the classification, when
|
||||
/// `find_classifiable_inner_call` overrode the outer callee. `None` when
|
||||
/// the classification came from the outer AST node directly — in that
|
||||
/// the classification came from the outer AST node directly, in that
|
||||
/// case `AstMeta.span` already points at the classified expression.
|
||||
///
|
||||
/// Consumers that want the location of the *labeled* call (sink/source/
|
||||
/// sanitizer display, flow-step rendering, taint origin attribution)
|
||||
/// should use [`NodeInfo::classification_span`] rather than reading this
|
||||
/// field directly. `AstMeta.span` remains the authoritative "whole
|
||||
/// statement" span — used by structural passes (unreachability,
|
||||
/// statement" span, used by structural passes (unreachability,
|
||||
/// resource lifecycle, guard byte scans, CFG/taint span dedup).
|
||||
#[serde(default)]
|
||||
pub callee_span: Option<(usize, usize)>,
|
||||
|
|
@ -283,7 +275,7 @@ pub struct CallMeta {
|
|||
/// only positional arguments.
|
||||
pub kwargs: Vec<(String, Vec<String>)>,
|
||||
/// String-literal value at each positional argument of this call, parallel
|
||||
/// to `arg_uses` — `Some(s)` when the argument is a syntactic string
|
||||
/// to `arg_uses`, `Some(s)` when the argument is a syntactic string
|
||||
/// literal, `None` otherwise. Empty for non-call nodes or when positional
|
||||
/// boundaries can't be determined. Consumed by the static-map abstract
|
||||
/// analysis (and future literal-aware passes) so they don't need the
|
||||
|
|
@ -302,10 +294,41 @@ pub struct CallMeta {
|
|||
///
|
||||
/// Takes priority over `sink_payload_args` in the SSA sink scan: when a
|
||||
/// call has an object-literal destination arg, only idents under the
|
||||
/// listed fields may contribute sink findings — not every ident in the
|
||||
/// listed fields may contribute sink findings, not every ident in the
|
||||
/// positional slot.
|
||||
///
|
||||
/// Legacy single-gate path: populated only when this call site matched
|
||||
/// exactly one gate. When a callee carries multiple gates (e.g. `fetch`
|
||||
/// is both an SSRF and a `DATA_EXFIL` gate), per-gate filters live in
|
||||
/// [`Self::gate_filters`] and this field is left `None`.
|
||||
#[serde(default)]
|
||||
pub destination_uses: Option<Vec<String>>,
|
||||
/// Per-gate filters for callees that carry multiple gated-sink rules.
|
||||
///
|
||||
/// Each entry preserves one matching gate's `(label_caps, payload_args,
|
||||
/// destination_uses)` so the SSA sink scan can attribute findings
|
||||
/// per-cap. Empty when the call site matches zero or exactly one gate
|
||||
/// (the single-gate case continues to use [`Self::sink_payload_args`] +
|
||||
/// [`Self::destination_uses`]).
|
||||
#[serde(default)]
|
||||
pub gate_filters: Vec<GateFilter>,
|
||||
}
|
||||
|
||||
/// One gate's contribution at a call site whose callee matches multiple
|
||||
/// gates. The SSA taint engine processes each filter independently so a
|
||||
/// `fetch({url: tainted}, {body: tainted})` flow surfaces as one SSRF
|
||||
/// finding (URL filter) plus one `DATA_EXFIL` finding (body filter), each
|
||||
/// carrying its own cap mask rather than a conflated union.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct GateFilter {
|
||||
/// Sink caps emitted by this gate (e.g. `Cap::SSRF`, `Cap::DATA_EXFIL`).
|
||||
pub label_caps: crate::labels::Cap,
|
||||
/// Argument positions that carry the tainted payload for this gate.
|
||||
pub payload_args: Vec<usize>,
|
||||
/// Destination-aware filter: when `Some(names)`, the sink check only
|
||||
/// considers SSA values whose `var_name` matches one of `names` (object-
|
||||
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
|
||||
pub destination_uses: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
/// Taint-classification and variable-flow metadata.
|
||||
|
|
@ -349,7 +372,7 @@ pub struct NodeInfo {
|
|||
///
|
||||
/// This flag is scoped to taint-style sink suppression: it indicates
|
||||
/// that no attacker-controlled data enters through the immediate
|
||||
/// arguments. It does NOT mean the call is "safe" in general — other
|
||||
/// arguments. It does NOT mean the call is "safe" in general, other
|
||||
/// detectors (resource lifecycle, structural analysis) may still
|
||||
/// legitimately flag these calls.
|
||||
pub all_args_literal: bool,
|
||||
|
|
@ -411,7 +434,7 @@ pub struct NodeInfo {
|
|||
pub is_eq_with_const: bool,
|
||||
/// True when this node reads a numeric-length property on a container:
|
||||
/// `arr.length`, `map.size`, `buf.byteLength`, `items.count`, `vec.len()`
|
||||
/// — either as a pure property access or as a zero-arg method call.
|
||||
///, either as a pure property access or as a zero-arg method call.
|
||||
/// Populated by inspecting the AST in `push_node` across JS/TS, Python,
|
||||
/// Ruby, Java, Rust, PHP, and C/C++ idioms where these accessors return
|
||||
/// an integer. Consumed by the type-fact analysis (`ssa::type_facts`)
|
||||
|
|
@ -419,12 +442,12 @@ pub struct NodeInfo {
|
|||
/// FILE_IO / SHELL_ESCAPE sink suppression for provably numeric
|
||||
/// payloads.
|
||||
pub is_numeric_length_access: bool,
|
||||
/// Phase 6.3: the field name read on the RHS of an assignment whose
|
||||
/// the field name read on the RHS of an assignment whose
|
||||
/// RHS is a single member-access expression (e.g. `let x = dto.email`).
|
||||
/// Set to `Some("email")` for that shape; left `None` otherwise.
|
||||
/// Consumed by the type-fact analysis (`ssa::type_facts`) so reads
|
||||
/// against a [`crate::ssa::type_facts::TypeKind::Dto`] receiver pick
|
||||
/// up the field's declared `TypeKind`. Strictly additive — when
|
||||
/// up the field's declared `TypeKind`. Strictly additive, when
|
||||
/// `None`, the legacy copy-prop semantics apply.
|
||||
pub member_field: Option<String>,
|
||||
}
|
||||
|
|
@ -442,7 +465,7 @@ impl NodeInfo {
|
|||
/// lines, flow-step rendering, symbolic witness extraction, debug views.
|
||||
///
|
||||
/// Use `ast.span` directly for **structural grain**: unreachability,
|
||||
/// resource lifecycle, guard byte scans, CFG/taint span dedup — anywhere
|
||||
/// resource lifecycle, guard byte scans, CFG/taint span dedup, anywhere
|
||||
/// the enclosing statement is the meaningful unit.
|
||||
#[inline]
|
||||
pub fn classification_span(&self) -> (usize, usize) {
|
||||
|
|
@ -514,7 +537,7 @@ pub struct BodyMeta {
|
|||
/// Per-parameter [`crate::ssa::type_facts::TypeKind`] inferred from
|
||||
/// decorators / annotations / static type text at CFG construction
|
||||
/// time. Same length as `params`; positions with no recoverable
|
||||
/// type info are `None`. Strictly additive — when every entry is
|
||||
/// type info are `None`. Strictly additive, when every entry is
|
||||
/// `None`, downstream behaviour is identical to the pre-Phase-1
|
||||
/// engine.
|
||||
pub param_types: Vec<Option<crate::ssa::type_facts::TypeKind>>,
|
||||
|
|
@ -528,7 +551,7 @@ pub struct BodyMeta {
|
|||
/// `LocalFuncSummary`. `None` for the synthetic top-level body.
|
||||
///
|
||||
/// All intra-file maps keyed on function identity (SSA summaries, callee
|
||||
/// bodies, inline cache, callback bindings) use this key — never the bare
|
||||
/// bodies, inline cache, callback bindings) use this key, never the bare
|
||||
/// leaf `name`, which is collision-prone across (container, arity,
|
||||
/// disambig, kind).
|
||||
pub func_key: Option<FuncKey>,
|
||||
|
|
@ -589,7 +612,7 @@ pub struct FileCfg {
|
|||
/// Promisify wrapper aliases: local name → wrapped callee name.
|
||||
/// Only populated for JS/TS files.
|
||||
pub promisify_aliases: PromisifyAliases,
|
||||
/// Phase 6: per-file class / trait / interface hierarchy edges.
|
||||
/// per-file class / trait / interface hierarchy edges.
|
||||
/// Each entry is `(sub_container, super_container)` after
|
||||
/// language-specific normalisation. See
|
||||
/// [`crate::cfg::hierarchy`] for the per-language extraction
|
||||
|
|
@ -711,14 +734,10 @@ fn extract_condition_raw<'a>(
|
|||
vars.dedup();
|
||||
vars.truncate(MAX_COND_VARS);
|
||||
|
||||
// 4. Extract text, truncated.
|
||||
let text = text_of(cond, code).map(|t| {
|
||||
if t.len() > MAX_CONDITION_TEXT_LEN {
|
||||
t[..MAX_CONDITION_TEXT_LEN].to_string()
|
||||
} else {
|
||||
t
|
||||
}
|
||||
});
|
||||
// 4. Extract text, truncated. UTF-8-safe, gogs (Gurmukhi) /
|
||||
// discourse (Cyrillic) trip raw byte slices on regex literals.
|
||||
let text = text_of(cond, code)
|
||||
.map(|t| truncate_at_char_boundary(&t, MAX_CONDITION_TEXT_LEN).to_string());
|
||||
|
||||
(text, vars, negated)
|
||||
}
|
||||
|
|
@ -739,7 +758,7 @@ pub(super) fn detect_negation<'a>(
|
|||
_if_ast: Node<'a>,
|
||||
_lang: &str,
|
||||
) -> (Node<'a>, bool) {
|
||||
// Unwrap parenthesized_expression — JS/Java/PHP wrap if-conditions in parens.
|
||||
// Unwrap parenthesized_expression, JS/Java/PHP wrap if-conditions in parens.
|
||||
// This lets us detect negation inside: `if (!expr)` → cond is `(!expr)`.
|
||||
let cond = if cond.kind() == "parenthesized_expression" {
|
||||
cond.child_by_field_name("expression")
|
||||
|
|
@ -811,7 +830,7 @@ fn extract_bin_op(ast: Node, lang: &str) -> Option<BinOp> {
|
|||
"*" => Some(BinOp::Mul),
|
||||
"/" => Some(BinOp::Div),
|
||||
"%" => Some(BinOp::Mod),
|
||||
// Bitwise (single-char tokens — no conflict with && / ||)
|
||||
// Bitwise (single-char tokens, no conflict with && / ||)
|
||||
"&" => Some(BinOp::BitAnd),
|
||||
"|" => Some(BinOp::BitOr),
|
||||
"^" => Some(BinOp::BitXor),
|
||||
|
|
@ -909,7 +928,7 @@ fn extract_template_prefix(ast: Node, lang: &str, code: &[u8]) -> Option<String>
|
|||
/// `extract_template_prefix` for both assignment RHS and call arguments.
|
||||
///
|
||||
/// Also descends through `await` / `yield` wrappers and into the first
|
||||
/// argument of a call expression — this covers the common sink shape
|
||||
/// argument of a call expression, this covers the common sink shape
|
||||
/// `await axios.get(\`https://host/…${x}\`)` where the template literal lives
|
||||
/// inside a call inside an `await` wrapper.
|
||||
fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
||||
|
|
@ -930,7 +949,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
|||
}
|
||||
"call_expression" | "call" | "new_expression" => {
|
||||
// Descend into the first positional argument (e.g.
|
||||
// `axios.get(\`https://…${x}\`)` — the URL we want to lock
|
||||
// `axios.get(\`https://…${x}\`)`, the URL we want to lock
|
||||
// is the template-literal first argument of the call).
|
||||
let args = cur
|
||||
.child_by_field_name("arguments")
|
||||
|
|
@ -942,7 +961,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
|||
}
|
||||
}
|
||||
|
||||
// Case 1: template literal — `\`scheme://host/…${x}…\``.
|
||||
// Case 1: template literal, `\`scheme://host/…${x}…\``.
|
||||
if cur.kind() == "template_string" {
|
||||
let mut w = cur.walk();
|
||||
let first_child = cur.named_children(&mut w).next()?;
|
||||
|
|
@ -957,7 +976,7 @@ fn prefix_of_expression(node: Node, code: &[u8]) -> Option<String> {
|
|||
return None;
|
||||
}
|
||||
|
||||
// Case 2: `"scheme://host/" + x` — LHS is a string literal.
|
||||
// Case 2: `"scheme://host/" + x`, LHS is a string literal.
|
||||
if cur.kind() == "binary_expression" {
|
||||
let mut w2 = cur.walk();
|
||||
let mut ops = cur.children(&mut w2).filter(|c| !c.is_named());
|
||||
|
|
@ -1028,7 +1047,7 @@ fn extract_bin_op_const(ast: Node, lang: &str, code: &[u8]) -> Option<i64> {
|
|||
}
|
||||
}
|
||||
|
||||
// Try left, then right — one of them should be a literal
|
||||
// Try left, then right, one of them should be a literal
|
||||
try_parse_number(left, code).or_else(|| try_parse_number(right, code))
|
||||
}
|
||||
|
||||
|
|
@ -1067,7 +1086,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
|
|||
.named_child(0)
|
||||
.is_some_and(|c| is_boolean_eq_const_tree(c, lang)),
|
||||
"unary_expression" | "not_operator" => {
|
||||
// `!` / `not` — operator is an anonymous child; operand is the
|
||||
// `!` / `not`, operator is an anonymous child; operand is the
|
||||
// single named child.
|
||||
let mut w = node.walk();
|
||||
let mut op_is_not = false;
|
||||
|
|
@ -1084,7 +1103,7 @@ fn is_boolean_eq_const_tree(node: Node, lang: &str) -> bool {
|
|||
.is_some_and(|c| is_boolean_eq_const_tree(c, lang))
|
||||
}
|
||||
"boolean_operator" => {
|
||||
// Python `and`/`or` — operands are named children.
|
||||
// Python `and`/`or`, operands are named children.
|
||||
let l = node.named_child(0);
|
||||
let r = node.named_child(1);
|
||||
l.is_some_and(|n| is_boolean_eq_const_tree(n, lang))
|
||||
|
|
@ -1137,9 +1156,9 @@ fn binary_operator_token(node: Node) -> Option<String> {
|
|||
/// Property names whose value is provably an integer across the supported
|
||||
/// languages: JS/TS `arr.length` (Array/String/TypedArray), `map.size`
|
||||
/// (Map/Set), `buffer.byteLength` (ArrayBuffer/TypedArray); Python `.count`
|
||||
/// (`str.count`, `list.count`, `tuple.count` — all return int); Ruby `.length`
|
||||
/// (`str.count`, `list.count`, `tuple.count`, all return int); Ruby `.length`
|
||||
/// / `.size` / `.count`; Java `.size()` / `.length()`; Rust `.len()`. This
|
||||
/// list is intentionally narrow — only properties whose semantics across every
|
||||
/// list is intentionally narrow, only properties whose semantics across every
|
||||
/// host we scan return an integer, so the `TypeKind::Int` fact is sound.
|
||||
fn is_numeric_length_property(name: &str) -> bool {
|
||||
matches!(name, "length" | "size" | "byteLength" | "count" | "len")
|
||||
|
|
@ -1157,7 +1176,7 @@ fn is_numeric_length_property(name: &str) -> bool {
|
|||
/// Consumed by the type-fact analysis (`ssa::type_facts::analyze_types`) to
|
||||
/// infer `TypeKind::Int` on the defined value so sink-cap suppression can
|
||||
/// treat `"row " + arr.length` as a non-injectable payload.
|
||||
/// Phase 6.3: when the RHS of an assignment / declaration is a single
|
||||
/// when the RHS of an assignment / declaration is a single
|
||||
/// member-access expression (`let x = dto.email`, `x = obj.field`,
|
||||
/// `let x = obj["field"]`), return the property name. The CFG type-fact
|
||||
/// analysis uses the recovered name to look up the field's declared
|
||||
|
|
@ -1321,7 +1340,7 @@ fn find_single_binary_expr<'a>(ast: Node<'a>, lang: &str) -> Option<Node<'a>> {
|
|||
|
||||
// Check if ast itself is a binary expression
|
||||
if is_binary_expr_kind(ast_kind, lang) {
|
||||
// Verify it has exactly 2 named children (left, right) — no nesting
|
||||
// Verify it has exactly 2 named children (left, right), no nesting
|
||||
let named_count = ast.named_child_count();
|
||||
if named_count == 2 {
|
||||
// Ensure neither child is itself a binary expression (that would
|
||||
|
|
@ -1435,7 +1454,7 @@ pub(super) fn push_node<'a>(
|
|||
// (e.g. PHP `object_creation_expression` has positional children).
|
||||
.or_else(|| find_constructor_type_child(ast))
|
||||
.and_then(|n| {
|
||||
// IIFE: `(function(x){...})(arg)` — the called expression is a
|
||||
// IIFE: `(function(x){...})(arg)`, the called expression is a
|
||||
// function literal with no identifier. Bind the call to the
|
||||
// anonymous body's synthetic name so resolve_callee can find
|
||||
// the extracted BodyCfg/summary. Without this, text_of() would
|
||||
|
|
@ -1512,7 +1531,7 @@ pub(super) fn push_node<'a>(
|
|||
// If this is a declaration/expression wrapper or an assignment that
|
||||
// *contains* a call, prefer the first inner call identifier instead of
|
||||
// the whole line. Track the inner call's byte span so we can populate
|
||||
// `CallMeta.callee_span` once the labels settle — enabling narrow
|
||||
// `CallMeta.callee_span` once the labels settle, enabling narrow
|
||||
// source-location reporting when the classified call lives several lines
|
||||
// below the enclosing statement (e.g. call inside a multi-line template
|
||||
// literal).
|
||||
|
|
@ -1546,9 +1565,9 @@ pub(super) fn push_node<'a>(
|
|||
let mut labels = classify_all(lang, &text, extra);
|
||||
|
||||
// If the outermost call didn't classify, try inner/nested calls.
|
||||
// E.g. `str(eval(expr))` — `str` is not a sink, but `eval` is.
|
||||
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
|
||||
// When the callee is overridden, save the original for container ops
|
||||
// (e.g. `parts.add(req.getParameter(...))` — callee becomes
|
||||
// (e.g. `parts.add(req.getParameter(...))`, callee becomes
|
||||
// "req.getParameter" but outer_callee preserves "parts.add").
|
||||
let mut outer_callee: Option<String> = None;
|
||||
let mut inner_callee_span: Option<(usize, usize)> = None;
|
||||
|
|
@ -1568,7 +1587,7 @@ pub(super) fn push_node<'a>(
|
|||
|
||||
// For assignments like `element.innerHTML = value`, the inner-call heuristic
|
||||
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
|
||||
// If that didn't produce a label, check the LHS property name — it may be a
|
||||
// If that didn't produce a label, check the LHS property name, it may be a
|
||||
// sink like `innerHTML`.
|
||||
//
|
||||
// This covers both direct `Kind::Assignment` nodes and `Kind::CallWrapper`
|
||||
|
|
@ -1588,7 +1607,7 @@ pub(super) fn push_node<'a>(
|
|||
if let Some(assign) = assign_node
|
||||
&& let Some(lhs) = assign.child_by_field_name("left")
|
||||
{
|
||||
// Try full member expression first (e.g. "location.href") — more
|
||||
// Try full member expression first (e.g. "location.href"), more
|
||||
// specific and avoids false positives on `a.href`.
|
||||
if let Some(full) = member_expr_text(lhs, code) {
|
||||
if let Some(l) = classify(lang, &full, extra) {
|
||||
|
|
@ -1612,7 +1631,7 @@ pub(super) fn push_node<'a>(
|
|||
// try to classify the member expression text as a source.
|
||||
// This handles `var x = process.env.CMD` (JS), `os.environ["KEY"]` (Python),
|
||||
// and similar property-access-based source patterns.
|
||||
// Skip when the assignment's RHS is itself a function/lambda literal —
|
||||
// Skip when the assignment's RHS is itself a function/lambda literal ,
|
||||
// labels found by `first_member_label` would come from inside the
|
||||
// closure body and shouldn't tag the outer wrapper (e.g. Go's
|
||||
// `run := func() { exec.Command(...) }` would otherwise inherit
|
||||
|
|
@ -1687,7 +1706,7 @@ pub(super) fn push_node<'a>(
|
|||
if labels.is_empty()
|
||||
&& let Some(outer) = call_ast
|
||||
&& let Some((inner, inner_callee_text)) = find_chained_inner_call(outer, lang, code)
|
||||
&& classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_some()
|
||||
&& !classify_gated_sink(lang, &inner_callee_text, |_| None, |_| None, |_| false).is_empty()
|
||||
{
|
||||
call_ast = Some(inner);
|
||||
outer_callee = Some(text.clone());
|
||||
|
|
@ -1707,13 +1726,14 @@ pub(super) fn push_node<'a>(
|
|||
// the outer statement `text`, so gate matcher names like `"fetch"` hit.
|
||||
let mut sink_payload_args: Option<Vec<usize>> = None;
|
||||
let mut destination_uses: Option<Vec<String>> = None;
|
||||
let mut gate_filters: Vec<GateFilter> = Vec::new();
|
||||
if labels.is_empty() {
|
||||
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
|
||||
if let Some(cn) = gate_call {
|
||||
let gate_callee_text = if call_ast.is_some() {
|
||||
text.clone()
|
||||
} else {
|
||||
// Inner call reached via wrapper — use the call-expression's
|
||||
// Inner call reached via wrapper, use the call-expression's
|
||||
// function name directly. Falls back to `text` so non-call-
|
||||
// expression kinds (method calls, Ruby `call` nodes, macros)
|
||||
// still have a usable callee string.
|
||||
|
|
@ -1723,51 +1743,84 @@ pub(super) fn push_node<'a>(
|
|||
.and_then(|f| text_of(f, code))
|
||||
.unwrap_or_else(|| text.clone())
|
||||
};
|
||||
if let Some(gm) = classify_gated_sink(
|
||||
let matches = classify_gated_sink(
|
||||
lang,
|
||||
&gate_callee_text,
|
||||
|idx| extract_const_string_arg(cn, idx, code),
|
||||
|kw| extract_const_keyword_arg(cn, kw, code),
|
||||
|kw| has_keyword_arg(cn, kw, code),
|
||||
) {
|
||||
labels.push(gm.label);
|
||||
let payload = gm.payload_args;
|
||||
if payload == crate::labels::ALL_ARGS_PAYLOAD {
|
||||
// Dynamic-activation sentinel: every positional arg is
|
||||
// conservatively a payload. Expand using the actual call
|
||||
// arity so `collect_tainted_sink_values` checks each one.
|
||||
let arity = extract_arg_uses(cn, code).len();
|
||||
if arity > 0 {
|
||||
sink_payload_args = Some((0..arity).collect());
|
||||
}
|
||||
} else if !payload.is_empty() {
|
||||
sink_payload_args = Some(payload.to_vec());
|
||||
}
|
||||
);
|
||||
|
||||
// Destination-aware gates (outbound HTTP clients): when the
|
||||
// gate declares destination-bearing object fields and the
|
||||
// positional destination arg at call time is an object
|
||||
// literal, narrow sink-taint checks to identifiers under
|
||||
// those fields. Non-object arg forms (string / ident /
|
||||
// expression) return `None` from the extractor and fall
|
||||
// through to whole-arg positional filtering.
|
||||
//
|
||||
// We only populate destination_uses for the FIRST payload
|
||||
// position that is an object literal. For outbound HTTP
|
||||
// gates `payload_args` is always a single position (arg 0)
|
||||
// so this is exact.
|
||||
if !gm.object_destination_fields.is_empty() {
|
||||
for &pos in gm.payload_args {
|
||||
if let Some(names) = extract_destination_field_idents(
|
||||
cn,
|
||||
pos,
|
||||
gm.object_destination_fields,
|
||||
code,
|
||||
) {
|
||||
destination_uses = Some(names);
|
||||
break;
|
||||
if !matches.is_empty() {
|
||||
// Per-gate filter accumulation. Each match contributes:
|
||||
// * its label (added to `labels` so `resolve_sink_caps`
|
||||
// downstream sees the union),
|
||||
// * a `GateFilter` carrying that gate's specific
|
||||
// `(label_caps, payload_args, destination_uses)` so
|
||||
// the SSA sink scan can attribute taint per-cap.
|
||||
let mut union_payload: Vec<usize> = Vec::new();
|
||||
for gm in &matches {
|
||||
labels.push(gm.label);
|
||||
|
||||
let payload_vec: Vec<usize> =
|
||||
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
|
||||
// Dynamic-activation sentinel: every positional arg is
|
||||
// conservatively a payload. Expand using the actual
|
||||
// call arity so `collect_tainted_sink_values` checks
|
||||
// each one.
|
||||
let arity = extract_arg_uses(cn, code).len();
|
||||
(0..arity).collect()
|
||||
} else {
|
||||
gm.payload_args.to_vec()
|
||||
};
|
||||
|
||||
// Destination-aware gates: when the gate declares
|
||||
// destination-bearing object fields and a payload-position
|
||||
// arg is an object literal at call time, narrow sink-taint
|
||||
// checks to identifiers under those fields. Non-object
|
||||
// arg forms return `None` from the extractor and the gate
|
||||
// falls back to whole-arg positional filtering.
|
||||
let mut dest_uses: Option<Vec<String>> = None;
|
||||
if !gm.object_destination_fields.is_empty() {
|
||||
for &pos in gm.payload_args {
|
||||
if let Some(names) = extract_destination_field_idents(
|
||||
cn,
|
||||
pos,
|
||||
gm.object_destination_fields,
|
||||
code,
|
||||
) {
|
||||
dest_uses = Some(names);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let label_caps = match gm.label {
|
||||
crate::labels::DataLabel::Sink(c) => c,
|
||||
_ => crate::labels::Cap::empty(),
|
||||
};
|
||||
|
||||
for &p in &payload_vec {
|
||||
if !union_payload.contains(&p) {
|
||||
union_payload.push(p);
|
||||
}
|
||||
}
|
||||
gate_filters.push(GateFilter {
|
||||
label_caps,
|
||||
payload_args: payload_vec,
|
||||
destination_uses: dest_uses,
|
||||
});
|
||||
}
|
||||
if !union_payload.is_empty() {
|
||||
sink_payload_args = Some(union_payload);
|
||||
}
|
||||
// Legacy single-gate path keeps `destination_uses` populated so
|
||||
// the SSA fast-path (one filter) continues to work without
|
||||
// consulting `gate_filters`. When multiple gates match,
|
||||
// per-position filters live in `gate_filters` and the legacy
|
||||
// field is intentionally left `None`.
|
||||
if gate_filters.len() == 1 {
|
||||
destination_uses = gate_filters[0].destination_uses.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1778,7 +1831,7 @@ pub(super) fn push_node<'a>(
|
|||
// path-traversal or HTML metacharacters. The CFG collapses the whole
|
||||
// chain into a single call node, so detection must inspect the AST of
|
||||
// that node directly. Only fires when no Sanitizer label already
|
||||
// classifies this node — existing label rules win.
|
||||
// classifies this node, existing label rules win.
|
||||
if lang == "rust" && !labels.iter().any(|l| matches!(l, DataLabel::Sanitizer(_))) {
|
||||
if let Some(cn) = call_ast {
|
||||
if cn.kind() == "call_expression" || cn.kind() == "method_call_expression" {
|
||||
|
|
@ -1815,7 +1868,7 @@ pub(super) fn push_node<'a>(
|
|||
// `having` / `joins` as `Sink(SQL_QUERY)` because their string-interpolation
|
||||
// form (`Model.where("id = #{x}")`) is a real SQLi vector. But the same
|
||||
// methods are intrinsically parameterised when arg 0 is a hash, symbol,
|
||||
// array, or non-interpolated string — Rails escapes the values. Rather
|
||||
// array, or non-interpolated string, Rails escapes the values. Rather
|
||||
// than dropping the sink (which would lose the genuine TPs), synthesise
|
||||
// a same-node `Sanitizer(SQL_QUERY)` for the safe shapes; this clears
|
||||
// SQL taint at the call and reflexively dominates the sink, suppressing
|
||||
|
|
@ -1825,7 +1878,7 @@ pub(super) fn push_node<'a>(
|
|||
// Chained calls (`Model.where(...).preload(...).to_a`) collapse into a
|
||||
// single CFG node whose outer `call_ast` may be `to_a` (no args). The
|
||||
// shape inspection has to walk the receiver chain to reach the AR query
|
||||
// call itself — `ruby_chain_arg0_for_method` does that walk.
|
||||
// call itself, `ruby_chain_arg0_for_method` does that walk.
|
||||
if (lang == "ruby" || lang == "rb")
|
||||
&& labels
|
||||
.iter()
|
||||
|
|
@ -1859,7 +1912,7 @@ pub(super) fn push_node<'a>(
|
|||
// and `Statement.executeQuery(String)` overloads are real injection
|
||||
// sinks when given a concatenated SQL string. But the same method
|
||||
// names on JPA `javax.persistence.Query` and JDBC `PreparedStatement`
|
||||
// are zero-arg — they execute SQL that was bound upstream by
|
||||
// are zero-arg, they execute SQL that was bound upstream by
|
||||
// `entityManager.createQuery(LITERAL)` / `connection.prepareStatement(LITERAL)`,
|
||||
// and any bind values went through `setParameter` / `setString`
|
||||
// (which the JDBC/JPA driver escapes). Walk the receiver chain to
|
||||
|
|
@ -1894,7 +1947,7 @@ pub(super) fn push_node<'a>(
|
|||
// (`createQuery` / `createNativeQuery` / `prepareStatement`)
|
||||
// and require its arg 0 to be a string literal. Anything
|
||||
// else (binary concat, identifier, method call) leaves
|
||||
// the sink in place — we cannot prove the SQL is
|
||||
// the sink in place, we cannot prove the SQL is
|
||||
// parameterised, so the structural finding stands.
|
||||
const JPA_BIND_METHODS: &[&str] = &[
|
||||
"createQuery",
|
||||
|
|
@ -1914,6 +1967,89 @@ pub(super) fn push_node<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
// Shape-based sanitizer synthesis for JS/TS ORM-accessor chains.
|
||||
// The static label table marks `db.query` / `connection.query` /
|
||||
// `pool.query` / `client.query` / `db.execute` as `Sink(SQL_QUERY)`
|
||||
// because the bare `connection.query("SELECT ..." + name)` form is a
|
||||
// real SQLi sink. But the same `db.query` method on Strapi-style ORMs
|
||||
// takes a model UID literal and returns a chainable model accessor:
|
||||
// `strapi.db.query('admin::api-token').findOne({ where: whereParams })`.
|
||||
// The trailing `.findOne({...})` / `.findMany({...})` / `.create(...)`
|
||||
// calls are intrinsically parameterised, the actual SQL is generated
|
||||
// by the ORM, and the per-call values arrive through field-keyed object
|
||||
// literals that the ORM driver escapes.
|
||||
//
|
||||
// Recognition rule: when the CFG node's classified text reaches a sink
|
||||
// with `SQL_QUERY` cap, walk the receiver chain looking for an inner
|
||||
// `*.query(...)` / `*.execute(...)` whose arg 0 is a string literal
|
||||
// and whose result has at least one chained method call appended whose
|
||||
// name is in the ORM-accessor whitelist. If both hold, synthesise a
|
||||
// same-node `Sanitizer(SQL_QUERY)` mirroring the Java JPA fix. Bare
|
||||
// `connection.query("SELECT ...")` (no chained method) and
|
||||
// `db.query("UPDATE x SET y=" + name)` (non-literal arg 0) leave the
|
||||
// sink in place, both are genuine SQLi shapes.
|
||||
if (lang == "javascript"
|
||||
|| lang == "js"
|
||||
|| lang == "typescript"
|
||||
|| lang == "ts"
|
||||
|| lang == "tsx")
|
||||
&& labels
|
||||
.iter()
|
||||
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SQL_QUERY)))
|
||||
&& !labels
|
||||
.iter()
|
||||
.any(|l| matches!(l, DataLabel::Sanitizer(c) if c.contains(Cap::SQL_QUERY)))
|
||||
{
|
||||
const QUERY_TARGETS: &[&str] = &["query", "execute"];
|
||||
// ORM-accessor methods that take object-literal args and return
|
||||
// promises of rows / row counts. Promise methods (`then`, `catch`,
|
||||
// `finally`) deliberately excluded, they don't prove ORM shape.
|
||||
const ORM_CHAIN_METHODS: &[&str] = &[
|
||||
"findOne",
|
||||
"findMany",
|
||||
"findFirst",
|
||||
"findUnique",
|
||||
"findById",
|
||||
"find",
|
||||
"create",
|
||||
"createMany",
|
||||
"update",
|
||||
"updateMany",
|
||||
"upsert",
|
||||
"delete",
|
||||
"deleteMany",
|
||||
"count",
|
||||
"aggregate",
|
||||
"distinct",
|
||||
"save",
|
||||
];
|
||||
// Fall back to a deeper walk (up to 4 levels) for await/return-
|
||||
// wrapped calls (e.g. `const x = await db.query(...).findOne(...)` ,
|
||||
// call sits at depth 3 inside lexical_declaration > variable_declarator
|
||||
// > await_expression > call_expression).
|
||||
let chain_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
|
||||
if let Some(call_node) = chain_call {
|
||||
// Outer method must be in the ORM whitelist *and* the chain must
|
||||
// have a deeper inner call to a `query`/`execute` whose arg 0 is
|
||||
// a string literal. Both checks gate the synthesis.
|
||||
let outer_method = js_chain_outer_method_for_inner(call_node, QUERY_TARGETS, code);
|
||||
let outer_is_orm = outer_method
|
||||
.as_deref()
|
||||
.is_some_and(|m| ORM_CHAIN_METHODS.contains(&m));
|
||||
if outer_is_orm
|
||||
&& let Some((arg0_kind, has_interp)) =
|
||||
js_chain_arg0_kind_for_method(call_node, QUERY_TARGETS, code)
|
||||
&& !has_interp
|
||||
&& matches!(
|
||||
arg0_kind.as_str(),
|
||||
"string" | "string_fragment" | "template_string"
|
||||
)
|
||||
{
|
||||
labels.push(DataLabel::Sanitizer(Cap::SQL_QUERY));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let span = (ast.start_byte(), ast.end_byte());
|
||||
|
||||
/* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */
|
||||
|
|
@ -2036,7 +2172,7 @@ pub(super) fn push_node<'a>(
|
|||
// (SSA `SsaOp::Call.receiver`, summary `receiver_to_return`/`receiver_to_sink`).
|
||||
//
|
||||
// Two cases:
|
||||
// 1. Kind::CallMethod — native method call AST (Java method_invocation,
|
||||
// 1. Kind::CallMethod, native method call AST (Java method_invocation,
|
||||
// Rust method_call_expression, Ruby call, PHP member_call_expression).
|
||||
// Receiver is exposed via "object"/"receiver"/"scope" field on the call.
|
||||
// 2. Kind::CallFn whose function child is a member_expression (JS/TS) or
|
||||
|
|
@ -2065,7 +2201,7 @@ pub(super) fn push_node<'a>(
|
|||
// value, which is what type-qualified resolution
|
||||
// anchors on. Falls back to `root_receiver_text` (which
|
||||
// returns raw text like "conn.execute") only if drilling
|
||||
// fails — preserving prior behavior for types we can't
|
||||
// fails, preserving prior behavior for types we can't
|
||||
// structurally reduce.
|
||||
root_member_receiver(rn, code).or_else(|| root_receiver_text(cn, lang, code))
|
||||
} else {
|
||||
|
|
@ -2076,7 +2212,7 @@ pub(super) fn push_node<'a>(
|
|||
// JS/TS `obj.method(x)`: call_expression.function = member_expression.
|
||||
// Python `obj.method(x)`: call.function = attribute.
|
||||
// Rust `obj.method(x)`: call_expression.function = field_expression
|
||||
// (field on `value`, not `object` — value can be another call
|
||||
// (field on `value`, not `object`, value can be another call
|
||||
// for chained forms like `Connection::open(p).unwrap().execute(...)`).
|
||||
// Pull the receiver from the object/attribute-owner field.
|
||||
let func_child = cn.child_by_field_name("function");
|
||||
|
|
@ -2139,7 +2275,7 @@ pub(super) fn push_node<'a>(
|
|||
// Python `with` and Java try-with-resources.
|
||||
let is_raii_managed = is_raii_factory(lang, &text);
|
||||
|
||||
// Ruby block form auto-close: `File.open(path) { |f| f.read }` —
|
||||
// Ruby block form auto-close: `File.open(path) { |f| f.read }` ,
|
||||
// the block parameter receives the resource and Ruby guarantees close
|
||||
// at block exit. If assigned (`f = File.open(p) { ... }`), the
|
||||
// variable holds the block's return value, not an open resource.
|
||||
|
|
@ -2156,7 +2292,7 @@ pub(super) fn push_node<'a>(
|
|||
// Prefer the span of the call found by `find_classifiable_inner_call`
|
||||
// (deeper, classification-driven) over the one from `first_call_ident`
|
||||
// (shallower, text-override-driven). Only record `callee_span` when it
|
||||
// actually narrows against `ast.span` — storing a redundant copy would
|
||||
// actually narrows against `ast.span`, storing a redundant copy would
|
||||
// just bloat every labeled Call node.
|
||||
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
|
||||
|
||||
|
|
@ -2174,6 +2310,7 @@ pub(super) fn push_node<'a>(
|
|||
kwargs,
|
||||
arg_string_literals,
|
||||
destination_uses,
|
||||
gate_filters,
|
||||
},
|
||||
taint: TaintMeta {
|
||||
labels,
|
||||
|
|
@ -2228,7 +2365,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
|
|||
/// Pre-emit dedicated Source CFG nodes for call arguments that contain source
|
||||
/// member expressions.
|
||||
///
|
||||
/// **Two-step API** — Source nodes must be created *before* the Call node so
|
||||
/// **Two-step API**, Source nodes must be created *before* the Call node so
|
||||
/// they receive lower graph indices. This is critical because the If handler
|
||||
/// uses `NodeIndex::new(g.node_count())` to capture the first node built in a
|
||||
/// branch and wires a True/False edge to it. If the Source node has a lower
|
||||
|
|
@ -2239,7 +2376,7 @@ pub(super) fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind:
|
|||
/// the branch body.
|
||||
///
|
||||
/// True when `ast` is an assignment / declaration whose RHS is a
|
||||
/// function or lambda literal — i.e. shapes like
|
||||
/// function or lambda literal, i.e. shapes like
|
||||
/// * Go `run := func() { ... }`
|
||||
/// * JS/TS `var run = function() { ... }` / `const run = () => ...`
|
||||
/// * Python `run = lambda x: ...`
|
||||
|
|
@ -2311,7 +2448,7 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
|
|||
false
|
||||
}
|
||||
|
||||
/// Pointer-Phase 6 / W5: when `ast` is (or wraps) an assignment whose
|
||||
/// when `ast` is (or wraps) an assignment whose
|
||||
/// LHS is a single subscript / index expression with a plain-identifier
|
||||
/// receiver, emit a synthetic `__index_set__` Call node and return its
|
||||
/// `NodeIndex`. Returns `None` for non-subscript LHSs, multi-target
|
||||
|
|
@ -2328,7 +2465,7 @@ fn try_lower_subscript_write(
|
|||
enclosing_func: Option<&str>,
|
||||
call_ordinal: &mut u32,
|
||||
) -> Option<NodeIndex> {
|
||||
// Locate the assignment node — `ast` may be the assignment itself
|
||||
// Locate the assignment node, `ast` may be the assignment itself
|
||||
// (Go `assignment_statement`) or a wrapper (`expression_statement`
|
||||
// containing JS `assignment_expression` / Python `assignment`).
|
||||
let assign_ast = if matches!(lookup(lang, ast.kind()), Kind::Assignment) {
|
||||
|
|
@ -2383,7 +2520,7 @@ fn try_lower_subscript_write(
|
|||
/// `synth_bindings` carry `(arg_pos, synth_name)` pairs that should be
|
||||
/// appended to both the call's `arg_uses[arg_pos]` and its `taint.uses`.
|
||||
/// `uses_only_synth_names` carry synth names that should *only* be
|
||||
/// appended to `taint.uses` — used for chain-inner-arg sources where the
|
||||
/// appended to `taint.uses`, used for chain-inner-arg sources where the
|
||||
/// synth value is not a positional argument of the OUTER call but still
|
||||
/// participates in the call's implicit dependency chain (e.g. `r.Body`
|
||||
/// inside `json.NewDecoder(r.Body).Decode(emoji)`'s receiver).
|
||||
|
|
@ -2446,7 +2583,7 @@ fn pre_emit_arg_source_nodes(
|
|||
for (pos, child) in children.iter().enumerate() {
|
||||
let src_label = first_member_label(*child, lang, code, extra);
|
||||
if let Some(DataLabel::Source(caps)) = src_label {
|
||||
// Use the *current* node count as a unique token — it equals the
|
||||
// Use the *current* node count as a unique token, it equals the
|
||||
// index the new Source node will receive.
|
||||
let synth_name = format!("__nyx_src_{}_{}", g.node_count(), pos);
|
||||
let member_text = first_member_text(*child, code);
|
||||
|
|
@ -2481,7 +2618,7 @@ fn pre_emit_arg_source_nodes(
|
|||
continue;
|
||||
}
|
||||
|
||||
// Pointer-Phase 6 / W5: pre-emit `__index_get__` Call nodes for
|
||||
//pre-emit `__index_get__` Call nodes for
|
||||
// subscript / index-expression args when pointer analysis is
|
||||
// enabled. This lets the W2/W4 container ELEM read hook fire
|
||||
// on the synth call, propagating must/may/caps from the cell
|
||||
|
|
@ -2489,7 +2626,7 @@ fn pre_emit_arg_source_nodes(
|
|||
//
|
||||
// Gated on `pointer::is_enabled()` so the env-var=0 path keeps
|
||||
// CFG shapes bit-identical to today's output. Only fires when
|
||||
// the array operand resolves to a plain identifier — see
|
||||
// the array operand resolves to a plain identifier, see
|
||||
// `subscript_components` for the bail conditions.
|
||||
if pointer_on
|
||||
&& is_subscript_kind(child.kind())
|
||||
|
|
@ -2539,7 +2676,7 @@ fn pre_emit_arg_source_nodes(
|
|||
// Gated to Go and to writeback-shaped outer callees (`Decode` /
|
||||
// `Unmarshal`) because the synth-source emission is only useful when
|
||||
// a downstream writeback consumer reads from the chain's tainted
|
||||
// receiver — broader gating risks emitting synth sources whose taint
|
||||
// receiver, broader gating risks emitting synth sources whose taint
|
||||
// never propagates and whose presence trips Layer B AST-pattern
|
||||
// suppression on unrelated sinks (see
|
||||
// `tests/fixtures/real_world/go/taint/func_literal_capture.go`).
|
||||
|
|
@ -2613,7 +2750,7 @@ fn pre_emit_arg_source_nodes(
|
|||
|
||||
/// Step 2: wire synthetic variable names from pre-emitted Source nodes into
|
||||
/// the Call node's `arg_uses` and `uses`. `uses_only` synth names are
|
||||
/// appended only to `taint.uses` — used for chain-inner-arg sources whose
|
||||
/// appended only to `taint.uses`, used for chain-inner-arg sources whose
|
||||
/// synth value is not a positional outer-call argument.
|
||||
fn apply_arg_source_bindings(
|
||||
g: &mut Cfg,
|
||||
|
|
@ -2724,7 +2861,7 @@ pub(super) fn build_sub<'a>(
|
|||
.unwrap_or(false);
|
||||
|
||||
// Check for negation wrapping the entire condition (e.g. `!(a && b)`)
|
||||
// — if present, skip short-circuit decomposition (De Morgan out of scope).
|
||||
//, if present, skip short-circuit decomposition (De Morgan out of scope).
|
||||
let has_short_circuit = has_short_circuit
|
||||
&& cond_subtree.map_or(false, |c| {
|
||||
let unwrapped = unwrap_parens(c);
|
||||
|
|
@ -3424,7 +3561,7 @@ pub(super) fn build_sub<'a>(
|
|||
// When the grammar-level name is anonymous, try to derive a binding
|
||||
// name from the surrounding declaration or assignment. This lets
|
||||
// `var h = function(x){...}` / `this.run = () => {...}` participate
|
||||
// in callback resolution — callers referencing `h` or `run` can
|
||||
// in callback resolution, callers referencing `h` or `run` can
|
||||
// find the body via `resolve_local_func_key` and intra-file calls
|
||||
// like `h()` can resolve to the anonymous body's summary. Without
|
||||
// this, the body is keyed with the synthetic anon name and there
|
||||
|
|
@ -3731,7 +3868,7 @@ pub(super) fn build_sub<'a>(
|
|||
// would lower the return as a plain `StmtKind::Call`, losing
|
||||
// the return semantics and letting fall-through Seq edges
|
||||
// survive into the SSA terminator (the OR-chain rejection-arm
|
||||
// defect — see `or_chain_rejection_block_terminates_with_return`).
|
||||
// defect, see `or_chain_rejection_block_terminates_with_return`).
|
||||
if let Some(inner) = ast.children(&mut cursor).find(|c| {
|
||||
matches!(
|
||||
lookup(lang, c.kind()),
|
||||
|
|
@ -3788,7 +3925,7 @@ pub(super) fn build_sub<'a>(
|
|||
);
|
||||
}
|
||||
|
||||
// Pointer-Phase 6 / W5: subscript-write lowering when the
|
||||
//subscript-write lowering when the
|
||||
// CallWrapper's inner expression is `arr[i] = v` (JS/TS,
|
||||
// Python). See `try_lower_subscript_write` for shape +
|
||||
// bail matrix.
|
||||
|
|
@ -3824,7 +3961,7 @@ pub(super) fn build_sub<'a>(
|
|||
// Pre-emit Source nodes for call arguments containing source
|
||||
// member expressions (e.g. `req.body.returnTo` inside
|
||||
// `res.redirect(req.body.returnTo)`). Created BEFORE the Call
|
||||
// node so they get lower indices — see doc comment on
|
||||
// node so they get lower indices, see doc comment on
|
||||
// `pre_emit_arg_source_nodes` for why this ordering matters.
|
||||
let (effective_preds, src_bindings, src_uses_only) = if kind == StmtKind::Call {
|
||||
pre_emit_arg_source_nodes(g, ast, lang, code, enclosing_func, analysis_rules, preds)
|
||||
|
|
@ -3984,7 +4121,7 @@ pub(super) fn build_sub<'a>(
|
|||
|
||||
// Assignment that may contain a call (Python `x = os.getenv(...)`, Ruby `x = gets()`)
|
||||
Kind::Assignment => {
|
||||
// JS/TS ternary-RHS split — same rationale as the CallWrapper branch.
|
||||
// JS/TS ternary-RHS split, same rationale as the CallWrapper branch.
|
||||
if matches!(lang, "javascript" | "typescript" | "tsx")
|
||||
&& let (Some(left), Some(right)) = (
|
||||
ast.child_by_field_name("left"),
|
||||
|
|
@ -4011,7 +4148,7 @@ pub(super) fn build_sub<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
// Pointer-Phase 6 / W5: subscript-write lowering. See
|
||||
//subscript-write lowering. See
|
||||
// `try_lower_subscript_write` for the per-language shape
|
||||
// matrix and bail conditions.
|
||||
if crate::pointer::is_enabled()
|
||||
|
|
@ -4099,12 +4236,19 @@ pub(crate) fn build_cfg<'a>(
|
|||
// function so thread-local state never leaks between files.
|
||||
populate_fn_dfs_indices(tree, lang);
|
||||
|
||||
// Phase 6: harvest DTO class definitions before any param classifier
|
||||
// runs. Empty for languages without a Phase 6 collector. Cleared
|
||||
// harvest DTO class definitions before any param classifier
|
||||
// runs. Empty for languages without a collector. Cleared
|
||||
// alongside the DFS map at end-of-build_cfg.
|
||||
DTO_CLASSES.with(|cell| {
|
||||
*cell.borrow_mut() = dto::collect_dto_classes(tree.root_node(), lang, code);
|
||||
});
|
||||
// harvest same-file `type X = Map<...>` / `Set<...>` / `T[]`
|
||||
// aliases so JS/TS param classifiers resolve `m: ElementsMap`
|
||||
// to `LocalCollection`. Empty for non-JS/TS languages.
|
||||
TYPE_ALIAS_LC.with(|cell| {
|
||||
*cell.borrow_mut() =
|
||||
dto::collect_type_alias_local_collections(tree.root_node(), lang, code);
|
||||
});
|
||||
|
||||
// Create the top-level body graph (BodyId(0)).
|
||||
let (mut g, entry, exit) = create_body_graph(0, code.len(), None);
|
||||
|
|
@ -4143,7 +4287,7 @@ pub(crate) fn build_cfg<'a>(
|
|||
connect_all(&mut g, &[e], exit, EdgeKind::Seq);
|
||||
}
|
||||
|
||||
debug!(target: "cfg", "CFG DONE — top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
|
||||
debug!(target: "cfg", "CFG DONE, top-level nodes: {}, bodies: {}", g.node_count(), bodies.len() + 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
for idx in g.node_indices() {
|
||||
|
|
@ -4231,10 +4375,11 @@ pub(crate) fn build_cfg<'a>(
|
|||
// Clear the per-file DFS-index map so it does not leak to the next
|
||||
// file built on this thread.
|
||||
clear_fn_dfs_indices();
|
||||
// Phase 6: same hygiene for the DTO map.
|
||||
// same hygiene for the DTO map.
|
||||
DTO_CLASSES.with(|cell| cell.borrow_mut().clear());
|
||||
TYPE_ALIAS_LC.with(|cell| cell.borrow_mut().clear());
|
||||
|
||||
// Phase 6 (typed call-graph subtype awareness): collect every
|
||||
// collect every
|
||||
// declared inheritance / impl / implements relationship in the
|
||||
// file. Per-language extractor in `cfg::hierarchy`; empty for
|
||||
// Go and C. Each `(sub, super)` pair gets duplicated onto every
|
||||
|
|
@ -4289,14 +4434,14 @@ fn apply_promisify_labels(
|
|||
/// Build a `CalleeSite` carrying the richer per-call-site metadata for a
|
||||
/// CFG node.
|
||||
///
|
||||
/// * `arity` — positional argument count. `None` when `extract_arg_uses`
|
||||
/// * `arity`, positional argument count. `None` when `extract_arg_uses`
|
||||
/// bailed out on splats/keyword-args (length 0 does not distinguish
|
||||
/// zero-arg calls from unknown; we treat 0 as a concrete zero). The
|
||||
/// receiver is a separate channel via `CallMeta.receiver` and is not
|
||||
/// represented in `arg_uses`, so `arity == arg_uses.len()` for calls.
|
||||
/// * `receiver` — forwarded verbatim from `CallMeta.receiver` (already
|
||||
/// * `receiver`, forwarded verbatim from `CallMeta.receiver` (already
|
||||
/// normalized to the root identifier).
|
||||
/// * `qualifier` — the segment(s) before the leaf identifier of the callee.
|
||||
/// * `qualifier`, the segment(s) before the leaf identifier of the callee.
|
||||
/// For **Rust** specifically, this is the *full* `::`-joined prefix (e.g.
|
||||
/// `"crate::auth::token"` for `crate::auth::token::validate`) so that
|
||||
/// cross-file `use`-map resolution in `callgraph.rs` has everything it
|
||||
|
|
@ -4380,7 +4525,7 @@ pub(crate) fn export_summaries(
|
|||
module_path: None,
|
||||
rust_use_map: None,
|
||||
rust_wildcards: None,
|
||||
// Phase 6 hierarchy edges live on `FileCfg`, not on the
|
||||
// Hierarchy edges live on `FileCfg`, not on the
|
||||
// graph-local `FuncSummaries`. `ParsedFile::export_summaries_with_root`
|
||||
// attaches them after this transform returns.
|
||||
hierarchy_edges: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ use petgraph::graph::NodeIndex;
|
|||
use smallvec::smallvec;
|
||||
use tree_sitter::Node;
|
||||
|
||||
/// Phase 6.2 — resolve a syntactic class / struct / interface / model
|
||||
/// resolve a syntactic class / struct / interface / model
|
||||
/// name against the per-file [`DTO_CLASSES`] map populated at the top
|
||||
/// of `build_cfg`. Returns the [`TypeKind::Dto`] carrying the
|
||||
/// per-field type map when the class is declared in the same file;
|
||||
|
|
@ -21,7 +21,7 @@ fn lookup_dto_class(class_name: &str) -> Option<TypeKind> {
|
|||
/// Extract parameter names + per-position [`TypeKind`] from a function
|
||||
/// AST node. Each entry's second slot is `Some(TypeKind)` when the
|
||||
/// parameter's decorator, attribute, or static type annotation maps to
|
||||
/// a known kind, and `None` otherwise. Strictly additive — when no
|
||||
/// a known kind, and `None` otherwise. Strictly additive, when no
|
||||
/// type info is recoverable, behaviour is identical to the names-only
|
||||
/// path.
|
||||
pub(super) fn extract_param_meta<'a>(
|
||||
|
|
@ -109,7 +109,7 @@ pub(super) fn extract_param_meta<'a>(
|
|||
// Python `typed_parameter`, `default_parameter`,
|
||||
// `typed_default_parameter`): the wrapper node has no `name`
|
||||
// field but contains the identifier as a child. Pick the
|
||||
// *first* identifier — that is the parameter name; subsequent
|
||||
// *first* identifier, that is the parameter name; subsequent
|
||||
// identifiers are part of the type annotation or default
|
||||
// expression.
|
||||
if !found {
|
||||
|
|
@ -123,7 +123,7 @@ pub(super) fn extract_param_meta<'a>(
|
|||
continue;
|
||||
}
|
||||
|
||||
// Bare identifier children — e.g. Rust untyped closure params `|cmd|`
|
||||
// Bare identifier children, e.g. Rust untyped closure params `|cmd|`
|
||||
// where the child is an `identifier` node, not a `parameter` wrapper.
|
||||
if child.kind() == "identifier" {
|
||||
if let Some(txt) = text_of(child, code) {
|
||||
|
|
@ -137,8 +137,8 @@ pub(super) fn extract_param_meta<'a>(
|
|||
/// Walk up from a function definition node and build a container path.
|
||||
///
|
||||
/// Records the names of enclosing classes / impls / modules / namespaces /
|
||||
/// structs — and, for anonymous / nested functions, the name of an enclosing
|
||||
/// named function — joined with `::`. Also returns a `FuncKind` guess
|
||||
/// structs, and, for anonymous / nested functions, the name of an enclosing
|
||||
/// named function, joined with `::`. Also returns a `FuncKind` guess
|
||||
/// reflecting the structural role.
|
||||
///
|
||||
/// Returns `(container, kind)`.
|
||||
|
|
@ -185,7 +185,7 @@ pub(super) fn compute_container_and_kind(
|
|||
| "enum_item"
|
||||
| "struct_specifier"
|
||||
| "struct_item" => Some("name"),
|
||||
// Rust impl blocks — pick the type name, not the trait name.
|
||||
// Rust impl blocks, pick the type name, not the trait name.
|
||||
"impl_item" => Some("type"),
|
||||
// Go / C++ / PHP namespaces and modules.
|
||||
"namespace_definition" | "namespace_declaration" | "module_declaration" | "module" => {
|
||||
|
|
@ -223,7 +223,7 @@ pub(super) fn compute_container_and_kind(
|
|||
|| pk == "lambda_expression"
|
||||
|| pk == "function_expression"
|
||||
{
|
||||
// Nested definition — record the outer function's name and
|
||||
// Nested definition, record the outer function's name and
|
||||
// classify self as Closure even if we got a real name.
|
||||
if let Some(name_node) = parent.child_by_field_name("name") {
|
||||
if let Some(text) = text_of(name_node, code) {
|
||||
|
|
@ -428,15 +428,15 @@ pub(super) fn inject_framework_param_sources(
|
|||
/// no recognised pattern matches, returns `None` and the engine
|
||||
/// behaves exactly as before.
|
||||
///
|
||||
/// Recognised patterns (Phase 2):
|
||||
/// * Java (Spring) — `@PathVariable`/`@RequestParam Long X` →
|
||||
/// Recognised patterns:
|
||||
/// * Java (Spring), `@PathVariable`/`@RequestParam Long X` →
|
||||
/// [`TypeKind::Int`]; `@RequestBody T` → object (no kind today).
|
||||
/// * TypeScript (NestJS) — `@Param('id') id: number` →
|
||||
/// * TypeScript (NestJS), `@Param('id') id: number` →
|
||||
/// [`TypeKind::Int`]; `@Body() dto: T` / `@Query('q') q: string`.
|
||||
/// * Rust (Axum / Rocket / Actix) — `Path<i64>` / `Path<u32>` /
|
||||
/// * Rust (Axum / Rocket / Actix), `Path<i64>` / `Path<u32>` /
|
||||
/// `web::Path<i64>` → [`TypeKind::Int`]; `Path<String>` →
|
||||
/// [`TypeKind::String`].
|
||||
/// * Python (FastAPI) — `def h(x: int)` → [`TypeKind::Int`];
|
||||
/// * Python (FastAPI), `def h(x: int)` → [`TypeKind::Int`];
|
||||
/// `Annotated[int, Path()]` → [`TypeKind::Int`].
|
||||
pub(super) fn classify_param_type<'a>(
|
||||
param: Node<'a>,
|
||||
|
|
@ -453,9 +453,9 @@ pub(super) fn classify_param_type<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
/// Java (Spring) — recognise typed-extractor parameters via the
|
||||
/// Java (Spring), recognise typed-extractor parameters via the
|
||||
/// surrounding annotation. Per Hard Rule 3, plain `Long X` without a
|
||||
/// known framework annotation is **not** treated as a typed extractor —
|
||||
/// known framework annotation is **not** treated as a typed extractor ,
|
||||
/// the parameter could be a regular function argument that the
|
||||
/// framework never validates. Recognised annotations:
|
||||
/// `@PathVariable`, `@RequestParam`, `@RequestBody`, `@RequestHeader`,
|
||||
|
|
@ -473,7 +473,7 @@ fn classify_param_type_java<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
|
|||
if let Some(k) = java_type_to_kind(&type_text) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: when the static type is a class name we don't classify
|
||||
// when the static type is a class name we don't classify
|
||||
// as a primitive (e.g. `@RequestBody CreateUser dto`), look up the
|
||||
// class in the same-file DTO map. Strip any generics for the
|
||||
// leading type so `Foo<Bar>` still resolves on `Foo`.
|
||||
|
|
@ -527,7 +527,7 @@ fn has_java_framework_annotation(param: Node<'_>, code: &[u8]) -> bool {
|
|||
}
|
||||
|
||||
/// Map a Java type-text fragment to a [`TypeKind`]. Public to the
|
||||
/// `cfg` module so the Phase 6 DTO collector can reuse the same
|
||||
/// `cfg` module so the DTO DTO collector can reuse the same
|
||||
/// classifier for class fields.
|
||||
pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let bare = t.trim().trim_start_matches('@').trim();
|
||||
|
|
@ -546,7 +546,7 @@ pub(super) fn java_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
|
||||
/// Map a TypeScript type-text fragment (already stripped of leading
|
||||
/// `:` / whitespace) to a primitive [`TypeKind`]. Used by both the
|
||||
/// per-parameter classifier and the Phase 6 DTO collector.
|
||||
/// per-parameter classifier and the DTO DTO collector.
|
||||
pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let head = t.split('<').next().unwrap_or(t).trim();
|
||||
match head {
|
||||
|
|
@ -557,13 +557,35 @@ pub(super) fn ts_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
}
|
||||
}
|
||||
|
||||
/// TypeScript (NestJS) — recognise typed-extractor parameters via a
|
||||
/// TypeScript (NestJS), recognise typed-extractor parameters via a
|
||||
/// known NestJS decorator (`@Param`, `@Body`, `@Query`, `@Headers`,
|
||||
/// `@Req`, `@Res`). Per Hard Rule 3, a bare `function h(id: number)`
|
||||
/// is not a framework extractor — without a NestJS decorator no
|
||||
/// is not a framework extractor, without a NestJS decorator no
|
||||
/// runtime gate is implied. Pipe coercions (`ParseIntPipe` /
|
||||
/// `ParseBoolPipe`) override the static type.
|
||||
///
|
||||
/// Exception: parameters annotated as a known JS built-in collection
|
||||
/// type (`Map<...>`, `Set<...>`, `WeakMap<...>`, `WeakSet<...>`,
|
||||
/// `Array<...>` / `T[]` / `ReadonlyArray<...>`) resolve to
|
||||
/// [`TypeKind::LocalCollection`] regardless of decorator presence.
|
||||
/// `LocalCollection` is a *receiver-shape* claim, not a
|
||||
/// framework-validated-input claim, it tells the auth analyser that
|
||||
/// `param.get(k)` / `param.set(k, v)` / `param.find(p)` is a
|
||||
/// container operation rather than a data-layer read/mutation. This
|
||||
/// closes the Excalidraw FP cluster (`elementsMap: ElementsMap`,
|
||||
/// `groupIdMapForOperation: Map<string, string>`) without affecting
|
||||
/// any input-validation reasoning.
|
||||
fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
|
||||
let type_text = param
|
||||
.child_by_field_name("type")
|
||||
.and_then(|n| inner_ts_type_text(n, code));
|
||||
|
||||
if let Some(t) = type_text.as_deref()
|
||||
&& let Some(k) = ts_type_to_local_collection(t.trim().trim_start_matches(':').trim())
|
||||
{
|
||||
return Some(k);
|
||||
}
|
||||
|
||||
if !has_ts_decorator_argument(
|
||||
param,
|
||||
code,
|
||||
|
|
@ -586,14 +608,12 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
|
|||
if has_ts_decorator_argument(param, code, &["ParseBoolPipe"]) {
|
||||
return Some(TypeKind::Bool);
|
||||
}
|
||||
let t = param
|
||||
.child_by_field_name("type")
|
||||
.and_then(|n| inner_ts_type_text(n, code))?;
|
||||
let t = type_text?;
|
||||
let stripped = t.trim().trim_start_matches(':').trim();
|
||||
if let Some(k) = ts_type_to_kind(stripped) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: NestJS `@Body() dto: CreateUser` — when the static
|
||||
// NestJS `@Body() dto: CreateUser`, when the static
|
||||
// type is a class / interface name declared in the same file,
|
||||
// resolve via the DTO map. Generic args dropped for the leading
|
||||
// type so `Foo<Bar>` matches on `Foo`.
|
||||
|
|
@ -601,8 +621,41 @@ fn classify_param_type_ts<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKin
|
|||
lookup_dto_class(head)
|
||||
}
|
||||
|
||||
/// Map a TypeScript / JavaScript type-text fragment to
|
||||
/// [`TypeKind::LocalCollection`] when the head is a JS built-in
|
||||
/// container type. Recognises:
|
||||
///
|
||||
/// * `Map<K, V>`, `Set<T>`, `WeakMap<K, V>`, `WeakSet<T>`, the four
|
||||
/// built-in keyed/unkeyed collection types.
|
||||
/// * `Array<T>`, `ReadonlyArray<T>`, the named array generics.
|
||||
/// * `T[]`, `readonly T[]`, the array shorthand syntax.
|
||||
/// * Same-file `type X = Map<...>` aliases (resolved via the
|
||||
/// per-file `TYPE_ALIAS_LC` map populated at the top of
|
||||
/// [`build_cfg`]).
|
||||
///
|
||||
/// Same-file user types named `Map` / `Set` / etc. (which would
|
||||
/// shadow the built-ins) are vanishingly rare in TS codebases that
|
||||
/// also define the methods (`get`, `set`, `has`, `find`); the
|
||||
/// classifier accepts the head match.
|
||||
pub(super) fn ts_type_to_local_collection(t: &str) -> Option<TypeKind> {
|
||||
let head_text = t.trim().trim_start_matches("readonly ").trim();
|
||||
// Array shorthand: `T[]` or `readonly T[]`.
|
||||
if head_text.ends_with("[]") {
|
||||
return Some(TypeKind::LocalCollection);
|
||||
}
|
||||
let head = head_text.split('<').next().unwrap_or(head_text).trim();
|
||||
match head {
|
||||
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" | "ReadonlyArray" => {
|
||||
Some(TypeKind::LocalCollection)
|
||||
}
|
||||
_ => super::TYPE_ALIAS_LC
|
||||
.with(|cell| cell.borrow().contains(head))
|
||||
.then_some(TypeKind::LocalCollection),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner_ts_type_text<'a>(type_anno: Node<'a>, code: &'a [u8]) -> Option<String> {
|
||||
// type_annotation node text is `: T` — unwrap to T.
|
||||
// type_annotation node text is `: T`, unwrap to T.
|
||||
if let Some(child) = type_anno.named_child(0) {
|
||||
return text_of(child, code);
|
||||
}
|
||||
|
|
@ -643,10 +696,10 @@ fn has_ts_decorator_argument(param: Node<'_>, code: &[u8], wanted: &[&str]) -> b
|
|||
false
|
||||
}
|
||||
|
||||
/// Rust (Axum / Rocket / Actix) — read the parameter's type text and
|
||||
/// Rust (Axum / Rocket / Actix), read the parameter's type text and
|
||||
/// look for `Path<i64>` / `Json<T>` / `Form<T>` / `Query<T>` shapes.
|
||||
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)` without an
|
||||
/// extractor wrapper) are **not** treated as typed extractors — only
|
||||
/// extractor wrapper) are **not** treated as typed extractors, only
|
||||
/// framework-wrapped types qualify.
|
||||
fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
|
||||
if param.kind() != "parameter" {
|
||||
|
|
@ -654,9 +707,121 @@ fn classify_param_type_rust<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeK
|
|||
}
|
||||
let type_node = param.child_by_field_name("type")?;
|
||||
let type_text = text_of(type_node, code)?;
|
||||
|
||||
// LocalCollection is a *receiver-shape* claim, not a
|
||||
// framework-validated-input claim, Hard Rule 3's "bare primitives
|
||||
// don't count" gate doesn't apply (mirrors `classify_param_type_ts`
|
||||
// for the same reason). Captures `unsharded: RoaringBitmap`,
|
||||
// `docids: &mut RoaringBitmap`, `params: HashMap<String, String>`,
|
||||
// `new_shard_docids: &'a mut hashbrown::HashMap<...>` shapes from
|
||||
// meilisearch/index-scheduler's bitmap bookkeeping where the
|
||||
// verb-name dispatch (`is_mutation: insert/remove`) would otherwise
|
||||
// classify these as DB writes.
|
||||
if let Some(k) = rust_type_to_local_collection(&type_text) {
|
||||
return Some(k);
|
||||
}
|
||||
|
||||
rust_type_to_kind(&type_text)
|
||||
}
|
||||
|
||||
/// Strip Rust reference markers, lifetimes, and `mut` from the head of
|
||||
/// a type-text fragment so the underlying type name is exposed for
|
||||
/// matching. Handles `&T`, `&mut T`, `&'a T`, `&'a mut T`, and
|
||||
/// repeated `&` prefixes (e.g. `&&mut T`).
|
||||
fn strip_rust_ref_markers(t: &str) -> &str {
|
||||
let mut s = t.trim();
|
||||
loop {
|
||||
if let Some(rest) = s.strip_prefix('&') {
|
||||
let rest = rest.trim_start();
|
||||
// Optional lifetime label: `'a`, `'static`, `'_`.
|
||||
let rest = if let Some(after) = rest.strip_prefix('\'') {
|
||||
let end = after
|
||||
.find(|c: char| !c.is_alphanumeric() && c != '_')
|
||||
.unwrap_or(after.len());
|
||||
after[end..].trim_start()
|
||||
} else {
|
||||
rest
|
||||
};
|
||||
// Optional `mut` keyword.
|
||||
let rest = rest.strip_prefix("mut ").unwrap_or(rest).trim_start();
|
||||
s = rest;
|
||||
continue;
|
||||
}
|
||||
if let Some(rest) = s.strip_prefix("mut ") {
|
||||
s = rest.trim_start();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// Map a Rust parameter / variable type-text to
|
||||
/// [`TypeKind::LocalCollection`] when the head names a known
|
||||
/// in-memory container. Strips reference / lifetime / `mut` markers,
|
||||
/// drops module-path prefixes (`std::collections::`, `hashbrown::`,
|
||||
/// `roaring::`), then matches the head against std and ecosystem
|
||||
/// collection types.
|
||||
///
|
||||
/// Recognises:
|
||||
/// * Std: `Vec`, `HashMap`, `HashSet`, `BTreeMap`, `BTreeSet`,
|
||||
/// `VecDeque`, `BinaryHeap`, `LinkedList`.
|
||||
/// * Ecosystem: `IndexMap`, `IndexSet` (indexmap), `SmallVec`
|
||||
/// (smallvec), `DashMap`, `DashSet` (dashmap), `FxHashMap`,
|
||||
/// `FxHashSet` (rustc-hash / fxhash), `RoaringBitmap`,
|
||||
/// `RoaringTreemap` (roaring).
|
||||
/// * Array / slice shorthand: `[T; N]`, `[T]` (covered by the
|
||||
/// leading-`[` check after ref-stripping).
|
||||
///
|
||||
/// Returns `None` for `Database<...>` (heed/sled, persistent KV
|
||||
/// store, NOT a local collection, keeping this `None` preserves
|
||||
/// real IDOR detection on persistent-store calls), `Mutex<...>` /
|
||||
/// `RwLock<...>` (synchronisation wrappers, not sink-shape claims),
|
||||
/// and bare primitives.
|
||||
pub(super) fn rust_type_to_local_collection(t: &str) -> Option<TypeKind> {
|
||||
let stripped = strip_rust_ref_markers(t);
|
||||
|
||||
// Array / slice shorthand: `[T; N]` or `[T]` (the `&` was
|
||||
// already stripped).
|
||||
if stripped.starts_with('[') {
|
||||
return Some(TypeKind::LocalCollection);
|
||||
}
|
||||
|
||||
// Drop module-path prefix: keep only the last segment before `<`
|
||||
// or end (`std::collections::HashMap<K, V>` → `HashMap`).
|
||||
let head_with_generics = stripped.rsplit("::").next().unwrap_or(stripped);
|
||||
let head = head_with_generics
|
||||
.split('<')
|
||||
.next()
|
||||
.unwrap_or(head_with_generics)
|
||||
.trim();
|
||||
|
||||
const TYPES: &[&str] = &[
|
||||
"Vec",
|
||||
"VecDeque",
|
||||
"BinaryHeap",
|
||||
"LinkedList",
|
||||
"HashMap",
|
||||
"HashSet",
|
||||
"BTreeMap",
|
||||
"BTreeSet",
|
||||
"IndexMap",
|
||||
"IndexSet",
|
||||
"SmallVec",
|
||||
"DashMap",
|
||||
"DashSet",
|
||||
"FxHashMap",
|
||||
"FxHashSet",
|
||||
"RoaringBitmap",
|
||||
"RoaringTreemap",
|
||||
];
|
||||
if TYPES.contains(&head) {
|
||||
Some(TypeKind::LocalCollection)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let stripped = t.trim();
|
||||
// Reject reference / mutability noise so `&Path<i64>` still matches
|
||||
|
|
@ -666,7 +831,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
.trim_start_matches('&')
|
||||
.trim_start_matches("mut ")
|
||||
.trim();
|
||||
// Only framework wrapper extractors qualify — bare primitives like
|
||||
// Only framework wrapper extractors qualify, bare primitives like
|
||||
// `i64` could be regular function parameters with no framework
|
||||
// validation gate.
|
||||
for wrap in [
|
||||
|
|
@ -684,7 +849,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
if let Some(rest) = stripped.strip_prefix(&prefix) {
|
||||
if let Some(inner) = rest.strip_suffix('>') {
|
||||
let inner = inner.trim();
|
||||
// Tuple extractor `Path<(i64, String)>` — first element wins.
|
||||
// Tuple extractor `Path<(i64, String)>`, first element wins.
|
||||
if inner.starts_with('(') {
|
||||
let inside = inner.trim_start_matches('(').trim_end_matches(')');
|
||||
let first = inside.split(',').next().unwrap_or("").trim();
|
||||
|
|
@ -696,16 +861,16 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
if let Some(k) = rust_primitive_to_kind(inner) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: `Json<T>` / `Form<T>` / `Query<T>` /
|
||||
// `Path<T>` with a same-file struct type — resolve via
|
||||
// `Json<T>` / `Form<T>` / `Query<T>` /
|
||||
// `Path<T>` with a same-file struct type, resolve via
|
||||
// the DTO map. Strip nested generics so `Json<Foo<i64>>`
|
||||
// matches on `Foo`.
|
||||
let head = inner.split('<').next().unwrap_or(inner).trim();
|
||||
if let Some(k) = lookup_dto_class(head) {
|
||||
return Some(k);
|
||||
}
|
||||
// Custom struct outside the same file — leave None
|
||||
// (cross-file resolution is Phase 6.4).
|
||||
// Custom struct outside the same file, leave None
|
||||
// (cross-file resolution is a follow-up).
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
|
@ -714,7 +879,7 @@ fn rust_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
}
|
||||
|
||||
/// Map a Rust primitive / `String` / `&str` to a [`TypeKind`]. Public
|
||||
/// to the `cfg` module so the Phase 6 DTO collector can reuse it for
|
||||
/// to the `cfg` module so the DTO DTO collector can reuse it for
|
||||
/// `struct` field types.
|
||||
pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let t = t.trim();
|
||||
|
|
@ -728,10 +893,10 @@ pub(super) fn rust_primitive_to_kind(t: &str) -> Option<TypeKind> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Python (FastAPI) — recognise typed-extractor parameters via the
|
||||
/// Python (FastAPI), recognise typed-extractor parameters via the
|
||||
/// `Annotated[X, Path()/Query()/Body()/Header()/Cookie()]` shape. Per
|
||||
/// Hard Rule 3, a bare `def h(id: int)` is **not** a framework
|
||||
/// extractor — the function may be a plain Python function and the
|
||||
/// extractor, the function may be a plain Python function and the
|
||||
/// type annotation provides no runtime gate.
|
||||
fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<TypeKind> {
|
||||
let type_node = param.child_by_field_name("type")?;
|
||||
|
|
@ -741,7 +906,7 @@ fn classify_param_type_python<'a>(param: Node<'a>, code: &'a [u8]) -> Option<Typ
|
|||
|
||||
fn python_type_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let stripped = t.trim();
|
||||
// `Annotated[int, Path()]` — only matches when one of the generic
|
||||
// `Annotated[int, Path()]`, only matches when one of the generic
|
||||
// args names a recognised FastAPI binding marker. Otherwise no
|
||||
// framework gate is implied.
|
||||
if let Some(inner) = stripped
|
||||
|
|
@ -756,8 +921,8 @@ fn python_type_to_kind(t: &str) -> Option<TypeKind> {
|
|||
if let Some(k) = python_primitive_to_kind(first) {
|
||||
return Some(k);
|
||||
}
|
||||
// Phase 6.2: `Annotated[CreateUser, Body()]` with a same-file
|
||||
// Pydantic model — resolve via the DTO map. Generic args are
|
||||
// `Annotated[CreateUser, Body()]` with a same-file
|
||||
// Pydantic model, resolve via the DTO map. Generic args are
|
||||
// dropped via the same head-split as `python_primitive_to_kind`.
|
||||
let head = first.split('[').next().unwrap_or(first).trim();
|
||||
return lookup_dto_class(head);
|
||||
|
|
@ -773,7 +938,7 @@ fn contains_fastapi_marker(s: &str) -> bool {
|
|||
}
|
||||
|
||||
/// Map a Python type expression to a primitive [`TypeKind`]. Used by
|
||||
/// both the per-parameter classifier and the Phase 6 Pydantic-model
|
||||
/// both the per-parameter classifier and the DTO Pydantic-model
|
||||
/// field collector.
|
||||
pub(super) fn python_primitive_to_kind(t: &str) -> Option<TypeKind> {
|
||||
let head = t.trim().split('[').next().unwrap_or(t).trim();
|
||||
|
|
@ -806,10 +971,70 @@ pub(super) fn is_configured_terminator(
|
|||
mod typed_extractor_tests {
|
||||
use super::{
|
||||
contains_fastapi_marker, java_type_to_kind, python_primitive_to_kind, python_type_to_kind,
|
||||
rust_primitive_to_kind, rust_type_to_kind,
|
||||
rust_primitive_to_kind, rust_type_to_kind, rust_type_to_local_collection,
|
||||
ts_type_to_local_collection,
|
||||
};
|
||||
use crate::ssa::type_facts::TypeKind;
|
||||
|
||||
// ── TypeScript / JavaScript local-collection types ───────────────────
|
||||
|
||||
#[test]
|
||||
fn ts_built_in_collections_map_to_local_collection() {
|
||||
// The four keyed/unkeyed built-in container generics.
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Map<string, number>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Set<string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("WeakMap<object, string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("WeakSet<object>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
// Array forms.
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Array<string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("ReadonlyArray<string>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("string[]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("readonly string[]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
// Excalidraw-style keyed map with index-type generic args.
|
||||
assert_eq!(
|
||||
ts_type_to_local_collection("Map<ExcalidrawElement[\"id\"], ExcalidrawElement>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ts_non_collection_types_return_none() {
|
||||
// Plain primitives.
|
||||
assert_eq!(ts_type_to_local_collection("string"), None);
|
||||
assert_eq!(ts_type_to_local_collection("number"), None);
|
||||
assert_eq!(ts_type_to_local_collection("boolean"), None);
|
||||
// Promise / Iterator / etc. are not LocalCollections.
|
||||
assert_eq!(ts_type_to_local_collection("Promise<string>"), None);
|
||||
assert_eq!(ts_type_to_local_collection("Iterator<number>"), None);
|
||||
// User types.
|
||||
assert_eq!(ts_type_to_local_collection("CreateUserDto"), None);
|
||||
assert_eq!(ts_type_to_local_collection("ElementsMap"), None);
|
||||
}
|
||||
|
||||
// ── Java (Spring) ────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
|
@ -841,7 +1066,7 @@ mod typed_extractor_tests {
|
|||
|
||||
#[test]
|
||||
fn java_request_body_dto_returns_none_until_phase_six() {
|
||||
// @RequestBody CreateUserDto dto — no kind today; Phase 6 will
|
||||
// @RequestBody CreateUserDto dto, no kind today; future passes will
|
||||
// return DtoObject(name) once cross-file class resolution lands.
|
||||
assert_eq!(java_type_to_kind("CreateUserDto"), None);
|
||||
assert_eq!(java_type_to_kind("List<String>"), None);
|
||||
|
|
@ -860,7 +1085,7 @@ mod typed_extractor_tests {
|
|||
|
||||
#[test]
|
||||
fn rust_path_tuple_first_element_wins() {
|
||||
// Path<(i64, String)> — first slot is the int extractor that
|
||||
// Path<(i64, String)>, first slot is the int extractor that
|
||||
// matters for sink suppression.
|
||||
assert_eq!(
|
||||
rust_type_to_kind("Path<(i64, String)>"),
|
||||
|
|
@ -876,15 +1101,15 @@ mod typed_extractor_tests {
|
|||
|
||||
#[test]
|
||||
fn rust_json_dto_returns_none_until_phase_six() {
|
||||
// Json<T> / Form<T> / Query<T> with a custom struct type — no
|
||||
// primitive resolution available; Phase 6 lifts to DTO.
|
||||
// Json<T> / Form<T> / Query<T> with a custom struct type, no
|
||||
// primitive resolution available; future passes will lift to DTO.
|
||||
assert_eq!(rust_type_to_kind("Json<CreateUserDto>"), None);
|
||||
assert_eq!(rust_type_to_kind("Form<CreateUserDto>"), None);
|
||||
assert_eq!(rust_type_to_kind("Query<Filters>"), None);
|
||||
}
|
||||
|
||||
/// Per Hard Rule 3, bare primitives (`fn h(id: i64)`) are NOT
|
||||
/// framework extractors — only wrapper types (`Path<i64>` etc.)
|
||||
/// framework extractors, only wrapper types (`Path<i64>` etc.)
|
||||
/// imply a framework runtime gate. Bare i64 must return None.
|
||||
#[test]
|
||||
fn rust_bare_primitives_are_not_framework_extractors() {
|
||||
|
|
@ -903,7 +1128,7 @@ mod typed_extractor_tests {
|
|||
#[test]
|
||||
fn python_bare_primitives_are_not_framework_extractors() {
|
||||
// Per Hard Rule 3: bare `def h(id: int)` is NOT a typed
|
||||
// extractor — without an `Annotated[..., Path()/Query()/Body()]`
|
||||
// extractor, without an `Annotated[..., Path()/Query()/Body()]`
|
||||
// wrapper, no FastAPI gate is implied.
|
||||
assert_eq!(python_type_to_kind("int"), None);
|
||||
assert_eq!(python_type_to_kind("float"), None);
|
||||
|
|
@ -936,7 +1161,7 @@ mod typed_extractor_tests {
|
|||
#[test]
|
||||
fn python_annotated_without_marker_returns_none() {
|
||||
// Annotated without a FastAPI binding marker is a generic
|
||||
// type-system tag — not a framework extractor.
|
||||
// type-system tag, not a framework extractor.
|
||||
assert_eq!(python_type_to_kind("Annotated[int, str]"), None);
|
||||
assert_eq!(python_type_to_kind("Annotated[int, MyMeta]"), None);
|
||||
}
|
||||
|
|
@ -954,4 +1179,128 @@ mod typed_extractor_tests {
|
|||
assert!(contains_fastapi_marker("bytes, File()"));
|
||||
assert!(!contains_fastapi_marker("int, str"));
|
||||
}
|
||||
|
||||
// ── Rust local-collection types ──────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn rust_std_collections_map_to_local_collection() {
|
||||
for ty in [
|
||||
"Vec<u32>",
|
||||
"HashMap<String, u32>",
|
||||
"HashSet<u64>",
|
||||
"BTreeMap<u32, String>",
|
||||
"BTreeSet<u32>",
|
||||
"VecDeque<u8>",
|
||||
"BinaryHeap<u32>",
|
||||
"LinkedList<i32>",
|
||||
] {
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection(ty),
|
||||
Some(TypeKind::LocalCollection),
|
||||
"{ty} should map to LocalCollection"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_ecosystem_collections_map_to_local_collection() {
|
||||
for ty in [
|
||||
"IndexMap<String, u32>",
|
||||
"IndexSet<u64>",
|
||||
"SmallVec<[u32; 4]>",
|
||||
"DashMap<String, u32>",
|
||||
"DashSet<u64>",
|
||||
"FxHashMap<String, u32>",
|
||||
"FxHashSet<u64>",
|
||||
"RoaringBitmap",
|
||||
"RoaringTreemap",
|
||||
] {
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection(ty),
|
||||
Some(TypeKind::LocalCollection),
|
||||
"{ty} should map to LocalCollection"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_module_qualified_collections_map_to_local_collection() {
|
||||
// Module-path prefixes: keep only the last segment for matching.
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("std::collections::HashMap<K, V>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("hashbrown::HashMap<String, RoaringBitmap>"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("roaring::RoaringBitmap"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_reference_and_lifetime_markers_stripped() {
|
||||
// `&T`, `&mut T`, `&'a T`, `&'a mut T`, `&'static T`,
|
||||
// repeated `&` prefixes, all reach the underlying type head.
|
||||
for ty in [
|
||||
"&RoaringBitmap",
|
||||
"&mut RoaringBitmap",
|
||||
"&'a RoaringBitmap",
|
||||
"&'a mut RoaringBitmap",
|
||||
"&'static RoaringBitmap",
|
||||
"&&mut RoaringBitmap",
|
||||
"&'a mut hashbrown::HashMap<String, RoaringBitmap>",
|
||||
] {
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection(ty),
|
||||
Some(TypeKind::LocalCollection),
|
||||
"{ty} should map to LocalCollection after ref stripping"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_array_and_slice_shorthand_map_to_local_collection() {
|
||||
// `[T; N]` arrays and `[T]` slices are local containers.
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("[u32; 4]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("[u8]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("&[u32]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("&mut [u32]"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rust_persistent_db_and_sync_wrappers_return_none() {
|
||||
// heed / sled / rocksdb persistent-store handles are NOT local
|
||||
// collections, preserves IDOR detection on real DB calls.
|
||||
assert_eq!(
|
||||
rust_type_to_local_collection("Database<BEU32, SerdeJson<Task>>"),
|
||||
None
|
||||
);
|
||||
assert_eq!(rust_type_to_local_collection("heed::Database<K, V>"), None);
|
||||
assert_eq!(rust_type_to_local_collection("sled::Db"), None);
|
||||
// Sync wrappers don't claim a sink shape.
|
||||
assert_eq!(rust_type_to_local_collection("Mutex<HashMap<K, V>>"), None);
|
||||
assert_eq!(rust_type_to_local_collection("RwLock<Vec<u32>>"), None);
|
||||
// Bare primitives.
|
||||
assert_eq!(rust_type_to_local_collection("u32"), None);
|
||||
assert_eq!(rust_type_to_local_collection("&str"), None);
|
||||
assert_eq!(rust_type_to_local_collection("String"), None);
|
||||
// Unrelated user types.
|
||||
assert_eq!(rust_type_to_local_collection("MyDao<User>"), None);
|
||||
assert_eq!(rust_type_to_local_collection("Connection"), None);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -107,11 +107,11 @@ fn has_web_handler_params(ctx: &AnalysisContext, func_name: &str) -> bool {
|
|||
/// Determine if a function qualifies as a web entrypoint (not just any entrypoint).
|
||||
///
|
||||
/// A web entrypoint must:
|
||||
/// 1. Match entrypoint naming rules (handle_*, route_*, api_*, etc.) — but NOT bare `main`
|
||||
/// 1. Match entrypoint naming rules (handle_*, route_*, api_*, etc.), but NOT bare `main`
|
||||
/// unless it has web-like parameters
|
||||
/// 2. Have parameters resembling HTTP handler signatures
|
||||
fn is_web_entrypoint(ctx: &AnalysisContext, func_name: &str) -> bool {
|
||||
// "main" without web params is a CLI entrypoint — skip
|
||||
// "main" without web params is a CLI entrypoint, skip
|
||||
if func_name == "main" {
|
||||
return has_web_handler_params(ctx, func_name);
|
||||
}
|
||||
|
|
@ -163,7 +163,7 @@ impl CfgAnalysis for AuthGap {
|
|||
|
||||
fn run(&self, ctx: &AnalysisContext) -> Vec<CfgFinding> {
|
||||
// Decorator/annotation/attribute auth on the body declaration
|
||||
// already gates every sink in the body — skip the
|
||||
// already gates every sink in the body, skip the
|
||||
// structural-call dominance check entirely when the framework
|
||||
// enforces auth at the declaration level. Mirrors the
|
||||
// `classify_auth_decorators` lookup the state engine uses to
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ use petgraph::visit::EdgeRef;
|
|||
/// Returns true if the identifier is exactly `err` / `error` or a
|
||||
/// snake-case error name (`err_x`, `error_x`, `x_err`, `x_error`).
|
||||
/// CamelCase names (`isErrorEnabled`, `getError`, `errorMsg`) are
|
||||
/// rejected — the cost is occasional FNs on Java-style error fields,
|
||||
/// rejected, the cost is occasional FNs on Java-style error fields,
|
||||
/// which is acceptable for a precision fix.
|
||||
fn is_error_var_ident(name: &str) -> bool {
|
||||
let lower = name.to_ascii_lowercase();
|
||||
|
|
@ -36,7 +36,7 @@ fn is_error_var_ident(name: &str) -> bool {
|
|||
/// Used by the error-fallthrough rule to skip happy-path checks
|
||||
/// like `if (!data.error && Array.isArray(results))` whose TRUE branch
|
||||
/// is the success path and is not expected to return. The original
|
||||
/// rule fires on `if (err) { warn(); } sink_after()` — a positive
|
||||
/// rule fires on `if (err) { warn(); } sink_after()`, a positive
|
||||
/// error check whose body forgets to early-return.
|
||||
fn contains_negated_err_identifier(text: &str) -> bool {
|
||||
let bytes = text.as_bytes();
|
||||
|
|
@ -46,7 +46,7 @@ fn contains_negated_err_identifier(text: &str) -> bool {
|
|||
i += 1;
|
||||
continue;
|
||||
}
|
||||
// Skip the `!=` / `!==` operators — those are comparisons, not
|
||||
// Skip the `!=` / `!==` operators, those are comparisons, not
|
||||
// logical-not. Only treat a `!` followed by whitespace or an
|
||||
// identifier-leading char as logical negation.
|
||||
if i + 1 < bytes.len() && bytes[i + 1] == b'=' {
|
||||
|
|
@ -57,7 +57,7 @@ fn contains_negated_err_identifier(text: &str) -> bool {
|
|||
while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
|
||||
j += 1;
|
||||
}
|
||||
// Allow a leading `(` for `!(expr)` shapes — peek past one open
|
||||
// Allow a leading `(` for `!(expr)` shapes, peek past one open
|
||||
// paren and continue capturing the identifier chain.
|
||||
if j < bytes.len() && bytes[j] == b'(' {
|
||||
j += 1;
|
||||
|
|
@ -118,7 +118,95 @@ fn branch_terminates(cfg: &crate::cfg::Cfg, if_node: NodeIndex) -> bool {
|
|||
false
|
||||
}
|
||||
|
||||
/// Check if all paths from `node` reach a Return/Break/Continue before exiting scope.
|
||||
/// Recognise calls that never return on the success path.
|
||||
///
|
||||
/// `cfg-error-fallthrough` looks for `if err != nil { … }` whose body
|
||||
/// fails to terminate. A `return`/`break`/`continue`/`throw` is the
|
||||
/// canonical terminator and already produces a `StmtKind::Return` /
|
||||
/// `Throw` / `Break` / `Continue` node. But a large class of real
|
||||
/// terminators arrives as a *call* whose callee is documented to abort
|
||||
/// the goroutine, process, or test:
|
||||
///
|
||||
/// * Go testing, `t.Fatal`, `t.Fatalf`, `t.Fatalln`, `b.Fatal*`,
|
||||
/// `*Helper()` chains ending in `Fatal*`, also third-party
|
||||
/// `require.NoError(t, …)` (asserts and aborts on err) which the
|
||||
/// common `c.Fatalf("...")` pattern in minio's table tests reduces
|
||||
/// to. All `Fatal*` methods on a `testing.T`/`B`/`F` call
|
||||
/// `runtime.Goexit()` which is documented as never returning to the
|
||||
/// caller.
|
||||
/// * Go std-library, `os.Exit`, `syscall.Exit`, `runtime.Goexit`,
|
||||
/// `log.Fatal`, `log.Fatalf`, `log.Fatalln`, `log.Panic*`.
|
||||
/// * Go builtin, bare `panic(…)`.
|
||||
/// * Rust, `panic!`, `unreachable!`, `unimplemented!`, `todo!`,
|
||||
/// `process::exit`, `std::process::exit`, `process::abort`,
|
||||
/// `std::process::abort` (the macros currently lower to
|
||||
/// `StmtKind::Throw` via tree-sitter's macro arm; the function
|
||||
/// forms need explicit recognition).
|
||||
/// * Python, `sys.exit`, `os._exit`, `os.abort`.
|
||||
///
|
||||
/// The recogniser looks at the bare method name (last segment after
|
||||
/// `.` or `::`) and, where the receiver is a closed token, the
|
||||
/// receiver's first segment. Bare `panic` / `exit` callees are
|
||||
/// recognised only when the namespace context matches (callee equals
|
||||
/// the literal string, no other receiver). This keeps the recogniser
|
||||
/// from claiming arbitrary user-defined `Exit(...)` / `Panic(...)` as
|
||||
/// terminators.
|
||||
///
|
||||
/// Closes the minio test-file cluster (49 in `xl-storage_test.go`
|
||||
/// alone, 176 across the repo) where every `if err != nil { c.Fatalf(...) }`
|
||||
/// fired `cfg-error-fallthrough`: the `Fatalf` aborts the goroutine
|
||||
/// and the post-if code never executes, but the rule classified it as
|
||||
/// fall-through. Conservative: only adds new terminators; never
|
||||
/// removes the existing `Return`/`Throw`/`Break`/`Continue` recognition.
|
||||
fn call_never_returns(info: &crate::cfg::NodeInfo) -> bool {
|
||||
if info.kind != StmtKind::Call {
|
||||
return false;
|
||||
}
|
||||
let Some(callee) = info.call.callee.as_deref() else {
|
||||
return false;
|
||||
};
|
||||
let last = callee.rsplit(['.', ':']).next().unwrap_or(callee);
|
||||
|
||||
// Method names that always terminate when called on any receiver
|
||||
// that's a testing handle (`*testing.T`, `*testing.B`, `*testing.F`)
|
||||
// or a logger. Receiver type is unknown to this rule; the names
|
||||
// are sufficiently distinctive that arbitrary user-defined methods
|
||||
// sharing the name are vanishingly rare.
|
||||
if matches!(
|
||||
last,
|
||||
// Go testing
|
||||
"Fatal" | "Fatalf" | "Fatalln" | "FailNow" |
|
||||
// Go log/slog terminating handlers
|
||||
"Panic" | "Panicf" | "Panicln" |
|
||||
// Rust process / never-return std fns
|
||||
"abort" | "unreachable_unchecked"
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bare callees (no receiver) that are language builtins or
|
||||
// unambiguous std-library terminators.
|
||||
match callee {
|
||||
// Go builtin
|
||||
"panic" => return true,
|
||||
// Go std
|
||||
"os.Exit" | "syscall.Exit" | "runtime.Goexit" | "log.Fatal" | "log.Fatalf"
|
||||
| "log.Fatalln" | "log.Panic" | "log.Panicf" | "log.Panicln" | "slog.Fatal"
|
||||
| "klog.Fatal" | "klog.Fatalf" | "klog.Exit" | "klog.Exitf" => return true,
|
||||
// Rust std
|
||||
"process::exit" | "process::abort" | "std::process::exit" | "std::process::abort" => {
|
||||
return true;
|
||||
}
|
||||
// Python std
|
||||
"sys.exit" | "os._exit" | "os.abort" => return true,
|
||||
_ => {}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if all paths from `node` reach a Return/Break/Continue (or a
|
||||
/// known never-returning call) before exiting scope.
|
||||
fn terminates_on_all_paths(
|
||||
cfg: &crate::cfg::Cfg,
|
||||
node: NodeIndex,
|
||||
|
|
@ -142,10 +230,15 @@ fn terminates_on_all_paths(
|
|||
}
|
||||
_ => {}
|
||||
}
|
||||
if call_never_returns(info) {
|
||||
// Documented never-returning call (`t.Fatalf`, `os.Exit`,
|
||||
// `panic`, `runtime.Goexit`, …), this path terminates.
|
||||
continue;
|
||||
}
|
||||
|
||||
let successors: Vec<_> = cfg.neighbors(current).collect();
|
||||
if successors.is_empty() {
|
||||
// Reached a dead end without terminating — path does not terminate
|
||||
// Reached a dead end without terminating, path does not terminate
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -181,7 +274,7 @@ fn find_post_if_sinks(cfg: &crate::cfg::Cfg, if_node: NodeIndex) -> Vec<NodeInde
|
|||
|
||||
// Seed from the False edge only. If the if has no explicit False
|
||||
// edge (some CFG shapes omit it for one-branch ifs), fall back to
|
||||
// Seq edges from the if node — but never follow True edges, which
|
||||
// Seq edges from the if node, but never follow True edges, which
|
||||
// lead into the body.
|
||||
let mut stack: Vec<NodeIndex> = cfg
|
||||
.edges(if_node)
|
||||
|
|
@ -225,9 +318,9 @@ impl CfgAnalysis for IncompleteErrorHandling {
|
|||
|
||||
// Look for If nodes whose CONDITION involves "err" or "error".
|
||||
// `info.taint.uses` for an If node contains identifiers from the
|
||||
// whole if statement (condition + body) — see
|
||||
// whole if statement (condition + body), see
|
||||
// `cfg::literals::extract_defs_uses_extra_defs` Kind::If branch
|
||||
// — so checking it would misfire on `if (!res.ok) { ... const
|
||||
//, so checking it would misfire on `if (!res.ok) { ... const
|
||||
// err = await … ; return … }` shapes whose body happens to
|
||||
// mention `err` even though the condition doesn't. Use
|
||||
// `info.condition_vars`, which is populated strictly from the
|
||||
|
|
@ -244,7 +337,7 @@ impl CfgAnalysis for IncompleteErrorHandling {
|
|||
|
||||
// Polarity gate: only fire when the condition POSITIVELY
|
||||
// checks for an error. `if (!data.error && other)` is a
|
||||
// happy-path check — the TRUE branch is the success branch
|
||||
// happy-path check, the TRUE branch is the success branch
|
||||
// and is not expected to terminate. Detect by scanning the
|
||||
// condition text for any `!` (logical-not, distinct from
|
||||
// `!=`) preceding an identifier whose name contains "err".
|
||||
|
|
@ -354,7 +447,7 @@ mod err_ident_tests {
|
|||
fn rejects_camelcase_method_names() {
|
||||
// Spring `logger.isErrorEnabled()` lifts `isErrorEnabled` into
|
||||
// `condition_vars`; under the old `lower.contains("err")` check
|
||||
// this fired the rule. The new strict check rejects it — the
|
||||
// this fired the rule. The new strict check rejects it, the
|
||||
// condition is asking "is logging enabled", not "is there an
|
||||
// error".
|
||||
assert!(!is_error_var_ident("isErrorEnabled"));
|
||||
|
|
@ -371,3 +464,103 @@ mod err_ident_tests {
|
|||
assert!(!is_error_var_ident("perform"));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod terminator_call_tests {
|
||||
use super::call_never_returns;
|
||||
use crate::cfg::{CallMeta, NodeInfo, StmtKind};
|
||||
|
||||
fn call_node(callee: &str) -> NodeInfo {
|
||||
NodeInfo {
|
||||
kind: StmtKind::Call,
|
||||
call: CallMeta {
|
||||
callee: Some(callee.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recognises_go_testing_fatal_methods() {
|
||||
// Bare method name on any receiver, the canonical minio test
|
||||
// shape `c.Fatalf("bucket creat error: %v", err)`.
|
||||
assert!(call_never_returns(&call_node("c.Fatalf")));
|
||||
assert!(call_never_returns(&call_node("t.Fatal")));
|
||||
assert!(call_never_returns(&call_node("t.Fatalf")));
|
||||
assert!(call_never_returns(&call_node("t.Fatalln")));
|
||||
assert!(call_never_returns(&call_node("b.Fatal")));
|
||||
assert!(call_never_returns(&call_node("t.FailNow")));
|
||||
// Logger panics (handler-style fatal).
|
||||
assert!(call_never_returns(&call_node("logger.Panic")));
|
||||
assert!(call_never_returns(&call_node("logger.Panicf")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recognises_go_std_terminators() {
|
||||
assert!(call_never_returns(&call_node("os.Exit")));
|
||||
assert!(call_never_returns(&call_node("syscall.Exit")));
|
||||
assert!(call_never_returns(&call_node("runtime.Goexit")));
|
||||
assert!(call_never_returns(&call_node("log.Fatal")));
|
||||
assert!(call_never_returns(&call_node("log.Fatalf")));
|
||||
assert!(call_never_returns(&call_node("log.Fatalln")));
|
||||
assert!(call_never_returns(&call_node("log.Panic")));
|
||||
assert!(call_never_returns(&call_node("klog.Exit")));
|
||||
// Bare builtin
|
||||
assert!(call_never_returns(&call_node("panic")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recognises_rust_and_python_std_terminators() {
|
||||
assert!(call_never_returns(&call_node("std::process::exit")));
|
||||
assert!(call_never_returns(&call_node("std::process::abort")));
|
||||
assert!(call_never_returns(&call_node("process::exit")));
|
||||
assert!(call_never_returns(&call_node("sys.exit")));
|
||||
assert!(call_never_returns(&call_node("os._exit")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn does_not_claim_user_defined_lookalikes() {
|
||||
// Bare `Exit` on a custom receiver is a normal method, not the
|
||||
// process-level terminator. The bare callee path only matches
|
||||
// exact std-library forms.
|
||||
assert!(!call_never_returns(&call_node("server.Exit")));
|
||||
assert!(!call_never_returns(&call_node("Exit")));
|
||||
assert!(!call_never_returns(&call_node("session.exit")));
|
||||
// Bare `panic` is a Go builtin; method `panic` is not.
|
||||
// The recogniser keys off the full callee path so
|
||||
// `widget.panic` does not match.
|
||||
assert!(!call_never_returns(&call_node("widget.panic")));
|
||||
// Common helpers that *don't* terminate.
|
||||
assert!(!call_never_returns(&call_node("log.Print")));
|
||||
assert!(!call_never_returns(&call_node("log.Println")));
|
||||
assert!(!call_never_returns(&call_node("t.Errorf")));
|
||||
assert!(!call_never_returns(&call_node("t.Logf")));
|
||||
assert!(!call_never_returns(&call_node("c.Skip")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn requires_call_kind() {
|
||||
// Only StmtKind::Call nodes are inspected; an If or Seq node
|
||||
// carrying the same callee text wouldn't ever come through
|
||||
// this path. Defensive: confirm the kind gate.
|
||||
let mut node = call_node("t.Fatal");
|
||||
node.kind = StmtKind::Seq;
|
||||
assert!(!call_never_returns(&node));
|
||||
node.kind = StmtKind::If;
|
||||
assert!(!call_never_returns(&node));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_callee_does_not_panic() {
|
||||
let node = NodeInfo {
|
||||
kind: StmtKind::Call,
|
||||
call: CallMeta {
|
||||
callee: None,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
assert!(!call_never_returns(&node));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ pub struct UnguardedSink;
|
|||
/// receiver recorded as a compound identifier rather than a named binding).
|
||||
fn is_all_args_constant(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
||||
// Fast path: syntactic literal detection from CFG construction.
|
||||
// Strictly weaker than the one-hop trace below — serves as an
|
||||
// Strictly weaker than the one-hop trace below, serves as an
|
||||
// optimization for the common case of inline literal arguments.
|
||||
if ctx.cfg[sink].all_args_literal {
|
||||
return true;
|
||||
|
|
@ -127,17 +127,17 @@ fn ssa_all_sink_operands_constant(
|
|||
/// SSA-backed reassign-aware safety probe: every operand of the sink
|
||||
/// resolves to a constant, callee fragment, OR a function parameter that
|
||||
/// is not itself a Source. Used at the cfg-unguarded-sink site under
|
||||
/// `!has_taint` — the taint engine has already proved no source-tainted
|
||||
/// `!has_taint`, the taint engine has already proved no source-tainted
|
||||
/// data reaches the sink, so a non-source Param at operand position is
|
||||
/// inert payload-wise (e.g. HTTP writer in `Fprintf(w, "<h1>", "Guest")`).
|
||||
///
|
||||
/// Gated on the function body actually exhibiting the reassign-to-constant
|
||||
/// signature — at least one named SSA def whose RHS is a literal Const
|
||||
/// signature, at least one named SSA def whose RHS is a literal Const
|
||||
/// (`name = "Guest"`). In a thin wrapper without a same-block named
|
||||
/// const assignment (`fn wrap(p) { sink(p) }`, or C `popen(buf, "r")` where
|
||||
/// `buf` is filled in-place by `sprintf` with no Const Assign on `buf`),
|
||||
/// the bare Param at operand position IS the payload and the suppression's
|
||||
/// rationale does not apply — `cfg-unguarded-sink` must still fire.
|
||||
/// rationale does not apply, `cfg-unguarded-sink` must still fire.
|
||||
fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
||||
let Some(facts) = ctx.body_const_facts else {
|
||||
return false;
|
||||
|
|
@ -165,13 +165,13 @@ fn ssa_all_sink_operands_const_or_param(ctx: &AnalysisContext, sink: NodeIndex)
|
|||
}
|
||||
|
||||
/// Return true if the SSA body contains a *named* variable whose definition
|
||||
/// is a constant — the SSA signature of an explicit `name = "literal"`
|
||||
/// is a constant, the SSA signature of an explicit `name = "literal"`
|
||||
/// reassignment. Used as the gate for the broader operand-Param suppression:
|
||||
/// the suppression's purpose is the reassign-to-constant idiom, which by
|
||||
/// definition has at least one named const assignment. In a thin wrapper
|
||||
/// (`fn wrap(p) { sink(p) }` or `popen(buf, "r")` where `buf` is filled by
|
||||
/// `sprintf`), no such named const assignment exists and the suppression's
|
||||
/// rationale doesn't apply — so the bare-Param structural finding fires.
|
||||
/// rationale doesn't apply, so the bare-Param structural finding fires.
|
||||
fn func_body_has_named_const_assign(facts: &BodyConstFacts) -> bool {
|
||||
for block in &facts.ssa.blocks {
|
||||
for inst in &block.body {
|
||||
|
|
@ -228,7 +228,7 @@ fn ssa_operand_const_or_param(
|
|||
// CFG-node-level Source label: when an SSA `Call` corresponds to a
|
||||
// Source-labeled CFG node (e.g. `env::var(...)` whose callee
|
||||
// matches a `LabelRule` Source matcher), the call's result is
|
||||
// tainted user input — refuse, regardless of how the SSA
|
||||
// tainted user input, refuse, regardless of how the SSA
|
||||
// happened to lower. Catches the `SsaOp::Call` lowering of
|
||||
// labeled Source functions, which the `SsaOp::Source` arm only
|
||||
// sees for callee-less pure sources like PHP `$_GET`.
|
||||
|
|
@ -266,7 +266,7 @@ fn ssa_operand_const_or_param(
|
|||
}
|
||||
SsaOp::Source => return false,
|
||||
SsaOp::Nop | SsaOp::Undef => {}
|
||||
// FieldProj: walk the receiver — `obj.f` is constant iff `obj`
|
||||
// FieldProj: walk the receiver, `obj.f` is constant iff `obj`
|
||||
// is constant under the same definition. The field name itself
|
||||
// is structural and adds no runtime value.
|
||||
SsaOp::FieldProj { receiver, .. } => stack.push(*receiver),
|
||||
|
|
@ -321,7 +321,7 @@ fn ssa_operand_constant(
|
|||
}
|
||||
SsaOp::Param { .. } | SsaOp::SelfParam | SsaOp::CatchParam | SsaOp::Source => {
|
||||
// Only acceptable when the param's `var_name` is a callee
|
||||
// fragment — i.e. an identifier that only appears because
|
||||
// fragment, i.e. an identifier that only appears because
|
||||
// the CFG recorded name components of the dotted/chained
|
||||
// callee as uses. Real parameters and sources are dynamic.
|
||||
let name = inst.var_name.as_deref().unwrap_or("");
|
||||
|
|
@ -333,7 +333,7 @@ fn ssa_operand_constant(
|
|||
}
|
||||
}
|
||||
SsaOp::Nop => {}
|
||||
// Undef is a non-user, non-dynamic sentinel — treat like Const
|
||||
// Undef is a non-user, non-dynamic sentinel, treat like Const
|
||||
// (no additional operands to trace).
|
||||
SsaOp::Undef => {}
|
||||
// FieldProj: structural field read; constness reduces to the
|
||||
|
|
@ -440,7 +440,7 @@ fn sink_args_typed_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap)
|
|||
!is_callee_fragment(name, callee_desc, &callee_parts, &outer_parts)
|
||||
}
|
||||
// Constant string literals used as inline args (e.g. `"listener"`,
|
||||
// `"-c"`) are not user-controlled — treat as non-real for the
|
||||
// `"-c"`) are not user-controlled, treat as non-real for the
|
||||
// "all int-typed" test so they don't block suppression.
|
||||
SsaOp::Const(_) => false,
|
||||
_ => true,
|
||||
|
|
@ -477,7 +477,7 @@ fn type_facts_suppress(values: &[SsaValue], sink_caps: Cap, type_facts: &TypeFac
|
|||
/// lookup idiom (e.g. `map.get(x).unwrap_or("safe")` over literal inserts)
|
||||
/// should clear a command-injection sink.
|
||||
///
|
||||
/// Only fires for `Cap::SHELL_ESCAPE` — SQL / path suppression from this
|
||||
/// Only fires for `Cap::SHELL_ESCAPE`, SQL / path suppression from this
|
||||
/// domain would require stronger reasoning (literal keys can still carry
|
||||
/// SQL tokens if the inserts themselves contain them).
|
||||
fn sink_args_static_map_safe(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
||||
|
|
@ -595,6 +595,71 @@ fn match_config_sanitizer(callee: &str, extra: &[RuntimeLabelRule]) -> Option<Ca
|
|||
None
|
||||
}
|
||||
|
||||
/// Resolve the `if (X)` / `if (!X)` indirect-validator pattern: the
|
||||
/// condition has exactly one bare-identifier variable whose defining
|
||||
/// CFG node is a [`StmtKind::Call`] whose `defines` is the same name
|
||||
/// and whose `callee` is recognised by
|
||||
/// [`crate::ssa::type_facts::classify_input_validator_callee`].
|
||||
///
|
||||
/// Returns the validator callee name when the pattern matches, `None`
|
||||
/// otherwise. Conservative: bails when the condition has zero or more
|
||||
/// than one variable, when no defining call is found, or when the
|
||||
/// callee doesn't match a validator pattern. Mirrors the SSA
|
||||
/// branch-narrowing layer
|
||||
/// ([`crate::taint::ssa_transfer::apply_input_validator_branch_narrowing`])
|
||||
/// so the structural `cfg-unguarded-sink` suppression matches the
|
||||
/// taint engine's validator recognition.
|
||||
///
|
||||
/// Driven off CFG `TaintMeta.defines` rather than the per-body SSA
|
||||
/// value-defs because nested arrow-function bodies are sometimes
|
||||
/// lowered with empty SSA in the cfg-analysis context, but the CFG
|
||||
/// nodes themselves carry `defines` in every body.
|
||||
fn cond_indirect_validator_callee(
|
||||
info: &crate::cfg::NodeInfo,
|
||||
ctx: &AnalysisContext,
|
||||
) -> Option<String> {
|
||||
if info.condition_vars.len() != 1 {
|
||||
return None;
|
||||
}
|
||||
let var_name = info.condition_vars[0].as_str();
|
||||
let cond_func = info.ast.enclosing_func.as_deref();
|
||||
let cond_span_start = info.ast.span.0;
|
||||
|
||||
// Walk the CFG for any node that DEFINES `var_name` via a Call
|
||||
// expression. Same-function only, and only consider definitions
|
||||
// textually before the condition: a reassignment after the `if`
|
||||
// cannot be the def reaching it. Among the eligible defs, take
|
||||
// the textually-last one (highest span start), a conservative
|
||||
// latest-def proxy without paying for full dominator analysis.
|
||||
let mut best: Option<(usize, &str)> = None;
|
||||
for nidx in ctx.cfg.node_indices() {
|
||||
let n = &ctx.cfg[nidx];
|
||||
if n.kind != crate::cfg::StmtKind::Call {
|
||||
continue;
|
||||
}
|
||||
if n.taint.defines.as_deref() != Some(var_name) {
|
||||
continue;
|
||||
}
|
||||
if n.ast.enclosing_func.as_deref() != cond_func {
|
||||
continue;
|
||||
}
|
||||
let span_start = n.ast.span.0;
|
||||
if span_start >= cond_span_start {
|
||||
continue;
|
||||
}
|
||||
let Some(callee) = n.call.callee.as_deref() else {
|
||||
continue;
|
||||
};
|
||||
match best {
|
||||
Some((s, _)) if s >= span_start => {}
|
||||
_ => best = Some((span_start, callee)),
|
||||
}
|
||||
}
|
||||
let (_, callee) = best?;
|
||||
|
||||
crate::ssa::type_facts::classify_input_validator_callee(callee).map(|_| callee.to_string())
|
||||
}
|
||||
|
||||
/// Find all nodes in the CFG that are calls to guard functions.
|
||||
fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
|
||||
let guard_rules = rules::guard_rules(ctx.lang);
|
||||
|
|
@ -620,6 +685,24 @@ fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
|
|||
| PredicateKind::ValidationCall
|
||||
) {
|
||||
result.push((idx, Cap::all()));
|
||||
} else if cond_indirect_validator_callee(info, ctx).is_some() {
|
||||
// Indirect-validator pattern:
|
||||
// const err = validate(x); if (err) throw …;
|
||||
// const ok = isValid(x); if (!ok) throw …;
|
||||
// The classifier returns Unknown / NullCheck / ErrorCheck
|
||||
// because the if-condition is a bare result variable, not
|
||||
// a direct call expression. `cond_indirect_validator_callee`
|
||||
// handles that by scanning the CFG for nodes whose
|
||||
// `TaintMeta.defines` matches the condition variable and
|
||||
// checking whether any defining Call has an
|
||||
// `is_input_validator_callee`-recognised callee. This keeps
|
||||
// cfg-unguarded-sink suppression aligned with the same
|
||||
// structural validator recognition the SSA branch-narrowing
|
||||
// layer uses, without requiring the condition itself to be
|
||||
// a direct call expression.
|
||||
//
|
||||
// Motivated by Novu CVE GHSA-4x48-cgf9-q33f.
|
||||
result.push((idx, Cap::all()));
|
||||
} else if matches!(
|
||||
kind,
|
||||
PredicateKind::ShellMetaValidated | PredicateKind::BoundedLength
|
||||
|
|
@ -733,7 +816,7 @@ fn sink_arg_is_parameter_only(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
|
|||
|
||||
let sink_uses = &sink_info.taint.uses;
|
||||
if sink_uses.is_empty() {
|
||||
// No identifiable arguments — could be a constant call like Command::new("ls")
|
||||
// No identifiable arguments, could be a constant call like Command::new("ls")
|
||||
return true; // treat as non-dangerous (constant arg)
|
||||
}
|
||||
|
||||
|
|
@ -787,7 +870,7 @@ pub(crate) fn has_redirect_path_prefix(source_bytes: &[u8], span: (usize, usize)
|
|||
false
|
||||
}
|
||||
|
||||
/// Check if this sink is an internal redirect — a `res.redirect` (SSRF sink)
|
||||
/// Check if this sink is an internal redirect, a `res.redirect` (SSRF sink)
|
||||
/// whose argument is a template literal or string starting with `/`, indicating
|
||||
/// a server-relative path rather than an attacker-controlled URL.
|
||||
fn is_internal_redirect(ctx: &AnalysisContext, sink: NodeIndex, sink_caps: Cap) -> bool {
|
||||
|
|
@ -896,7 +979,7 @@ impl CfgAnalysis for UnguardedSink {
|
|||
let source_derived = sink_arg_is_source_derived(ctx, *sink);
|
||||
|
||||
// If sink args are all constants (including one-hop constant bindings)
|
||||
// and taint didn't confirm, this is a false positive — skip it.
|
||||
// and taint didn't confirm, this is a false positive, skip it.
|
||||
if is_all_args_constant(ctx, *sink) && !has_taint {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -904,7 +987,7 @@ impl CfgAnalysis for UnguardedSink {
|
|||
// SSA latest-def suppression: when the taint engine has already
|
||||
// proved no source-tainted data reaches this sink (`!has_taint`)
|
||||
// and every SSA operand resolves to a constant, callee-fragment
|
||||
// pseudo-name, OR a function parameter that is not a Source —
|
||||
// pseudo-name, OR a function parameter that is not a Source ,
|
||||
// the sink's actual arguments cannot carry an injection payload.
|
||||
// Catches the reassign-to-constant idiom (`name := req.x; name =
|
||||
// "Guest"; sink(name)`) where the latest SSA def is a literal
|
||||
|
|
@ -919,7 +1002,7 @@ impl CfgAnalysis for UnguardedSink {
|
|||
// Type-aware suppression: when all SSA operand values of the sink
|
||||
// are proven to carry non-injectable types (e.g. integers parsed
|
||||
// from a raw source), the arguments cannot form a payload for
|
||||
// SHELL/SQL/FILE sinks. Skip the structural finding — the taint
|
||||
// SHELL/SQL/FILE sinks. Skip the structural finding, the taint
|
||||
// engine already covers the source→sink flow via type-aware
|
||||
// suppression. Unknown-typed or mixed operands fall through.
|
||||
if !has_taint && sink_args_typed_safe(ctx, *sink, sink_caps) {
|
||||
|
|
@ -936,13 +1019,13 @@ impl CfgAnalysis for UnguardedSink {
|
|||
|
||||
// Parameterized SQL queries: arg 0 is a string literal with
|
||||
// placeholders ($1, ?, %s, :name) and a params argument exists.
|
||||
// These are safe by construction — the driver handles escaping.
|
||||
// These are safe by construction, the driver handles escaping.
|
||||
if sink_info.parameterized_query {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Internal redirects: res.redirect(`/path/...`) with a path-prefix
|
||||
// argument are server-relative — not attacker-controlled URLs.
|
||||
// argument are server-relative, not attacker-controlled URLs.
|
||||
if is_internal_redirect(ctx, *sink, sink_caps) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -953,10 +1036,10 @@ impl CfgAnalysis for UnguardedSink {
|
|||
let (severity, confidence) = if has_taint || source_derived {
|
||||
(Severity::High, Confidence::High)
|
||||
} else if param_only && !in_entrypoint {
|
||||
// Wrapper function with param-only args — zero signal. Suppress.
|
||||
// Wrapper function with param-only args, zero signal. Suppress.
|
||||
continue;
|
||||
} else if !ctx.taint_active {
|
||||
// AST-only / cfg-only mode — preserve as LOW (unchanged)
|
||||
// AST-only / cfg-only mode, preserve as LOW (unchanged)
|
||||
(Severity::Low, Confidence::Low)
|
||||
} else {
|
||||
// taint_active=true but found nothing.
|
||||
|
|
@ -970,7 +1053,7 @@ impl CfgAnalysis for UnguardedSink {
|
|||
// If the function containing the sink has no Source-labeled
|
||||
// nodes AND no parameters (through which taint could flow
|
||||
// from callers), taint ran and found nothing because there
|
||||
// is nothing to find. Suppress — the structural finding
|
||||
// is nothing to find. Suppress, the structural finding
|
||||
// is noise.
|
||||
let sink_func = sink_info.ast.enclosing_func.as_deref();
|
||||
let has_sources = ctx.cfg.node_indices().any(|n| {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
#![doc = include_str!(concat!(env!("OUT_DIR"), "/cfg_analysis.md"))]
|
||||
|
||||
pub mod auth;
|
||||
pub mod dominators;
|
||||
pub mod error_handling;
|
||||
|
|
@ -30,17 +32,15 @@ pub struct BodyConstFacts {
|
|||
pub type_facts: TypeFactResult,
|
||||
/// Field-sensitive Steensgaard points-to facts.
|
||||
///
|
||||
/// Computed only when [`crate::pointer::is_enabled()`] (i.e. the
|
||||
/// `NYX_POINTER_ANALYSIS=1` env var is set). Phase 2 of the
|
||||
/// pointer-analysis rollout consumes this in `state::transfer.rs`
|
||||
/// to suppress proxy-acquire mis-attribution on field-aliased
|
||||
/// locals like `m := c.mu`. When `None`, every consumer must fall
|
||||
/// back to its existing pointer-unaware behaviour.
|
||||
/// Computed only when [`crate::pointer::is_enabled()`].
|
||||
/// `state::transfer.rs` consumes this to suppress proxy-acquire
|
||||
/// mis-attribution on field-aliased locals like `m := c.mu`. When
|
||||
/// `None`, consumers fall back to pointer-unaware behaviour.
|
||||
pub pointer_facts: Option<crate::pointer::PointsToFacts>,
|
||||
}
|
||||
|
||||
/// Lower a body to SSA and run constant propagation. Returns `None` when
|
||||
/// lowering fails (empty CFG, invalid entry) — callers treat absence as
|
||||
/// lowering fails (empty CFG, invalid entry), callers treat absence as
|
||||
/// "no SSA facts available" and fall back to the syntactic path.
|
||||
pub fn build_body_const_facts(body: &crate::cfg::BodyCfg, lang: Lang) -> Option<BodyConstFacts> {
|
||||
let mut ssa = crate::ssa::lower_to_ssa_with_params(
|
||||
|
|
@ -116,13 +116,13 @@ pub struct AnalysisContext<'a> {
|
|||
/// Structural analyses use it to suppress findings when a sink's argument
|
||||
/// SSA values are proven to carry non-injectable types (e.g. integers
|
||||
/// parsed from a raw source can't form SHELL/SQL/path payloads). Sourced
|
||||
/// from `body_const_facts` when present — keep both pointers coherent.
|
||||
/// from `body_const_facts` when present, keep both pointers coherent.
|
||||
pub type_facts: Option<&'a TypeFactResult>,
|
||||
/// Decorators / annotations / attributes attached to the body's
|
||||
/// declaration (e.g. Python `@login_required`, Java `@PreAuthorize`,
|
||||
/// Symfony `#[IsGranted(...)]`). Consumed by the AuthGap analysis to
|
||||
/// suppress `cfg-auth-gap` when the framework already enforces auth at
|
||||
/// the function-declaration level — the gap only matters when the
|
||||
/// the function-declaration level, the gap only matters when the
|
||||
/// auth call has to live inside the body.
|
||||
pub auth_decorators: &'a [String],
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ fn find_acquire_nodes(
|
|||
}
|
||||
if let Some(callee) = &info.call.callee {
|
||||
let callee_lower = callee.to_ascii_lowercase();
|
||||
// Check exclusions first — if the callee matches an exclude
|
||||
// Check exclusions first, if the callee matches an exclude
|
||||
// pattern, it is NOT an acquire even if it also matches an
|
||||
// acquire pattern (e.g. `freopen` ends with `fopen`).
|
||||
let excluded = exclude_patterns.iter().any(|p| {
|
||||
|
|
@ -74,7 +74,7 @@ fn find_release_nodes(ctx: &AnalysisContext, release_patterns: &[&str]) -> Vec<N
|
|||
/// `if (acquire_var)` (or `if (!acquire_var)`) and the edge represents
|
||||
/// "acquire_var is null", the resource was never actually produced on that
|
||||
/// path, so a release is unnecessary. This closes the canonical
|
||||
/// `FILE *f = fopen(...); if (f) fclose(f);` idiom — without this rule the
|
||||
/// `FILE *f = fopen(...); if (f) fclose(f);` idiom, without this rule the
|
||||
/// false edge of the null check provides a path acquire→exit that misses
|
||||
/// the release, producing a may-leak FP.
|
||||
fn release_on_all_exit_paths(
|
||||
|
|
@ -103,8 +103,8 @@ fn release_on_all_exit_paths(
|
|||
/// the resource handle is null/falsy and therefore not actually acquired.
|
||||
///
|
||||
/// Recognises:
|
||||
/// * `if (var)` — false edge means `var` is null
|
||||
/// * `if (!var)` — true edge means `var` is null
|
||||
/// * `if (var)`, false edge means `var` is null
|
||||
/// * `if (!var)`, true edge means `var` is null
|
||||
///
|
||||
/// Rejects comparisons (`if (var != NULL)`), method calls
|
||||
/// (`if (var.is_valid())`), and composite conditions (`if (var && cond)`).
|
||||
|
|
@ -198,7 +198,7 @@ fn all_paths_pass_through(
|
|||
/// - `obj.field = var` (C dot / generic field store)
|
||||
/// - `list->next = ...` (linked-list insertion)
|
||||
///
|
||||
/// If the variable is transferred, there is no leak — the receiving struct is
|
||||
/// If the variable is transferred, there is no leak, the receiving struct is
|
||||
/// responsible for the lifetime.
|
||||
fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
|
||||
let acquired_var = match &ctx.cfg[acquire].taint.defines {
|
||||
|
|
@ -258,7 +258,7 @@ fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
|
|||
false
|
||||
};
|
||||
if !is_field_write {
|
||||
continue; // genuine redefinition — stop this path
|
||||
continue; // genuine redefinition, stop this path
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -343,7 +343,7 @@ fn is_consumed_by_owner(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
|
|||
}
|
||||
}
|
||||
|
||||
// Also check the span text for consuming calls — handles cases where
|
||||
// Also check the span text for consuming calls, handles cases where
|
||||
// the call is embedded in a return statement (e.g. `return FileResponse(f)`)
|
||||
if info.taint.uses.iter().any(|u| u == &acquired_var) {
|
||||
let (start, end) = info.ast.span;
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ static JAVA_AUTH: &[AuthRule] = &[AuthRule {
|
|||
"hasPermission",
|
||||
"requireRole",
|
||||
// Spring Security / JAX-RS annotation names (used by decorator
|
||||
// detection — see `extract_auth_decorators` in src/cfg.rs).
|
||||
// detection, see `extract_auth_decorators` in src/cfg.rs).
|
||||
"PreAuthorize",
|
||||
"PostAuthorize",
|
||||
"Secured",
|
||||
|
|
@ -174,7 +174,7 @@ static JS_AUTH: &[AuthRule] = &[AuthRule {
|
|||
"jwt.verify",
|
||||
// NestJS-style decorators and guard class names (seeded by decorator
|
||||
// arg extraction in `extract_auth_decorators`). `UseGuards` alone is
|
||||
// too generic — we still match on guard *argument* identifiers here.
|
||||
// too generic, we still match on guard *argument* identifiers here.
|
||||
"Authenticated",
|
||||
"AuthGuard",
|
||||
"JwtAuthGuard",
|
||||
|
|
@ -268,7 +268,7 @@ static CPP_AUTH: &[AuthRule] = &[AuthRule {
|
|||
"check_auth",
|
||||
"verify_token",
|
||||
"validate_token",
|
||||
// Custom C++ attributes — framework-defined, bare-name match.
|
||||
// Custom C++ attributes, framework-defined, bare-name match.
|
||||
"authenticated",
|
||||
"require_auth",
|
||||
"admin_only",
|
||||
|
|
@ -287,7 +287,7 @@ static RUST_AUTH: &[AuthRule] = &[AuthRule {
|
|||
"check_auth",
|
||||
"verify_token",
|
||||
"validate_token",
|
||||
// Custom proc-macro attributes — framework-defined, bare-name match.
|
||||
// Custom proc-macro attributes, framework-defined, bare-name match.
|
||||
"authenticated",
|
||||
"require_auth",
|
||||
"admin_only",
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ fn unreachable_code_detection_runs_without_panic() {
|
|||
|
||||
#[test]
|
||||
fn all_branches_reachable_no_findings() {
|
||||
// All branches reachable — no unreachable-code findings
|
||||
// All branches reachable, no unreachable-code findings
|
||||
let src = br#"
|
||||
use std::process::Command;
|
||||
fn main() {
|
||||
|
|
@ -282,7 +282,7 @@ fn ssa_const_prop_preserves_sink_on_dynamic_source_arg() {
|
|||
|
||||
#[test]
|
||||
fn unguarded_sink_detected() {
|
||||
// Sink with no validation — should be flagged
|
||||
// Sink with no validation, should be flagged
|
||||
let src = br#"
|
||||
use std::process::Command;
|
||||
fn main() {
|
||||
|
|
@ -335,6 +335,90 @@ fn guarded_sink_with_sanitizer_not_flagged() {
|
|||
);
|
||||
}
|
||||
|
||||
/// Regression: `cond_indirect_validator_callee` used to pick the
|
||||
/// textually-last call defining the condition variable across the
|
||||
/// whole function, including reassignments that occur **after** the
|
||||
/// `if`. When that later call wasn't a recognised validator, the
|
||||
/// validator pattern was missed and the downstream sink was
|
||||
/// (incorrectly) flagged as `cfg-unguarded-sink`.
|
||||
///
|
||||
/// Pattern:
|
||||
/// let err = validateInput(cmd); // real validator, before the if
|
||||
/// if (err) throw …; // sink-guarding branch
|
||||
/// eval(cmd); // sink dominated by the guard
|
||||
/// err = recordMetric(); // later reassignment, NOT a validator
|
||||
///
|
||||
/// The defining call reaching the `if` is `validateInput`; the
|
||||
/// `recordMetric` reassignment is downstream of the use and must not
|
||||
/// shadow it.
|
||||
#[test]
|
||||
fn indirect_validator_ignores_reassignment_after_if() {
|
||||
let src = br#"
|
||||
async function handler(req) {
|
||||
const cmd = req.query.cmd;
|
||||
let err = await validateInput(cmd);
|
||||
if (err) {
|
||||
throw new Error('blocked');
|
||||
}
|
||||
eval(cmd);
|
||||
err = recordMetric();
|
||||
}
|
||||
"#;
|
||||
|
||||
let findings = parse_and_analyse(
|
||||
&guards::UnguardedSink,
|
||||
src,
|
||||
"javascript",
|
||||
Language::from(tree_sitter_javascript::LANGUAGE),
|
||||
);
|
||||
|
||||
let guard_findings: Vec<_> = findings
|
||||
.iter()
|
||||
.filter(|f| f.rule_id == "cfg-unguarded-sink")
|
||||
.collect();
|
||||
assert!(
|
||||
guard_findings.is_empty(),
|
||||
"later non-validator reassignment must not shadow the real validator def reaching the if; got {:?}",
|
||||
guard_findings
|
||||
);
|
||||
}
|
||||
|
||||
/// Companion sanity check for `indirect_validator_ignores_reassignment_after_if`:
|
||||
/// without the trailing reassignment the same pattern is already
|
||||
/// suppressed by `cond_indirect_validator_callee`. Pinned so a future
|
||||
/// change to the indirect-validator recognition can't silently regress
|
||||
/// this baseline alongside the regression case above.
|
||||
#[test]
|
||||
fn indirect_validator_baseline_suppresses_dominated_sink() {
|
||||
let src = br#"
|
||||
async function handler(req) {
|
||||
const cmd = req.query.cmd;
|
||||
const err = await validateInput(cmd);
|
||||
if (err) {
|
||||
throw new Error('blocked');
|
||||
}
|
||||
eval(cmd);
|
||||
}
|
||||
"#;
|
||||
|
||||
let findings = parse_and_analyse(
|
||||
&guards::UnguardedSink,
|
||||
src,
|
||||
"javascript",
|
||||
Language::from(tree_sitter_javascript::LANGUAGE),
|
||||
);
|
||||
|
||||
let guard_findings: Vec<_> = findings
|
||||
.iter()
|
||||
.filter(|f| f.rule_id == "cfg-unguarded-sink")
|
||||
.collect();
|
||||
assert!(
|
||||
guard_findings.is_empty(),
|
||||
"indirect-validator pattern (no reassignment) must suppress dominated sink; got {:?}",
|
||||
guard_findings
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Auth gap tests ────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
|
@ -397,7 +481,7 @@ fn auth_check_before_sink_no_finding() {
|
|||
#[test]
|
||||
fn error_fallthrough_analysis_runs_on_go() {
|
||||
// Go pattern: err check without return, followed by dangerous call.
|
||||
// This is a heuristic analysis — we verify it runs without panicking.
|
||||
// This is a heuristic analysis, we verify it runs without panicking.
|
||||
let src = br#"
|
||||
package main
|
||||
import "os/exec"
|
||||
|
|
@ -422,7 +506,7 @@ fn error_fallthrough_analysis_runs_on_go() {
|
|||
|
||||
#[test]
|
||||
fn proper_error_return_no_finding_go() {
|
||||
// Go pattern: err check with return — should not flag error fallthrough.
|
||||
// Go pattern: err check with return, should not flag error fallthrough.
|
||||
let src = br#"
|
||||
package main
|
||||
import "os/exec"
|
||||
|
|
@ -820,6 +904,7 @@ fn taint_and_unguarded_sink_deduped() {
|
|||
path_hash: 0,
|
||||
finding_id: String::new(),
|
||||
alternative_finding_ids: smallvec::SmallVec::new(),
|
||||
effective_sink_caps: crate::labels::Cap::empty(),
|
||||
}];
|
||||
|
||||
let findings = parse_and_run_all_with_taint(
|
||||
|
|
@ -949,7 +1034,7 @@ function readFile() {
|
|||
|
||||
#[test]
|
||||
fn js_throw_terminates_block() {
|
||||
// throw should act as a terminator — code directly after throw in the same
|
||||
// throw should act as a terminator, code directly after throw in the same
|
||||
// block should be unreachable.
|
||||
let src = br#"
|
||||
function fail() {
|
||||
|
|
@ -1031,7 +1116,7 @@ fn configured_terminator_stops_flow() {
|
|||
"eval should be unreachable after process.exit terminator"
|
||||
);
|
||||
}
|
||||
// If eval_nodes is empty it means the node wasn't created (also acceptable —
|
||||
// If eval_nodes is empty it means the node wasn't created (also acceptable ,
|
||||
// it's after a terminator so the CFG may not even emit it)
|
||||
}
|
||||
|
||||
|
|
@ -1480,7 +1565,7 @@ void process() {
|
|||
|
||||
let reachable = dominators::reachable_set(cfg, entry);
|
||||
|
||||
// All nodes should be reachable — the preproc recovery should prevent
|
||||
// All nodes should be reachable, the preproc recovery should prevent
|
||||
// the dangling-else from orphaning downstream code.
|
||||
let unreachable_count = cfg.node_count() - reachable.len();
|
||||
assert!(
|
||||
|
|
@ -1515,7 +1600,7 @@ void process() {
|
|||
|
||||
let reachable = dominators::reachable_set(cfg, entry);
|
||||
|
||||
// All nodes should be reachable — break exits the loop and post-loop
|
||||
// All nodes should be reachable, break exits the loop and post-loop
|
||||
// code (free(x)) should be connected.
|
||||
let unreachable_count = cfg.node_count() - reachable.len();
|
||||
assert!(
|
||||
|
|
@ -1878,7 +1963,7 @@ def run():
|
|||
|
||||
#[test]
|
||||
fn python_one_hop_constant_still_suppressed() {
|
||||
// cmd = "ls"; os.system(cmd) — `all_args_literal` is false (identifier arg),
|
||||
// cmd = "ls"; os.system(cmd), `all_args_literal` is false (identifier arg),
|
||||
// but should still be suppressed via existing one-hop constant trace in cfg_analysis.
|
||||
let src = br#"
|
||||
import os
|
||||
|
|
@ -1959,7 +2044,7 @@ def run():
|
|||
|
||||
#[test]
|
||||
fn python_constant_receiver_tainted_arg_produces_finding() {
|
||||
// safe_obj.system(user_input) — constant receiver is irrelevant, tainted arg must report
|
||||
// safe_obj.system(user_input), constant receiver is irrelevant, tainted arg must report
|
||||
let src = br#"
|
||||
import os
|
||||
import sys
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ fn event_handler_callbacks(ctx: &AnalysisContext) -> HashSet<String> {
|
|||
.iter()
|
||||
.any(|h| callee_lower.ends_with(&h.to_ascii_lowercase()));
|
||||
if is_handler {
|
||||
// The callback function is typically used within the call — any function
|
||||
// The callback function is typically used within the call, any function
|
||||
// that appears as `uses` of this call node is a potential callback.
|
||||
for u in &info.taint.uses {
|
||||
callbacks.insert(u.clone());
|
||||
|
|
@ -113,7 +113,7 @@ impl CfgAnalysis for UnreachableCode {
|
|||
Severity::Medium,
|
||||
)
|
||||
} else {
|
||||
// Plain unreachable code — low severity
|
||||
// Plain unreachable code, low severity
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ fn print_toml_with_highlights(toml_str: &str) {
|
|||
continue;
|
||||
}
|
||||
// key = value lines (but not `[xxx]`). Split on the first `=`
|
||||
// that isn't inside a quoted string — TOML keys don't contain
|
||||
// that isn't inside a quoted string, TOML keys don't contain
|
||||
// `=` outside quotes, so a leading-segment split is safe enough
|
||||
// for the common case. Continuation lines from multi-line
|
||||
// arrays/strings won't have `=` and fall through to plain.
|
||||
|
|
@ -149,7 +149,7 @@ fn prune_matching(effective: &toml::Value, defaults: &toml::Value) -> Option<tom
|
|||
}
|
||||
}
|
||||
None => {
|
||||
// Key absent in defaults — keep entirely.
|
||||
// Key absent in defaults, keep entirely.
|
||||
out.insert(k.clone(), v.clone());
|
||||
}
|
||||
}
|
||||
|
|
@ -160,9 +160,9 @@ fn prune_matching(effective: &toml::Value, defaults: &toml::Value) -> Option<tom
|
|||
Some(toml::Value::Table(out))
|
||||
}
|
||||
}
|
||||
// Identical leaf — drop.
|
||||
// Identical leaf, drop.
|
||||
_ if effective == defaults => None,
|
||||
// Differing leaf or shape change — keep the effective value.
|
||||
// Differing leaf or shape change, keep the effective value.
|
||||
_ => Some(effective.clone()),
|
||||
}
|
||||
}
|
||||
|
|
@ -180,13 +180,13 @@ fn count_top_level_keys(toml_str: &str) -> usize {
|
|||
continue;
|
||||
}
|
||||
if trimmed.starts_with('[') {
|
||||
// Section header — not an override on its own. Reset
|
||||
// Section header, not an override on its own. Reset
|
||||
// any stuck multi-line state defensively.
|
||||
in_multiline = false;
|
||||
continue;
|
||||
}
|
||||
if in_multiline {
|
||||
// Inside a multi-line array/inline table — closing bracket
|
||||
// Inside a multi-line array/inline table, closing bracket
|
||||
// ends it, intermediate lines don't count.
|
||||
if trimmed.starts_with(']') || trimmed.starts_with('}') {
|
||||
in_multiline = false;
|
||||
|
|
|
|||
|
|
@ -123,7 +123,7 @@ pub fn build_index_with_observer(
|
|||
logs: Option<&Arc<ScanLogCollector>>,
|
||||
) -> NyxResult<()> {
|
||||
// Pass 1 of the indexed scan reads persisted summaries produced here, so
|
||||
// framework context must be populated at index-build time — otherwise
|
||||
// framework context must be populated at index-build time, otherwise
|
||||
// framework-conditional label rules never contribute to the summaries
|
||||
// and indexed scans diverge from non-indexed ones. Matches the
|
||||
// auto-fill in scan_filesystem_with_observer /
|
||||
|
|
@ -152,7 +152,7 @@ pub fn build_index_with_observer(
|
|||
|
||||
let walk_start = std::time::Instant::now();
|
||||
let (rx, handle) = spawn_file_walker(project_path, config);
|
||||
// Drain the channel BEFORE joining — the bounded channel will deadlock
|
||||
// Drain the channel BEFORE joining, the bounded channel will deadlock
|
||||
// if we join first and the walker blocks on send.
|
||||
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
|
||||
if let Err(err) = handle.join() {
|
||||
|
|
@ -205,7 +205,7 @@ pub fn build_index_with_observer(
|
|||
.try_for_each(|path| -> NyxResult<()> {
|
||||
let mut idx = Indexer::from_pool(project_name, &pool)?;
|
||||
|
||||
// Read once, hash once — pass bytes to both rule execution and
|
||||
// Read once, hash once, pass bytes to both rule execution and
|
||||
// summary extraction. Use pre-computed hash for upsert to avoid
|
||||
// a redundant file read inside upsert_file.
|
||||
let bytes = std::fs::read(&path)?;
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ pub fn handle_command(
|
|||
// Resolve engine options once for the whole process. Scan overlays CLI
|
||||
// flags below; other subcommands use the config values verbatim. The
|
||||
// install is a no-op after the first call, so Scan's overlay must happen
|
||||
// before we reach this point for its own call path — we delay the install
|
||||
// before we reach this point for its own call path, we delay the install
|
||||
// to the Scan arm and gate non-scan commands behind a fallback install of
|
||||
// the bare config values.
|
||||
let install_from_config = |config: &Config| {
|
||||
|
|
@ -378,7 +378,7 @@ fn print_engine_explanation(config: &Config, engine_profile: Option<EngineProfil
|
|||
use console::style;
|
||||
|
||||
// Plain-text on/off, padded to 3 chars so the trailing column aligns
|
||||
// regardless of which value is rendered. Colour is layered on top —
|
||||
// regardless of which value is rendered. Colour is layered on top ,
|
||||
// the visible width stays 3 characters because `console::style` emits
|
||||
// zero-width ANSI codes (and nothing at all when NO_COLOR is set).
|
||||
fn onoff(b: bool) -> String {
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ fn record_persist_error(errors: &Arc<Mutex<Vec<String>>>, message: String) {
|
|||
/// When `enabled` is true, a panic inside `f` is caught, logged, and
|
||||
/// converted into a `NyxError::Msg`; callers that already match on
|
||||
/// `Err(_)` will gracefully skip the file. When `enabled` is false,
|
||||
/// the panic propagates unchanged — preserving the default behaviour
|
||||
/// the panic propagates unchanged, preserving the default behaviour
|
||||
/// for users who want to catch engine bugs loudly.
|
||||
///
|
||||
/// `AssertUnwindSafe` is load-bearing: closures over `&Config` /
|
||||
|
|
@ -222,7 +222,7 @@ fn is_false(b: &bool) -> bool {
|
|||
/// Framework detection drives framework-conditional label rules (e.g. actix /
|
||||
/// axum / rocket handler-arg sources, Rails route helpers) and auth-analysis
|
||||
/// extractors. If any scan entry point forgets to populate it, the indexed
|
||||
/// and non-indexed paths silently diverge — missing framework-specific
|
||||
/// and non-indexed paths silently diverge, missing framework-specific
|
||||
/// findings in whichever path skipped detection. This helper exists so the
|
||||
/// auto-fill stays consistent across `scan_filesystem_with_observer`,
|
||||
/// `scan_with_index_parallel_observer`, and `build_index_with_observer`.
|
||||
|
|
@ -239,7 +239,7 @@ pub(crate) fn ensure_framework_ctx(root: &Path, cfg: &Config) -> Option<Config>
|
|||
///
|
||||
/// Drives the one-time `preview-tier scan` banner in `handle()`. Tracks
|
||||
/// the extensions `lang_for_path` in `ast.rs` maps to the `"c"` and `"cpp"`
|
||||
/// slugs — keep this aligned with that mapping.
|
||||
/// slugs, keep this aligned with that mapping.
|
||||
pub(crate) fn is_preview_tier_path(path: &Path) -> bool {
|
||||
matches!(
|
||||
path.extension()
|
||||
|
|
@ -514,14 +514,14 @@ pub fn retain_converged_findings(diags: &mut Vec<Diag>) {
|
|||
/// the same function; tiebreak by source line asc, source col asc).
|
||||
///
|
||||
/// Rule IDs of the form `taint-unsanitised-flow (source L:C)` share a single
|
||||
/// base `taint-unsanitised-flow`. The grouping key is column-agnostic —
|
||||
/// base `taint-unsanitised-flow`. The grouping key is column-agnostic ,
|
||||
/// multiple flows to the same sink line differing only in column or source
|
||||
/// are collapsed to one. The rule_id preserves the source location, so the
|
||||
/// kept representative still identifies which flow was reported.
|
||||
///
|
||||
/// The grouping key **includes the resolved sink capability bits** so that
|
||||
/// two different sinks on the same line (e.g. `sink_sql(x); sink_shell(x);`)
|
||||
/// are not collapsed into one finding — they represent materially different
|
||||
/// are not collapsed into one finding, they represent materially different
|
||||
/// vulnerabilities and must surface independently. Findings with different
|
||||
/// base rule IDs (e.g. `js.code_exec.eval`) or different severities are
|
||||
/// left untouched per guardrails.
|
||||
|
|
@ -560,7 +560,7 @@ pub(crate) fn deduplicate_taint_flows(diags: &mut Vec<Diag>) {
|
|||
let src_col = src.map(|s| s.col).unwrap_or(u32::MAX);
|
||||
// Same-function check: first flow_step (Source) and the step at the
|
||||
// sink share an `enclosing_func`. If flow_steps are absent or the
|
||||
// function markers are missing, treat as "unknown" — worse than a
|
||||
// function markers are missing, treat as "unknown", worse than a
|
||||
// confirmed same-function match but better than a confirmed mismatch.
|
||||
let same_function_flag: u32 = ev
|
||||
.and_then(|e| {
|
||||
|
|
@ -677,7 +677,7 @@ pub const SCC_UNCONVERGED_CROSS_FILE_NOTE_PREFIX: &str = "scc_unconverged:cross-
|
|||
/// [`GlobalSummaries::snapshot_caps`] results.
|
||||
///
|
||||
/// Used by the Phase-B worklist to derive the next iteration's dirty
|
||||
/// file set. Semantics match [`diff_cap_snapshots`] — a key that
|
||||
/// file set. Semantics match [`diff_cap_snapshots`], a key that
|
||||
/// appears or disappears counts as changed.
|
||||
fn changed_cap_keys_of(
|
||||
before: &HashMap<crate::symbol::FuncKey, (u16, u16, u16, Vec<usize>)>,
|
||||
|
|
@ -728,7 +728,7 @@ fn changed_ssa_keys_of(
|
|||
///
|
||||
/// Called once per unconverged batch (after the pass-2 rayon parallelism
|
||||
/// has collected `iteration_diags`) so the cost is O(n) over the batch's
|
||||
/// findings — much cheaper than a per-finding `warn!`.
|
||||
/// findings, much cheaper than a per-finding `warn!`.
|
||||
///
|
||||
/// Confidence is **capped** at `Low` rather than unconditionally set:
|
||||
/// upstream analysis may have proven something particularly strong about
|
||||
|
|
@ -795,7 +795,7 @@ fn tag_unconverged_findings(
|
|||
|
||||
/// Safety cap on SCC fixed-point iterations.
|
||||
///
|
||||
/// The convergence predicate is *snapshot equality* — we break as soon as
|
||||
/// The convergence predicate is *snapshot equality*, we break as soon as
|
||||
/// an iteration leaves both `snapshot_caps()` and `snapshot_ssa()`
|
||||
/// unchanged. The cap only triggers if something prevents monotone
|
||||
/// progress (e.g. a non-monotone SSA summary refinement or an SCC larger
|
||||
|
|
@ -809,7 +809,7 @@ fn tag_unconverged_findings(
|
|||
/// SCC with `k` functions arranged in a chain, fresh taint introduced at
|
||||
/// one end of the chain needs up to `k` iterations to reach the other
|
||||
/// end. A hard cap of 3 was silently truncating propagation for any
|
||||
/// SCC of 4+ cross-file functions — findings vanished with no warning.
|
||||
/// SCC of 4+ cross-file functions, findings vanished with no warning.
|
||||
///
|
||||
/// `FuncSummary` is a finite-height lattice (≤ 48 bits of caps + a
|
||||
/// bounded vector of parameter indices) and `insert()` is strictly
|
||||
|
|
@ -865,7 +865,7 @@ fn effective_scc_cap() -> usize {
|
|||
/// persisted by non-recursive topo batches in the most recent
|
||||
/// [`run_topo_batches`] invocation. Intended for the regression tests
|
||||
/// that prove the topo-refinement pipeline is wired and producing
|
||||
/// observable cross-batch state — see
|
||||
/// observable cross-batch state, see
|
||||
/// `tests/topo_pass2_refinement_tests.rs`. Cheap relaxed load.
|
||||
static LAST_TOPO_NONRECURSIVE_REFINEMENTS: AtomicUsize = AtomicUsize::new(0);
|
||||
|
||||
|
|
@ -905,7 +905,7 @@ fn topo_refine_enabled() -> bool {
|
|||
///
|
||||
/// When `call_graph` is missing an edge (e.g. a summary was inserted
|
||||
/// after graph construction), we conservatively fall back to
|
||||
/// re-analysing the full batch — correctness is preserved at the cost
|
||||
/// re-analysing the full batch, correctness is preserved at the cost
|
||||
/// of the worklist optimisation for that iteration.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn run_topo_batches(
|
||||
|
|
@ -1104,7 +1104,7 @@ fn run_topo_batches(
|
|||
// A file becomes dirty for iteration N+1 iff it
|
||||
// contains at least one caller of a FuncKey that
|
||||
// changed in iteration N. If no key changed, the
|
||||
// dirty set is empty — which implies convergence (and
|
||||
// dirty set is empty, which implies convergence (and
|
||||
// matches `iter_converged` above).
|
||||
let changed_cap_keys = changed_cap_keys_of(&snap_before, &snap_after);
|
||||
let changed_ssa_keys =
|
||||
|
|
@ -1124,7 +1124,7 @@ fn run_topo_batches(
|
|||
// changed key. Fall back to the full batch when the
|
||||
// call graph does not resolve any caller (e.g. all
|
||||
// changes happened in leaf functions that no one in
|
||||
// this batch calls — rare but must not regress to
|
||||
// this batch calls, rare but must not regress to
|
||||
// missed analysis).
|
||||
let namespaces_needing_reanalysis =
|
||||
crate::callgraph::namespaces_for_callers(call_graph, &all_changed_keys);
|
||||
|
|
@ -1165,7 +1165,7 @@ fn run_topo_batches(
|
|||
}
|
||||
if iter_converged {
|
||||
// Snapshots equal but dirty_files non-empty is
|
||||
// anomalous — log and treat as converged
|
||||
// anomalous, log and treat as converged
|
||||
// (snapshot equality is the correctness-preserving
|
||||
// signal).
|
||||
tracing::debug!(
|
||||
|
|
@ -1182,7 +1182,7 @@ fn run_topo_batches(
|
|||
// After the loop, flatten per-file diags into the
|
||||
// iteration_diags vector in batch order for deterministic
|
||||
// output. Files that were in the batch but never made
|
||||
// dirty (shouldn't happen — iter 0 runs all of them) are
|
||||
// dirty (shouldn't happen, iter 0 runs all of them) are
|
||||
// skipped silently.
|
||||
let mut iteration_diags: Vec<Diag> = Vec::new();
|
||||
for p in &batch.files {
|
||||
|
|
@ -1268,7 +1268,7 @@ fn run_topo_batches(
|
|||
// parallel section completes, persist those refinements into
|
||||
// `global_summaries` sequentially. Subsequent batches in
|
||||
// topo order (caller-most batches) then resolve their call
|
||||
// sites against the refined cross-file context — the final
|
||||
// sites against the refined cross-file context, the final
|
||||
// step in the callee-first topo pipeline that pass-2
|
||||
// sequencing was always meant to deliver.
|
||||
//
|
||||
|
|
@ -1455,7 +1455,7 @@ fn run_topo_batches(
|
|||
}
|
||||
}
|
||||
|
||||
// Orphan files (no functions in call graph) — process last, single pass.
|
||||
// Orphan files (no functions in call graph), process last, single pass.
|
||||
if !orphans.is_empty() {
|
||||
let orphan_diags: Vec<Diag> = orphans
|
||||
.par_iter()
|
||||
|
|
@ -2099,7 +2099,7 @@ pub fn scan_with_index_parallel_observer(
|
|||
if let Some(p) = &progress_ref {
|
||||
p.set_current_file(&path.to_string_lossy());
|
||||
}
|
||||
// Read once, hash once — use the hash for the change check
|
||||
// Read once, hash once, use the hash for the change check
|
||||
// to avoid a second file read inside should_scan.
|
||||
if let Ok(bytes) = std::fs::read(path) {
|
||||
let hash = Indexer::digest_bytes(&bytes);
|
||||
|
|
@ -2681,7 +2681,7 @@ pub fn scan_with_index_parallel_observer(
|
|||
// pipeline intends to produce (taint + cfg-* + state-* from state
|
||||
// analysis + auth.* when configured). A previous revision clipped this
|
||||
// to `taint*`/`cfg-*` only, silently dropping state-model findings and
|
||||
// breaking parity with `scan_filesystem` — fixed. Mode-scoped
|
||||
// breaking parity with `scan_filesystem`, fixed. Mode-scoped
|
||||
// filtering, if ever needed, belongs in the analysis layer, not here.
|
||||
|
||||
let post_process_start = std::time::Instant::now();
|
||||
|
|
@ -3134,7 +3134,7 @@ mod dedup_taint_flow_tests {
|
|||
|
||||
#[test]
|
||||
fn dedup_collapses_same_line_different_columns() {
|
||||
// Two findings at line 10 but different columns — the widened key
|
||||
// Two findings at line 10 but different columns, the widened key
|
||||
// (path, line, severity) collapses them; the tighter source wins.
|
||||
let mut diags = vec![
|
||||
make_taint("a.rs", 10, 3, 4, 1),
|
||||
|
|
@ -3151,7 +3151,7 @@ mod dedup_taint_flow_tests {
|
|||
|
||||
#[test]
|
||||
fn dedup_does_not_drop_different_sink_caps_on_same_line() {
|
||||
// Two findings at line 10, same column, same severity — but with
|
||||
// Two findings at line 10, same column, same severity, but with
|
||||
// different resolved sink capability bits (SQL vs SHELL). They must
|
||||
// NOT collapse: different sink kinds are materially different
|
||||
// vulnerabilities. Regression guard.
|
||||
|
|
@ -3175,7 +3175,7 @@ mod dedup_taint_flow_tests {
|
|||
|
||||
#[test]
|
||||
fn dedup_collapses_same_sink_caps_on_same_line() {
|
||||
// Same line, same severity, same sink caps — this is the canonical
|
||||
// Same line, same severity, same sink caps, this is the canonical
|
||||
// dedup case (two flows to the same sink, differing only in source).
|
||||
let mut diags = vec![
|
||||
make_taint("a.rs", 10, 5, 3, 1),
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ pub fn handle(
|
|||
|
||||
// Invalidate the findings cache whenever a scan finishes so the next
|
||||
// request rebuilds against fresh diags. The next-request rebuild keeps
|
||||
// this hot-path simple — we only clear the slot here, never recompute.
|
||||
// this hot-path simple, we only clear the slot here, never recompute.
|
||||
let cache_for_invalidate = Arc::clone(&state.findings_cache);
|
||||
let mut event_rx = event_tx.subscribe();
|
||||
tokio::spawn(async move {
|
||||
|
|
@ -152,7 +152,7 @@ async fn shutdown_signal() {
|
|||
.expect("failed to listen for Ctrl+C");
|
||||
eprintln!("\n Shutting down...");
|
||||
// SSE connections block graceful shutdown indefinitely.
|
||||
// Use a raw OS thread to force exit — tokio tasks may not
|
||||
// Use a raw OS thread to force exit, tokio tasks may not
|
||||
// run reliably during shutdown.
|
||||
std::thread::spawn(|| {
|
||||
std::thread::sleep(std::time::Duration::from_millis(250));
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ impl ConstValue {
|
|||
if let Ok(i) = t.parse::<i64>() {
|
||||
return Some(ConstValue::Int(i));
|
||||
}
|
||||
// Negative with space: "- 5" — not supported, conservative
|
||||
// Negative with space: "- 5", not supported, conservative
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
@ -118,9 +118,9 @@ impl ConstValue {
|
|||
pub struct TypeSet(u16);
|
||||
|
||||
impl TypeSet {
|
||||
/// All 12 type bits set — no type constraint (Top).
|
||||
/// All 12 type bits set, no type constraint (Top).
|
||||
pub const TOP: Self = Self(0x0FFF);
|
||||
/// No type bits — unsatisfiable (Bottom).
|
||||
/// No type bits, unsatisfiable (Bottom).
|
||||
pub const BOTTOM: Self = Self(0);
|
||||
|
||||
pub fn singleton(kind: &TypeKind) -> Self {
|
||||
|
|
@ -149,7 +149,7 @@ impl TypeSet {
|
|||
self == Self::TOP
|
||||
}
|
||||
|
||||
/// Complement — all types NOT in this set.
|
||||
/// Complement, all types NOT in this set.
|
||||
pub fn complement(self) -> Self {
|
||||
Self(!self.0 & Self::TOP.0)
|
||||
}
|
||||
|
|
@ -184,7 +184,7 @@ fn type_kind_index(kind: &TypeKind) -> u32 {
|
|||
TypeKind::Url => 10,
|
||||
TypeKind::HttpClient => 11,
|
||||
TypeKind::LocalCollection => 12,
|
||||
// Phase 6 DTO types carry per-field structural info that the
|
||||
// the analysis DTO types carry per-field structural info that the
|
||||
// bitset domain can't represent. Collapse to Unknown so callers
|
||||
// still see "any type possible" rather than crashing on an
|
||||
// unhandled variant. Same-file/cross-file Dto-aware paths read
|
||||
|
|
@ -274,7 +274,7 @@ impl Nullability {
|
|||
|
||||
/// Boolean state lattice.
|
||||
///
|
||||
/// Same shape as [`Nullability`]. No `negate()` — negation is structural
|
||||
/// Same shape as [`Nullability`]. No `negate()`, negation is structural
|
||||
/// on [`ConditionExpr`](super::lower::ConditionExpr).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum BoolState {
|
||||
|
|
@ -313,7 +313,7 @@ impl BoolState {
|
|||
/// Abstract fact about a single SSA value.
|
||||
///
|
||||
/// Combines interval, constant, type, null, and boolean constraints.
|
||||
/// There is intentionally no generic `negate()` on ValueFact — negation
|
||||
/// There is intentionally no generic `negate()` on ValueFact, negation
|
||||
/// is structural on [`ConditionExpr`](super::lower::ConditionExpr) and
|
||||
/// then applied as atomic refinements by the solver.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
|
|
@ -857,14 +857,14 @@ impl PathEnv {
|
|||
// `assume_neq`, and a few internal sites. Large generated inputs
|
||||
// (thousands of short statements on one line) can drive millions
|
||||
// of calls and overflow a plain u16 `refine_count`. Saturate to
|
||||
// stay within bounds — the refinement pipeline is already
|
||||
// stay within bounds, the refinement pipeline is already
|
||||
// idempotent past the cap, so saturation is semantically a no-op.
|
||||
self.refine_count = self.refine_count.saturating_add(1);
|
||||
|
||||
// Check size bound
|
||||
let pos = self.facts.binary_search_by_key(&v, |(k, _)| *k);
|
||||
if pos.is_err() && self.facts.len() >= MAX_PATH_ENV_ENTRIES {
|
||||
return; // bounded — don't grow
|
||||
return; // bounded, don't grow
|
||||
}
|
||||
|
||||
// Get meet count for widening
|
||||
|
|
@ -963,7 +963,7 @@ impl PathEnv {
|
|||
let ra = self.uf.find_immutable(a);
|
||||
let rb = self.uf.find_immutable(b);
|
||||
if ra == rb {
|
||||
// Already known equal — contradiction
|
||||
// Already known equal, contradiction
|
||||
self.unsat = true;
|
||||
return;
|
||||
}
|
||||
|
|
@ -1040,7 +1040,7 @@ impl PathEnv {
|
|||
return;
|
||||
}
|
||||
|
||||
// Step 4: dedup check — if this exact constraint already exists, skip
|
||||
// Step 4: dedup check, if this exact constraint already exists, skip
|
||||
let already_present = self
|
||||
.relational
|
||||
.iter()
|
||||
|
|
@ -1052,7 +1052,7 @@ impl PathEnv {
|
|||
if self.relational.len() < MAX_RELATIONAL {
|
||||
self.relational.push((ra, op, rb));
|
||||
}
|
||||
// If at capacity, skip — conservative: losing a constraint only
|
||||
// If at capacity, skip, conservative: losing a constraint only
|
||||
// loses pruning power, never introduces unsoundness.
|
||||
}
|
||||
|
||||
|
|
@ -1089,7 +1089,7 @@ impl PathEnv {
|
|||
if has_strict || op == RelOp::Lt {
|
||||
return true;
|
||||
}
|
||||
// All Le: a <= b <= ... <= a means all equal — satisfiable
|
||||
// All Le: a <= b <= ... <= a means all equal, satisfiable
|
||||
return false;
|
||||
}
|
||||
// Continue walking (take first outgoing edge)
|
||||
|
|
@ -1181,11 +1181,11 @@ impl PathEnv {
|
|||
while i < self.facts.len() && j < other.facts.len() {
|
||||
match self.facts[i].0.cmp(&other.facts[j].0) {
|
||||
std::cmp::Ordering::Less => {
|
||||
// Only in self — drop (absent on other side = Top)
|
||||
// Only in self, drop (absent on other side = Top)
|
||||
i += 1;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
// Only in other — drop
|
||||
// Only in other, drop
|
||||
j += 1;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@
|
|||
//! 1. **Structural:** `condition_negated` (AST-level boolean)
|
||||
//! 2. **Structural:** `condition_vars` (AST-extracted identifiers)
|
||||
//! 3. **Structural:** compound decomposition (already handled by
|
||||
//! `build_condition_chain` — each leaf is a separate Block/Branch)
|
||||
//! 4. **Structural:** `value_defs` — resolve var names to [`SsaValue`]s
|
||||
//! 5. **Structural:** `const_values` — augment with known constants
|
||||
//! 6. **Text fallback:** `condition_text` — parse comparison operator and
|
||||
//! `build_condition_chain`, each leaf is a separate Block/Branch)
|
||||
//! 4. **Structural:** `value_defs`, resolve var names to [`SsaValue`]s
|
||||
//! 5. **Structural:** `const_values`, augment with known constants
|
||||
//! 6. **Text fallback:** `condition_text`, parse comparison operator and
|
||||
//! literal operand. Necessary because individual comparisons are NOT
|
||||
//! decomposed into separate SSA operations (condition nodes → `Nop`).
|
||||
|
||||
|
|
@ -82,7 +82,7 @@ impl CompOp {
|
|||
/// Structured condition expression with SSA-resolved operands.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum ConditionExpr {
|
||||
/// `lhs op rhs` — e.g., `x > 5`, `x == y`.
|
||||
/// `lhs op rhs`, e.g., `x > 5`, `x == y`.
|
||||
Comparison {
|
||||
lhs: Operand,
|
||||
op: CompOp,
|
||||
|
|
@ -98,7 +98,7 @@ pub enum ConditionExpr {
|
|||
},
|
||||
/// Boolean truthiness test: `if (x)`.
|
||||
BoolTest { var: SsaValue },
|
||||
/// Could not parse or resolve — conservatively no refinement.
|
||||
/// Could not parse or resolve, conservatively no refinement.
|
||||
Unknown,
|
||||
}
|
||||
|
||||
|
|
@ -240,7 +240,7 @@ pub fn lower_condition_with_stacks(
|
|||
.map(|(name, val)| (name.as_str(), *val))
|
||||
.collect();
|
||||
|
||||
// No const_values at lowering time — empty lookup
|
||||
// No const_values at lowering time, empty lookup
|
||||
let const_lookup: HashMap<SsaValue, super::domain::ConstValue> = HashMap::new();
|
||||
|
||||
let lower = text.to_ascii_lowercase();
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
//! Constraint solver: apply conditions to [`PathEnv`] and check satisfiability.
|
||||
//!
|
||||
//! The solver operates on structured [`ConditionExpr`] values — never on raw
|
||||
//! The solver operates on structured [`ConditionExpr`] values, never on raw
|
||||
//! text. Negation is always structural (via [`ConditionExpr::negate`] /
|
||||
//! [`CompOp::negate`]), not via a generic "negate ValueFact" operation.
|
||||
|
||||
|
|
@ -13,7 +13,7 @@ use super::lower::{CompOp, ConditionExpr, Operand};
|
|||
/// for the branch where the condition has the given polarity.
|
||||
///
|
||||
/// `polarity = true`: condition holds (true branch).
|
||||
/// `polarity = false`: condition does NOT hold (false branch) — negate
|
||||
/// `polarity = false`: condition does NOT hold (false branch), negate
|
||||
/// the condition structurally, then apply.
|
||||
pub fn refine_env(env: &PathEnv, cond: &ConditionExpr, polarity: bool) -> PathEnv {
|
||||
if env.is_unsat() {
|
||||
|
|
@ -97,7 +97,7 @@ fn apply_condition(env: &mut PathEnv, cond: &ConditionExpr) {
|
|||
}
|
||||
|
||||
ConditionExpr::Unknown => {
|
||||
// No information — no refinement
|
||||
// No information, no refinement
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -232,7 +232,7 @@ pub fn class_name_to_type_kind(name: &str) -> Option<TypeKind> {
|
|||
"Boolean" => Some(TypeKind::Bool),
|
||||
"List" | "ArrayList" | "Collection" | "Set" | "HashSet" => Some(TypeKind::Array),
|
||||
"URL" | "URI" => Some(TypeKind::Url),
|
||||
// Framework HTTP clients — also listed in JAVA_HIERARCHY (type_facts.rs)
|
||||
// Framework HTTP clients, also listed in JAVA_HIERARCHY (type_facts.rs)
|
||||
// for subtype resolution. Both locations needed: this function is called
|
||||
// directly by the constraint solver, while the hierarchy provides
|
||||
// is_subtype_of() for instanceof checks.
|
||||
|
|
|
|||
|
|
@ -156,7 +156,7 @@ fn valuefact_widen_stable_bound() {
|
|||
b.lo = Some(0);
|
||||
b.lo_strict = true;
|
||||
let w = a.widen(&b);
|
||||
assert_eq!(w.lo, Some(0)); // stable — preserved
|
||||
assert_eq!(w.lo, Some(0)); // stable, preserved
|
||||
assert!(w.lo_strict);
|
||||
}
|
||||
|
||||
|
|
@ -357,7 +357,7 @@ fn pathenv_max_refine_per_block() {
|
|||
let v = SsaValue(0);
|
||||
// Reset counter
|
||||
env.reset_refine_count();
|
||||
// Refine many times — should stop after MAX_REFINE_PER_BLOCK
|
||||
// Refine many times, should stop after MAX_REFINE_PER_BLOCK
|
||||
for _ in 0..(MAX_REFINE_PER_BLOCK + 50) {
|
||||
let mut f = ValueFact::top();
|
||||
f.null = Nullability::NonNull;
|
||||
|
|
|
|||
|
|
@ -1,69 +1,20 @@
|
|||
//! Convergence-loop telemetry: per-batch and per-file JSONL sidecar.
|
||||
//!
|
||||
//! Records how many iterations each fix-point loop (cross-file SCC;
|
||||
//! JS/TS in-file pass-2) actually used on real inputs, plus the
|
||||
//! per-iteration change-set size trajectory, so we can tune caps on
|
||||
//! evidence rather than by guess.
|
||||
//!
|
||||
//! # Why this module exists
|
||||
//!
|
||||
//! The SCC fix-point safety cap ([`crate::commands::scan::SCC_FIXPOINT_SAFETY_CAP`])
|
||||
//! and the JS/TS pass-2 cap ([`crate::taint::JS_TS_PASS2_SAFETY_CAP`])
|
||||
//! are both 64 iterations — chosen as "generous for every realistic
|
||||
//! input we've seen". Neither value is backed by telemetry from a
|
||||
//! production corpus (React, VSCode, Webpack, enterprise
|
||||
//! monorepos). Without that data we cannot:
|
||||
//!
|
||||
//! * tell how often the cap actually fires under real workloads,
|
||||
//! * distinguish tuneable-budget problems from non-monotonicity
|
||||
//! regressions (Phase-D classifier addresses this on cap-hit, but
|
||||
//! tells us nothing about the near-cap distribution),
|
||||
//! * decide whether further Phase-B worklist optimisation is needed.
|
||||
//!
|
||||
//! The telemetry emitted here is consumed by offline analysis tools
|
||||
//! (`tools/convergence_report.py`, not tracked here) that compute
|
||||
//! P50/P95/P99 iteration counts per corpus.
|
||||
//!
|
||||
//! # Lifecycle
|
||||
//!
|
||||
//! Telemetry is **opt-in** via `NYX_CONVERGENCE_TELEMETRY=1` — production
|
||||
//! scans are unaffected by default. When enabled:
|
||||
//!
|
||||
//! * [`is_enabled`] returns true.
|
||||
//! * The SCC loop and JS/TS pass-2 loop each call [`record`] when
|
||||
//! they terminate (early-convergence or cap-hit).
|
||||
//! * On scan shutdown, the collected records are written to a JSONL
|
||||
//! file alongside the SARIF output (or to the path specified by
|
||||
//! `NYX_CONVERGENCE_TELEMETRY_PATH`).
|
||||
//!
|
||||
//! Records never touch the critical path — [`record`] is a cheap
|
||||
//! push onto a `Mutex<Vec<_>>` and the write happens once at scan end.
|
||||
//!
|
||||
//! # Schema stability
|
||||
//!
|
||||
//! Records serialize as JSONL (one JSON object per line, newline
|
||||
//! separated). The `kind` tag is snake_case and stable; adding new
|
||||
//! fields is backwards-compatible because unknown fields are ignored
|
||||
//! by downstream tooling. Removing fields, or changing existing
|
||||
//! fields' types, is a **breaking change** — bump the schema version
|
||||
//! in [`SCHEMA_VERSION`] if you must.
|
||||
//! Opt-in via `NYX_CONVERGENCE_TELEMETRY=1`. Records iteration counts
|
||||
//! and change-set trajectories for the cross-file SCC and JS/TS
|
||||
//! pass-2 fix-point loops so caps can be tuned from evidence. Output
|
||||
//! goes to `NYX_CONVERGENCE_TELEMETRY_PATH` or a SARIF-adjacent file.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
use std::sync::{Mutex, OnceLock};
|
||||
|
||||
/// Stable schema version for the JSONL records emitted by this module.
|
||||
///
|
||||
/// Bump when the record shape changes in a way that breaks downstream
|
||||
/// consumers (field removed, type changed). Adding optional fields is
|
||||
/// backwards-compatible and does not require a bump.
|
||||
/// JSONL schema version. Bump on breaking shape changes; optional
|
||||
/// fields don't require a bump.
|
||||
pub const SCHEMA_VERSION: u32 = 1;
|
||||
|
||||
/// One convergence event: either a cross-file SCC batch or a JS/TS
|
||||
/// in-file pass-2 run. The `kind` discriminator selects between them.
|
||||
///
|
||||
/// Serialized as JSON with `kind` as a snake_case tag so downstream
|
||||
/// tooling can pattern-match without depending on Rust enum layout.
|
||||
/// One convergence event, either a cross-file SCC batch or a JS/TS
|
||||
/// in-file pass-2 run.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum ConvergenceEvent {
|
||||
|
|
@ -98,7 +49,7 @@ pub struct SccBatchRecord {
|
|||
/// True iff the batch reached the fixed point before the cap
|
||||
/// fired.
|
||||
pub converged: bool,
|
||||
/// Per-iteration change-set size — the same trajectory the
|
||||
/// Per-iteration change-set size, the same trajectory the
|
||||
/// [`crate::engine_notes::CapHitReason`] classifier consumes. Empty
|
||||
/// when the loop terminated on iteration 0 (pathological case).
|
||||
pub trajectory: SmallVec<[u32; 4]>,
|
||||
|
|
@ -130,20 +81,10 @@ pub struct InFilePass2Record {
|
|||
pub trajectory: SmallVec<[u32; 4]>,
|
||||
}
|
||||
|
||||
/// Global collector for convergence events recorded during a scan.
|
||||
///
|
||||
/// Stored behind a `OnceLock<Mutex<Vec<_>>>` so multiple rayon workers
|
||||
/// can record events concurrently without a startup cost when
|
||||
/// telemetry is disabled. The mutex contention is negligible because
|
||||
/// each scan produces O(batches + JS/TS files) events, not per-task
|
||||
/// events.
|
||||
static COLLECTOR: OnceLock<Mutex<Vec<ConvergenceEvent>>> = OnceLock::new();
|
||||
|
||||
/// Returns true when telemetry collection is active for this process.
|
||||
///
|
||||
/// Controlled by the `NYX_CONVERGENCE_TELEMETRY` env var: any value
|
||||
/// except `"0"`, `"false"`, or empty enables it. Cached on first
|
||||
/// read so the env lookup is paid once per process.
|
||||
/// True when `NYX_CONVERGENCE_TELEMETRY` is set to anything other than
|
||||
/// `"0"`, `"false"`, or empty. Cached.
|
||||
pub fn is_enabled() -> bool {
|
||||
static ENABLED: OnceLock<bool> = OnceLock::new();
|
||||
*ENABLED.get_or_init(|| match std::env::var("NYX_CONVERGENCE_TELEMETRY") {
|
||||
|
|
@ -152,11 +93,7 @@ pub fn is_enabled() -> bool {
|
|||
})
|
||||
}
|
||||
|
||||
/// Record a convergence event. No-op when telemetry is disabled.
|
||||
///
|
||||
/// Safe to call from parallel rayon contexts — the underlying mutex
|
||||
/// is reentrant-safe and the push is O(1). Events are retained in
|
||||
/// memory until [`drain`] is called at scan end.
|
||||
/// Record a convergence event. No-op when telemetry is disabled.
|
||||
pub fn record(event: ConvergenceEvent) {
|
||||
if !is_enabled() {
|
||||
return;
|
||||
|
|
@ -167,9 +104,7 @@ pub fn record(event: ConvergenceEvent) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Drain and return all recorded events. Leaves the collector empty
|
||||
/// so subsequent scans in the same process (e.g. integration tests)
|
||||
/// do not see stale events.
|
||||
/// Drain all recorded events.
|
||||
pub fn drain() -> Vec<ConvergenceEvent> {
|
||||
let Some(lock) = COLLECTOR.get() else {
|
||||
return Vec::new();
|
||||
|
|
@ -207,7 +142,7 @@ pub fn write_jsonl(path: &std::path::Path) -> std::io::Result<usize> {
|
|||
/// Canonical sidecar path: uses `NYX_CONVERGENCE_TELEMETRY_PATH` if
|
||||
/// set, otherwise derives from the current working directory.
|
||||
///
|
||||
/// The `_derive_from_root` hint is the scan root — when no explicit
|
||||
/// The `_derive_from_root` hint is the scan root, when no explicit
|
||||
/// path is configured we place the sidecar next to it as
|
||||
/// `nyx-convergence.jsonl` so the file lands alongside the SARIF
|
||||
/// output by default.
|
||||
|
|
@ -230,7 +165,7 @@ mod tests {
|
|||
static COLLECTOR_TEST_GUARD: Mutex<()> = Mutex::new(());
|
||||
|
||||
/// Clear the global collector so each test starts with a known
|
||||
/// state. Does **not** force `is_enabled()` true — the unit
|
||||
/// state. Does **not** force `is_enabled()` true, the unit
|
||||
/// tests below bypass `record()` (which is a no-op unless
|
||||
/// env-enabled) by pushing directly into the collector.
|
||||
fn reset_and_enable_telemetry() {
|
||||
|
|
|
|||
|
|
@ -202,16 +202,16 @@ pub mod index {
|
|||
///
|
||||
/// Bumped independently of `ENGINE_VERSION` whenever the serialized
|
||||
/// layout or identity of a cached artefact changes in an incompatible
|
||||
/// way — e.g. a `FuncKey` field semantic change that would cause old
|
||||
/// way, e.g. a `FuncKey` field semantic change that would cause old
|
||||
/// summaries to misbehave when rehydrated.
|
||||
///
|
||||
/// History:
|
||||
/// * `"1"` — initial.
|
||||
/// * `"2"` — 0.5.0: `FuncKey.disambig` changed from the function-node
|
||||
/// * `"1"`, initial.
|
||||
/// * `"2"`, 0.5.0: `FuncKey.disambig` changed from the function-node
|
||||
/// byte offset to a depth-first structural index. Pre-0.5.0 caches
|
||||
/// store byte-offset disambigs and would fail to match bodies built
|
||||
/// by the new engine, so they are silently rebuilt on open.
|
||||
/// * `"3"` — `ssa_function_bodies.body` changed from JSON TEXT to
|
||||
/// * `"3"`, `ssa_function_bodies.body` changed from JSON TEXT to
|
||||
/// bincode BLOB. Old JSON payloads cannot be deserialised by the
|
||||
/// new engine, so they are silently rebuilt on open.
|
||||
pub const SCHEMA_VERSION: &str = "3";
|
||||
|
|
@ -432,7 +432,7 @@ pub mod index {
|
|||
|
||||
match stored {
|
||||
Some(ref v) if v == current => {
|
||||
// Schema version matches — nothing to do.
|
||||
// Schema version matches, nothing to do.
|
||||
}
|
||||
_ => {
|
||||
let old = stored.as_deref().unwrap_or("<none>");
|
||||
|
|
@ -475,7 +475,7 @@ pub mod index {
|
|||
|
||||
match stored {
|
||||
Some(ref v) if v == current => {
|
||||
// Version matches — nothing to do.
|
||||
// Version matches, nothing to do.
|
||||
}
|
||||
_ => {
|
||||
let old = stored.as_deref().unwrap_or("<none>");
|
||||
|
|
@ -601,10 +601,10 @@ pub mod index {
|
|||
Ok(match row {
|
||||
Some((stored_hash, stored_mtime)) => {
|
||||
if stored_mtime != mtime {
|
||||
// mtime changed — must re-scan
|
||||
// mtime changed, must re-scan
|
||||
true
|
||||
} else {
|
||||
// mtime matches — compare hash only if cheap
|
||||
// mtime matches, compare hash only if cheap
|
||||
// (the caller already read the file and can use
|
||||
// should_scan_with_hash instead for full accuracy)
|
||||
let digest = Self::digest_file(path)?;
|
||||
|
|
@ -811,7 +811,7 @@ pub mod index {
|
|||
/// Atomically replace all SSA function summaries for a single file.
|
||||
///
|
||||
/// The input tuple is
|
||||
/// `(name, arity, lang, namespace, container, disambig, kind, summary)` —
|
||||
/// `(name, arity, lang, namespace, container, disambig, kind, summary)` ,
|
||||
/// matching the fields required to reconstruct a full [`crate::symbol::FuncKey`]
|
||||
/// on load.
|
||||
pub fn replace_ssa_summaries_for_file(
|
||||
|
|
@ -1040,7 +1040,7 @@ pub mod index {
|
|||
/// Load symbol metadata (name, arity, lang, namespace, container, kind)
|
||||
/// for a single file.
|
||||
///
|
||||
/// Lighter than `load_all_ssa_summaries` — skips JSON deserialization of
|
||||
/// Lighter than `load_all_ssa_summaries`, skips JSON deserialization of
|
||||
/// the full summary body and filters by file_path in the query. `kind`
|
||||
/// is the [`crate::symbol::FuncKind`] slug (`"fn"`, `"method"`,
|
||||
/// `"closure"`, ...) so consumers can distinguish anonymous functions
|
||||
|
|
@ -1074,7 +1074,7 @@ pub mod index {
|
|||
///
|
||||
/// Persists cross-file callee bodies for interprocedural symex.
|
||||
/// Bodies are serialized as MessagePack (rmp-serde, named-field
|
||||
/// encoding) BLOBs — JSON proved too costly at indexing time on
|
||||
/// encoding) BLOBs, JSON proved too costly at indexing time on
|
||||
/// large SSA structures, and bincode's positional format trips
|
||||
/// over the `#[serde(skip_serializing_if = ...)]` attributes
|
||||
/// scattered through `OptimizeResult` and friends.
|
||||
|
|
@ -1260,7 +1260,7 @@ pub mod index {
|
|||
///
|
||||
/// Mirrors [`Self::replace_ssa_summaries_for_file`]. Each input tuple
|
||||
/// is `(name, arity, lang, namespace, container, disambig, kind, summary)`
|
||||
/// — the full identity needed to reconstruct the callee's
|
||||
///, the full identity needed to reconstruct the callee's
|
||||
/// [`crate::symbol::FuncKey`] on load.
|
||||
pub fn replace_auth_summaries_for_file(
|
||||
&mut self,
|
||||
|
|
@ -1326,7 +1326,7 @@ pub mod index {
|
|||
/// [`Self::replace_ssa_summaries_for_file`],
|
||||
/// [`Self::replace_ssa_bodies_for_file`] and
|
||||
/// [`Self::replace_auth_summaries_for_file`] in sequence, but
|
||||
/// issues a single fsync at commit instead of four — the
|
||||
/// issues a single fsync at commit instead of four, the
|
||||
/// dominant cost on large scans.
|
||||
///
|
||||
/// Behaviour parity with the four-call sequence:
|
||||
|
|
@ -1376,7 +1376,7 @@ pub mod index {
|
|||
let path_str = file_path.to_string_lossy();
|
||||
let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64;
|
||||
|
||||
// function_summaries — always replace.
|
||||
// function_summaries, always replace.
|
||||
tx.execute(
|
||||
"DELETE FROM function_summaries WHERE project = ?1 AND file_path = ?2",
|
||||
params![self.project, path_str],
|
||||
|
|
@ -1408,7 +1408,7 @@ pub mod index {
|
|||
}
|
||||
}
|
||||
|
||||
// ssa_function_summaries — only touched when non-empty.
|
||||
// ssa_function_summaries, only touched when non-empty.
|
||||
if !ssa_summaries.is_empty() {
|
||||
tx.execute(
|
||||
"DELETE FROM ssa_function_summaries
|
||||
|
|
@ -1444,7 +1444,7 @@ pub mod index {
|
|||
}
|
||||
}
|
||||
|
||||
// ssa_function_bodies — only touched when non-empty.
|
||||
// ssa_function_bodies, only touched when non-empty.
|
||||
if !ssa_bodies.is_empty() {
|
||||
tx.execute(
|
||||
"DELETE FROM ssa_function_bodies
|
||||
|
|
@ -1478,7 +1478,7 @@ pub mod index {
|
|||
}
|
||||
}
|
||||
|
||||
// auth_check_summaries — always replace, even when empty,
|
||||
// auth_check_summaries, always replace, even when empty,
|
||||
// so a helper that lost its ownership check no longer
|
||||
// leaks lifts into subsequent pass-2 runs.
|
||||
tx.execute(
|
||||
|
|
@ -2203,7 +2203,7 @@ pub mod index {
|
|||
Ok(rows)
|
||||
}
|
||||
|
||||
/// Record the first time a finding fingerprint was observed. Idempotent —
|
||||
/// Record the first time a finding fingerprint was observed. Idempotent ,
|
||||
/// the earliest call wins via INSERT OR IGNORE. Used by the overview
|
||||
/// backlog-age computation; ts should be the originating scan's
|
||||
/// `started_at` (RFC-3339).
|
||||
|
|
@ -2246,7 +2246,7 @@ pub mod index {
|
|||
if fingerprints.is_empty() {
|
||||
return Ok(std::collections::HashMap::new());
|
||||
}
|
||||
// SQLite IN-clause cap is high but parameter count is bounded — chunk
|
||||
// SQLite IN-clause cap is high but parameter count is bounded, chunk
|
||||
// for safety with large fingerprint sets.
|
||||
let mut out = std::collections::HashMap::with_capacity(fingerprints.len());
|
||||
let conn = self.c();
|
||||
|
|
@ -2590,7 +2590,7 @@ fn ssa_summaries_round_trip() {
|
|||
/// asserts that `return_path_facts` survive serialise → SQLite persist →
|
||||
/// load → deserialise. Regression guard for the per-return-path PathFact
|
||||
/// decomposition that closes the rs-safe-014 / tar-rs / rs-safe-016 FP
|
||||
/// cluster — without this round-trip working, cross-file callers lose
|
||||
/// cluster, without this round-trip working, cross-file callers lose
|
||||
/// the per-arm narrowing and inline-only callees regain the joined-fact
|
||||
/// dilution.
|
||||
#[test]
|
||||
|
|
@ -2955,7 +2955,7 @@ fn ssa_bodies_replace_on_rescan() {
|
|||
assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 1);
|
||||
assert_eq!(idx.load_all_ssa_bodies().unwrap()[0].8.ssa.blocks.len(), 2);
|
||||
|
||||
// Store v2 with 5 blocks — should replace, not accumulate
|
||||
// Store v2 with 5 blocks, should replace, not accumulate
|
||||
let hash2 = index::Indexer::digest_bytes(b"v2");
|
||||
let bodies2 = vec![(
|
||||
"func".to_string(),
|
||||
|
|
@ -3053,7 +3053,7 @@ fn ssa_bodies_removed_on_file_delete() {
|
|||
idx.replace_ssa_bodies_for_file(&f, &hash, &bodies).unwrap();
|
||||
assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 1);
|
||||
|
||||
// Delete file — should also remove bodies
|
||||
// Delete file, should also remove bodies
|
||||
idx.remove_file_and_related(&f).unwrap();
|
||||
assert_eq!(idx.load_all_ssa_bodies().unwrap().len(), 0);
|
||||
}
|
||||
|
|
@ -3215,7 +3215,7 @@ fn version_mismatch_triggers_reset() {
|
|||
1
|
||||
);
|
||||
|
||||
// Reopen — version mismatch should trigger full wipe
|
||||
// Reopen, version mismatch should trigger full wipe
|
||||
drop(pool);
|
||||
let pool2 = index::Indexer::init(&db).unwrap();
|
||||
|
||||
|
|
@ -3286,7 +3286,7 @@ fn multiple_opens_no_repeated_resets() {
|
|||
populate_project(&pool, "proj", td.path());
|
||||
drop(pool);
|
||||
|
||||
// Second open — should preserve data
|
||||
// Second open, should preserve data
|
||||
let pool2 = index::Indexer::init(&db).unwrap();
|
||||
assert_eq!(
|
||||
index::Indexer::count_rows(&pool2, "function_summaries", "proj").unwrap(),
|
||||
|
|
@ -3297,7 +3297,7 @@ fn multiple_opens_no_repeated_resets() {
|
|||
populate_project(&pool2, "proj2", td.path());
|
||||
drop(pool2);
|
||||
|
||||
// Third open — should still preserve both projects
|
||||
// Third open, should still preserve both projects
|
||||
let pool3 = index::Indexer::init(&db).unwrap();
|
||||
assert_eq!(
|
||||
index::Indexer::count_rows(&pool3, "function_summaries", "proj").unwrap(),
|
||||
|
|
@ -3376,7 +3376,7 @@ fn missing_ssa_namespace_column_triggers_recreate() {
|
|||
.unwrap();
|
||||
}
|
||||
|
||||
// Open via init — should detect missing namespace and recreate
|
||||
// Open via init, should detect missing namespace and recreate
|
||||
let pool = index::Indexer::init(&db).unwrap();
|
||||
|
||||
// Verify the table now has the namespace column by inserting with it
|
||||
|
|
@ -3405,12 +3405,12 @@ fn valid_schema_no_recreate() {
|
|||
let td = tempfile::tempdir().unwrap();
|
||||
let db = td.path().join("nyx.sqlite");
|
||||
|
||||
// First init — creates all tables
|
||||
// First init, creates all tables
|
||||
let pool = index::Indexer::init(&db).unwrap();
|
||||
populate_project(&pool, "proj", td.path());
|
||||
drop(pool);
|
||||
|
||||
// Second init — schema is valid, should NOT drop/recreate
|
||||
// Second init, schema is valid, should NOT drop/recreate
|
||||
let pool2 = index::Indexer::init(&db).unwrap();
|
||||
// Data survives because schema was already correct
|
||||
assert_eq!(
|
||||
|
|
@ -3735,7 +3735,7 @@ fn metadata_table_survives_clear() {
|
|||
assert_eq!(stored.as_deref(), Some(index::ENGINE_VERSION));
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5 / A3 audit: field_points_to round-trips through
|
||||
/// field_points_to round-trips through
|
||||
/// the SsaFuncSummary SQLite blob. Pin that the new field_points_to
|
||||
/// records preserve param_field_reads, param_field_writes, the
|
||||
/// receiver sentinel (`u32::MAX`), the container-element marker
|
||||
|
|
@ -3817,7 +3817,7 @@ fn ssa_summaries_round_trip_preserves_field_points_to() {
|
|||
}
|
||||
|
||||
/// Pre-Phase-5 blob compatibility: a summary serialised without
|
||||
/// `field_points_to` deserialises with the empty default — no
|
||||
/// `field_points_to` deserialises with the empty default, no
|
||||
/// migration needed because the field is `#[serde(default)]`.
|
||||
#[test]
|
||||
fn ssa_summaries_pre_phase5_blob_decodes_with_empty_field_points_to() {
|
||||
|
|
|
|||
|
|
@ -1,98 +1,43 @@
|
|||
//! Provenance notes attached to findings when the engine has hit an
|
||||
//! internal budget, widening, or lowering cap.
|
||||
//!
|
||||
//! The notes are surfaced through `Finding.engine_notes` (and
|
||||
//! `Evidence.engine_notes` once the finding reaches the `Diag` layer) so
|
||||
//! downstream consumers can tell "we found nothing" from "we stopped
|
||||
//! looking".
|
||||
//!
|
||||
//! Each note carries a [`LossDirection`] classification that describes
|
||||
//! *how* the engine deviated from a fully-converged analysis. The
|
||||
//! direction drives two downstream behaviours:
|
||||
//!
|
||||
//! * [`crate::evidence::compute_confidence`] caps confidence at
|
||||
//! `Medium` when any attached note has direction
|
||||
//! [`LossDirection::OverReport`] or [`LossDirection::Bail`] (the
|
||||
//! finding itself may be spurious).
|
||||
//! * [`crate::rank`] applies a direction-aware `completeness` penalty
|
||||
//! to the attack-surface score (see `rank.rs::completeness_penalty`).
|
||||
//!
|
||||
//! This replaces the earlier Phase-3 stance of "notes are purely
|
||||
//! additive and never influence score". A release audit flagged that
|
||||
//! users sorting thousands of findings by rank could not distinguish
|
||||
//! converged analysis from capped analysis, which produced false
|
||||
//! confidence in fragile findings. The direction-aware pipeline
|
||||
//! preserves the observability goal while fixing the credibility gap.
|
||||
//! Each note carries a [`LossDirection`] classification.
|
||||
//! [`crate::evidence::compute_confidence`] caps confidence at `Medium`
|
||||
//! for `OverReport`/`Bail` notes, and [`crate::rank`] applies a
|
||||
//! direction-aware completeness penalty.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
|
||||
/// Classification of *why* a fix-point loop hit its safety cap.
|
||||
///
|
||||
/// The cap-hit alone is not actionable — "we ran 64 iterations and did
|
||||
/// not detect convergence" can mean several very different things:
|
||||
///
|
||||
/// * the lattice is still shrinking but slowly (e.g. a 72-function chain
|
||||
/// SCC that legitimately needs >64 iterations),
|
||||
/// * the lattice stopped shrinking but the convergence predicate still
|
||||
/// detects change (the change set stabilised at a non-zero value —
|
||||
/// monotonicity is fine but something in the convergence predicate is
|
||||
/// spurious), or
|
||||
/// * the lattice is oscillating (two iterations alternating with the
|
||||
/// same change-set size; this is a *bug*, not a tuning issue).
|
||||
///
|
||||
/// Recording the reason makes cap-hit telemetry actionable: operators
|
||||
/// can tell when "raise the cap" would actually help vs. when they are
|
||||
/// looking at a summary-non-monotonicity regression.
|
||||
///
|
||||
/// Serialized as a nested snake_case tagged enum so SARIF/JSON consumers
|
||||
/// can pattern-match without depending on Rust layout.
|
||||
/// Why a fix-point loop hit its safety cap. Distinguishes "raise the
|
||||
/// cap" cases from non-monotonicity bugs in cap-hit telemetry.
|
||||
/// Serialized as a tagged snake_case enum for SARIF/JSON consumers.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum CapHitReason {
|
||||
/// The change-set size was still decreasing when the cap fired.
|
||||
/// `trajectory` is the last N iteration deltas (most recent last).
|
||||
/// Operators can safely raise the cap; the underlying analysis is
|
||||
/// healthy but the SCC is larger than the current budget.
|
||||
/// Change-set still decreasing when the cap fired. Safe to raise
|
||||
/// the cap; the SCC is just larger than budget.
|
||||
MonotoneShrinking { trajectory: SmallVec<[u32; 4]> },
|
||||
/// The change-set size stayed constant for the last ≥2 iterations
|
||||
/// without reaching zero. This is unusual: every iteration is
|
||||
/// updating the *same* keys, which suggests a summary that changes
|
||||
/// the same fields back and forth even though the cap bits are
|
||||
/// saturating. Raise the cap **and** investigate.
|
||||
/// Change-set held steady at a non-zero value for ≥2 iterations.
|
||||
/// Same keys updating back and forth, investigate.
|
||||
Plateau { delta: u32 },
|
||||
/// The change-set size oscillated with a detected period ≤ N/2.
|
||||
/// Genuinely bad — the analysis is not monotone, convergence will
|
||||
/// *never* be reached, and raising the cap will not help. File a
|
||||
/// bug with the fixture attached.
|
||||
/// Period-2 oscillation detected. Non-monotone; raising the cap
|
||||
/// will not help. File a bug.
|
||||
SuspectedOscillation {
|
||||
period: u8,
|
||||
trajectory: SmallVec<[u32; 4]>,
|
||||
},
|
||||
/// Default when the engine did not record a trajectory (e.g. the
|
||||
/// cap fired after only one iteration so there is nothing to
|
||||
/// classify). Preserves backwards compatibility for old notes
|
||||
/// deserialized from disk.
|
||||
/// No trajectory recorded (e.g. cap fired after a single iteration).
|
||||
#[default]
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl CapHitReason {
|
||||
/// Classify a trajectory of per-iteration change-set sizes.
|
||||
///
|
||||
/// `deltas` should carry the *changed-key counts* from the last N
|
||||
/// iterations (most recent last). Classification rules:
|
||||
///
|
||||
/// 1. Fewer than 2 samples → `Unknown` (nothing to diff against).
|
||||
/// 2. A period-2 pattern (a,b,a,b) with a ≠ b → `SuspectedOscillation`.
|
||||
/// 3. Last two samples equal and non-zero → `Plateau`.
|
||||
/// 4. Strictly decreasing tail → `MonotoneShrinking`.
|
||||
/// 5. Otherwise → `Unknown` (inconclusive; rare in practice).
|
||||
///
|
||||
/// The function is pure — no allocation beyond the returned
|
||||
/// [`SmallVec`] — so it is safe to call from within a hot loop when
|
||||
/// a cap actually fires. Callers should accumulate deltas in a
|
||||
/// fixed-size ring buffer to bound memory.
|
||||
/// Classify a trajectory of per-iteration change-set sizes
|
||||
/// (most recent last). Rules: <2 samples → `Unknown`; a,b,a,b with
|
||||
/// a≠b → `SuspectedOscillation`; last two equal non-zero →
|
||||
/// `Plateau`; strictly decreasing tail → `MonotoneShrinking`;
|
||||
/// otherwise `Unknown`.
|
||||
pub fn classify(deltas: &[u32]) -> CapHitReason {
|
||||
if deltas.len() < 2 {
|
||||
return CapHitReason::Unknown;
|
||||
|
|
@ -161,44 +106,26 @@ impl CapHitReason {
|
|||
}
|
||||
|
||||
/// Direction of precision loss encoded by an [`EngineNote`].
|
||||
///
|
||||
/// Every new [`EngineNote`] variant must declare a direction via
|
||||
/// [`EngineNote::direction`] — the match is exhaustive by design so the
|
||||
/// classification cannot silently default.
|
||||
///
|
||||
/// Ordering matters: variants are sorted by worsening impact on a
|
||||
/// specific finding's credibility. [`combine`](Self::combine) uses the
|
||||
/// `Ord` impl to merge directions when multiple notes are attached.
|
||||
/// Variants are ordered by worsening credibility impact;
|
||||
/// [`combine`](Self::combine) takes the max.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum LossDirection {
|
||||
/// The note is informational only. Analysis was fully converged;
|
||||
/// the note records a harmless event such as a cache reuse.
|
||||
/// Analysis converged; the note records a harmless event.
|
||||
Informational,
|
||||
/// The analysis may have *missed* additional findings (e.g. the
|
||||
/// worklist was capped before fully propagating taint). Findings
|
||||
/// that *were* reported are still sound — they correspond to real
|
||||
/// flows — but the result set is a lower bound.
|
||||
/// Analysis may have missed findings (worklist was capped). Reported
|
||||
/// findings remain sound, the result set is a lower bound.
|
||||
UnderReport,
|
||||
/// The analysis may have reported a *spurious* finding (e.g.
|
||||
/// predicate state was widened to top, so a validation guard that
|
||||
/// would have suppressed the finding was lost). The specific
|
||||
/// finding is more likely to be a false positive than one produced
|
||||
/// from converged state.
|
||||
/// Analysis may have reported a spurious finding (e.g. predicate
|
||||
/// state widened to top, dropping a guard). Likely FP.
|
||||
OverReport,
|
||||
/// Analysis of this finding's body aborted before producing a
|
||||
/// trustworthy result (e.g. SSA lowering bailed, parse timed out).
|
||||
/// The finding is weakly supported; a human reviewer should treat
|
||||
/// it as a starting point rather than a confirmed flow.
|
||||
/// Analysis aborted before producing a trustworthy result.
|
||||
/// Treat the finding as a starting point, not a confirmed flow.
|
||||
Bail,
|
||||
}
|
||||
|
||||
impl LossDirection {
|
||||
/// Merge two directions by taking the worse (later in `Ord`).
|
||||
///
|
||||
/// A body with both `UnderReport` and `OverReport` notes is treated
|
||||
/// as `OverReport` because over-reporting is the more credibility-
|
||||
/// damaging failure mode for a specific emitted finding.
|
||||
/// Merge by taking the worse (later in `Ord`).
|
||||
pub fn combine(self, other: LossDirection) -> LossDirection {
|
||||
self.max(other)
|
||||
}
|
||||
|
|
@ -215,111 +142,46 @@ impl LossDirection {
|
|||
}
|
||||
|
||||
/// A single provenance event recorded during analysis.
|
||||
///
|
||||
/// `kind` is serialized as a snake_case tag so tooling can pattern-match
|
||||
/// across JSON and SARIF output without depending on Rust enum layout.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum EngineNote {
|
||||
/// The taint worklist hit its iteration budget before converging.
|
||||
/// Direction: [`LossDirection::UnderReport`] — the fixpoint was
|
||||
/// aborted, so some flows may have been missed, but emitted flows
|
||||
/// are still backed by propagated taint.
|
||||
/// Taint worklist hit its iteration budget. UnderReport.
|
||||
WorklistCapped { iterations: u32 },
|
||||
/// Origin tracking was truncated when a value exceeded the configured
|
||||
/// per-value origin cap (`analysis.engine.max_origins`, default 32).
|
||||
/// Direction: [`LossDirection::UnderReport`] — each dropped origin
|
||||
/// corresponds to a real source flow whose independent finding will
|
||||
/// not be emitted. Other survivors still produce findings, so the
|
||||
/// counter is a strict lower bound on under-reporting. Raise
|
||||
/// `max_origins` if operators observe this note on realistic inputs.
|
||||
/// Truncation is deterministic: origins are sorted by source
|
||||
/// location and the largest-by-location are dropped first, so the
|
||||
/// survivor set is stable across runs and merge orderings.
|
||||
/// Per-value origin set truncated to `analysis.engine.max_origins`
|
||||
/// (default 32). UnderReport, dropped origins correspond to real
|
||||
/// source flows whose findings won't emit.
|
||||
OriginsTruncated { dropped: u32 },
|
||||
/// JS/TS pass-2 in-file global propagation hit its iteration cap.
|
||||
/// Direction: [`LossDirection::UnderReport`] — global state may
|
||||
/// not have reached fixpoint; cross-function flows could be missed.
|
||||
///
|
||||
/// `reason` classifies *why* the cap fired (monotone-but-slow,
|
||||
/// plateau, suspected oscillation) so operators can tell a
|
||||
/// tunable-budget problem from a monotonicity regression. Older
|
||||
/// serialized notes without this field default to
|
||||
/// [`CapHitReason::Unknown`].
|
||||
/// JS/TS pass-2 in-file global propagation hit its cap. UnderReport.
|
||||
InFileFixpointCapped {
|
||||
iterations: u32,
|
||||
#[serde(default)]
|
||||
reason: CapHitReason,
|
||||
},
|
||||
/// Cross-file SCC fixpoint hit `SCC_FIXPOINT_SAFETY_CAP`.
|
||||
/// Direction: [`LossDirection::UnderReport`] — the iterative
|
||||
/// cross-file join aborted; summaries for members of this SCC may
|
||||
/// be incomplete.
|
||||
///
|
||||
/// `reason` classifies *why* the cap fired (monotone-but-slow,
|
||||
/// plateau, suspected oscillation) so operators can tell a
|
||||
/// tunable-budget problem from a monotonicity regression. Older
|
||||
/// serialized notes without this field default to
|
||||
/// [`CapHitReason::Unknown`].
|
||||
/// Cross-file SCC fixpoint hit `SCC_FIXPOINT_SAFETY_CAP`. UnderReport.
|
||||
CrossFileFixpointCapped {
|
||||
iterations: u32,
|
||||
#[serde(default)]
|
||||
reason: CapHitReason,
|
||||
},
|
||||
/// SSA lowering produced an empty body (parse failure or
|
||||
/// unsupported shape). Direction: [`LossDirection::Bail`] — any
|
||||
/// finding attributed to this body is weakly supported because the
|
||||
/// IR itself is malformed.
|
||||
/// SSA lowering produced an empty body. Bail.
|
||||
SsaLoweringBailed { reason: String },
|
||||
/// Tree-sitter parse exceeded the configured timeout.
|
||||
/// Direction: [`LossDirection::Bail`] — parse aborted; findings
|
||||
/// surfaced from the partial tree should be treated as a human-
|
||||
/// review starting point.
|
||||
/// Tree-sitter parse exceeded the timeout. Bail.
|
||||
ParseTimeout { timeout_ms: u32 },
|
||||
/// Predicate state was widened to top to maintain monotonicity.
|
||||
/// Direction: [`LossDirection::OverReport`] — validation guards
|
||||
/// that would have suppressed the finding may have been lost, so
|
||||
/// the finding is more likely to be a false positive.
|
||||
/// Predicate state widened to top to keep the lattice monotone.
|
||||
/// OverReport, guards may have been lost.
|
||||
PredicateStateWidened,
|
||||
/// Path-environment constraints exceeded internal cap; widened to
|
||||
/// top. Direction: [`LossDirection::OverReport`] — same reasoning
|
||||
/// as [`Self::PredicateStateWidened`]: dropped path constraints can
|
||||
/// only turn infeasible paths into apparent-feasible ones.
|
||||
/// Path-environment constraints widened to top. OverReport.
|
||||
PathEnvCapped,
|
||||
/// Inline cache reused a cached body summary; origins were
|
||||
/// re-attributed. Direction: [`LossDirection::Informational`] —
|
||||
/// the cache hit does not affect precision, but surfacing the
|
||||
/// re-attribution helps explain why origin locations move between
|
||||
/// runs that share a body signature.
|
||||
/// Inline cache reused a cached body. Informational.
|
||||
InlineCacheReused,
|
||||
/// Points-to analysis dropped heap object members when an
|
||||
/// intra-procedural points-to set exceeded
|
||||
/// `analysis.engine.max_pointsto` (default 32).
|
||||
/// Direction: [`LossDirection::UnderReport`] — stores and loads
|
||||
/// that flow through the truncated set miss the dropped abstract
|
||||
/// heap objects, so any taint into those objects via this alias
|
||||
/// path will not reach downstream sinks. Other aliasing paths to
|
||||
/// the same objects still propagate normally, so the counter is a
|
||||
/// strict lower bound on under-reporting. Raise `max_pointsto`
|
||||
/// if operators observe this note on factory-heavy codebases.
|
||||
/// Points-to set truncated to `analysis.engine.max_pointsto`
|
||||
/// (default 32). UnderReport.
|
||||
PointsToTruncated { dropped: u32 },
|
||||
}
|
||||
|
||||
impl EngineNote {
|
||||
/// Classify this note by direction of precision loss.
|
||||
///
|
||||
/// The match is exhaustive: every `EngineNote` variant must declare
|
||||
/// a direction. When adding a new cap site, pick the direction
|
||||
/// that most honestly describes the impact on an emitted finding:
|
||||
///
|
||||
/// * `Informational` — analysis fully converged; note is a
|
||||
/// provenance breadcrumb (e.g. cache reuse).
|
||||
/// * `UnderReport` — analysis was cut short, but anything emitted
|
||||
/// is still backed by real propagation.
|
||||
/// * `OverReport` — precision was widened, so the emitted finding
|
||||
/// is *more* likely to be a false positive than the baseline.
|
||||
/// * `Bail` — analysis of this body aborted; the finding is weakly
|
||||
/// supported.
|
||||
/// Direction of precision loss for this note. New variants must
|
||||
/// declare one explicitly.
|
||||
pub fn direction(&self) -> LossDirection {
|
||||
match self {
|
||||
EngineNote::WorklistCapped { .. } => LossDirection::UnderReport,
|
||||
|
|
@ -335,23 +197,15 @@ impl EngineNote {
|
|||
}
|
||||
}
|
||||
|
||||
/// True if this note indicates the engine may have deviated from a
|
||||
/// fully-converged analysis (any non-informational direction).
|
||||
///
|
||||
/// This is a convenience over
|
||||
/// `self.direction() != LossDirection::Informational` and drives
|
||||
/// the `confidence_capped` SARIF property.
|
||||
/// True for any non-informational direction. Drives the
|
||||
/// `confidence_capped` SARIF property.
|
||||
pub fn lowers_confidence(&self) -> bool {
|
||||
self.direction() != LossDirection::Informational
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the worst direction across a slice of notes.
|
||||
///
|
||||
/// Returns `None` when `notes` is empty or contains only
|
||||
/// [`LossDirection::Informational`] notes. Returns `Some(dir)` with
|
||||
/// the most impactful direction otherwise — this is what downstream
|
||||
/// consumers (rank, confidence) use to decide how to degrade a finding.
|
||||
/// Worst non-informational direction across a slice of notes, or
|
||||
/// `None` if the slice is empty or only carries informational notes.
|
||||
pub fn worst_direction(notes: &[EngineNote]) -> Option<LossDirection> {
|
||||
let mut worst: Option<LossDirection> = None;
|
||||
for note in notes {
|
||||
|
|
@ -367,9 +221,7 @@ pub fn worst_direction(notes: &[EngineNote]) -> Option<LossDirection> {
|
|||
worst
|
||||
}
|
||||
|
||||
/// Deduplicating push: does not append if an identical note is already
|
||||
/// present. Used to keep per-finding note lists small when a cap site
|
||||
/// fires repeatedly inside the same body.
|
||||
/// Push-if-not-present.
|
||||
pub fn push_unique(notes: &mut smallvec::SmallVec<[EngineNote; 2]>, note: EngineNote) {
|
||||
if !notes.iter().any(|n| n == ¬e) {
|
||||
notes.push(note);
|
||||
|
|
|
|||
|
|
@ -289,7 +289,7 @@ pub struct StateEvidence {
|
|||
/// (validation guards may have been lost, so the finding is more
|
||||
/// likely to be a false positive); `Bail` means analysis of the body
|
||||
/// aborted before producing a trustworthy result. `UnderReport` notes
|
||||
/// (e.g. `WorklistCapped`) do *not* cap confidence — the reported flow
|
||||
/// (e.g. `WorklistCapped`) do *not* cap confidence, the reported flow
|
||||
/// is still real, just surrounded by an incomplete result set.
|
||||
pub fn compute_confidence(diag: &Diag) -> Confidence {
|
||||
// Degraded analysis caps confidence
|
||||
|
|
@ -343,7 +343,7 @@ fn apply_engine_notes_cap(diag: &Diag, base: Confidence) -> Confidence {
|
|||
| crate::engine_notes::LossDirection::Bail => base.min(Confidence::Medium),
|
||||
// UnderReport: result set is a lower bound, but the emitted
|
||||
// finding itself remains as credible as the analysis decided.
|
||||
// Do not cap — the rank completeness penalty is the right lever
|
||||
// Do not cap, the rank completeness penalty is the right lever
|
||||
// for that case (see rank.rs::completeness_penalty).
|
||||
crate::engine_notes::LossDirection::UnderReport => base,
|
||||
// Informational is filtered out upstream by `worst_direction`,
|
||||
|
|
@ -600,7 +600,7 @@ pub fn generate_explanation(diag: &Diag) -> Option<String> {
|
|||
|
||||
/// Extract a vulnerability category label from the Diag (used in explanation text).
|
||||
fn extract_category_from_id(id: &str) -> String {
|
||||
// Rule IDs like "taint-unsanitised-flow (source 3:1)" — category comes
|
||||
// Rule IDs like "taint-unsanitised-flow (source 3:1)", category comes
|
||||
// from the finding category field, but we approximate from the ID here.
|
||||
if id.contains("sql") || id.contains("SQL") {
|
||||
"SQL injection".to_string()
|
||||
|
|
@ -680,7 +680,7 @@ pub fn compute_confidence_limiters(diag: &Diag) -> Vec<String> {
|
|||
"Backwards demand-driven analysis exceeded its budget (verdict not reached)".into(),
|
||||
);
|
||||
}
|
||||
// Confirmation is *not* a limiter — it is a positive signal. The
|
||||
// Confirmation is *not* a limiter, it is a positive signal. The
|
||||
// taint-confidence scorer picks it up separately.
|
||||
let _ = NOTE_CONFIRMED;
|
||||
}
|
||||
|
|
@ -976,7 +976,7 @@ mod tests {
|
|||
#[test]
|
||||
fn confidence_capped_at_medium_by_over_report() {
|
||||
// OverReport (PredicateStateWidened) means validation predicates
|
||||
// were lost — the emitted finding is more likely to be spurious.
|
||||
// were lost, the emitted finding is more likely to be spurious.
|
||||
let d = with_notes(
|
||||
taint_high_confidence_diag(),
|
||||
vec![crate::engine_notes::EngineNote::PredicateStateWidened],
|
||||
|
|
@ -995,7 +995,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn confidence_cap_does_not_upgrade_low() {
|
||||
// `base.min(Medium)` is what caps — it must not *raise* a Low
|
||||
// `base.min(Medium)` is what caps, it must not *raise* a Low
|
||||
// baseline to Medium. Use a taint finding with weak evidence so
|
||||
// the points scorer gives us Low, then attach a Bail note.
|
||||
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::Low);
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ pub fn render_console(
|
|||
}
|
||||
|
||||
for (path, issues) in &grouped {
|
||||
// File path header — dim blue, never brighter than severity.
|
||||
// File path header, dim blue, never brighter than severity.
|
||||
out.push_str(&format!("{}\n", style(path).blue().dim().underlined()));
|
||||
for d in issues {
|
||||
out.push_str(&render_diag(d, width));
|
||||
|
|
@ -261,7 +261,7 @@ fn render_diag(d: &Diag, width: usize) -> String {
|
|||
// Engine provenance notes: show count + worst direction so a user
|
||||
// scanning the console can see "this finding is from capped analysis"
|
||||
// at a glance. Direction tags ("under-report", "over-report", "bail")
|
||||
// are stable strings from `LossDirection::tag()` — kept in sync with
|
||||
// are stable strings from `LossDirection::tag()`, kept in sync with
|
||||
// the SARIF `result.properties.engine_notes[].kind` serialization so
|
||||
// downstream tooling can cross-reference console and SARIF output.
|
||||
// Informational-only notes (e.g. InlineCacheReused) are not surfaced
|
||||
|
|
@ -453,7 +453,7 @@ fn state_remediation_hint(rule_id: &str) -> Option<&'static str> {
|
|||
/// Colored severity tag with icon. The tag is the visual anchor of each finding.
|
||||
///
|
||||
/// - HIGH: bold red
|
||||
/// - MEDIUM: bold 208 (orange) — distinct from yellow
|
||||
/// - MEDIUM: bold 208 (orange), distinct from yellow
|
||||
/// - LOW: dim 67 (muted blue-gray)
|
||||
fn severity_tag(sev: Severity) -> String {
|
||||
match sev {
|
||||
|
|
@ -503,7 +503,7 @@ fn collapse_chain_spacing(s: &str) -> String {
|
|||
// Collapse: emit `.` directly after `)`
|
||||
continue;
|
||||
} else {
|
||||
// Not a chain continuation — emit the whitespace we skipped
|
||||
// Not a chain continuation, emit the whitespace we skipped
|
||||
for c in &chars[ws_start..i] {
|
||||
out.push(*c);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ pub struct CallSiteKey {
|
|||
/// An explicit cross-language bridge edge.
|
||||
///
|
||||
/// Connects a call site in one language to a function definition in another.
|
||||
/// Without an `InteropEdge`, cross-language resolution is never attempted —
|
||||
/// Without an `InteropEdge`, cross-language resolution is never attempted ,
|
||||
/// this prevents false positives from name collisions across languages.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct InteropEdge {
|
||||
|
|
|
|||
|
|
@ -115,8 +115,8 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig {
|
|||
/// Benchmark-driven output-parameter source positions for known C APIs.
|
||||
/// Maps callee name → argument positions that receive Source taint.
|
||||
pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[
|
||||
("fgets", &[0]), // fgets(buf, size, stream) — buf receives input
|
||||
("gets", &[0]), // gets(buf) — buf receives input
|
||||
("fgets", &[0]), // fgets(buf, size, stream), buf receives input
|
||||
("gets", &[0]), // gets(buf), buf receives input
|
||||
("recv", &[1]), // recv(fd, buf, len, flags)
|
||||
("recvfrom", &[1]), // recvfrom(fd, buf, len, flags, ...)
|
||||
];
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
|
|||
// and extract them as separate bodies. Without these, a
|
||||
// `class_specifier` / `struct_specifier` falls through to the
|
||||
// generic `_ =>` arm in `build_sub`, which records a leaf `Seq`
|
||||
// node and never walks the body — so inline member-function
|
||||
// node and never walks the body, so inline member-function
|
||||
// definitions (and methods of nested classes) are silently dropped.
|
||||
"declaration_list" => Kind::Block,
|
||||
"field_declaration_list" => Kind::Block,
|
||||
|
|
@ -160,7 +160,7 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig {
|
|||
|
||||
/// Benchmark-driven output-parameter source positions for known C++ APIs.
|
||||
pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[
|
||||
("getline", &[1]), // std::getline(stream, str) — str receives input
|
||||
("getline", &[1]), // std::getline(stream, str), str receives input
|
||||
("std::getline", &[1]),
|
||||
("fgets", &[0]),
|
||||
("gets", &[0]),
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SQL_QUERY),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// fmt.Printf/Sprintf write to stdout or build strings in memory — not
|
||||
// fmt.Printf/Sprintf write to stdout or build strings in memory, not
|
||||
// security sinks. fmt.Fprintf writes to an io.Writer (often http.ResponseWriter)
|
||||
// so it IS a security sink for XSS.
|
||||
LabelRule {
|
||||
|
|
@ -110,7 +110,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
// Idiomatic Go SSRF sinks (Owncast CVE-2023-3188) use the
|
||||
// `http.DefaultClient.Get(url)` form rather than the bare
|
||||
// `http.Get(url)` helper, so the suffix-matched callee text needs
|
||||
// an explicit entry here — bare `Get/Post/Do/Head` would
|
||||
// an explicit entry here, bare `Get/Post/Do/Head` would
|
||||
// over-match unrelated method names.
|
||||
"http.DefaultClient.Get",
|
||||
"http.DefaultClient.Post",
|
||||
|
|
|
|||
|
|
@ -53,13 +53,13 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sanitizer(Cap::URL_ENCODE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// OWASP ESAPI input validator — validates and canonicalizes input
|
||||
// OWASP ESAPI input validator, validates and canonicalizes input
|
||||
LabelRule {
|
||||
matchers: &["Validator.getValidInput"],
|
||||
label: DataLabel::Sanitizer(Cap::all()),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Type-check sanitizers — parsing to a primitive erases taint
|
||||
// Type-check sanitizers, parsing to a primitive erases taint
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
"Integer.parseInt",
|
||||
|
|
@ -99,7 +99,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::CODE_EXEC),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// HTTP response sinks — println/print are broad (also match System.out)
|
||||
// HTTP response sinks, println/print are broad (also match System.out)
|
||||
// but necessary to catch response.getWriter().println() via suffix matching.
|
||||
LabelRule {
|
||||
matchers: &["println", "print"],
|
||||
|
|
@ -107,7 +107,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
case_sensitive: false,
|
||||
},
|
||||
// openConnection() is the standard java.net.URL API for initiating a connection.
|
||||
// It is the correct interception point — the URL is already set on the object.
|
||||
// It is the correct interception point, the URL is already set on the object.
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
"openConnection",
|
||||
|
|
@ -153,9 +153,9 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SQL_QUERY),
|
||||
case_sensitive: true,
|
||||
},
|
||||
// NOTE: Java logging (logger.info, log.warn, etc.) removed as sinks —
|
||||
// NOTE: Java logging (logger.info, log.warn, etc.) removed as sinks ,
|
||||
// logging format injection is not a real security vulnerability in Java.
|
||||
// String.format also removed — it builds strings in memory (not a sink);
|
||||
// String.format also removed, it builds strings in memory (not a sink);
|
||||
// the real sink is wherever the formatted string is used (SQL, HTTP, etc.).
|
||||
// ─── JNDI injection sinks ───
|
||||
LabelRule {
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
case_sensitive: false,
|
||||
},
|
||||
// `encodeURIComponent` percent-encodes every character outside the
|
||||
// ASCII identifier alphabet, including `<`, `>`, `&`, `"`, `'` — so
|
||||
// ASCII identifier alphabet, including `<`, `>`, `&`, `"`, `'`, so
|
||||
// the result is safe to embed in HTML text content and HTML
|
||||
// attribute values, not just URL components. Treating it as
|
||||
// covering both URL_ENCODE and HTML_ESCAPE caps avoids FPs when a
|
||||
|
|
@ -92,7 +92,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// he library — HTML entity encoding
|
||||
// he library, HTML entity encoding
|
||||
LabelRule {
|
||||
matchers: &["he.encode", "he.escape"],
|
||||
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
|
||||
|
|
@ -148,16 +148,16 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: true,
|
||||
},
|
||||
// ── Outbound HTTP clients — modeled as destination-aware gated sinks ──
|
||||
// ── Outbound HTTP clients, modeled as destination-aware gated sinks ──
|
||||
// Flat-Sink modeling of fetch/axios/got/undici/http.request was producing
|
||||
// a dominant FP class where any tainted body/payload arg appeared as SSRF
|
||||
// (e.g. `fetch("/api/telemetry", { body: navigator.userAgent })`). SSRF
|
||||
// semantics require attacker control over the *destination*, not the
|
||||
// payload. The gated entries in `GATED_SINKS` below narrow activation to
|
||||
// URL / host / path / origin arguments or object fields. Taint flowing
|
||||
// only to body / data / json / headers is no longer flagged as SSRF —
|
||||
// cross-boundary data-exfiltration detection is a separate future
|
||||
// capability (`Cap::DATA_EXFIL`, not yet introduced).
|
||||
// payload. The gated entries in `GATED_SINKS` below narrow SSRF
|
||||
// activation to URL / host / path / origin arguments or object fields.
|
||||
// Taint flowing only to body / data / json / headers is captured by a
|
||||
// *separate* gate class (`Cap::DATA_EXFIL`) so the two can coexist on
|
||||
// the same callee without one over-flagging the other.
|
||||
// Express response sinks
|
||||
LabelRule {
|
||||
matchers: &["res.send", "res.json"],
|
||||
|
|
@ -222,6 +222,21 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SSRF),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// ── Cross-boundary data exfiltration (DATA_EXFIL) ─────────────────────
|
||||
//
|
||||
// `XMLHttpRequest.prototype.send(body)`, when the receiver type is
|
||||
// tracked back to `new XMLHttpRequest()`, the SSA engine's type-qualified
|
||||
// resolver converts `xhr.send` to `HttpClient.send`; matching that form
|
||||
// fires DATA_EXFIL on tainted body flow. The explicit
|
||||
// `XMLHttpRequest.prototype.send.apply(...)` form is also covered. The
|
||||
// `fetch` body / headers / json case is covered by the gated entry in
|
||||
// `GATED_SINKS` (so SSRF on the URL and DATA_EXFIL on the payload can
|
||||
// coexist on a single call site).
|
||||
LabelRule {
|
||||
matchers: &["HttpClient.send", "XMLHttpRequest.prototype.send"],
|
||||
label: DataLabel::Sink(Cap::DATA_EXFIL),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// ─────────── SQL injection sinks ─────────────
|
||||
// Database drivers: mysql, mysql2, pg, better-sqlite3
|
||||
LabelRule {
|
||||
|
|
@ -314,7 +329,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
// only to body / data / json / headers / payload is silenced. See the
|
||||
// commentary at the top of RULES for the rationale.
|
||||
//
|
||||
// `fetch(input, init)` — arg 0 can be a URL string OR a Request/config
|
||||
// `fetch(input, init)`, arg 0 can be a URL string OR a Request/config
|
||||
// object with `url`. Per WHATWG Fetch, when `input` is a dictionary, the
|
||||
// URL field is canonically `url`. Init-object body/headers at arg 1 are
|
||||
// *not* destination-bearing.
|
||||
|
|
@ -332,7 +347,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["url"],
|
||||
},
|
||||
},
|
||||
// `axios(config)` / `axios.request(config)` — config object exposes
|
||||
// `axios(config)` / `axios.request(config)`, config object exposes
|
||||
// `url` and `baseURL`. Body-ish fields (`data`, `params`, `headers`)
|
||||
// are excluded.
|
||||
SinkGate {
|
||||
|
|
@ -363,7 +378,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["url", "baseURL"],
|
||||
},
|
||||
},
|
||||
// `axios.get(url[, config])` — arg 0 is URL; arg 1 is config.
|
||||
// `axios.get(url[, config])`, arg 0 is URL; arg 1 is config.
|
||||
SinkGate {
|
||||
callee_matcher: "axios.get",
|
||||
arg_index: 0,
|
||||
|
|
@ -378,7 +393,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &[],
|
||||
},
|
||||
},
|
||||
// `axios.post(url, data[, config])` — arg 0 is URL; `data` at arg 1 is
|
||||
// `axios.post(url, data[, config])`, arg 0 is URL; `data` at arg 1 is
|
||||
// the request body and must NOT activate SSRF.
|
||||
SinkGate {
|
||||
callee_matcher: "axios.post",
|
||||
|
|
@ -394,7 +409,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &[],
|
||||
},
|
||||
},
|
||||
// `axios.put / axios.patch / axios.delete` follow the same shape —
|
||||
// `axios.put / axios.patch / axios.delete` follow the same shape ,
|
||||
// (url, data?, config?). Keep the model consistent across verbs.
|
||||
SinkGate {
|
||||
callee_matcher: "axios.put",
|
||||
|
|
@ -438,7 +453,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &[],
|
||||
},
|
||||
},
|
||||
// `got(url[, options])` / `got(options)` — options exposes `url` and
|
||||
// `got(url[, options])` / `got(options)`, options exposes `url` and
|
||||
// `prefixUrl`. Body-ish fields (`body`, `json`, `form`, `searchParams`,
|
||||
// `headers`) are excluded.
|
||||
SinkGate {
|
||||
|
|
@ -455,7 +470,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["url", "prefixUrl"],
|
||||
},
|
||||
},
|
||||
// `undici.request(url | opts[, opts])` — opts exposes `origin` and
|
||||
// `undici.request(url | opts[, opts])`, opts exposes `origin` and
|
||||
// `path`. Body-ish fields (`body`, `headers`) are excluded.
|
||||
SinkGate {
|
||||
callee_matcher: "undici.request",
|
||||
|
|
@ -471,11 +486,11 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["origin", "path"],
|
||||
},
|
||||
},
|
||||
// Node `http.request(options[, cb])` / `https.request(options[, cb])` —
|
||||
// Node `http.request(options[, cb])` / `https.request(options[, cb])` ,
|
||||
// options exposes `host`, `hostname`, `path`, `protocol`, `port`,
|
||||
// `origin`. Body is sent via `.write()`/`.end()` on the returned
|
||||
// ClientRequest, so it never appears as a positional arg here.
|
||||
// Arg 0 may also be a URL string — the "whole arg is destination"
|
||||
// Arg 0 may also be a URL string, the "whole arg is destination"
|
||||
// fallback (triggered when arg 0 is not an object literal) covers that.
|
||||
SinkGate {
|
||||
callee_matcher: "http.request",
|
||||
|
|
@ -505,7 +520,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"],
|
||||
},
|
||||
},
|
||||
// Node `http.get(options[, cb])` / `https.get(options[, cb])` —
|
||||
// Node `http.get(options[, cb])` / `https.get(options[, cb])` ,
|
||||
// convenience wrappers around `.request()` that auto-call `.end()`.
|
||||
// Same destination semantics as `.request`. Motivated by
|
||||
// CVE-2025-64430 (Parse Server SSRF via http.get(uri)).
|
||||
|
|
@ -537,6 +552,31 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"],
|
||||
},
|
||||
},
|
||||
// ── Cross-boundary data exfiltration ──────────────────────────────────
|
||||
//
|
||||
// Sensitive data flowing into the *payload* of an outbound request is a
|
||||
// distinct vulnerability class from SSRF: the destination is fixed but
|
||||
// attacker-influenced bytes leave the process via the request body /
|
||||
// headers / json field. These gates fire on the body-bearing positions
|
||||
// and emit `Cap::DATA_EXFIL`, which is intentionally separate from
|
||||
// `Cap::SSRF` so a `fetch(taintedUrl, {body: tainted})` site reports
|
||||
// both classes independently.
|
||||
//
|
||||
// `fetch(input, init)`, `init` at arg 1 carries body / headers / json.
|
||||
SinkGate {
|
||||
callee_matcher: "fetch",
|
||||
arg_index: 1,
|
||||
dangerous_values: &[],
|
||||
dangerous_prefixes: &[],
|
||||
label: DataLabel::Sink(Cap::DATA_EXFIL),
|
||||
case_sensitive: false,
|
||||
payload_args: &[1],
|
||||
keyword_name: None,
|
||||
dangerous_kwargs: &[],
|
||||
activation: GateActivation::Destination {
|
||||
object_destination_fields: &["body", "headers", "json"],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
pub static KINDS: Map<&'static str, Kind> = phf_map! {
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ pub struct LabelRule {
|
|||
/// expands it to `(0..arity)` using the actual call arity.
|
||||
///
|
||||
/// The value `usize::MAX` is used because `args.get(usize::MAX)` is a guaranteed
|
||||
/// miss for any real argument list — an accidental direct-lookup would be a no-op
|
||||
/// miss for any real argument list, an accidental direct-lookup would be a no-op
|
||||
/// rather than silently aliasing position 0.
|
||||
pub const ALL_ARGS_PAYLOAD: &[usize] = &[usize::MAX];
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ pub enum GateActivation {
|
|||
/// arg selects the MIME type).
|
||||
ValueMatch,
|
||||
/// Destination-bearing flow activation. The gate fires when taint reaches
|
||||
/// a declared destination location at the call site — no literal
|
||||
/// a declared destination location at the call site, no literal
|
||||
/// inspection, no prefix heuristic.
|
||||
///
|
||||
/// For callees whose destination is a positional argument (e.g. `fetch`'s
|
||||
|
|
@ -80,7 +80,7 @@ pub enum GateActivation {
|
|||
}
|
||||
|
||||
/// Argument-sensitive sink activation. Whether a call becomes a sink is
|
||||
/// determined by the gate's [`GateActivation`] mode — literal-value matching
|
||||
/// determined by the gate's [`GateActivation`] mode, literal-value matching
|
||||
/// for traditional role-selector APIs, or destination-flow activation for
|
||||
/// outbound HTTP clients and other APIs where a specific location in the
|
||||
/// call carries the attacker-controlled destination.
|
||||
|
|
@ -144,6 +144,13 @@ bitflags! {
|
|||
/// carrier cap for folding `auth_analysis` into the SSA/taint
|
||||
/// engine.
|
||||
const UNAUTHORIZED_ID = 0b0001_0000_0000_0000; // bit 12
|
||||
/// Cross-boundary data-exfiltration: tainted sensitive data flowing
|
||||
/// into outbound request bodies, headers, or other payload-bearing
|
||||
/// fields of network egress APIs. Distinct from `SSRF` (attacker
|
||||
/// control over the destination URL), `DATA_EXFIL` fires when the
|
||||
/// destination is fixed but attacker-influenced data leaves the
|
||||
/// process via the request payload.
|
||||
const DATA_EXFIL = 0b0010_0000_0000_0000; // bit 13
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -192,7 +199,7 @@ pub enum Kind {
|
|||
/// reachability does not depend on sibling-case execution order.
|
||||
Switch,
|
||||
Trivia,
|
||||
/// Simple sequential expression (e.g. cast/type-assertion) — treated like
|
||||
/// Simple sequential expression (e.g. cast/type-assertion), treated like
|
||||
/// any other sequential statement in the CFG but explicitly classified so
|
||||
/// code that inspects `Kind` can recognise it.
|
||||
Seq,
|
||||
|
|
@ -472,9 +479,9 @@ pub enum SourceKind {
|
|||
FileSystem,
|
||||
/// Database query results
|
||||
Database,
|
||||
/// Caught exception — may carry user-controlled data
|
||||
/// Caught exception, may carry user-controlled data
|
||||
CaughtException,
|
||||
/// Could not determine — treat conservatively
|
||||
/// Could not determine, treat conservatively
|
||||
Unknown,
|
||||
}
|
||||
|
||||
|
|
@ -511,7 +518,7 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
|
|||
|
||||
// File system patterns
|
||||
if cl.contains("read") || cl.contains("fopen") || cl.contains("open") {
|
||||
// Distinguish from db reads — file reads typically have FILE_IO cap
|
||||
// Distinguish from db reads, file reads typically have FILE_IO cap
|
||||
if caps.contains(Cap::FILE_IO) {
|
||||
return SourceKind::FileSystem;
|
||||
}
|
||||
|
|
@ -570,6 +577,7 @@ pub fn parse_cap(s: &str) -> Option<Cap> {
|
|||
"code_exec" => Some(Cap::CODE_EXEC),
|
||||
"crypto" => Some(Cap::CRYPTO),
|
||||
"unauthorized_id" => Some(Cap::UNAUTHORIZED_ID),
|
||||
"data_exfil" | "data_exfiltration" => Some(Cap::DATA_EXFIL),
|
||||
"all" => Some(Cap::all()),
|
||||
_ => None,
|
||||
}
|
||||
|
|
@ -621,7 +629,7 @@ pub fn build_lang_rules(
|
|||
Vec::new()
|
||||
};
|
||||
|
||||
// Phase C: fold `auth_analysis` into the taint engine by injecting
|
||||
// fold `auth_analysis` into the taint engine by injecting
|
||||
// `Cap::UNAUTHORIZED_ID` sink/sanitizer rules. Gated by config; default
|
||||
// OFF so the standalone `auth_analysis` subsystem remains authoritative.
|
||||
if config.scanner.enable_auth_as_taint {
|
||||
|
|
@ -636,7 +644,7 @@ pub fn build_lang_rules(
|
|||
}
|
||||
}
|
||||
|
||||
/// Return Phase C auth-as-taint rules for a given language (currently Rust-only).
|
||||
/// Return the auth-as-taint rules for a given language (Rust-only).
|
||||
fn phase_c_auth_rules_for_lang(lang_slug: &str) -> Vec<RuntimeLabelRule> {
|
||||
match lang_slug {
|
||||
"rust" | "rs" => rust::phase_c_auth_rules(),
|
||||
|
|
@ -718,7 +726,7 @@ fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool {
|
|||
if exact_only {
|
||||
// `=foo` matchers fire only when `text` IS `foo` (no `Mod.foo`,
|
||||
// `Class::foo`, or any preceding namespace). Lets a label rule
|
||||
// distinguish bare `Kernel#open` from `File.open` — the former
|
||||
// distinguish bare `Kernel#open` from `File.open`, the former
|
||||
// shells out on `|cmd`, the latter never does (CVE-2020-8130).
|
||||
start == 0
|
||||
} else {
|
||||
|
|
@ -731,7 +739,7 @@ fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool {
|
|||
|
||||
/// Strip an optional `=` "exact-match" sigil from the start of a matcher.
|
||||
/// Matchers prefixed with `=` (e.g. `"=open"`) only fire when the candidate
|
||||
/// text equals the matcher exactly — the boundary-`.`-or-`:` allowance is
|
||||
/// text equals the matcher exactly, the boundary-`.`-or-`:` allowance is
|
||||
/// suppressed. Used to distinguish bare-callee Ruby/Python builtins from
|
||||
/// methods of the same name on a typed receiver.
|
||||
#[inline]
|
||||
|
|
@ -767,7 +775,7 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
|
|||
let full_normalized = normalize_chained_call(text);
|
||||
let full_norm_bytes = full_normalized.as_bytes();
|
||||
|
||||
// ── Check runtime (config) rules first — they take priority ──────
|
||||
// ── Check runtime (config) rules first, they take priority ──────
|
||||
if let Some(extras) = extra {
|
||||
// Pass 1: exact / suffix
|
||||
for rule in extras {
|
||||
|
|
@ -865,7 +873,7 @@ pub fn classify_all(
|
|||
}
|
||||
}
|
||||
|
||||
// ── Check runtime (config) rules first — they take priority ──────
|
||||
// ── Check runtime (config) rules first, they take priority ──────
|
||||
if let Some(extras) = extra {
|
||||
// Pass 1: exact / suffix
|
||||
for rule in extras {
|
||||
|
|
@ -941,7 +949,7 @@ pub fn classify_all(
|
|||
/// (or [`ALL_ARGS_PAYLOAD`] for dynamic-activation conservative fallback).
|
||||
/// `object_destination_fields`, when non-empty, restricts sink-taint checks
|
||||
/// to identifiers found under those field names within an object-literal
|
||||
/// positional argument — used by destination-aware outbound-HTTP gates so
|
||||
/// positional argument, used by destination-aware outbound-HTTP gates so
|
||||
/// `fetch({url, body})` fires only when taint reaches `url`, not `body`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct GateMatch {
|
||||
|
|
@ -952,9 +960,13 @@ pub struct GateMatch {
|
|||
|
||||
/// Classify a call against gated sink rules.
|
||||
///
|
||||
/// Returns `Some(GateMatch)` if the callee matches a gated rule AND the
|
||||
/// activation conditions fire. Returns `None` if the callee doesn't match
|
||||
/// any gated rule, or matches but the activation is provably safe.
|
||||
/// Returns every gate whose callee matches AND whose activation conditions
|
||||
/// fire. An empty result means the callee did not match any gated rule, or
|
||||
/// every match was provably safe. Multiple matches are possible when the
|
||||
/// same callee carries gates for different sink classes, e.g. `fetch` is
|
||||
/// both an SSRF gate (URL flow) and a `DATA_EXFIL` gate (body / headers /
|
||||
/// json flow); each gate carries its own [`GateMatch`] so downstream code
|
||||
/// can attribute findings per-cap.
|
||||
///
|
||||
/// `const_arg_at` extracts positional argument values.
|
||||
/// `const_keyword_arg` extracts keyword argument values (for languages like Python).
|
||||
|
|
@ -964,11 +976,15 @@ pub fn classify_gated_sink(
|
|||
const_arg_at: impl Fn(usize) -> Option<String>,
|
||||
const_keyword_arg: impl Fn(&str) -> Option<String>,
|
||||
kwarg_present: impl Fn(&str) -> bool,
|
||||
) -> Option<GateMatch> {
|
||||
let gates = GATED_REGISTRY.get(lang).or_else(|| {
|
||||
) -> SmallVec<[GateMatch; 2]> {
|
||||
let mut out: SmallVec<[GateMatch; 2]> = SmallVec::new();
|
||||
let gates = match GATED_REGISTRY.get(lang).or_else(|| {
|
||||
let key = lang.to_ascii_lowercase();
|
||||
GATED_REGISTRY.get(key.as_str())
|
||||
})?;
|
||||
}) {
|
||||
Some(g) => g,
|
||||
None => return out,
|
||||
};
|
||||
|
||||
let callee_bytes = callee_text.as_bytes();
|
||||
|
||||
|
|
@ -985,11 +1001,12 @@ pub fn classify_gated_sink(
|
|||
object_destination_fields,
|
||||
} = gate.activation
|
||||
{
|
||||
return Some(GateMatch {
|
||||
out.push(GateMatch {
|
||||
label: gate.label,
|
||||
payload_args: gate.payload_args,
|
||||
object_destination_fields,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── ValueMatch activation (legacy) ───────────────────────────────
|
||||
|
|
@ -1012,7 +1029,7 @@ pub fn classify_gated_sink(
|
|||
any_dangerous = true;
|
||||
break;
|
||||
}
|
||||
// Present with a safe literal — continue checking other kwargs.
|
||||
// Present with a safe literal, continue checking other kwargs.
|
||||
}
|
||||
None => {
|
||||
any_dynamic_present = true;
|
||||
|
|
@ -1020,23 +1037,25 @@ pub fn classify_gated_sink(
|
|||
}
|
||||
}
|
||||
if any_dangerous {
|
||||
return Some(GateMatch {
|
||||
out.push(GateMatch {
|
||||
label: gate.label,
|
||||
payload_args: gate.payload_args,
|
||||
object_destination_fields: &[],
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if any_dynamic_present {
|
||||
// Dynamic kwarg value — we can't prove safe. Conservatively
|
||||
// Dynamic kwarg value, we can't prove safe. Conservatively
|
||||
// flag every positional arg so the activation pathway isn't
|
||||
// silently narrowed to the gate's declared `payload_args`.
|
||||
return Some(GateMatch {
|
||||
out.push(GateMatch {
|
||||
label: gate.label,
|
||||
payload_args: ALL_ARGS_PAYLOAD,
|
||||
object_destination_fields: &[],
|
||||
});
|
||||
continue;
|
||||
}
|
||||
return None; // all listed kwargs absent or safe-literal → suppress
|
||||
continue; // all listed kwargs absent or safe-literal → suppress
|
||||
}
|
||||
|
||||
// Single-kwarg / positional gate path (original semantics).
|
||||
|
|
@ -1058,22 +1077,22 @@ pub fn classify_gated_sink(
|
|||
.iter()
|
||||
.any(|p| lower.starts_with(&p.to_ascii_lowercase()));
|
||||
if is_dangerous {
|
||||
return Some(GateMatch {
|
||||
out.push(GateMatch {
|
||||
label: gate.label,
|
||||
payload_args: gate.payload_args,
|
||||
object_destination_fields: &[],
|
||||
});
|
||||
}
|
||||
return None; // safe constant → suppress
|
||||
// safe constant → suppress (no push)
|
||||
}
|
||||
// Unknown / dynamic activation arg: the gate fires conservatively,
|
||||
// but we can't prove that only the declared `payload_args` carry
|
||||
// risk — a tainted activation arg (e.g. `setAttribute(userAttr, …)`
|
||||
// risk, a tainted activation arg (e.g. `setAttribute(userAttr, …)`
|
||||
// where `userAttr` is user-controlled) is itself a vulnerability
|
||||
// path. Return ALL_ARGS_PAYLOAD so downstream sink scanning
|
||||
// considers every positional argument.
|
||||
None => {
|
||||
return Some(GateMatch {
|
||||
out.push(GateMatch {
|
||||
label: gate.label,
|
||||
payload_args: ALL_ARGS_PAYLOAD,
|
||||
object_destination_fields: &[],
|
||||
|
|
@ -1081,7 +1100,7 @@ pub fn classify_gated_sink(
|
|||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
out
|
||||
}
|
||||
|
||||
/// Public wrapper for [`normalize_chained_call`] so callers outside the module
|
||||
|
|
@ -1090,25 +1109,11 @@ pub fn normalize_chained_call_for_classify(text: &str) -> String {
|
|||
normalize_chained_call(text)
|
||||
}
|
||||
|
||||
/// Return the bare method-name segment of a callee text.
|
||||
///
|
||||
/// Centralised replacement for the textual `callee.rsplit('.').next().unwrap_or(callee)`
|
||||
/// pattern that used to be scattered across the codebase.
|
||||
///
|
||||
/// Behaviour-preserving across the Phase 2 SSA chain decomposition rollout:
|
||||
/// - When SSA lowering rewrites a chained-receiver call (`c.mu.Lock()` →
|
||||
/// `Call("Lock", [v_mu])`), the call's `callee` is already the bare method
|
||||
/// name, so this helper is a no-op pass-through.
|
||||
/// - For 1-dot callees (`obj.method`) and for languages where Phase 2 lowering
|
||||
/// doesn't run yet (PHP/Ruby) the helper still extracts the trailing method
|
||||
/// from the textual form, exactly as the old per-callsite split did.
|
||||
/// - For bare callees (no dot), it returns the input unchanged.
|
||||
///
|
||||
/// Use this helper when you need the *terminal* method name from a callee
|
||||
/// string regardless of whether the call had a chained receiver. When you
|
||||
/// have an `SsaOp::Call` in hand, prefer reading `callee` directly and
|
||||
/// walking `receiver` through `FieldProj` ops — that's the precise path.
|
||||
/// This helper is the textual fallback for callsites that only see a `&str`.
|
||||
/// Return the bare method-name segment of a callee text. Returns the
|
||||
/// input unchanged for bare callees. When you have an `SsaOp::Call`,
|
||||
/// prefer reading `callee` directly and walking `receiver` through
|
||||
/// `FieldProj` ops, this helper is the textual fallback for callsites
|
||||
/// that only see a `&str`.
|
||||
pub fn bare_method_name(callee: &str) -> &str {
|
||||
callee.rsplit('.').next().unwrap_or(callee)
|
||||
}
|
||||
|
|
@ -1314,19 +1319,15 @@ mod tests {
|
|||
fn bare_method_name_strips_chain() {
|
||||
// No-dot input → returned as-is.
|
||||
assert_eq!(bare_method_name("foo"), "foo");
|
||||
// 1-dot → trailing segment (Phase 2 leaves these alone in SSA).
|
||||
// 1-dot → trailing segment.
|
||||
assert_eq!(bare_method_name("obj.method"), "method");
|
||||
// Multi-dot → trailing segment (matches AST-only callees from
|
||||
// PHP/Ruby and any pre-Phase-2 textual paths kept around in
|
||||
// `callee_text` for display).
|
||||
// Multi-dot → trailing segment.
|
||||
assert_eq!(bare_method_name("a.b.c.method"), "method");
|
||||
// Trailing dot → empty trailing segment, matching the legacy
|
||||
// `rsplit('.').next()` behaviour bit-for-bit.
|
||||
// Trailing dot → empty trailing segment.
|
||||
assert_eq!(bare_method_name("foo."), "");
|
||||
// Empty input.
|
||||
assert_eq!(bare_method_name(""), "");
|
||||
// Phase 2 invariant: when SSA decomposed a chain, `callee` is
|
||||
// the bare method already and the helper is a no-op.
|
||||
// SSA-decomposed chains pass through untouched.
|
||||
assert_eq!(bare_method_name("Lock"), "Lock");
|
||||
}
|
||||
|
||||
|
|
@ -1399,7 +1400,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn classify_bare_href_is_none() {
|
||||
// Bare "href" should NOT be a sink — only "location.href" and variants
|
||||
// Bare "href" should NOT be a sink, only "location.href" and variants
|
||||
let result = classify("javascript", "href", None);
|
||||
assert_eq!(result, None);
|
||||
}
|
||||
|
|
@ -1497,7 +1498,7 @@ mod tests {
|
|||
#[test]
|
||||
fn classify_go_user_client_get_is_not_ssrf_sink() {
|
||||
// `client.Get` on a user-named *http.Client variable should NOT
|
||||
// match — the Go SSRF set is restricted to the stdlib package
|
||||
// match, the Go SSRF set is restricted to the stdlib package
|
||||
// helper `http.DefaultClient`. Type-aware resolution would be the
|
||||
// path to a broader rule, not a bare-name match.
|
||||
let result = classify("go", "client.Get", None);
|
||||
|
|
@ -1530,7 +1531,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn classify_ruby_io_open_is_not_shell_escape_sink() {
|
||||
// `IO.open` takes a file descriptor — never pipes. The bare-
|
||||
// `IO.open` takes a file descriptor, never pipes. The bare-
|
||||
// open CMDI rule must leave it alone.
|
||||
let result = classify("ruby", "IO.open", None);
|
||||
assert_ne!(result, Some(DataLabel::Sink(Cap::SHELL_ESCAPE)));
|
||||
|
|
@ -1572,7 +1573,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn classify_cpp_sto_family_is_sanitizer() {
|
||||
// Phase 1: full `std::sto*` family (including 64-bit and `long
|
||||
// full `std::sto*` family (including 64-bit and `long
|
||||
// double` variants) clears every taint cap that flows through it,
|
||||
// matching the existing `std::stoi`/`std::stol` rule.
|
||||
for callee in [
|
||||
|
|
@ -1621,6 +1622,16 @@ mod tests {
|
|||
false
|
||||
}
|
||||
|
||||
/// Find the first matching gate whose label sink-caps overlap `caps`.
|
||||
/// Lets tests target a specific gate when a callee carries multiple
|
||||
/// (e.g. `fetch` is both an SSRF and a `DATA_EXFIL` gate).
|
||||
fn find_match_with_caps(matches: &[GateMatch], caps: Cap) -> Option<GateMatch> {
|
||||
matches
|
||||
.iter()
|
||||
.find(|m| matches!(m.label, DataLabel::Sink(c) if c.intersects(caps)))
|
||||
.copied()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn gated_sink_dangerous_exact() {
|
||||
let result = classify_gated_sink(
|
||||
|
|
@ -1631,12 +1642,12 @@ mod tests {
|
|||
no_kw_present,
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
payload_args: [1usize].as_slice(),
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1650,12 +1661,12 @@ mod tests {
|
|||
no_kw_present,
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
payload_args: [1usize].as_slice(),
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1668,24 +1679,24 @@ mod tests {
|
|||
no_kw,
|
||||
no_kw_present,
|
||||
);
|
||||
assert_eq!(result, None);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn gated_sink_dynamic_conservative() {
|
||||
// Dynamic activation (e.g. `setAttribute(attrVar, val)`) returns the
|
||||
// ALL_ARGS_PAYLOAD sentinel so callers expand payload tracking to
|
||||
// every positional arg — the activation arg itself is a vulnerability
|
||||
// every positional arg, the activation arg itself is a vulnerability
|
||||
// path when attacker-controlled.
|
||||
let result =
|
||||
classify_gated_sink("javascript", "setAttribute", |_| None, no_kw, no_kw_present);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
payload_args: ALL_ARGS_PAYLOAD,
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1698,7 +1709,7 @@ mod tests {
|
|||
no_kw,
|
||||
no_kw_present,
|
||||
);
|
||||
assert_eq!(result, None);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1711,7 +1722,7 @@ mod tests {
|
|||
no_kw,
|
||||
no_kw_present,
|
||||
);
|
||||
assert_eq!(result.unwrap().payload_args, &[1]);
|
||||
assert_eq!(result[0].payload_args, &[1]);
|
||||
|
||||
// parseFromString: payload is arg 0
|
||||
let result = classify_gated_sink(
|
||||
|
|
@ -1727,7 +1738,7 @@ mod tests {
|
|||
no_kw,
|
||||
no_kw_present,
|
||||
);
|
||||
assert_eq!(result.unwrap().payload_args, &[0]);
|
||||
assert_eq!(result[0].payload_args, &[0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1745,7 +1756,7 @@ mod tests {
|
|||
no_kw,
|
||||
no_kw_present,
|
||||
);
|
||||
assert_eq!(result, None);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1764,12 +1775,12 @@ mod tests {
|
|||
|kw| kw == "shell",
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
payload_args: [0usize].as_slice(),
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1788,7 +1799,7 @@ mod tests {
|
|||
},
|
||||
|kw| kw == "shell",
|
||||
);
|
||||
assert_eq!(result, None);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1797,12 +1808,12 @@ mod tests {
|
|||
// literal available → unknown activation → ALL_ARGS_PAYLOAD sentinel.
|
||||
let result = classify_gated_sink("python", "Popen", |_| None, |_| None, no_kw_present);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
payload_args: ALL_ARGS_PAYLOAD,
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1825,12 +1836,12 @@ mod tests {
|
|||
|kw| kw == "shell",
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
payload_args: [0usize].as_slice(),
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1850,7 +1861,7 @@ mod tests {
|
|||
},
|
||||
|kw| kw == "shell",
|
||||
);
|
||||
assert_eq!(result, None);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
/// `subprocess.run(cmd)` → no shell kwarg → presence-aware gate suppresses.
|
||||
|
|
@ -1864,7 +1875,7 @@ mod tests {
|
|||
|_| None,
|
||||
no_kw_present,
|
||||
);
|
||||
assert_eq!(result, None);
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
/// `subprocess.run(cmd, shell=flag)` → shell kwarg present but dynamic →
|
||||
|
|
@ -1880,12 +1891,12 @@ mod tests {
|
|||
|kw| kw == "shell",
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
Some(GateMatch {
|
||||
result.as_slice(),
|
||||
&[GateMatch {
|
||||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
payload_args: ALL_ARGS_PAYLOAD,
|
||||
object_destination_fields: &[],
|
||||
})
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1893,18 +1904,18 @@ mod tests {
|
|||
/// verbatim for the caller to apply object-literal field filtering.
|
||||
#[test]
|
||||
fn gated_sink_destination_positional_always_fires() {
|
||||
// `fetch(url)` — arg 0 is the URL (positional destination) OR an
|
||||
// `fetch(url)`, arg 0 is the URL (positional destination) OR an
|
||||
// object with a `url` field. The gate fires unconditionally, with
|
||||
// `url` declared as the object-literal destination-field for the
|
||||
// `fetch({url, body})` shape.
|
||||
let result = classify_gated_sink(
|
||||
"javascript",
|
||||
"fetch",
|
||||
|_| None, // no literal — Destination mode doesn't inspect it
|
||||
|_| None, // no literal, Destination mode doesn't inspect it
|
||||
no_kw,
|
||||
no_kw_present,
|
||||
);
|
||||
let m = result.expect("fetch gate should fire");
|
||||
let m = find_match_with_caps(&result, Cap::SSRF).expect("fetch SSRF gate should fire");
|
||||
assert_eq!(m.label, DataLabel::Sink(Cap::SSRF));
|
||||
assert_eq!(m.payload_args, &[0]);
|
||||
assert_eq!(m.object_destination_fields, &["url"]);
|
||||
|
|
@ -1914,10 +1925,13 @@ mod tests {
|
|||
/// the CFG caller to drive object-literal field filtering.
|
||||
#[test]
|
||||
fn gated_sink_destination_object_fields_surfaced() {
|
||||
// `http.request(opts, cb)` — opts is an object with destination fields.
|
||||
// `http.request(opts, cb)`, opts is an object with destination fields.
|
||||
let result =
|
||||
classify_gated_sink("javascript", "http.request", |_| None, no_kw, no_kw_present);
|
||||
let m = result.expect("http.request gate should fire");
|
||||
let m = result
|
||||
.first()
|
||||
.copied()
|
||||
.expect("http.request gate should fire");
|
||||
assert_eq!(m.label, DataLabel::Sink(Cap::SSRF));
|
||||
assert_eq!(m.payload_args, &[0]);
|
||||
assert!(
|
||||
|
|
@ -1929,6 +1943,27 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// `fetch` carries both SSRF (URL flow) and `DATA_EXFIL` (body / headers /
|
||||
/// json flow) gates. Both must fire from a single classify call so the
|
||||
/// downstream CFG can build per-cap filters.
|
||||
#[test]
|
||||
fn gated_sink_fetch_emits_ssrf_and_data_exfil() {
|
||||
let result = classify_gated_sink("javascript", "fetch", |_| None, no_kw, no_kw_present);
|
||||
let ssrf = find_match_with_caps(&result, Cap::SSRF).expect("SSRF gate fires");
|
||||
assert_eq!(ssrf.label, DataLabel::Sink(Cap::SSRF));
|
||||
assert_eq!(ssrf.payload_args, &[0]);
|
||||
assert_eq!(ssrf.object_destination_fields, &["url"]);
|
||||
|
||||
let exfil = find_match_with_caps(&result, Cap::DATA_EXFIL).expect("DATA_EXFIL gate fires");
|
||||
assert_eq!(exfil.label, DataLabel::Sink(Cap::DATA_EXFIL));
|
||||
assert_eq!(exfil.payload_args, &[1]);
|
||||
assert!(
|
||||
exfil.object_destination_fields.contains(&"body"),
|
||||
"expected body in DATA_EXFIL destination fields, got {:?}",
|
||||
exfil.object_destination_fields,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_all_single_label() {
|
||||
let result = classify_all("javascript", "innerHTML", None);
|
||||
|
|
|
|||
|
|
@ -106,6 +106,19 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sanitizer(Cap::URL_ENCODE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// SQLAlchemy bound-parameter sanitizer. Values passed as keyword
|
||||
// arguments to `text("…:name…").bindparams(name=value)` are bound
|
||||
// by the driver, so injection cannot break out of the literal
|
||||
// context. The accompanying SQL-string check (py.sqli.text_format)
|
||||
// already flags the `text(f"…")` shape at construction, so this
|
||||
// sanitizer only clears flow when the SQL is a literal and the
|
||||
// values reach the engine via bindparams. Recognises both the
|
||||
// method form (`text(…).bindparams(...)`) and the bare call form.
|
||||
LabelRule {
|
||||
matchers: &["bindparams", ".bindparams"],
|
||||
label: DataLabel::Sanitizer(Cap::SQL_QUERY),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Path canonicalization
|
||||
LabelRule {
|
||||
matchers: &["os.path.abspath", "os.path.normpath"],
|
||||
|
|
@ -119,7 +132,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::CODE_EXEC),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Jinja2 / string.Template — tainted template string enables SSTI
|
||||
// Jinja2 / string.Template, tainted template string enables SSTI
|
||||
LabelRule {
|
||||
matchers: &["Template"],
|
||||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
|
|
@ -141,7 +154,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Flask Markup — bypasses auto-escaping
|
||||
// Flask Markup, bypasses auto-escaping
|
||||
LabelRule {
|
||||
matchers: &["Markup"],
|
||||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
|
|
@ -216,7 +229,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SSRF),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// aiohttp HTTP client — SSRF sinks
|
||||
// aiohttp HTTP client, SSRF sinks
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
"aiohttp.get",
|
||||
|
|
@ -228,6 +241,30 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SSRF),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Type-qualified SSRF sinks: when the receiver is tracked as
|
||||
// TypeKind::HttpClient (e.g. `client = requests.Session()`,
|
||||
// `client = httpx.Client()`, or `s = aiohttp.ClientSession()`),
|
||||
// resolve_type_qualified_labels() constructs `"HttpClient.<method>"`
|
||||
// call texts so the receiver-name is no longer load-bearing. Matches
|
||||
// the existing Rust HttpClient.<method> sink set so both languages
|
||||
// stay in step on the type-aware SSRF model. Motivated by the
|
||||
// upstream LMDeploy CVE-2026-33626 shape:
|
||||
// client = requests.Session()
|
||||
// response = client.get(url, ...)
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
"HttpClient.get",
|
||||
"HttpClient.post",
|
||||
"HttpClient.put",
|
||||
"HttpClient.delete",
|
||||
"HttpClient.patch",
|
||||
"HttpClient.head",
|
||||
"HttpClient.request",
|
||||
"HttpClient.send",
|
||||
],
|
||||
label: DataLabel::Sink(Cap::SSRF),
|
||||
case_sensitive: false,
|
||||
},
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
"pickle.loads",
|
||||
|
|
@ -256,7 +293,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
dangerous_kwargs: &[],
|
||||
activation: GateActivation::ValueMatch,
|
||||
},
|
||||
// subprocess.run(cmd, shell=True) — multi-kwarg gate using the new
|
||||
// subprocess.run(cmd, shell=True), multi-kwarg gate using the new
|
||||
// presence-aware mechanism. Payload is arg 1 (after receiver offset
|
||||
// applied by the CFG layer when the call is modelled method-style).
|
||||
SinkGate {
|
||||
|
|
@ -361,7 +398,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
|
|||
let mut rules = Vec::new();
|
||||
|
||||
if ctx.has(DetectedFramework::Django) {
|
||||
// QuerySet.extra() — raw SQL injection risk.
|
||||
// QuerySet.extra(), raw SQL injection risk.
|
||||
// Framework-conditional because `extra` is too generic as a static matcher.
|
||||
rules.push(RuntimeLabelRule {
|
||||
matchers: vec!["extra".into()],
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Source(Cap::all()),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Rails request object — user-controlled HTTP request data.
|
||||
// Rails request object, user-controlled HTTP request data.
|
||||
// Dotted matchers work via push_node receiver.method text construction
|
||||
// (confirmed by existing Net::HTTP.get matcher in ssrf_net_http fixture).
|
||||
LabelRule {
|
||||
|
|
@ -75,7 +75,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
},
|
||||
// Bare `Kernel#open(path)` interprets a path beginning with `|` as a
|
||||
// shell command (`open("|cmd")` runs `cmd`). `=open` exact-matcher
|
||||
// syntax limits this rule to the bare call — `File.open`, `IO.open`,
|
||||
// syntax limits this rule to the bare call, `File.open`, `IO.open`,
|
||||
// `URI.open` etc. each have their own non-pipe semantics and are
|
||||
// covered by their own labels (or intentionally not labeled as CMDI).
|
||||
// CVE-2020-8130 (rake `Rake::FileList#egrep`) was the canonical
|
||||
|
|
@ -99,7 +99,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
// File I/O sinks: user-controlled paths flowing into File.open/File.new
|
||||
// are a path-traversal / arbitrary-read vector. File.open also participates
|
||||
// in the resource-lifecycle acquire/release pair (cfg_analysis::RUBY_RESOURCES),
|
||||
// so this entry is additive — it does not disturb resource-leak detection.
|
||||
// so this entry is additive, it does not disturb resource-leak detection.
|
||||
LabelRule {
|
||||
matchers: &["File.open", "File.new", "File.read", "IO.read"],
|
||||
label: DataLabel::Sink(Cap::FILE_IO),
|
||||
|
|
@ -115,7 +115,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// URI.open is the network-capable Kernel#open wrapper — more specific than
|
||||
// URI.open is the network-capable Kernel#open wrapper, more specific than
|
||||
// plain `open` (excluded to avoid file I/O false positives).
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
|
|
@ -140,7 +140,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::DESERIALIZE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// Reflection / dynamic class resolution — arbitrary class instantiation from
|
||||
// Reflection / dynamic class resolution, arbitrary class instantiation from
|
||||
// user-controlled names enables gadget chains (similar risk profile to
|
||||
// deserialization). Rails adds `constantize`/`safe_constantize` to String.
|
||||
LabelRule {
|
||||
|
|
@ -157,7 +157,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
// SQL injection: ActiveRecord query methods that accept raw SQL strings.
|
||||
// `where` and `order` are the most common Rails SQLi vectors when called
|
||||
// with string interpolation (e.g., User.where("name = '#{params[:name]}'")).
|
||||
// Broad matchers — verified against fixture fallout.
|
||||
// Broad matchers, verified against fixture fallout.
|
||||
LabelRule {
|
||||
matchers: &["where", "order", "group", "having", "joins", "pluck"],
|
||||
label: DataLabel::Sink(Cap::SQL_QUERY),
|
||||
|
|
@ -240,7 +240,7 @@ pub static PARAM_CONFIG: ParamConfig = ParamConfig {
|
|||
|
||||
/// ActiveRecord query methods that the static [`RULES`] table classifies as
|
||||
/// `Sink(Cap::SQL_QUERY)`. These are SQL injection vectors only when arg 0
|
||||
/// is a string with interpolation (`#{x}`) or a non-literal identifier — the
|
||||
/// is a string with interpolation (`#{x}`) or a non-literal identifier, the
|
||||
/// hash form (`where(id: x)`) and the parameterised form (`where("a = ?", x)`)
|
||||
/// are intrinsically safe because Rails escapes the values.
|
||||
const AR_QUERY_METHOD_NAMES: &[&str] = &["where", "order", "group", "having", "joins", "pluck"];
|
||||
|
|
@ -249,7 +249,7 @@ const AR_QUERY_METHOD_NAMES: &[&str] = &["where", "order", "group", "having", "j
|
|||
/// shape-safe. Hash literals (`pair`, `hash`), symbol literals
|
||||
/// (`simple_symbol`, `hash_key_symbol`), array literals (`array`), and pure
|
||||
/// string literals without `#{...}` interpolation are all safe. Strings WITH
|
||||
/// interpolation and identifiers / method calls are *not* in this list —
|
||||
/// interpolation and identifiers / method calls are *not* in this list ,
|
||||
/// callers must check `has_interpolation` and the kind separately.
|
||||
const AR_QUERY_SAFE_ARG0_KINDS: &[&str] = &[
|
||||
"pair",
|
||||
|
|
@ -270,15 +270,15 @@ const AR_QUERY_SAFE_ARG0_KINDS: &[&str] = &[
|
|||
/// `cfg-unguarded-sink` (sanitiser dominates the sink reflexively).
|
||||
///
|
||||
/// Real-world FP shapes this closes (redmine, mastodon, diaspora):
|
||||
/// * `Issue.where(:id => params[:id])` — hash form
|
||||
/// * `Model.where(id: x, name: y)` — keyword-shorthand pairs
|
||||
/// * `Project.order(:created_at)` — symbol literal
|
||||
/// * `Issue.pluck(:id, :name)` — symbol literals
|
||||
/// * `Model.where("active = ?", x)` — parameterised string
|
||||
/// * `Issue.where(:id => params[:id])`, hash form
|
||||
/// * `Model.where(id: x, name: y)`, keyword-shorthand pairs
|
||||
/// * `Project.order(:created_at)`, symbol literal
|
||||
/// * `Issue.pluck(:id, :name)`, symbol literals
|
||||
/// * `Model.where("active = ?", x)`, parameterised string
|
||||
///
|
||||
/// Real-world TPs preserved:
|
||||
/// * `User.where("name = '#{name}'")` — string with interpolation
|
||||
/// * `Model.where(some_string_var)` — dynamic identifier (conservative)
|
||||
/// * `User.where("name = '#{name}'")`, string with interpolation
|
||||
/// * `Model.where(some_string_var)`, dynamic identifier (conservative)
|
||||
pub fn ar_query_safe_shape(callee_text: &str, arg0_kind: &str, has_interpolation: bool) -> bool {
|
||||
// Match the callee's last segment ("Model.where" → "where", "where" → "where").
|
||||
let leaf = callee_text.rsplit(['.', ':']).next().unwrap_or(callee_text);
|
||||
|
|
@ -297,7 +297,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
|
|||
let mut rules = Vec::new();
|
||||
|
||||
if ctx.has(DetectedFramework::Rails) {
|
||||
// Strong parameters — permit/require sanitize user input
|
||||
// Strong parameters, permit/require sanitize user input
|
||||
rules.push(RuntimeLabelRule {
|
||||
matchers: vec!["permit".into(), "require".into()],
|
||||
label: DataLabel::Sanitizer(Cap::all()),
|
||||
|
|
@ -306,7 +306,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
|
|||
}
|
||||
|
||||
if ctx.has(DetectedFramework::Sinatra) {
|
||||
// Sinatra template rendering — user content flows to rendered output
|
||||
// Sinatra template rendering, user content flows to rendered output
|
||||
rules.push(RuntimeLabelRule {
|
||||
matchers: vec!["erb".into(), "haml".into()],
|
||||
label: DataLabel::Sink(Cap::HTML_ESCAPE),
|
||||
|
|
@ -323,7 +323,7 @@ mod ar_query_tests {
|
|||
|
||||
#[test]
|
||||
fn hash_form_is_safe() {
|
||||
// Model.where(:id => x) — pair node directly in argument_list
|
||||
// Model.where(:id => x) , pair node directly in argument_list
|
||||
assert!(ar_query_safe_shape("Model.where", "pair", false));
|
||||
// Model.where(id: x)
|
||||
assert!(ar_query_safe_shape("where", "pair", false));
|
||||
|
|
@ -338,32 +338,32 @@ mod ar_query_tests {
|
|||
|
||||
#[test]
|
||||
fn parameterised_string_is_safe() {
|
||||
// Model.where("a = ?", x) — first arg is a string literal w/o interpolation
|
||||
// Model.where("a = ?", x) , first arg is a string literal w/o interpolation
|
||||
assert!(ar_query_safe_shape("where", "string", false));
|
||||
assert!(ar_query_safe_shape("where", "string_literal", false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn interpolated_string_is_dangerous() {
|
||||
// Model.where("a = #{x}") — string node WITH interpolation child
|
||||
// Model.where("a = #{x}") , string node WITH interpolation child
|
||||
assert!(!ar_query_safe_shape("where", "string", true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dynamic_identifier_is_dangerous() {
|
||||
// Model.where(some_var) — kind is identifier, not in safe list
|
||||
// Model.where(some_var), kind is identifier, not in safe list
|
||||
assert!(!ar_query_safe_shape("where", "identifier", false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn array_form_is_safe() {
|
||||
// Model.pluck([:id, :name]) — uncommon but valid
|
||||
// Model.pluck([:id, :name]), uncommon but valid
|
||||
assert!(ar_query_safe_shape("pluck", "array", false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_ar_method_is_never_suppressed() {
|
||||
// find_by_sql is a real raw-SQL sink — never suppress.
|
||||
// find_by_sql is a real raw-SQL sink, never suppress.
|
||||
assert!(!ar_query_safe_shape("find_by_sql", "string", false));
|
||||
assert!(!ar_query_safe_shape("connection.execute", "pair", false));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -168,7 +168,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
|
|||
"expression_statement" => Kind::CallWrapper,
|
||||
"assignment_expression" => Kind::Assignment,
|
||||
|
||||
// struct expressions — recurse so env::var() calls inside field
|
||||
// struct expressions, recurse so env::var() calls inside field
|
||||
// initialisers produce Source-labelled CFG nodes (needed for summaries).
|
||||
"struct_expression" => Kind::Block,
|
||||
"field_initializer_list" => Kind::Block,
|
||||
|
|
@ -287,7 +287,7 @@ pub fn framework_rules(ctx: &FrameworkContext) -> Vec<RuntimeLabelRule> {
|
|||
rules
|
||||
}
|
||||
|
||||
/// Phase C: auth-as-taint label rules for Rust. Gated by
|
||||
/// auth-as-taint label rules for Rust. Gated by
|
||||
/// `config.scanner.enable_auth_as_taint`; appended to the runtime rule set
|
||||
/// when the flag is enabled. These declare **sinks** (state-changing or
|
||||
/// outbound operations that should not be reached by an un-checked
|
||||
|
|
@ -343,10 +343,8 @@ pub fn phase_c_auth_rules() -> Vec<RuntimeLabelRule> {
|
|||
case_sensitive: false,
|
||||
},
|
||||
// ── Sanitizers clearing Cap::UNAUTHORIZED_ID ──
|
||||
// Ownership and membership guards from the auth_analysis default
|
||||
// `authorization_check_names` list. Phase C consumes these via
|
||||
// call-site argument sanitization (see
|
||||
// `is_auth_as_taint_arg_sanitizer` in ssa_transfer).
|
||||
// Ownership and membership guards consumed via call-site
|
||||
// argument sanitization (see `is_auth_as_taint_arg_sanitizer`).
|
||||
RuntimeLabelRule {
|
||||
matchers: vec![
|
||||
"check_ownership".into(),
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// he library — HTML entity encoding
|
||||
// he library, HTML entity encoding
|
||||
LabelRule {
|
||||
matchers: &["he.encode", "he.escape"],
|
||||
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
|
||||
|
|
@ -131,7 +131,7 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
|
||||
case_sensitive: true,
|
||||
},
|
||||
// ── Outbound HTTP clients — modeled as destination-aware gated sinks ──
|
||||
// ── Outbound HTTP clients, modeled as destination-aware gated sinks ──
|
||||
// See GATED_SINKS below; rationale mirrors javascript.rs.
|
||||
LabelRule {
|
||||
matchers: &[
|
||||
|
|
@ -206,6 +206,14 @@ pub static RULES: &[LabelRule] = &[
|
|||
label: DataLabel::Sink(Cap::SSRF),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// ── Cross-boundary data exfiltration (DATA_EXFIL) ─────────────────────
|
||||
// See javascript.rs for rationale. `xhr.send(body)` resolves to
|
||||
// `HttpClient.send` via type-qualified resolution.
|
||||
LabelRule {
|
||||
matchers: &["HttpClient.send", "XMLHttpRequest.prototype.send"],
|
||||
label: DataLabel::Sink(Cap::DATA_EXFIL),
|
||||
case_sensitive: false,
|
||||
},
|
||||
// ─────────── SQL injection sinks ─────────────
|
||||
// Database drivers: mysql, mysql2, pg, better-sqlite3
|
||||
LabelRule {
|
||||
|
|
@ -283,7 +291,7 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
dangerous_kwargs: &[],
|
||||
activation: GateActivation::ValueMatch,
|
||||
},
|
||||
// ── Outbound HTTP clients (SSRF) — see javascript.rs for rationale ────
|
||||
// ── Outbound HTTP clients (SSRF), see javascript.rs for rationale ────
|
||||
SinkGate {
|
||||
callee_matcher: "fetch",
|
||||
arg_index: 0,
|
||||
|
|
@ -452,6 +460,24 @@ pub static GATED_SINKS: &[SinkGate] = &[
|
|||
object_destination_fields: &["host", "hostname", "path", "protocol", "port", "origin"],
|
||||
},
|
||||
},
|
||||
// ── Cross-boundary data exfiltration ──────────────────────────────────
|
||||
// `fetch(input, init)`, payload-bearing fields of `init` (arg 1) flow
|
||||
// into the request body / headers / json, distinct from SSRF on the URL
|
||||
// (arg 0). See javascript.rs for full rationale.
|
||||
SinkGate {
|
||||
callee_matcher: "fetch",
|
||||
arg_index: 1,
|
||||
dangerous_values: &[],
|
||||
dangerous_prefixes: &[],
|
||||
label: DataLabel::Sink(Cap::DATA_EXFIL),
|
||||
case_sensitive: false,
|
||||
payload_args: &[1],
|
||||
keyword_name: None,
|
||||
dangerous_kwargs: &[],
|
||||
activation: GateActivation::Destination {
|
||||
object_destination_fields: &["body", "headers", "json"],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
pub static KINDS: Map<&'static str, Kind> = phf_map! {
|
||||
|
|
|
|||
49
src/lib.rs
49
src/lib.rs
|
|
@ -1,43 +1,14 @@
|
|||
//! # Nyx Scanner
|
||||
//! Multi-language static vulnerability scanner. Tree-sitter parsing, petgraph
|
||||
//! CFGs, SSA-based dataflow, and cross-file taint analysis with a
|
||||
//! capability-based sanitizer system. Supports Rust, C, C++, Java, Go, PHP,
|
||||
//! Python, Ruby, TypeScript, and JavaScript.
|
||||
//!
|
||||
//! A multi-language static vulnerability scanner. Nyx parses source files with
|
||||
//! [tree-sitter](https://tree-sitter.github.io/), builds intra-procedural
|
||||
//! control-flow graphs ([petgraph](https://docs.rs/petgraph)), and runs
|
||||
//! cross-file taint analysis with a capability-based sanitizer system.
|
||||
//!
|
||||
//! ## Architecture
|
||||
//!
|
||||
//! Nyx uses a **two-pass architecture**:
|
||||
//!
|
||||
//! 1. **Pass 1 — Summary extraction**: Parse each file, build a CFG per function,
|
||||
//! and export a [`summary::FuncSummary`] capturing source/sanitizer/sink capabilities,
|
||||
//! taint propagation behavior, and callee lists. Summaries are persisted to SQLite.
|
||||
//!
|
||||
//! 2. **Pass 2 — Analysis**: Load all summaries into a [`summary::GlobalSummaries`] map,
|
||||
//! re-parse files, and run taint analysis with cross-file callee resolution. CFG
|
||||
//! structural analysis checks for auth gaps, unguarded sinks, and resource leaks.
|
||||
//!
|
||||
//! ## Four Detector Families
|
||||
//!
|
||||
//! - **Taint** ([`taint`]) — Monotone forward dataflow tracking source-to-sink flows
|
||||
//! - **CFG Structural** ([`cfg_analysis`]) — Dominator-based guard and auth-gap detection
|
||||
//! - **State Model** ([`state`]) — Resource lifecycle and authentication state lattices
|
||||
//! - **AST Patterns** ([`patterns`]) — Tree-sitter structural queries per language
|
||||
//!
|
||||
//! ## Supported Languages
|
||||
//!
|
||||
//! Rust, C, C++, Java, Go, PHP, Python, Ruby, TypeScript, JavaScript.
|
||||
//!
|
||||
//! ## Entry Points
|
||||
//!
|
||||
//! - [`scan_no_index`] — Run a two-pass scan without indexing (for tests)
|
||||
//! - [`commands::scan::scan_filesystem`] — Filesystem scan with optional indexing
|
||||
//! - [`commands::scan::scan_with_index_parallel`] — Index-backed parallel scan
|
||||
//!
|
||||
//! ## Documentation
|
||||
//!
|
||||
//! See the [`docs/`](https://github.com/elicpeter/nyx/tree/master/docs) directory
|
||||
//! for user and contributor documentation.
|
||||
//! The handbook below is embedded verbatim from
|
||||
//! [`docs/how-it-works.md`](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md).
|
||||
//! Per-detector documentation lives on the [`taint`], [`cfg_analysis`],
|
||||
//! [`state`], [`patterns`], and [`auth_analysis`] modules. The primary
|
||||
//! library entry point for tests and embedders is [`scan_no_index`].
|
||||
#![doc = include_str!(concat!(env!("OUT_DIR"), "/lib_intro.md"))]
|
||||
|
||||
pub mod abstract_interp;
|
||||
pub mod ast;
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ fn main() -> NyxResult<()> {
|
|||
let quiet = config.output.quiet || cli.command.is_structured_output(&config);
|
||||
|
||||
// Print config note before scanning (human-readable mode only). Pure
|
||||
// informational commands suppress it too — their output is usually
|
||||
// informational commands suppress it too, their output is usually
|
||||
// piped or grepped and the preamble is noise.
|
||||
if let Some(note) = config_note.filter(|_| !quiet && !is_info) {
|
||||
eprint!("{note}");
|
||||
|
|
|
|||
|
|
@ -47,14 +47,28 @@ fn cfg_rule_description(id: &str) -> Option<&'static str> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Look up a human-readable description for any rule ID.
|
||||
fn rule_description(id: &str) -> &str {
|
||||
// Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base)
|
||||
let base_id = if id.starts_with("taint-") {
|
||||
/// Normalise a finding's id to the base SARIF rule id.
|
||||
///
|
||||
/// Findings carry source-location-suffixed ids like
|
||||
/// `"taint-unsanitised-flow (source 12:3)"` so identical (source, sink)
|
||||
/// pairs can be deduped, but SARIF wants a single rule per category.
|
||||
/// Cap-specific taint rule classes (e.g. `taint-data-exfiltration`) are
|
||||
/// preserved as distinct bases so consumers can filter on them rather than
|
||||
/// folding everything into `taint-unsanitised-flow`.
|
||||
fn sarif_base_id(id: &str) -> &str {
|
||||
if id.starts_with("taint-data-exfiltration") {
|
||||
"taint-data-exfiltration"
|
||||
} else if id.starts_with("taint-") {
|
||||
"taint-unsanitised-flow"
|
||||
} else {
|
||||
id
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a human-readable description for any rule ID.
|
||||
fn rule_description(id: &str) -> &str {
|
||||
// Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base)
|
||||
let base_id = sarif_base_id(id);
|
||||
|
||||
if let Some(desc) = PATTERN_DESCRIPTIONS.get(base_id) {
|
||||
return desc;
|
||||
|
|
@ -62,10 +76,13 @@ fn rule_description(id: &str) -> &str {
|
|||
if let Some(desc) = cfg_rule_description(base_id) {
|
||||
return desc;
|
||||
}
|
||||
if base_id == "taint-unsanitised-flow" {
|
||||
return "Unsanitised data flows from source to sink";
|
||||
match base_id {
|
||||
"taint-unsanitised-flow" => "Unsanitised data flows from source to sink",
|
||||
"taint-data-exfiltration" => {
|
||||
"Sensitive data flows into the payload of an outbound network request"
|
||||
}
|
||||
_ => id,
|
||||
}
|
||||
id
|
||||
}
|
||||
|
||||
fn severity_to_level(sev: Severity) -> &'static str {
|
||||
|
|
@ -83,11 +100,7 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
|
|||
let mut rule_index_map: HashMap<String, usize> = HashMap::new();
|
||||
|
||||
for d in diags {
|
||||
let base = if d.id.starts_with("taint-") {
|
||||
"taint-unsanitised-flow".to_string()
|
||||
} else {
|
||||
d.id.clone()
|
||||
};
|
||||
let base = sarif_base_id(&d.id).to_string();
|
||||
if !rule_index_map.contains_key(&base) {
|
||||
let idx = rule_ids.len();
|
||||
rule_index_map.insert(base.clone(), idx);
|
||||
|
|
@ -108,15 +121,11 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
|
|||
let results: Vec<Value> = diags
|
||||
.iter()
|
||||
.map(|d| {
|
||||
let base = if d.id.starts_with("taint-") {
|
||||
"taint-unsanitised-flow"
|
||||
} else {
|
||||
&d.id
|
||||
};
|
||||
let base = sarif_base_id(&d.id);
|
||||
let rule_index = rule_index_map[base];
|
||||
|
||||
// Make path relative to scan root. Fall back to a deterministic
|
||||
// sentinel instead of the absolute path — SARIF must not leak
|
||||
// sentinel instead of the absolute path, SARIF must not leak
|
||||
// home-directory or host-specific prefixes.
|
||||
let uri = match Path::new(&d.path).strip_prefix(scan_root) {
|
||||
Ok(p) => p.to_string_lossy().to_string(),
|
||||
|
|
@ -213,17 +222,17 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
|
|||
props.insert("relatedFindings".into(), json!(d.alternative_finding_ids));
|
||||
}
|
||||
|
||||
// Engine provenance notes — surface any cap-hit / lowering
|
||||
// Engine provenance notes, surface any cap-hit / lowering
|
||||
// bail / timeout signals recorded by the analysis engine so
|
||||
// downstream consumers can tell "nothing found" from "engine
|
||||
// stopped looking".
|
||||
//
|
||||
// Three properties are emitted together:
|
||||
// * `engine_notes` — raw list of {kind, ...} entries
|
||||
// * `confidence_capped` — true iff any non-informational
|
||||
// * `engine_notes` , raw list of {kind, ...} entries
|
||||
// * `confidence_capped` , true iff any non-informational
|
||||
// note is present (back-compat
|
||||
// boolean; drives legacy dashboards)
|
||||
// * `loss_direction` — worst `LossDirection` across
|
||||
// * `loss_direction` , worst `LossDirection` across
|
||||
// the list ("under-report",
|
||||
// "over-report", "bail"). Absent
|
||||
// when only informational notes
|
||||
|
|
@ -590,7 +599,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn build_sarif_path_outside_scan_root_is_redacted() {
|
||||
// Absolute host paths leak home-directory information — SARIF must
|
||||
// Absolute host paths leak home-directory information, SARIF must
|
||||
// substitute a deterministic token when a finding falls outside the
|
||||
// scan root.
|
||||
let mut diag = make_diag("rule-x", Severity::High);
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ pub fn scan_ejs_file(path: &Path, bytes: &[u8]) -> Vec<Diag> {
|
|||
// Advance past this match for the next iteration.
|
||||
search_from = abs_end + 2; // skip "%>"
|
||||
|
||||
// Skip <%- include(...) %> — EJS partial inclusion, not user-controlled.
|
||||
// Skip <%- include(...) %>, EJS partial inclusion, not user-controlled.
|
||||
if is_include_call(expr) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ pub const PATTERNS: &[Pattern] = &[
|
|||
Pattern {
|
||||
id: "java.deser.readobject",
|
||||
description: "ObjectInputStream.readObject() performs unsafe deserialization",
|
||||
// Match any .readObject() call — the method name is specific enough.
|
||||
// Match any .readObject() call, the method name is specific enough.
|
||||
query: r#"(method_invocation
|
||||
name: (identifier) @id (#eq? @id "readObject"))
|
||||
@vuln"#,
|
||||
|
|
@ -21,6 +21,46 @@ pub const PATTERNS: &[Pattern] = &[
|
|||
category: PatternCategory::Deserialization,
|
||||
confidence: Confidence::High,
|
||||
},
|
||||
// ── Tier A: SnakeYAML deserialization (CVE-2022-1471) ──────────────
|
||||
// `new Yaml()` constructed without a `SafeConstructor` argument
|
||||
// accepts arbitrary YAML tags (`!!javax.script.ScriptEngineManager`,
|
||||
// `!!java.net.URLClassLoader`, …) and instantiates any class via
|
||||
// reflection. SnakeYAML 2.0 swapped the default to SafeConstructor
|
||||
// but pre-2.0 deployments stay vulnerable until call sites are
|
||||
// patched. We match the empty-arg form `new Yaml()` only, so the
|
||||
// explicit-SafeConstructor remediation form
|
||||
// `new Yaml(new SafeConstructor(new LoaderOptions()))` is silent.
|
||||
Pattern {
|
||||
id: "java.deser.snakeyaml_unsafe_constructor",
|
||||
description: "new Yaml() without SafeConstructor accepts arbitrary class tags (CVE-2022-1471)",
|
||||
query: r#"(object_creation_expression
|
||||
type: (type_identifier) @t (#eq? @t "Yaml")
|
||||
arguments: (argument_list) @args (#eq? @args "()"))
|
||||
@vuln"#,
|
||||
severity: Severity::High,
|
||||
tier: PatternTier::A,
|
||||
category: PatternCategory::Deserialization,
|
||||
confidence: Confidence::High,
|
||||
},
|
||||
// ── Tier A: Apache Commons Text Text4Shell (CVE-2022-42889) ────────
|
||||
// `StringSubstitutor.createInterpolator()` enables `script:`,
|
||||
// `dns:`, and `url:` lookups by default, `${script:js:…}`
|
||||
// evaluates JavaScript via the JSR-223 ScriptEngineManager. The
|
||||
// factory call is itself the structural bug; the recommended app-
|
||||
// side mitigation builds a `StringSubstitutor` directly with a
|
||||
// restricted lookup map.
|
||||
Pattern {
|
||||
id: "java.code_exec.text4shell_interpolator",
|
||||
description: "StringSubstitutor.createInterpolator() enables script:/dns:/url: evaluation (CVE-2022-42889)",
|
||||
query: r#"(method_invocation
|
||||
object: (identifier) @c (#eq? @c "StringSubstitutor")
|
||||
name: (identifier) @id (#eq? @id "createInterpolator"))
|
||||
@vuln"#,
|
||||
severity: Severity::High,
|
||||
tier: PatternTier::A,
|
||||
category: PatternCategory::CodeExec,
|
||||
confidence: Confidence::High,
|
||||
},
|
||||
// ── Tier A: Command execution ──────────────────────────────────────
|
||||
Pattern {
|
||||
id: "java.cmdi.runtime_exec",
|
||||
|
|
|
|||
|
|
@ -1,42 +1,4 @@
|
|||
//! # AST Pattern Conventions
|
||||
//!
|
||||
//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs.
|
||||
//!
|
||||
//! ## ID format
|
||||
//!
|
||||
//! `<lang>.<category>.<specific>` — e.g. `java.deser.readobject`, `py.cmdi.os_system`.
|
||||
//!
|
||||
//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`.
|
||||
//!
|
||||
//! ## Tiers
|
||||
//!
|
||||
//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`).
|
||||
//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated
|
||||
//! arg, format-string with variable first arg).
|
||||
//!
|
||||
//! ## Severity
|
||||
//!
|
||||
//! * **High** — command exec, deserialization, banned C functions.
|
||||
//! * **Medium** — SQL concat, reflection, XSS sinks, casts.
|
||||
//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`).
|
||||
//!
|
||||
//! Note: the default `min_severity` filter skips Low patterns; they only appear when
|
||||
//! the user explicitly lowers the threshold.
|
||||
//!
|
||||
//! ## No-duplicate rule
|
||||
//!
|
||||
//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a
|
||||
//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but
|
||||
//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`).
|
||||
//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location.
|
||||
//!
|
||||
//! ## Adding a new pattern
|
||||
//!
|
||||
//! 1. Pick the language file under `src/patterns/<lang>.rs`.
|
||||
//! 2. Choose tier, category, severity per the rules above.
|
||||
//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`.
|
||||
//! 4. Add a snippet to `tests/fixtures/patterns/<lang>/positive.<ext>`.
|
||||
//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`.
|
||||
#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))]
|
||||
|
||||
pub mod c;
|
||||
pub mod cpp;
|
||||
|
|
@ -68,7 +30,7 @@ pub enum Severity {
|
|||
impl Severity {
|
||||
/// Bracketed, colored, fixed-width tag for aligned console output.
|
||||
///
|
||||
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` — always 8 visible characters
|
||||
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"`, always 8 visible characters
|
||||
/// so the column after the tag lines up regardless of severity.
|
||||
#[allow(dead_code)] // public API for lib consumers
|
||||
pub fn colored_tag(self) -> String {
|
||||
|
|
@ -123,9 +85,9 @@ impl FromStr for Severity {
|
|||
/// A parsed severity filter expression.
|
||||
///
|
||||
/// Supports three forms:
|
||||
/// - Single level: `"HIGH"` — matches only that level
|
||||
/// - Comma list: `"HIGH,MEDIUM"` — matches any listed level
|
||||
/// - Threshold: `">=MEDIUM"` — matches that level and above
|
||||
/// - Single level: `"HIGH"`, matches only that level
|
||||
/// - Comma list: `"HIGH,MEDIUM"`, matches any listed level
|
||||
/// - Threshold: `">=MEDIUM"`, matches that level and above
|
||||
///
|
||||
/// Parsing is case-insensitive and tolerates whitespace around tokens.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
|
|
@ -242,7 +204,7 @@ impl PatternCategory {
|
|||
/// One AST pattern with a tree-sitter query and meta-data.
|
||||
#[derive(Debug, Clone, Serialize, PartialEq)]
|
||||
pub struct Pattern {
|
||||
/// Unique identifier — `<lang>.<category>.<specific>` preferred.
|
||||
/// Unique identifier, `<lang>.<category>.<specific>` preferred.
|
||||
pub id: &'static str,
|
||||
/// Human-readable explanation.
|
||||
pub description: &'static str,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
|
|||
///
|
||||
/// Taint rules cover `eval`/`exec`, `os.system`/`os.popen`/`subprocess.*`,
|
||||
/// and `cursor.execute`. AST patterns here add coverage for **deserialization**,
|
||||
/// **subprocess shell=True** (Tier B — taint doesn't check keyword args), and
|
||||
/// **subprocess shell=True** (Tier B, taint doesn't check keyword args), and
|
||||
/// **code execution** sinks that taint cannot structurally verify.
|
||||
pub const PATTERNS: &[Pattern] = &[
|
||||
// ── Tier A: Code execution ─────────────────────────────────────────
|
||||
|
|
@ -121,14 +121,45 @@ pub const PATTERNS: &[Pattern] = &[
|
|||
confidence: Confidence::High,
|
||||
},
|
||||
// ── Tier B: SQL injection (format/concat heuristic) ────────────────
|
||||
// Catches both `cursor.execute(query + user)` (binary_operator concat)
|
||||
// and `cursor.execute(f"... {user} ...")` (f-string with interpolation).
|
||||
// f-strings appear as a `string` node with `interpolation` children in
|
||||
// tree-sitter-python; the alternation lets the same pattern cover both
|
||||
// the historical % / + concat shapes and the modern f-string SQLi shape
|
||||
// that surfaces in CVE-2025-24793 (snowflake-connector-python),
|
||||
// CVE-2025-69662 (geopandas), and dozens of similar cursor.execute
|
||||
// call sites across the corpus.
|
||||
Pattern {
|
||||
id: "py.sqli.execute_format",
|
||||
description: "cursor.execute with string concatenation risks SQL injection",
|
||||
description: "cursor.execute with string concatenation or f-string risks SQL injection",
|
||||
query: r#"(call
|
||||
function: (attribute
|
||||
attribute: (identifier) @fn (#eq? @fn "execute"))
|
||||
arguments: (argument_list
|
||||
(binary_operator) @arg))
|
||||
[(binary_operator)
|
||||
(string (interpolation))] @arg))
|
||||
@vuln"#,
|
||||
severity: Severity::Medium,
|
||||
tier: PatternTier::B,
|
||||
category: PatternCategory::SqlInjection,
|
||||
confidence: Confidence::Medium,
|
||||
},
|
||||
// SQLAlchemy `text(<concat-or-fstring>)`, same Tier B heuristic
|
||||
// applied to the SQLAlchemy raw-SQL constructor. Catches the
|
||||
// CVE-2025-69662 (geopandas) shape:
|
||||
// connection.execute(text(f"SELECT … '{geom_name}' …"))
|
||||
// where the f-string interpolation is the injection point and the
|
||||
// surrounding `connection.execute` would otherwise hide the unsafe
|
||||
// construction from the simple execute_format pattern.
|
||||
Pattern {
|
||||
id: "py.sqli.text_format",
|
||||
description: "sqlalchemy text() with f-string or string concat risks SQL injection",
|
||||
query: r#"(call
|
||||
function: [(identifier) @fn (attribute attribute: (identifier) @fn)]
|
||||
(#eq? @fn "text")
|
||||
arguments: (argument_list
|
||||
[(binary_operator)
|
||||
(string (interpolation))] @arg))
|
||||
@vuln"#,
|
||||
severity: Severity::Medium,
|
||||
tier: PatternTier::B,
|
||||
|
|
|
|||
|
|
@ -1,33 +1,8 @@
|
|||
//! Field-sensitive Steensgaard points-to analysis driver.
|
||||
//!
|
||||
//! Walks the SSA body once per fixpoint pass, emitting equality
|
||||
//! constraints for each instruction. The constraints are resolved
|
||||
//! via standard union-find with path compression and union-by-rank;
|
||||
//! propagation through `FieldProj` requires a worklist because the
|
||||
//! representative of a receiver may change after the field projection
|
||||
//! is first visited.
|
||||
//!
|
||||
//! The analysis is flow-insensitive (Steensgaard) — every assignment
|
||||
//! that joins two values unifies their points-to sets across the
|
||||
//! entire body. Field sensitivity is recovered by representing each
|
||||
//! `obj.f` access as a structural [`AbsLoc::Field`] location with a
|
||||
//! distinct identity per `(parent_loc, field)` pair.
|
||||
//!
|
||||
//! ## Phase 1 scope
|
||||
//!
|
||||
//! - Field READS via [`SsaOp::FieldProj`] — sufficient for Phase 2's
|
||||
//! resource-lifecycle attribution fix (the gin/`context.go` proxy
|
||||
//! acquire FP).
|
||||
//! - Param/SelfParam → fresh caller-relative locations.
|
||||
//! - Phi/Assign → Steensgaard unification.
|
||||
//! - Call results → fresh allocation-site locations (one per call
|
||||
//! instruction, keyed by SSA value).
|
||||
//! - Source/Const/Nop/Undef → empty (scalars don't reach the heap).
|
||||
//!
|
||||
//! Field WRITES land in Phase 3 alongside the cross-method field-flow
|
||||
//! consumer; they require careful handling of the synthetic
|
||||
//! base-update `Assign` instructions emitted by SSA lowering and are
|
||||
//! not load-bearing for Phase 1's "no behaviour change" gate.
|
||||
//! Flow-insensitive union-find over SSA values; field sensitivity comes
|
||||
//! from representing each `obj.f` access as a structural
|
||||
//! [`AbsLoc::Field`] keyed by `(parent_loc, field)`.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
|
|
@ -41,13 +16,9 @@ use super::domain::{AbsLoc, LOC_TOP, LocId, LocInterner, PointsToSet, PtrProxyHi
|
|||
/// in a small number of passes for any well-formed body.
|
||||
const MAX_FIXPOINT_ITERS: usize = 8;
|
||||
|
||||
/// Pointer-Phase 4: container-read callees that pull a single element
|
||||
/// out of a collection without indexing through a key. Recognised
|
||||
/// across the languages nyx supports as a cross-cut surface — exact
|
||||
/// per-language specialisation is intentionally skipped for the
|
||||
/// minimum-viable rollout. The receiver-side projection through
|
||||
/// [`FieldId::ELEM`] is conservative: a callee not in this list still
|
||||
/// gets the existing fresh-alloc behaviour and does not lose precision.
|
||||
/// Container-read callees that pull a single element out of a
|
||||
/// collection without a key. Cross-language; non-listed callees still
|
||||
/// get fresh-alloc behaviour, so the list is conservative.
|
||||
fn is_container_read_callee(callee: &str) -> bool {
|
||||
let bare = match callee.rsplit_once('.') {
|
||||
Some((_, m)) => m,
|
||||
|
|
@ -67,19 +38,12 @@ fn is_container_read_callee(callee: &str) -> bool {
|
|||
| "dequeue"
|
||||
| "remove"
|
||||
| "popleft"
|
||||
// Pointer-Phase 6 / W5: synthetic callee emitted by CFG
|
||||
// lowering for subscript / index-expression reads
|
||||
// (`arr[i]`, `map[k]`, `cmds[0]`).
|
||||
// synthetic callee for subscript reads (`arr[i]`, `map[k]`)
|
||||
| "__index_get__"
|
||||
)
|
||||
}
|
||||
|
||||
/// Pointer-Phase 4: container-write callees that store an element into
|
||||
/// a collection. Mirror of [`is_container_read_callee`]. The pointer
|
||||
/// analysis itself doesn't track stored values (the Steensgaard
|
||||
/// receiver/result aliasing already covers the common cases), but the
|
||||
/// helper is exposed so the taint engine's ELEM-cell write hook can
|
||||
/// share a single classifier with the points-to pass.
|
||||
/// Container-write callees, mirror of [`is_container_read_callee`].
|
||||
pub fn is_container_write_callee(callee: &str) -> bool {
|
||||
let bare = match callee.rsplit_once('.') {
|
||||
Some((_, m)) => m,
|
||||
|
|
@ -97,37 +61,34 @@ pub fn is_container_write_callee(callee: &str) -> bool {
|
|||
| "insert"
|
||||
| "enqueue"
|
||||
| "unshift"
|
||||
// Pointer-Phase 6 / W5: synthetic callee emitted by CFG
|
||||
// lowering for subscript / index-expression writes
|
||||
// (`arr[i] = v`, `map[k] = v`).
|
||||
// synthetic callee for subscript writes (`arr[i] = v`, `map[k] = v`)
|
||||
| "__index_set__"
|
||||
)
|
||||
}
|
||||
|
||||
/// Pointer-Phase 4: callee-name aware container-read recognition.
|
||||
/// Public for unit tests + reuse from the taint engine.
|
||||
/// Public re-export of [`is_container_read_callee`] for the taint engine.
|
||||
pub fn is_container_read_callee_pub(callee: &str) -> bool {
|
||||
is_container_read_callee(callee)
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5: derive a [`crate::summary::points_to::FieldPointsToSummary`]
|
||||
/// from per-body points-to facts.
|
||||
/// Derive a [`crate::summary::points_to::FieldPointsToSummary`] from
|
||||
/// per-body points-to facts.
|
||||
///
|
||||
/// Records two channels:
|
||||
///
|
||||
/// 1. **Reads** — walks every [`SsaOp::FieldProj`] in the body; for
|
||||
/// 1. **Reads**, walks every [`SsaOp::FieldProj`] in the body; for
|
||||
/// each `loc ∈ pt(receiver)` that resolves to a parameter
|
||||
/// location ([`AbsLoc::Param`] / [`AbsLoc::SelfParam`]), records
|
||||
/// the projected field name into the summary's
|
||||
/// `param_field_reads`.
|
||||
/// 2. **Writes** — walks the body's [`SsaBody::field_writes`] side-
|
||||
/// 2. **Writes**, walks the body's [`SsaBody::field_writes`] side-
|
||||
/// table (populated by SSA lowering's W1 synth-Assign hook) and
|
||||
/// records each `(receiver, FieldId)` pair against the receiver's
|
||||
/// pt set the same way reads are recorded.
|
||||
///
|
||||
/// Field name resolution goes through the body's
|
||||
/// [`SsaBody::field_interner`] because [`crate::ssa::ir::FieldId`]
|
||||
/// is body-local — names are the only stable cross-file identity.
|
||||
/// is body-local, names are the only stable cross-file identity.
|
||||
///
|
||||
/// Receiver (`SelfParam`) reads/writes are recorded under the
|
||||
/// [`u32::MAX`] sentinel parameter index, mirroring the convention in
|
||||
|
|
@ -226,7 +187,7 @@ pub fn extract_field_points_to(
|
|||
/// Per-body points-to result.
|
||||
///
|
||||
/// Owns the body-local [`LocInterner`] and a flat `SsaValue → PointsToSet`
|
||||
/// table. The table is dense — one slot per SSA value — so lookups
|
||||
/// table. The table is dense, one slot per SSA value, so lookups
|
||||
/// are O(1).
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PointsToFacts {
|
||||
|
|
@ -242,7 +203,7 @@ pub struct PointsToFacts {
|
|||
}
|
||||
|
||||
impl PointsToFacts {
|
||||
/// Empty result — every value points to nothing. Used by callers
|
||||
/// Empty result, every value points to nothing. Used by callers
|
||||
/// that need a "no facts" placeholder when the analysis is
|
||||
/// disabled or the body could not be analysed.
|
||||
pub fn empty(body: BodyId) -> Self {
|
||||
|
|
@ -288,11 +249,6 @@ impl PointsToFacts {
|
|||
/// [`PtrProxyHint::FieldOnly`] iff every member is an
|
||||
/// [`AbsLoc::Field`].
|
||||
///
|
||||
/// Phase 2 consumer: the resource-lifecycle proxy attribution in
|
||||
/// `state::transfer.rs` uses `FieldOnly` to recognise locals like
|
||||
/// `m` in `m := c.mu` and route the proxy entry through
|
||||
/// `chain_proxies` instead of marking the local as a leakable
|
||||
/// SymbolId-keyed resource.
|
||||
pub fn proxy_hint(&self, v: SsaValue) -> PtrProxyHint {
|
||||
let set = self.pt(v);
|
||||
if set.is_empty() || set.is_top() {
|
||||
|
|
@ -310,7 +266,7 @@ impl PointsToFacts {
|
|||
/// Build a `var_name → PtrProxyHint` map by scanning the body's
|
||||
/// value defs for the latest definition of each named variable.
|
||||
/// Names that resolve to no variable, or whose latest definition is
|
||||
/// `Other`, are omitted — only `FieldOnly` entries appear.
|
||||
/// `Other`, are omitted, only `FieldOnly` entries appear.
|
||||
///
|
||||
/// Iterates over [`SsaBody::value_defs`] in *reverse* order so the
|
||||
/// last (post-renaming) SSA definition for each name wins. Used by
|
||||
|
|
@ -340,13 +296,13 @@ impl PointsToFacts {
|
|||
/// Analyse a single body and return its [`PointsToFacts`].
|
||||
///
|
||||
/// `body_id` is used as the disambiguator inside the abstract
|
||||
/// locations — supplying a stable id (e.g. the file's
|
||||
/// locations, supplying a stable id (e.g. the file's
|
||||
/// `BodyMeta.id`) lets callers compare facts emitted by different
|
||||
/// bodies in the same file.
|
||||
pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts {
|
||||
let mut state = AnalysisState::new(body_id, body.num_values());
|
||||
|
||||
// Pass 1 — emit constraints from ops that don't depend on
|
||||
// Pass 1, emit constraints from ops that don't depend on
|
||||
// representative resolution (Param, SelfParam, Call result,
|
||||
// etc.). These produce the "leaf" points-to sets.
|
||||
for block in &body.blocks {
|
||||
|
|
@ -355,7 +311,7 @@ pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts {
|
|||
}
|
||||
}
|
||||
|
||||
// Pass 2+ — propagate through field projections, phis, and
|
||||
// Pass 2+, propagate through field projections, phis, and
|
||||
// assignments until a fixpoint. Field projections need iteration
|
||||
// because a `FieldProj` whose receiver's representative changes
|
||||
// (via a later unification) must re-emit its constraint with the
|
||||
|
|
@ -377,7 +333,7 @@ pub fn analyse_body(body: &SsaBody, body_id: BodyId) -> PointsToFacts {
|
|||
|
||||
// ── Constraint solver internals ────────────────────────────────────
|
||||
|
||||
/// Mutable analysis state — the interner, points-to table, and
|
||||
/// Mutable analysis state, the interner, points-to table, and
|
||||
/// union-find arrays. Lives inside `analyse_body` only.
|
||||
struct AnalysisState {
|
||||
/// Body-id forwarded to [`PointsToFacts::body`] when the analysis
|
||||
|
|
@ -457,7 +413,7 @@ impl AnalysisState {
|
|||
|
||||
/// `pt(rep_a) ∪= pt(rep_b)`. Caller is responsible for passing
|
||||
/// already-resolved representatives if it wants Steensgaard
|
||||
/// unification — see `union` for that.
|
||||
/// unification, see `union` for that.
|
||||
fn copy_pt(&mut self, dst: u32, src: u32) -> bool {
|
||||
let dr = self.find(dst);
|
||||
let sr = self.find(src);
|
||||
|
|
@ -486,7 +442,7 @@ impl AnalysisState {
|
|||
self.add_loc(v, loc);
|
||||
}
|
||||
SsaOp::CatchParam => {
|
||||
// Exception bindings come from the runtime — model as
|
||||
// Exception bindings come from the runtime, model as
|
||||
// an opaque allocation-site keyed by the SSA value.
|
||||
let loc = self.interner.intern_alloc(body_id, v);
|
||||
self.add_loc(v, loc);
|
||||
|
|
@ -494,14 +450,14 @@ impl AnalysisState {
|
|||
SsaOp::Call {
|
||||
callee, receiver, ..
|
||||
} => {
|
||||
// Pointer-Phase 4: container element retrieval ops
|
||||
// container element retrieval ops
|
||||
// (`shift`, `pop`, `peek`, `front`, …) project through
|
||||
// the abstract `Field(pt(receiver), ELEM)` cell so
|
||||
// per-element taint flows independently of the SSA
|
||||
// value referencing the container. The receiver's
|
||||
// points-to set may not be fully resolved on this
|
||||
// pass, so we *also* add a fresh allocation site as a
|
||||
// fallback — the fixpoint pass below absorbs the
|
||||
// fallback, the fixpoint pass below absorbs the
|
||||
// proper Field projection once the receiver's set
|
||||
// converges.
|
||||
let loc = self.interner.intern_alloc(body_id, v);
|
||||
|
|
@ -538,7 +494,7 @@ impl AnalysisState {
|
|||
}
|
||||
}
|
||||
SsaOp::FieldProj { .. } => {
|
||||
// Resolved during the fixpoint pass — see
|
||||
// Resolved during the fixpoint pass, see
|
||||
// `propagate_inst`.
|
||||
}
|
||||
SsaOp::Source | SsaOp::Const(_) | SsaOp::Nop | SsaOp::Undef => {
|
||||
|
|
@ -548,7 +504,7 @@ impl AnalysisState {
|
|||
}
|
||||
|
||||
/// Fixpoint-pass transfer. Re-runs constraints whose result
|
||||
/// depends on the current set of representatives — i.e. field
|
||||
/// depends on the current set of representatives, i.e. field
|
||||
/// projections, phis, and assignments may need to absorb new
|
||||
/// members emitted after the first pass. Returns `true` when
|
||||
/// any points-to set changed.
|
||||
|
|
@ -608,7 +564,7 @@ impl AnalysisState {
|
|||
}
|
||||
|
||||
/// Materialise the dense `SsaValue → PointsToSet` table. Each
|
||||
/// value's set is the set of its representative — values in the
|
||||
/// value's set is the set of its representative, values in the
|
||||
/// same Steensgaard class share the same set.
|
||||
fn into_facts(mut self) -> PointsToFacts {
|
||||
let mut by_value = Vec::with_capacity(self.pt.len());
|
||||
|
|
@ -714,7 +670,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `let c = self; let m = c.mu;` — pt(m) must be `{Field(SelfParam, mu)}`,
|
||||
/// `let c = self; let m = c.mu;` , pt(m) must be `{Field(SelfParam, mu)}`,
|
||||
/// distinct from pt(c) = `{SelfParam}`.
|
||||
#[test]
|
||||
fn field_subobject_distinct_from_receiver() {
|
||||
|
|
@ -762,7 +718,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// `let y = x;` — y and x share the same points-to set.
|
||||
/// `let y = x;` , y and x share the same points-to set.
|
||||
#[test]
|
||||
fn copy_propagation_unifies() {
|
||||
let mut b = BodyBuilder::new();
|
||||
|
|
@ -783,7 +739,7 @@ mod tests {
|
|||
assert!(!facts.pt(y).is_empty());
|
||||
}
|
||||
|
||||
/// `if (cond) z = a; else z = b;` — phi at the merge unifies
|
||||
/// `if (cond) z = a; else z = b;` , phi at the merge unifies
|
||||
/// `pt(z)` with both `pt(a)` and `pt(b)`.
|
||||
#[test]
|
||||
fn phi_unifies_branches() {
|
||||
|
|
@ -793,7 +749,7 @@ mod tests {
|
|||
let b_v = b.fresh(Some("b"));
|
||||
b.emit(b_v, SsaOp::Param { index: 1 }, Some("b"));
|
||||
|
||||
// Phi(0: a, 0: b) — predecessor block ids are placeholders.
|
||||
// Phi(0: a, 0: b), predecessor block ids are placeholders.
|
||||
let z = b.fresh(Some("z"));
|
||||
b.emit(
|
||||
z,
|
||||
|
|
@ -812,7 +768,7 @@ mod tests {
|
|||
assert_eq!(pt_z.len(), 2);
|
||||
}
|
||||
|
||||
/// `node = node.next;` — the `FieldProj` self-cycle must
|
||||
/// `node = node.next;`, the `FieldProj` self-cycle must
|
||||
/// terminate via the union-find / depth bound, not loop.
|
||||
#[test]
|
||||
fn self_referential_field_chain_terminates() {
|
||||
|
|
@ -847,7 +803,7 @@ mod tests {
|
|||
let facts = analyse_body(&body, body_id());
|
||||
let pt_node = facts.pt(node);
|
||||
// Either we converge to a non-empty set including a Field chain,
|
||||
// or we saturate to Top — either is a valid termination outcome.
|
||||
// or we saturate to Top, either is a valid termination outcome.
|
||||
assert!(!pt_node.is_empty());
|
||||
}
|
||||
|
||||
|
|
@ -864,7 +820,7 @@ mod tests {
|
|||
assert!(facts.pt(s).is_empty());
|
||||
}
|
||||
|
||||
/// `Call` produces a fresh allocation-site location for its result —
|
||||
/// `Call` produces a fresh allocation-site location for its result ,
|
||||
/// distinct from its arguments.
|
||||
#[test]
|
||||
fn call_result_is_fresh_alloc() {
|
||||
|
|
@ -901,7 +857,7 @@ mod tests {
|
|||
|
||||
/// Driver smoke-test: the analysis runs on an SsaBody produced by
|
||||
/// the real lowering pipeline without panicking. This pins the
|
||||
/// "no behaviour change" gate — analysis runs to completion on
|
||||
/// "no behaviour change" gate, analysis runs to completion on
|
||||
/// representative input.
|
||||
#[test]
|
||||
fn smoke_runs_on_lowered_body() {
|
||||
|
|
@ -929,12 +885,10 @@ mod tests {
|
|||
assert!(facts.is_trivial());
|
||||
assert_eq!(facts.len(), 0);
|
||||
|
||||
// Suppress unused-import warning for `Cfg` — it's exposed for
|
||||
// future Phase 1.b tests that need a real CFG.
|
||||
let _ = std::marker::PhantomData::<Cfg>;
|
||||
}
|
||||
|
||||
/// Pointer-Phase 2 contract pin: a value defined by a `FieldProj`
|
||||
/// Contract pin: a value defined by a `FieldProj`
|
||||
/// classifies as [`PtrProxyHint::FieldOnly`]. Consumed by the
|
||||
/// resource-lifecycle pass to recognise field-aliased locals.
|
||||
#[test]
|
||||
|
|
@ -965,7 +919,7 @@ mod tests {
|
|||
assert_eq!(facts.proxy_hint(c), crate::pointer::PtrProxyHint::Other);
|
||||
}
|
||||
|
||||
/// Pointer-Phase 4: container-read callee classifier covers a
|
||||
/// container-read callee classifier covers a
|
||||
/// representative sample across nyx's languages. Pinned because
|
||||
/// the taint engine relies on the same classifier.
|
||||
#[test]
|
||||
|
|
@ -992,7 +946,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// Pointer-Phase 4: container-write classifier (mirror).
|
||||
/// container-write classifier (mirror).
|
||||
#[test]
|
||||
fn container_write_callee_classifier() {
|
||||
for c in [
|
||||
|
|
@ -1014,7 +968,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// Pointer-Phase 4: a `Call("shift", receiver=container)` projects
|
||||
/// a `Call("shift", receiver=container)` projects
|
||||
/// `Field(pt(container), ELEM)` into the result, alongside the
|
||||
/// fresh allocation site that fall-back paths still emit.
|
||||
#[test]
|
||||
|
|
@ -1023,7 +977,7 @@ mod tests {
|
|||
// `arr` is the parameter container.
|
||||
let arr = b.fresh(Some("arr"));
|
||||
b.emit(arr, SsaOp::Param { index: 0 }, Some("arr"));
|
||||
// `e := arr.shift()` — container read.
|
||||
// `e := arr.shift()`, container read.
|
||||
let e = b.fresh(Some("e"));
|
||||
b.emit(
|
||||
e,
|
||||
|
|
@ -1055,7 +1009,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5: `extract_field_points_to` records a field
|
||||
/// `extract_field_points_to` records a field
|
||||
/// READ on the parameter index when a `FieldProj` traces back to
|
||||
/// an `AbsLoc::Param`.
|
||||
#[test]
|
||||
|
|
@ -1064,7 +1018,7 @@ mod tests {
|
|||
// `obj` is parameter 0.
|
||||
let obj = b.fresh(Some("obj"));
|
||||
b.emit(obj, SsaOp::Param { index: 0 }, Some("obj"));
|
||||
// `let n = obj.name;` — field projection from a param.
|
||||
// `let n = obj.name;`, field projection from a param.
|
||||
let name_field = b.intern_field("name");
|
||||
let n = b.fresh(Some("n"));
|
||||
b.emit(
|
||||
|
|
@ -1088,7 +1042,7 @@ mod tests {
|
|||
assert!(entry.1.iter().any(|s| s == "name"));
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5 / W3: `extract_field_points_to` records field
|
||||
/// `extract_field_points_to` records field
|
||||
/// WRITES from the body's `field_writes` side-table populated by
|
||||
/// SSA lowering. A synth Assign whose receiver traces back to
|
||||
/// `AbsLoc::Param` produces a `param_field_writes` entry.
|
||||
|
|
@ -1124,7 +1078,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5 / W3: writes through the receiver (`this.f =
|
||||
/// writes through the receiver (`this.f =
|
||||
/// rhs`) are recorded under the same `u32::MAX` sentinel as
|
||||
/// reads.
|
||||
#[test]
|
||||
|
|
@ -1151,7 +1105,7 @@ mod tests {
|
|||
assert!(entry.1.iter().any(|s| s == "cache"));
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5 / W3: container-element writes (`<elem>`
|
||||
/// container-element writes (`<elem>`
|
||||
/// marker) flow through the same channel as named-field writes
|
||||
/// when the synth Assign carries `FieldId::ELEM`.
|
||||
#[test]
|
||||
|
|
@ -1180,7 +1134,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// Pointer-Phase 5: receiver projections are recorded under the
|
||||
/// receiver projections are recorded under the
|
||||
/// `u32::MAX` sentinel parameter index (mirror of
|
||||
/// `SsaFuncSummary::receiver_to_*`).
|
||||
#[test]
|
||||
|
|
@ -1233,7 +1187,7 @@ mod tests {
|
|||
assert!(is_container_write_callee("arr.__index_set__"));
|
||||
}
|
||||
|
||||
/// W5: regression guard — neither synth name should match the
|
||||
/// W5: regression guard, neither synth name should match the
|
||||
/// opposite predicate, otherwise the W2 read/write hooks would
|
||||
/// double-fire on the same call.
|
||||
#[test]
|
||||
|
|
@ -1245,10 +1199,10 @@ mod tests {
|
|||
#[test]
|
||||
fn name_proxy_hints_collects_field_only_locals() {
|
||||
let mut b = BodyBuilder::new();
|
||||
// `c` is the receiver — root location, hint=Other.
|
||||
// `c` is the receiver, root location, hint=Other.
|
||||
let c = b.fresh(Some("c"));
|
||||
b.emit(c, SsaOp::SelfParam, Some("c"));
|
||||
// `m := c.mu` — field projection, hint=FieldOnly.
|
||||
// `m := c.mu`, field projection, hint=FieldOnly.
|
||||
let mu = b.intern_field("mu");
|
||||
let m = b.fresh(Some("m"));
|
||||
b.emit(
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
//!
|
||||
//! Locations are interned to compact `LocId(u32)` handles so the
|
||||
//! union-find resolver can operate on dense integer keys. Field
|
||||
//! locations are keyed structurally by `(parent_loc_id, field_id)` —
|
||||
//! locations are keyed structurally by `(parent_loc_id, field_id)` ,
|
||||
//! interning a `Field(parent, f)` always returns the same `LocId` no
|
||||
//! matter how many times the same `(parent, f)` pair is requested.
|
||||
|
||||
|
|
@ -29,14 +29,14 @@ pub const MAX_POINTSTO_MEMBERS: usize = 16;
|
|||
/// Compact handle for an interned [`AbsLoc`].
|
||||
///
|
||||
/// All abstract locations referenced by a single body share one
|
||||
/// [`LocInterner`] — `LocId`s are only meaningful relative to that
|
||||
/// [`LocInterner`], `LocId`s are only meaningful relative to that
|
||||
/// interner. IDs are assigned densely from 0 and are stable for the
|
||||
/// lifetime of the interner so the union-find can index parent / rank
|
||||
/// arrays directly.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
pub struct LocId(pub u32);
|
||||
|
||||
/// Sentinel "anywhere" location. Always `LocId(0)` — the interner
|
||||
/// Sentinel "anywhere" location. Always `LocId(0)`, the interner
|
||||
/// reserves the first slot at construction so callers can compare
|
||||
/// against it cheaply.
|
||||
pub const LOC_TOP: LocId = LocId(0);
|
||||
|
|
@ -48,7 +48,7 @@ pub const LOC_TOP: LocId = LocId(0);
|
|||
/// is exceeded the chain folds to [`AbsLoc::Top`].
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum AbsLoc {
|
||||
/// "Anywhere" — the over-approximation used when precision is
|
||||
/// "Anywhere", the over-approximation used when precision is
|
||||
/// unrecoverable (e.g. a value sourced from outside the analysed
|
||||
/// body, or a points-to set that exceeded the cap).
|
||||
Top,
|
||||
|
|
@ -60,7 +60,7 @@ pub enum AbsLoc {
|
|||
/// file. The interned `u32` is the `SsaValue.0` of the call /
|
||||
/// constructor instruction.
|
||||
Alloc(BodyId, u32),
|
||||
/// Function parameter — the abstract identity of the value
|
||||
/// Function parameter, the abstract identity of the value
|
||||
/// supplied by the caller for parameter `index`. The receiver
|
||||
/// (`self` / `this`) uses [`AbsLoc::SelfParam`] instead.
|
||||
Param(BodyId, usize),
|
||||
|
|
@ -69,7 +69,7 @@ pub enum AbsLoc {
|
|||
/// receiver" sentinel index.
|
||||
SelfParam(BodyId),
|
||||
/// Heap field of a parent location: `parent.f`. `parent` is
|
||||
/// itself a [`LocId`] — chains of field accesses produce nested
|
||||
/// itself a [`LocId`], chains of field accesses produce nested
|
||||
/// `Field` locations. Depth is bounded by [`MAX_FIELD_DEPTH`].
|
||||
Field { parent: LocId, field: FieldId },
|
||||
}
|
||||
|
|
@ -130,7 +130,7 @@ impl LocInterner {
|
|||
}
|
||||
|
||||
/// Resolve a [`LocId`] back to its [`AbsLoc`]. Panics on out-of-
|
||||
/// range ids — only ids the interner produced are valid.
|
||||
/// range ids, only ids the interner produced are valid.
|
||||
#[inline]
|
||||
pub fn resolve(&self, id: LocId) -> &AbsLoc {
|
||||
&self.locs[id.0 as usize]
|
||||
|
|
@ -202,7 +202,7 @@ impl LocInterner {
|
|||
}
|
||||
|
||||
/// Coarse classification of a value's points-to set, used by consumers
|
||||
/// (Phase 2: resource lifecycle) that don't need full set membership but
|
||||
/// (Hierarchy: resource lifecycle) that don't need full set membership but
|
||||
/// do need to know "is this value's heap identity a *field* of some
|
||||
/// other value, or does it stand on its own?".
|
||||
///
|
||||
|
|
@ -213,7 +213,7 @@ impl LocInterner {
|
|||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum PtrProxyHint {
|
||||
/// Every member of the points-to set is an [`AbsLoc::Field`]. The
|
||||
/// value is a sub-object alias — e.g. `m` in `m := c.mu`.
|
||||
/// value is a sub-object alias, e.g. `m` in `m := c.mu`.
|
||||
FieldOnly,
|
||||
/// Anything else: the set is empty, contains a root location
|
||||
/// ([`AbsLoc::SelfParam`] / [`AbsLoc::Param`] / [`AbsLoc::Alloc`]),
|
||||
|
|
@ -242,7 +242,7 @@ impl Default for PointsToSet {
|
|||
}
|
||||
|
||||
impl PointsToSet {
|
||||
/// Empty set — the value points to nothing tracked by the
|
||||
/// Empty set, the value points to nothing tracked by the
|
||||
/// analysis (e.g. a scalar constant).
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
|
|
@ -257,7 +257,7 @@ impl PointsToSet {
|
|||
Self { ids }
|
||||
}
|
||||
|
||||
/// `{Top}` — the universal over-approximation.
|
||||
/// `{Top}`, the universal over-approximation.
|
||||
pub fn top() -> Self {
|
||||
Self::singleton(LOC_TOP)
|
||||
}
|
||||
|
|
@ -313,7 +313,7 @@ impl PointsToSet {
|
|||
}
|
||||
}
|
||||
|
||||
/// Set-union, in place. Returns `true` when `self` changed —
|
||||
/// Set-union, in place. Returns `true` when `self` changed ,
|
||||
/// the constraint solver uses the bit to decide whether the
|
||||
/// containing equivalence class needs another pass.
|
||||
pub fn union_in_place(&mut self, other: &PointsToSet) -> bool {
|
||||
|
|
|
|||
|
|
@ -1,24 +1,12 @@
|
|||
//! Field-sensitive Steensgaard alias / points-to analysis.
|
||||
//!
|
||||
//! Sibling pass to [`crate::ssa::heap`]. Where `heap.rs` tracks per-value
|
||||
//! container identity for taint propagation through container element
|
||||
//! abstractions, this module tracks **field-sensitive** points-to so the
|
||||
//! engine can distinguish a receiver from one of its sub-fields:
|
||||
//!
|
||||
//! - `c.mu.Lock()` — the lock is acquired on `Field(c, mu)`, not on `c`
|
||||
//! itself. Without this distinction the resource-lifecycle pass
|
||||
//! mis-attributes the acquire to the receiver and emits a spurious
|
||||
//! "leakable resource" finding (the gin / `context.go` FP class).
|
||||
//! - Cross-method field flow — method A writes `this.cache`, method B
|
||||
//! reads `this.cache`; both observe a shared abstract location
|
||||
//! `Field(SelfParam, cache)` only when fields have a stable identity
|
||||
//! independent of the parent value.
|
||||
//!
|
||||
//! Phase 1 of the rollout (this commit) ships the analysis but no
|
||||
//! consumer. Behaviour is unchanged whether `NYX_POINTER_ANALYSIS=1` is
|
||||
//! set or not — the analysis is opt-in and only computed when callers
|
||||
//! ask for it. Phase 2 (resource lifecycle) and Phase 3 (taint engine)
|
||||
//! will start consuming the resulting facts.
|
||||
//! Sibling to [`crate::ssa::heap`]: where heap tracks per-value
|
||||
//! container identity for element abstractions, this module tracks
|
||||
//! field-sensitive points-to so the engine can distinguish a receiver
|
||||
//! from a sub-field. `c.mu.Lock()` acquires on `Field(c, mu)`, not `c`,
|
||||
//! so the resource-lifecycle pass doesn't mis-attribute the acquire.
|
||||
//! Cross-method field flow (method A writes `this.cache`, method B
|
||||
//! reads it) observes the shared `Field(SelfParam, cache)` location.
|
||||
|
||||
pub mod analysis;
|
||||
pub mod domain;
|
||||
|
|
@ -29,12 +17,8 @@ pub use analysis::{
|
|||
};
|
||||
pub use domain::{AbsLoc, LocId, LocInterner, PointsToSet, PtrProxyHint};
|
||||
|
||||
/// Returns whether the field-sensitive pointer analysis is enabled at runtime.
|
||||
///
|
||||
/// Default: enabled (post-Phase-6 flip on 2026-04-26). Set
|
||||
/// `NYX_POINTER_ANALYSIS=0` (or `false`) to disable for one release
|
||||
/// cycle so customer scans can compare baselines. The env-var
|
||||
/// override is removed entirely in the next release.
|
||||
/// Returns whether the field-sensitive pointer analysis is enabled.
|
||||
/// Set `NYX_POINTER_ANALYSIS=0` (or `false`) to disable.
|
||||
#[inline]
|
||||
pub fn is_enabled() -> bool {
|
||||
!matches!(
|
||||
|
|
|
|||
24
src/rank.rs
24
src/rank.rs
|
|
@ -97,14 +97,14 @@ pub fn compute_attack_rank(diag: &Diag) -> AttackRank {
|
|||
// direction of precision loss is classified by
|
||||
// `EngineNote::direction()` and drives a bounded penalty:
|
||||
//
|
||||
// * `Bail` — analysis aborted on this body → -8.0
|
||||
// * `OverReport` — widening may have produced a false positive → -8.0
|
||||
// * `UnderReport` — fixpoint was cut short but this finding is
|
||||
// * `Bail` , analysis aborted on this body → -8.0
|
||||
// * `OverReport` , widening may have produced a false positive → -8.0
|
||||
// * `UnderReport`, fixpoint was cut short but this finding is
|
||||
// still a real flow → -3.0
|
||||
// * `Informational` — no penalty (cache reuse etc.)
|
||||
// * `Informational`, no penalty (cache reuse etc.)
|
||||
//
|
||||
// The penalty is the *worst* direction across all attached notes —
|
||||
// not additive — so a body with ten `OriginsTruncated` notes is not
|
||||
// The penalty is the *worst* direction across all attached notes ,
|
||||
// not additive, so a body with ten `OriginsTruncated` notes is not
|
||||
// ranked below a body with one `ParseTimeout`. Magnitudes are
|
||||
// chosen so that `High + capped` (60 − 8 = 52) still exceeds
|
||||
// `Medium + taint + UserInput` (30 + 10 + 6 = 46), preserving the
|
||||
|
|
@ -125,7 +125,7 @@ pub fn compute_attack_rank(diag: &Diag) -> AttackRank {
|
|||
///
|
||||
/// `None` when the finding has no evidence struct, no engine notes, or
|
||||
/// only informational notes. Uses `worst_direction` so the penalty is
|
||||
/// the single most credibility-damaging direction present — adding more
|
||||
/// the single most credibility-damaging direction present, adding more
|
||||
/// notes of the same direction does not compound the penalty.
|
||||
struct CompletenessPenalty {
|
||||
value: f64,
|
||||
|
|
@ -289,16 +289,16 @@ fn source_kind_priority(source_value: &str) -> f64 {
|
|||
// Strong user-input signals
|
||||
6.0
|
||||
} else if lower.contains("env") || lower.contains("var(") || lower.contains("getenv") {
|
||||
// Environment / config — still attacker-controllable in many deployments
|
||||
// Environment / config, still attacker-controllable in many deployments
|
||||
5.0
|
||||
} else if lower.contains("read") || lower.contains("file") || lower.contains("open") {
|
||||
// File system — needs indirect vector
|
||||
// File system, needs indirect vector
|
||||
3.0
|
||||
} else if lower.contains("query") || lower.contains("fetch") || lower.contains("select") {
|
||||
// Database — needs prior injection
|
||||
// Database, needs prior injection
|
||||
2.0
|
||||
} else {
|
||||
// Unknown / unrecognised — treat as moderately exploitable
|
||||
// Unknown / unrecognised, treat as moderately exploitable
|
||||
4.0
|
||||
}
|
||||
}
|
||||
|
|
@ -931,7 +931,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn completeness_penalty_is_not_additive_across_notes() {
|
||||
// Ten OriginsTruncated notes must produce the same penalty as one —
|
||||
// Ten OriginsTruncated notes must produce the same penalty as one ,
|
||||
// the penalty reflects the worst direction, not a count.
|
||||
let mut d_many = clean_diag_with_evidence();
|
||||
let many = (0..10)
|
||||
|
|
|
|||
|
|
@ -3,11 +3,11 @@
|
|||
//! This module is entirely Rust-flavored helpers for the cross-file call graph.
|
||||
//! Other languages do not need it. The two pieces are:
|
||||
//!
|
||||
//! * [`derive_module_path`] — given a Rust source file path and an optional
|
||||
//! * [`derive_module_path`], given a Rust source file path and an optional
|
||||
//! crate root, produce its canonical crate-relative module path
|
||||
//! (`src/foo/bar.rs` → `"foo::bar"`, `src/lib.rs` → `""`).
|
||||
//!
|
||||
//! * [`parse_rust_use_map`] — walk the top-level `use_declaration` nodes of a
|
||||
//! * [`parse_rust_use_map`], walk the top-level `use_declaration` nodes of a
|
||||
//! parsed tree and produce a [`RustUseMap`] mapping local aliases to fully
|
||||
//! qualified paths plus a list of wildcard imports.
|
||||
//!
|
||||
|
|
@ -27,7 +27,7 @@
|
|||
//! * Macro-expanded `use` statements
|
||||
//! * `pub use` re-exports across modules
|
||||
//! * `extern crate alias_name;`
|
||||
//! * Self-prefixed imports (`use self::sub::foo;`) — treated as `self::sub::foo`
|
||||
//! * Self-prefixed imports (`use self::sub::foo;`), treated as `self::sub::foo`
|
||||
//!
|
||||
//! These are flagged in the final pass-1 telemetry but do not block resolution.
|
||||
|
||||
|
|
@ -102,7 +102,7 @@ pub fn derive_module_path(file_path: &Path, scan_root: Option<&Path>) -> Option<
|
|||
let mut segments: Vec<&str> = rel.iter().filter_map(|s| s.to_str()).collect();
|
||||
|
||||
// Strip a leading `src` directory if present. Files outside `src/` (e.g.
|
||||
// tests, examples, build.rs) get a `None` here — we do not have a stable
|
||||
// tests, examples, build.rs) get a `None` here, we do not have a stable
|
||||
// module path for them and resolution should fall back to file-based.
|
||||
match segments.first().copied() {
|
||||
Some("src") => {
|
||||
|
|
@ -145,7 +145,7 @@ pub fn derive_module_path(file_path: &Path, scan_root: Option<&Path>) -> Option<
|
|||
/// [`RustUseMap`].
|
||||
///
|
||||
/// The walk only inspects direct children of the source root. Nested `use`s
|
||||
/// inside functions or impls are deliberately skipped — their scope is local
|
||||
/// inside functions or impls are deliberately skipped, their scope is local
|
||||
/// and does not influence the cross-file call graph at the module level.
|
||||
pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap {
|
||||
let mut map = RustUseMap::default();
|
||||
|
|
@ -160,7 +160,7 @@ pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap {
|
|||
Some(n) => n,
|
||||
None => {
|
||||
// tree-sitter-rust 0.24 sometimes exposes the body as a named
|
||||
// child instead of a field — fall back to the first named child.
|
||||
// child instead of a field, fall back to the first named child.
|
||||
match child.named_child(0) {
|
||||
Some(n) => n,
|
||||
None => continue,
|
||||
|
|
@ -179,7 +179,7 @@ pub fn parse_rust_use_map(src: &[u8], tree: &Tree) -> RustUseMap {
|
|||
/// `b::c` inside `a::{b::c}` is flattened to `a::b::c`).
|
||||
fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut RustUseMap) {
|
||||
match node.kind() {
|
||||
// `crate::auth::token::validate` — terminal scoped path, leaf is the alias.
|
||||
// `crate::auth::token::validate`, terminal scoped path, leaf is the alias.
|
||||
"scoped_identifier" => {
|
||||
let segments = scoped_segments(node, src);
|
||||
if segments.is_empty() {
|
||||
|
|
@ -191,7 +191,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
|
|||
map.aliases.insert(leaf, full);
|
||||
}
|
||||
}
|
||||
// `validate` — bare identifier (e.g. `use foo::validate`).
|
||||
// `validate`, bare identifier (e.g. `use foo::validate`).
|
||||
"identifier" => {
|
||||
let name = node_text(node, src).to_string();
|
||||
if name.is_empty() {
|
||||
|
|
@ -201,7 +201,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
|
|||
segs.push(name.clone());
|
||||
map.aliases.insert(name, segs.join("::"));
|
||||
}
|
||||
// `crate::auth::token::{validate, verify}` — left side is the prefix,
|
||||
// `crate::auth::token::{validate, verify}`, left side is the prefix,
|
||||
// right side is a use_list of further use clauses.
|
||||
"scoped_use_list" => {
|
||||
// path field carries the prefix; the list field carries the body.
|
||||
|
|
@ -239,7 +239,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
|
|||
collect_use_paths(c, src, prefix, map);
|
||||
}
|
||||
}
|
||||
// `crate::auth::token::validate as ok` — alias the leaf identifier.
|
||||
// `crate::auth::token::validate as ok`, alias the leaf identifier.
|
||||
"use_as_clause" => {
|
||||
let path_node = node
|
||||
.child_by_field_name("path")
|
||||
|
|
@ -256,7 +256,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
|
|||
map.aliases.insert(alias_name, full);
|
||||
}
|
||||
}
|
||||
// `crate::auth::token::*` — record the prefix as a wildcard import.
|
||||
// `crate::auth::token::*`, record the prefix as a wildcard import.
|
||||
"use_wildcard" => {
|
||||
// The wildcard's child is the path being wildcarded.
|
||||
let path_node = node.named_child(0);
|
||||
|
|
@ -270,7 +270,7 @@ fn collect_use_paths(node: Node<'_>, src: &[u8], prefix: &[String], map: &mut Ru
|
|||
}
|
||||
_ => {
|
||||
// Unknown/unsupported form (e.g. macro_invocation in use position,
|
||||
// attribute-prefixed clauses) — flag in pass-1 telemetry, skip
|
||||
// attribute-prefixed clauses), flag in pass-1 telemetry, skip
|
||||
// here to keep the walk total.
|
||||
}
|
||||
}
|
||||
|
|
@ -452,7 +452,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn module_path_no_cargo_toml_with_scan_root() {
|
||||
// No Cargo.toml anywhere — fall back to scan root.
|
||||
// No Cargo.toml anywhere, fall back to scan root.
|
||||
let dir = PathBuf::from("/tmp/nyx_mp_test_no_cargo");
|
||||
std::fs::create_dir_all(dir.join("src")).ok();
|
||||
// Make sure no Cargo.toml exists.
|
||||
|
|
@ -535,7 +535,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn use_map_malformed_does_not_panic() {
|
||||
// Truncated input — must not panic.
|
||||
// Truncated input, must not panic.
|
||||
let src = b"use crate::auth::";
|
||||
let tree = parse(std::str::from_utf8(src).unwrap());
|
||||
let _ = parse_rust_use_map(src, &tree);
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ pub struct AppState {
|
|||
pub findings_cache: Arc<RwLock<Option<CachedFindings>>>,
|
||||
}
|
||||
|
||||
/// 50 MiB cap on request bodies — generous for config uploads, tight
|
||||
/// 50 MiB cap on request bodies, generous for config uploads, tight
|
||||
/// enough to prevent OOM from a rogue client.
|
||||
const MAX_BODY_BYTES: usize = 50 * 1024 * 1024;
|
||||
|
||||
|
|
@ -286,7 +286,7 @@ mod tests {
|
|||
}
|
||||
|
||||
/// Panic inside a thread that holds a write guard on the shared config lock.
|
||||
/// With `parking_lot::RwLock`, the lock must remain usable afterwards —
|
||||
/// With `parking_lot::RwLock`, the lock must remain usable afterwards ,
|
||||
/// this is the poison-recovery contract we rely on in every route handler.
|
||||
#[tokio::test]
|
||||
async fn config_lock_survives_panic_in_write_guard() {
|
||||
|
|
|
|||
|
|
@ -782,7 +782,7 @@ pub struct FuncSummaryView {
|
|||
/// Enclosing container path (class / impl / module / outer function).
|
||||
/// Empty for free top-level functions.
|
||||
pub container: String,
|
||||
/// Structural [`crate::symbol::FuncKind`] slug — `"fn"`, `"method"`,
|
||||
/// Structural [`crate::symbol::FuncKind`] slug, `"fn"`, `"method"`,
|
||||
/// `"closure"`, etc. Lets the UI distinguish anonymous closures from
|
||||
/// named functions for filtering.
|
||||
pub func_kind: String,
|
||||
|
|
@ -934,10 +934,10 @@ pub struct PointerView {
|
|||
pub locations: Vec<PointerLocationView>,
|
||||
pub values: Vec<PointerValueView>,
|
||||
/// Field reads attributed to params/receiver via the field-points-to
|
||||
/// extractor (Phase 5).
|
||||
/// extractor.
|
||||
pub field_reads: Vec<PointerFieldEntryView>,
|
||||
/// Field writes attributed to params/receiver via the field-points-to
|
||||
/// extractor (Phase 5).
|
||||
/// extractor.
|
||||
pub field_writes: Vec<PointerFieldEntryView>,
|
||||
/// Number of distinct interned locations beyond the reserved Top sentinel.
|
||||
pub location_count: usize,
|
||||
|
|
@ -998,7 +998,7 @@ impl PointerView {
|
|||
});
|
||||
}
|
||||
|
||||
// Per-value pt sets — emit only values with non-empty sets to keep
|
||||
// Per-value pt sets, emit only values with non-empty sets to keep
|
||||
// the payload focused on interesting facts.
|
||||
let mut values: Vec<PointerValueView> = Vec::new();
|
||||
for v in 0..ssa.num_values() as u32 {
|
||||
|
|
@ -1064,12 +1064,12 @@ pub struct TypeFactDetailView {
|
|||
pub ssa_value: u32,
|
||||
pub var_name: Option<String>,
|
||||
pub line: usize,
|
||||
/// Type kind tag — matches the [`TypeKind`] discriminant
|
||||
/// Type kind tag, matches the [`TypeKind`] discriminant
|
||||
/// (`String`, `Int`, `HttpClient`, `Dto`, …).
|
||||
pub kind: String,
|
||||
/// True when the value is allowed to be null/None.
|
||||
pub nullable: bool,
|
||||
/// Container/class name — set for `HttpClient`, `DatabaseConnection`,
|
||||
/// Container/class name, set for `HttpClient`, `DatabaseConnection`,
|
||||
/// `Dto`, etc. Mirrors [`TypeKind::container_name`].
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub container: Option<String>,
|
||||
|
|
@ -1437,7 +1437,7 @@ pub fn function_list(analysis: &FileAnalysis) -> Vec<FunctionInfo> {
|
|||
/// Lower a single function to SSA and optimize it.
|
||||
///
|
||||
/// Returns the per-function body graph alongside the SSA. SSA is lowered
|
||||
/// against `body.graph`, whose `NodeIndex` space is body-local — the file's
|
||||
/// against `body.graph`, whose `NodeIndex` space is body-local, the file's
|
||||
/// top-level CFG (`analysis.cfg()`) has a different index space, so any
|
||||
/// downstream analysis that indexes by `inst.cfg_node` must use the returned
|
||||
/// `&Cfg`, not `analysis.cfg()`.
|
||||
|
|
@ -1638,7 +1638,7 @@ pub fn analyse_file_summaries(
|
|||
/// Run the file-level authorization extraction pipeline for the debug UI.
|
||||
///
|
||||
/// Returns the structured `AuthorizationModel` (routes, units, sensitive
|
||||
/// operations, auth checks) plus the file bytes and an `enabled` flag —
|
||||
/// operations, auth checks) plus the file bytes and an `enabled` flag ,
|
||||
/// the bytes drive line-number resolution in the view, and `enabled`
|
||||
/// surfaces "auth analysis is off for this language" without conflating
|
||||
/// it with an empty result.
|
||||
|
|
@ -1651,7 +1651,7 @@ pub fn analyse_file_auth(
|
|||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?
|
||||
.ok_or(StatusCode::BAD_REQUEST)?;
|
||||
// Determine whether the auth rules were actually enabled for this
|
||||
// file's language — `extract_auth_model_for_debug` returns an empty
|
||||
// file's language, `extract_auth_model_for_debug` returns an empty
|
||||
// model both when the rules are disabled and when the file just
|
||||
// happens to have no routes. The view distinguishes the two so the
|
||||
// UI can show "analysis disabled" instead of "no routes found".
|
||||
|
|
@ -2122,7 +2122,7 @@ fn main() {
|
|||
// Belt-and-suspenders: assert that calling with the wrong (top-level)
|
||||
// CFG would have panicked. We can't catch the panic across rayon
|
||||
// worker threads here, but we can confirm at least one `inst.cfg_node`
|
||||
// index lies outside `analysis.cfg()`'s range — that's what triggers
|
||||
// index lies outside `analysis.cfg()`'s range, that's what triggers
|
||||
// the OOB indexing inside `transfer_inst`.
|
||||
let toplevel_count = analysis.cfg().node_count();
|
||||
let max_inst_idx = ssa
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
//! Health-score scoring engine — v3.5.
|
||||
//! Health-score scoring engine, v3.5.
|
||||
//!
|
||||
//! Pure-function scoring over a `HealthInputs` struct. Documented in
|
||||
//! `docs/health-score-audit.md` (calibration, rationale) and
|
||||
|
|
@ -15,7 +15,7 @@
|
|||
//!
|
||||
//! 2. **HIGH-count guardrails.** The *qualitative* axis: HIGH counts
|
||||
//! cap the maximum grade and floor "no HIGH" to at least C. These
|
||||
//! are non-negotiable promises — even a perfect-everywhere-else
|
||||
//! are non-negotiable promises, even a perfect-everywhere-else
|
||||
//! repo with 6 confirmed HIGHs grades F.
|
||||
//!
|
||||
//! Modifiers (triage, trend, stale, regression, suppression hygiene)
|
||||
|
|
@ -27,17 +27,17 @@
|
|||
//! * Verdict-weighted credibility (`Confirmed > NotAttempted >
|
||||
//! Inconclusive > Infeasible`). This is the structural protection
|
||||
//! against false-positive-driven F grades while the scanner is
|
||||
//! still maturing — it auto-tightens as symex coverage grows.
|
||||
//! still maturing, it auto-tightens as symex coverage grows.
|
||||
//! * Cross-file vs intra-file vs AST-only weighting via
|
||||
//! `context_factor`.
|
||||
//! * Test-path downweight (0.3×) — a HIGH in a test fixture is
|
||||
//! * Test-path downweight (0.3×), a HIGH in a test fixture is
|
||||
//! genuinely less concerning than one in a request handler.
|
||||
//! * Effective HIGH count for ceilings — the HIGH-count caps key on
|
||||
//! * Effective HIGH count for ceilings, the HIGH-count caps key on
|
||||
//! credibility-adjusted HIGHs, not raw HIGHs. A repo with 5
|
||||
//! low-confidence HIGHs that got `NotAttempted` from symex doesn't
|
||||
//! pay the same ceiling cost as a repo with 5 `Confirmed` HIGHs.
|
||||
//! * Tighter modifier ranges so they can't flip a band.
|
||||
//! * No `parse_success_rate` (it's actually a cache-miss metric —
|
||||
//! * No `parse_success_rate` (it's actually a cache-miss metric ,
|
||||
//! see `project_parse_success_rate_misnomer.md`).
|
||||
|
||||
use crate::commands::scan::Diag;
|
||||
|
|
@ -48,11 +48,11 @@ use crate::server::models::{BacklogStats, FindingSummary, HealthComponent, Healt
|
|||
// ── Tunables ─────────────────────────────────────────────────────────────────
|
||||
//
|
||||
// Calibrated for v0.5.0 scanner FP rate. As Nyx symex coverage and
|
||||
// rule precision improve, the HIGH ceilings should tighten — see
|
||||
// rule precision improve, the HIGH ceilings should tighten, see
|
||||
// `docs/health-score-audit.md` "Calibration trajectory" for the
|
||||
// roadmap.
|
||||
|
||||
/// Below this file count, we floor the size divisor at 1.0 — tiny
|
||||
/// Below this file count, we floor the size divisor at 1.0, tiny
|
||||
/// repos can't claim infinite per-LOC dilution from one finding.
|
||||
const FILES_FLOOR: f64 = 100.0;
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ const QUALITY_DRAG_PER_FINDING: f64 = 0.05;
|
|||
const QUALITY_DRAG_CAP: f64 = 15.0;
|
||||
|
||||
/// Below this finding count, the Triage component contributes
|
||||
/// weight 0 — we don't punish fresh users for not having triaged
|
||||
/// weight 0, we don't punish fresh users for not having triaged
|
||||
/// what didn't need triaging.
|
||||
const TRIAGE_FLOOR: usize = 20;
|
||||
|
||||
|
|
@ -77,7 +77,7 @@ const STALE_PENALTY_CAP: f64 = 10.0;
|
|||
// ── Public API ───────────────────────────────────────────────────────────────
|
||||
|
||||
/// Pure inputs to the health-score calculation. No app state, no DB
|
||||
/// handles — those upstream concerns are flattened into primitives the
|
||||
/// handles, those upstream concerns are flattened into primitives the
|
||||
/// scorer actually consumes.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct HealthInputs<'a> {
|
||||
|
|
@ -120,7 +120,7 @@ pub fn compute(inp: &HealthInputs<'_>) -> HealthScore {
|
|||
let quality_drag = quality_drag(weighted.quality_count);
|
||||
let base_after_drag = (base_score - quality_drag).clamp(0.0, 100.0);
|
||||
|
||||
// Step 5: HIGH-count guardrails — keyed on *effective* HIGH count
|
||||
// Step 5: HIGH-count guardrails, keyed on *effective* HIGH count
|
||||
// (credibility-weighted), not raw count. This is what protects
|
||||
// users from FP-driven F grades while the scanner is maturing.
|
||||
let ceiling = high_total_ceiling(weighted.effective_high);
|
||||
|
|
@ -161,9 +161,9 @@ struct WeightedAggregate {
|
|||
/// context_factor` across security findings. Quality lints are
|
||||
/// handled separately via `quality_drag`.
|
||||
raw_weight: f64,
|
||||
/// Number of `*.quality.*` findings — drives `quality_drag`.
|
||||
/// Number of `*.quality.*` findings, drives `quality_drag`.
|
||||
quality_count: usize,
|
||||
/// Credibility-adjusted HIGH count (rounded) — drives the HIGH
|
||||
/// Credibility-adjusted HIGH count (rounded), drives the HIGH
|
||||
/// ceiling and floor. A low-confidence + Inconclusive HIGH might
|
||||
/// contribute 0.2; five of them would round to 1.
|
||||
effective_high: usize,
|
||||
|
|
@ -171,10 +171,10 @@ struct WeightedAggregate {
|
|||
raw_high: usize,
|
||||
raw_medium: usize,
|
||||
raw_low_security: usize,
|
||||
/// Confidence rate (high+medium*0.5)/total — drives the
|
||||
/// Confidence rate (high+medium*0.5)/total, drives the
|
||||
/// confidence component. 100 if no findings.
|
||||
confidence_rate: f64,
|
||||
/// Symex coverage — % of taint findings with any non-NotAttempted
|
||||
/// Symex coverage, % of taint findings with any non-NotAttempted
|
||||
/// verdict. Surfaced in component detail; not currently in score.
|
||||
symex_coverage: f64,
|
||||
}
|
||||
|
|
@ -218,7 +218,7 @@ fn aggregate_findings(findings: &[Diag]) -> WeightedAggregate {
|
|||
_ => 0.0,
|
||||
};
|
||||
|
||||
// Symex coverage tracking — only meaningful for findings with
|
||||
// Symex coverage tracking, only meaningful for findings with
|
||||
// taint-flow evidence (the ones symex even attempts).
|
||||
if let Some(ev) = f.evidence.as_ref()
|
||||
&& ev.symbolic.is_some()
|
||||
|
|
@ -294,7 +294,7 @@ fn context_factor(f: &Diag) -> f64 {
|
|||
return 0.3;
|
||||
}
|
||||
let Some(ev) = f.evidence.as_ref() else {
|
||||
return 0.75; // No evidence at all — pattern match
|
||||
return 0.75; // No evidence at all, pattern match
|
||||
};
|
||||
if ev.flow_steps.is_empty() {
|
||||
return 0.75;
|
||||
|
|
@ -351,7 +351,7 @@ fn quality_drag(quality_count: usize) -> f64 {
|
|||
(quality_count as f64 * QUALITY_DRAG_PER_FINDING).min(QUALITY_DRAG_CAP)
|
||||
}
|
||||
|
||||
// ── HIGH guardrails — calibrated for v0.5.0 FP rate ──────────────────────────
|
||||
// ── HIGH guardrails, calibrated for v0.5.0 FP rate ──────────────────────────
|
||||
|
||||
/// Final-score ceiling keyed on *effective* HIGH count (credibility-
|
||||
/// weighted, not raw). See module docstring for the rationale.
|
||||
|
|
@ -398,7 +398,7 @@ fn build_components(
|
|||
let sev_score = base_after_drag.round().clamp(0.0, 100.0) as u8;
|
||||
let sev_detail = severity_detail(weighted, size_divisor, inp.repo_files, inp.backlog);
|
||||
|
||||
// Confidence component — high-conf rate scaled into 0..=100.
|
||||
// Confidence component, high-conf rate scaled into 0..=100.
|
||||
let conf_score = weighted.confidence_rate.round().clamp(0.0, 100.0) as u8;
|
||||
let conf_detail = format!(
|
||||
"High-confidence rate {:.0}% across {} security finding{}",
|
||||
|
|
@ -407,7 +407,7 @@ fn build_components(
|
|||
plural_s(total - weighted.quality_count)
|
||||
);
|
||||
|
||||
// Trend component — only contributes weight when has_history.
|
||||
// Trend component, only contributes weight when has_history.
|
||||
let net = inp.fixed_since_last as i64 - inp.new_since_last as i64;
|
||||
let trend_score = (50 + net * 5).clamp(0, 100) as u8;
|
||||
let trend_weight = if inp.has_history { 0.20 } else { 0.0 };
|
||||
|
|
@ -420,7 +420,7 @@ fn build_components(
|
|||
"Not applicable: no prior scan to compare against (re-scan to populate)".into()
|
||||
};
|
||||
|
||||
// Triage — drops out when total < TRIAGE_FLOOR.
|
||||
// Triage, drops out when total < TRIAGE_FLOOR.
|
||||
let triage_active = total >= TRIAGE_FLOOR;
|
||||
let triage_score = (inp.triage_coverage * 100.0).round().clamp(0.0, 100.0) as u8;
|
||||
let triage_weight = if triage_active { 0.20 } else { 0.0 };
|
||||
|
|
@ -470,7 +470,7 @@ fn build_components(
|
|||
HealthComponent {
|
||||
label: "Severity pressure".into(),
|
||||
score: sev_score,
|
||||
weight: 1.0, // Severity is the *base*, not a modifier — full weight in the blend.
|
||||
weight: 1.0, // Severity is the *base*, not a modifier, full weight in the blend.
|
||||
detail: sev_detail,
|
||||
},
|
||||
HealthComponent {
|
||||
|
|
@ -770,7 +770,7 @@ mod tests {
|
|||
.collect();
|
||||
let s = summary_of(&findings);
|
||||
let h = compute(&first_scan(&s, &findings, 0.0, 100));
|
||||
// The score reflects credibility — should NOT crater to F.
|
||||
// The score reflects credibility, should NOT crater to F.
|
||||
assert!(
|
||||
h.score >= 60,
|
||||
"low-credibility HIGHs shouldn't crater to F, got {}",
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ pub struct JobManager {
|
|||
job_order: Mutex<Vec<String>>,
|
||||
active_job_id: Mutex<Option<String>>,
|
||||
max_jobs: usize,
|
||||
/// Dedicated rayon pool for scans — keeps the global pool (and tokio
|
||||
/// Dedicated rayon pool for scans, keeps the global pool (and tokio
|
||||
/// worker threads) free so the web UI stays responsive during a scan.
|
||||
scan_pool: rayon::ThreadPool,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -632,7 +632,7 @@ pub struct HealthScore {
|
|||
pub struct HealthComponent {
|
||||
/// Human label (e.g. "Severity pressure", "Trend", "Triage").
|
||||
pub label: String,
|
||||
/// 0–100 — already inverted so higher = healthier.
|
||||
/// 0–100, already inverted so higher = healthier.
|
||||
pub score: u8,
|
||||
/// Weight applied when blending into the final score (0.0–1.0).
|
||||
pub weight: f64,
|
||||
|
|
@ -662,7 +662,7 @@ pub struct BacklogStats {
|
|||
pub median_age_days: Option<u32>,
|
||||
/// Findings older than 30 days that remain open.
|
||||
pub stale_count: usize,
|
||||
/// Histogram buckets (label, count) — fixed 5 buckets.
|
||||
/// Histogram buckets (label, count), fixed 5 buckets.
|
||||
pub age_buckets: Vec<OverviewCount>,
|
||||
}
|
||||
|
||||
|
|
@ -691,12 +691,12 @@ pub struct ConfidenceDistribution {
|
|||
pub struct ScannerQuality {
|
||||
pub files_scanned: u64,
|
||||
pub files_skipped: u64,
|
||||
/// 0.0–1.0 — files_scanned / (files_scanned + files_skipped).
|
||||
/// 0.0–1.0, files_scanned / (files_scanned + files_skipped).
|
||||
pub parse_success_rate: f64,
|
||||
pub functions_analyzed: u64,
|
||||
pub call_edges: u64,
|
||||
pub unresolved_calls: u64,
|
||||
/// 0.0–1.0 — call_edges / (call_edges + unresolved_calls).
|
||||
/// 0.0–1.0, call_edges / (call_edges + unresolved_calls).
|
||||
pub call_resolution_rate: f64,
|
||||
/// % of taint findings that received a symbolic verdict (Confirmed|Infeasible|Inconclusive).
|
||||
pub symex_verified_rate: f64,
|
||||
|
|
@ -712,7 +712,7 @@ pub struct IssueCategoryBucket {
|
|||
pub count: usize,
|
||||
}
|
||||
|
||||
/// "Hot sink" — a single callee that absorbs many findings.
|
||||
/// "Hot sink", a single callee that absorbs many findings.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct HotSink {
|
||||
/// Callee name (best-effort; from flow_steps last Sink).
|
||||
|
|
@ -723,7 +723,7 @@ pub struct HotSink {
|
|||
/// One OWASP Top-10 (2021) bucket.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct OwaspBucket {
|
||||
/// "A01:2021 — Broken Access Control" etc.
|
||||
/// "A01:2021, Broken Access Control" etc.
|
||||
pub code: String,
|
||||
pub label: String,
|
||||
pub count: usize,
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ pub async fn observe(mut request: Request, next: Next) -> Response {
|
|||
response.headers_mut().insert(REQUEST_ID_HEADER, value);
|
||||
}
|
||||
|
||||
// Skip noisy SSE channel — long-lived stream pollutes logs.
|
||||
// Skip noisy SSE channel, long-lived stream pollutes logs.
|
||||
if path != "/api/events" {
|
||||
if status.is_server_error() {
|
||||
tracing::error!(
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
//! Static rule-id → OWASP Top-10 (2021) mapping for the dashboard.
|
||||
//!
|
||||
//! Rule IDs follow the convention `{lang}.{family}.{name}` (e.g. `js.xss.outer_html`).
|
||||
//! The family segment is what determines the bucket. Conservative — when in doubt,
|
||||
//! The family segment is what determines the bucket. Conservative, when in doubt,
|
||||
//! map to the closest fit; rules with no obvious bucket are left unbucketed.
|
||||
|
||||
use crate::server::models::OwaspBucket;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Extract the family token from a rule ID. Handles two ID shapes:
|
||||
/// 1. `lang.family.name` — typical (e.g. `js.xss.outer_html`)
|
||||
/// 2. `family-subname` or single-segment — engine-emitted (e.g.
|
||||
/// 1. `lang.family.name`, typical (e.g. `js.xss.outer_html`)
|
||||
/// 2. `family-subname` or single-segment, engine-emitted (e.g.
|
||||
/// `state-resource-leak`, `taint-unsanitised-flow`, `cfg-error-fallthrough`)
|
||||
fn extract_family(rule_id: &str) -> &str {
|
||||
if let Some(idx) = rule_id.find('.') {
|
||||
|
|
@ -33,23 +33,23 @@ pub fn owasp_bucket_for(rule_id: &str) -> Option<(&'static str, &'static str)> {
|
|||
}
|
||||
|
||||
Some(match family {
|
||||
// A01 — Broken Access Control
|
||||
// A01, Broken Access Control
|
||||
"auth" | "csrf" | "mass_assign" | "path" | "redirect" => ("A01", "Broken Access Control"),
|
||||
// A02 — Cryptographic Failures
|
||||
// A02, Cryptographic Failures
|
||||
"crypto" | "secrets" => ("A02", "Cryptographic Failures"),
|
||||
// A03 — Injection (covers SQLi, XSS, command, code-eval, template, NoSQL, LDAP, reflection,
|
||||
// A03, Injection (covers SQLi, XSS, command, code-eval, template, NoSQL, LDAP, reflection,
|
||||
// and engine-level taint findings without a more specific family tag).
|
||||
"sqli" | "xss" | "cmdi" | "code_exec" | "template" | "nosql" | "ldap" | "reflection"
|
||||
| "taint" => ("A03", "Injection"),
|
||||
// A05 — Security Misconfiguration (TLS verify off, cookie flags, prototype pollution)
|
||||
// A05, Security Misconfiguration (TLS verify off, cookie flags, prototype pollution)
|
||||
"config" | "transport" | "prototype" => ("A05", "Security Misconfiguration"),
|
||||
// A08 — Software and Data Integrity Failures
|
||||
// A08, Software and Data Integrity Failures
|
||||
"deser" => ("A08", "Software and Data Integrity Failures"),
|
||||
// A09 — Logging & Monitoring Failures
|
||||
// A09, Logging & Monitoring Failures
|
||||
"log" => ("A09", "Logging and Monitoring Failures"),
|
||||
// A10 — SSRF
|
||||
// A10, SSRF
|
||||
"ssrf" => ("A10", "Server-Side Request Forgery"),
|
||||
// Memory-safety + state-machine resource lifecycle bugs — closest OWASP fit is
|
||||
// Memory-safety + state-machine resource lifecycle bugs, closest OWASP fit is
|
||||
// A04 Insecure Design (defensive depth).
|
||||
"memory" | "state" => ("A04", "Insecure Design"),
|
||||
// Quality findings (e.g. rs.quality.unwrap) and CFG structural issues
|
||||
|
|
@ -162,7 +162,7 @@ mod tests {
|
|||
fn malformed_rule_returns_none() {
|
||||
// single-segment "not" → family "not" → unmapped → None
|
||||
assert_eq!(owasp_bucket_for("not-a-rule"), None);
|
||||
// "js.onlytwo" — family is "onlytwo" which is unmapped
|
||||
// "js.onlytwo", family is "onlytwo" which is unmapped
|
||||
assert_eq!(owasp_bucket_for("js.onlytwo"), None);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -282,7 +282,7 @@ async fn remove_terminator(
|
|||
// ── Sources / Sinks / Sanitizers (by kind) ───────────────────────────────────
|
||||
|
||||
fn list_by_kind(state: &AppState, target_kind: &str) -> Vec<LabelEntryView> {
|
||||
// Built-in rules live on /api/rules — keep this endpoint focused on the
|
||||
// Built-in rules live on /api/rules, keep this endpoint focused on the
|
||||
// user's own additions in nyx.local.
|
||||
let target_rule_kind = match target_kind {
|
||||
"source" => RuleKind::Source,
|
||||
|
|
|
|||
|
|
@ -306,8 +306,8 @@ async fn get_type_facts(
|
|||
}
|
||||
|
||||
/// GET /api/debug/auth?file=<path>
|
||||
/// Return the file-scoped authorization model — routes, units,
|
||||
/// sensitive operations, and auth checks — for the debug UI.
|
||||
/// Return the file-scoped authorization model, routes, units,
|
||||
/// sensitive operations, and auth checks, for the debug UI.
|
||||
async fn get_auth(
|
||||
State(state): State<AppState>,
|
||||
Query(q): Query<FileQuery>,
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ struct TreeEntry {
|
|||
struct SymbolEntry {
|
||||
name: String,
|
||||
/// Legacy display kind (`"function"` / `"method"`) used by existing CSS
|
||||
/// classes in the frontend. Kept for backward-compat — new consumers
|
||||
/// classes in the frontend. Kept for backward-compat, new consumers
|
||||
/// should prefer `func_kind`.
|
||||
kind: String,
|
||||
/// Structural [`crate::symbol::FuncKind`] slug (`"fn"`, `"method"`,
|
||||
|
|
@ -291,7 +291,7 @@ async fn get_symbols(
|
|||
let entries: Vec<SymbolEntry> = symbols
|
||||
.into_iter()
|
||||
.map(|(name, arity, _lang, namespace, container, func_kind)| {
|
||||
// Legacy `kind` field — still used by existing CSS classes
|
||||
// Legacy `kind` field, still used by existing CSS classes
|
||||
// (`symbol-kind-method`, `symbol-kind-function`). Map any
|
||||
// method-like FuncKind onto `"method"` and everything else
|
||||
// onto `"function"` so the rendered icon stays sensible.
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ fn load_latest_findings_internal(state: &AppState) -> LoadedFindings {
|
|||
/// Build (or fetch from cache) the per-scan derived views.
|
||||
///
|
||||
/// Returns clones of `Arc`s so callers can drop the lock immediately and work
|
||||
/// without contention. Triage state is *not* baked into the cached views — it
|
||||
/// without contention. Triage state is *not* baked into the cached views, it
|
||||
/// changes on a different cadence and is overlaid per request.
|
||||
fn cached_for_latest(state: &AppState) -> CachedFindings {
|
||||
let loaded = load_latest_findings_internal(state);
|
||||
|
|
@ -85,7 +85,7 @@ fn cached_for_latest(state: &AppState) -> CachedFindings {
|
|||
}
|
||||
}
|
||||
|
||||
// Slow path: rebuild. Guard against concurrent rebuilds of the same key —
|
||||
// Slow path: rebuild. Guard against concurrent rebuilds of the same key ,
|
||||
// a second writer that finds the cache already populated for our key
|
||||
// simply returns it.
|
||||
let mut guard = state.findings_cache.write();
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ pub fn routes() -> Router<AppState> {
|
|||
.route("/overview/baseline/{scan_id}", post(set_baseline_path))
|
||||
}
|
||||
|
||||
/// GET /api/overview — aggregated dashboard data.
|
||||
/// GET /api/overview, aggregated dashboard data.
|
||||
async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
|
||||
// 1. Load latest findings (in-memory → DB fallback)
|
||||
let findings = crate::server::routes::findings::load_latest_findings(&state);
|
||||
|
|
@ -121,7 +121,7 @@ async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
|
|||
new_since_last,
|
||||
fixed_since_last,
|
||||
reintroduced: reintroduced_count,
|
||||
// Files-scanned proxy for repo size — used for size-aware
|
||||
// Files-scanned proxy for repo size, used for size-aware
|
||||
// severity dampening in `health::compute`. See
|
||||
// `docs/health-score-audit.md` for calibration data.
|
||||
repo_files: scanner_quality
|
||||
|
|
@ -129,10 +129,10 @@ async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
|
|||
.map(|q| q.files_scanned)
|
||||
.filter(|&f| f > 0),
|
||||
backlog: backlog.as_ref(),
|
||||
// Trend is meaningless without ≥2 completed scans —
|
||||
// Trend is meaningless without ≥2 completed scans ,
|
||||
// matches the first-scan check `compare_to_current` uses.
|
||||
has_history: history.scans.len() >= 2,
|
||||
// Suppression-hygiene modifier — populated when the
|
||||
// Suppression-hygiene modifier, populated when the
|
||||
// suppression panel was computable for this scan.
|
||||
blanket_suppression_rate: suppression_hygiene.as_ref().map(|s| s.blanket_rate),
|
||||
},
|
||||
|
|
@ -173,7 +173,7 @@ async fn overview(State(state): State<AppState>) -> Json<OverviewResponse> {
|
|||
})
|
||||
}
|
||||
|
||||
/// GET /api/overview/trends — scan-over-scan finding counts.
|
||||
/// GET /api/overview/trends, scan-over-scan finding counts.
|
||||
async fn overview_trends(State(state): State<AppState>) -> Json<Vec<TrendPoint>> {
|
||||
let mut points = Vec::new();
|
||||
|
||||
|
|
@ -218,7 +218,7 @@ struct BaselineBody {
|
|||
scan_id: String,
|
||||
}
|
||||
|
||||
/// POST /api/overview/baseline { scan_id } — pin a scan as the baseline for drift comparison.
|
||||
/// POST /api/overview/baseline { scan_id }, pin a scan as the baseline for drift comparison.
|
||||
async fn set_baseline(
|
||||
State(state): State<AppState>,
|
||||
Json(body): Json<BaselineBody>,
|
||||
|
|
@ -226,7 +226,7 @@ async fn set_baseline(
|
|||
set_baseline_inner(&state, &body.scan_id)
|
||||
}
|
||||
|
||||
/// POST /api/overview/baseline/:scan_id — convenience path-form for clients without a JSON body.
|
||||
/// POST /api/overview/baseline/:scan_id, convenience path-form for clients without a JSON body.
|
||||
async fn set_baseline_path(
|
||||
State(state): State<AppState>,
|
||||
AxPath(scan_id): AxPath<String>,
|
||||
|
|
@ -248,7 +248,7 @@ fn set_baseline_inner(state: &AppState, scan_id: &str) -> Result<StatusCode, Sta
|
|||
Ok(StatusCode::NO_CONTENT)
|
||||
}
|
||||
|
||||
/// DELETE /api/overview/baseline — clear the pinned baseline.
|
||||
/// DELETE /api/overview/baseline, clear the pinned baseline.
|
||||
async fn clear_baseline(State(state): State<AppState>) -> Result<StatusCode, StatusCode> {
|
||||
let pool = state
|
||||
.db_pool
|
||||
|
|
@ -381,7 +381,7 @@ impl ScanHistory {
|
|||
(new_count, fixed_count, reintroduced)
|
||||
}
|
||||
|
||||
/// Trend slope across the last N totals — 1.0 means strictly improving,
|
||||
/// Trend slope across the last N totals, 1.0 means strictly improving,
|
||||
/// -1.0 strictly regressing, 0.0 unchanged. Returns None with <3 points.
|
||||
fn trend_slope(&self) -> Option<f64> {
|
||||
if self.scans.len() < 3 {
|
||||
|
|
@ -712,7 +712,7 @@ fn compute_cross_file_ratio(findings: &[Diag]) -> f64 {
|
|||
cross as f64 / findings.len() as f64
|
||||
}
|
||||
|
||||
/// Hot sinks are *only* meaningful for taint findings — counting AST rule IDs
|
||||
/// Hot sinks are *only* meaningful for taint findings, counting AST rule IDs
|
||||
/// (e.g. `rs.quality.unwrap`) here just duplicates the Top Rules table. So we
|
||||
/// deliberately require a real Sink-step callee (or a parsable sink snippet)
|
||||
/// and skip everything else. Empty result → frontend hides the card.
|
||||
|
|
@ -751,7 +751,7 @@ fn compute_hot_sinks(findings: &[Diag], limit: usize) -> Vec<HotSink> {
|
|||
rows
|
||||
}
|
||||
|
||||
/// Pull the leading identifier from a sink snippet — a best-effort heuristic
|
||||
/// Pull the leading identifier from a sink snippet, a best-effort heuristic
|
||||
/// for the dashboard's "hot sinks" list.
|
||||
fn extract_callee_from_snippet(s: &str) -> String {
|
||||
let trimmed = s.trim();
|
||||
|
|
@ -932,7 +932,7 @@ fn compute_suppression_hygiene(state: &AppState, findings: &[Diag]) -> Suppressi
|
|||
}
|
||||
|
||||
fn compute_backlog(state: &AppState, findings: &[Diag], history: &ScanHistory) -> BacklogStats {
|
||||
// No useful aging data on the first scan — every fingerprint was first-seen
|
||||
// No useful aging data on the first scan, every fingerprint was first-seen
|
||||
// today by definition. Avoid the misleading "0d / 0d / 0" display.
|
||||
if history.scans.len() <= 1 {
|
||||
return BacklogStats {
|
||||
|
|
@ -1046,7 +1046,7 @@ fn build_posture(
|
|||
current_total: usize,
|
||||
) -> PostureSummary {
|
||||
// First-scan case: no prior data to diff against. Saying "stable / no change"
|
||||
// is misleading — we genuinely don't know yet.
|
||||
// is misleading, we genuinely don't know yet.
|
||||
if history.scans.len() <= 1 {
|
||||
return PostureSummary {
|
||||
trend: "unknown".into(),
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ fn build_rule_list(state: &AppState) -> Vec<RuleInfo> {
|
|||
rules
|
||||
}
|
||||
|
||||
/// GET /api/rules — list all rules with finding counts.
|
||||
/// GET /api/rules, list all rules with finding counts.
|
||||
async fn list_rules(State(state): State<AppState>) -> Json<Vec<RuleListItem>> {
|
||||
let rules = build_rule_list(&state);
|
||||
|
||||
|
|
@ -99,7 +99,7 @@ async fn list_rules(State(state): State<AppState>) -> Json<Vec<RuleListItem>> {
|
|||
Json(items)
|
||||
}
|
||||
|
||||
/// GET /api/rules/:id — full detail for one rule.
|
||||
/// GET /api/rules/:id, full detail for one rule.
|
||||
async fn get_rule(
|
||||
State(state): State<AppState>,
|
||||
Path(id): Path<String>,
|
||||
|
|
@ -140,7 +140,7 @@ async fn get_rule(
|
|||
}))
|
||||
}
|
||||
|
||||
/// POST /api/rules/:id/toggle — enable/disable a rule.
|
||||
/// POST /api/rules/:id/toggle, enable/disable a rule.
|
||||
async fn toggle_rule(
|
||||
State(state): State<AppState>,
|
||||
Path(id): Path<String>,
|
||||
|
|
@ -162,7 +162,7 @@ async fn toggle_rule(
|
|||
Ok(Json(serde_json::json!({ "status": "ok", "rule_id": id })))
|
||||
}
|
||||
|
||||
/// POST /api/rules/clone — clone a built-in rule to custom.
|
||||
/// POST /api/rules/clone, clone a built-in rule to custom.
|
||||
async fn clone_rule(
|
||||
State(state): State<AppState>,
|
||||
Json(body): Json<serde_json::Value>,
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ async fn delete_scan(
|
|||
Json(serde_json::json!({ "error": msg })),
|
||||
));
|
||||
}
|
||||
// "Scan not found" in memory is fine — may be DB-only
|
||||
// "Scan not found" in memory is fine, may be DB-only
|
||||
}
|
||||
|
||||
// Delete from DB (CASCADE handles metrics + logs)
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@
|
|||
//! This file is designed to be committed to version control so that triage
|
||||
//! decisions travel with the code and are shared across team members.
|
||||
//!
|
||||
//! The file uses **portable fingerprints** — computed with paths relative to the
|
||||
//! project root — so they match across machines regardless of where the repo is
|
||||
//! The file uses **portable fingerprints**, computed with paths relative to the
|
||||
//! project root, so they match across machines regardless of where the repo is
|
||||
//! checked out.
|
||||
|
||||
use crate::commands::scan::Diag;
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ impl BaseAliasResult {
|
|||
///
|
||||
/// For each entry `(dst_val, src_val)` where copy prop replaced `dst` with
|
||||
/// `src`, looks up the original variable names. If both are plain identifiers
|
||||
/// (no dots — i.e. not field paths), they are registered as base aliases.
|
||||
/// (no dots, i.e. not field paths), they are registered as base aliases.
|
||||
/// Transitive closure is computed so `b = a; c = b` yields group `{a, b, c}`.
|
||||
pub fn compute_base_aliases(
|
||||
copy_map: &HashMap<SsaValue, SsaValue>,
|
||||
|
|
@ -103,7 +103,7 @@ pub fn compute_base_aliases(
|
|||
let ra = find(parent, a);
|
||||
let rb = find(parent, b);
|
||||
if ra != rb {
|
||||
// Arbitrary root choice — alphabetically smaller becomes root
|
||||
// Arbitrary root choice, alphabetically smaller becomes root
|
||||
// for determinism.
|
||||
if ra < rb {
|
||||
parent.insert(rb, ra);
|
||||
|
|
@ -130,7 +130,7 @@ pub fn compute_base_aliases(
|
|||
None => continue,
|
||||
};
|
||||
|
||||
// Only alias plain idents — dotted paths (field accesses) are tracked
|
||||
// Only alias plain idents, dotted paths (field accesses) are tracked
|
||||
// independently in SSA and handled by field-aware suppression.
|
||||
if dst_name.contains('.') || src_name.contains('.') {
|
||||
continue;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue