From 3c89bddbf2832f84f469027921a9369268bbe45d Mon Sep 17 00:00:00 2001 From: Eli Peter <54954007+elicpeter@users.noreply.github.com> Date: Sat, 2 May 2026 03:36:14 -0400 Subject: [PATCH] Improved path traversal detection and enhanced sink classification logic --- .gitignore | 1 + CHANGELOG.md | 19 + Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 10 +- THIRDPARTY-LICENSES.html | 2 +- action-scripts/download.sh | 2 +- action.yml | 4 +- docs/advanced-analysis.md | 21 +- docs/auth.md | 4 +- docs/configuration.md | 2 +- docs/detectors/patterns.md | 4 +- docs/detectors/taint.md | 2 +- docs/how-it-works.md | 2 +- docs/installation.md | 2 +- docs/language-maturity.md | 102 +- docs/rules.md | 2 +- docs/serve.md | 2 +- frontend/package.json | 2 +- src/abstract_interp/path_domain.rs | 258 +++- src/ast.rs | 1267 ++++++++++++++++- src/cfg/cfg_tests.rs | 71 + src/cfg/mod.rs | 21 +- src/labels/mod.rs | 11 + src/labels/ruby.rs | 4 + src/patterns/javascript.rs | 3 +- src/patterns/typescript.rs | 6 +- src/symex/strings.rs | 69 + src/taint/path_state.rs | 106 +- src/taint/ssa_transfer/events.rs | 20 +- src/taint/ssa_transfer/mod.rs | 290 +++- src/taint/ssa_transfer/tests.rs | 74 + tests/benchmark/README.md | 8 +- tests/benchmark/RESULTS.md | 13 +- .../buffer_reinterpret_cast_struct_alias.cpp | 15 + .../safe_reinterpret_cast_byte_pointer.cpp | 52 + .../safe_canonicalise_rooted_startsWith.js | 20 + .../safe/safe_env_empty_fallback.js | 9 + .../php/crypto/crypto_md5_password_hash.php | 42 + .../php/safe/safe_md5_sha1_non_crypto_use.php | 86 ++ .../safe_canonicalise_rooted_startswith.py | 19 + .../path_traversal_yaml_load_file_read.rb | 8 + .../safe/safe_canonicalise_rooted_unless.rb | 18 + .../corpus/ruby/ssrf/ssrf_open_uri.rb | 6 + .../safe/safe_env_empty_fallback.ts | 21 + .../cve_corpus/ruby/CVE-2021-21288/patched.rb | 67 + .../ruby/CVE-2021-21288/vulnerable.rb | 66 + .../cve_corpus/ruby/CVE-2023-38337/patched.rb | 75 + .../ruby/CVE-2023-38337/vulnerable.rb | 82 ++ tests/benchmark/ground_truth.json | 619 +++++++- tests/benchmark/results/latest.json | 543 +++++-- .../cpp_reinterpret_cast_byte_pointer/App.cpp | 37 + .../expectations.json | 16 + .../php_md5_sha1_non_crypto_use/App.php | 70 + .../expectations.json | 17 + tests/integration_tests.rs | 38 + 56 files changed, 3989 insertions(+), 345 deletions(-) create mode 100644 tests/benchmark/corpus/cpp/buffer_overflow/buffer_reinterpret_cast_struct_alias.cpp create mode 100644 tests/benchmark/corpus/cpp/safe/safe_reinterpret_cast_byte_pointer.cpp create mode 100644 tests/benchmark/corpus/javascript/safe/safe_canonicalise_rooted_startsWith.js create mode 100644 tests/benchmark/corpus/javascript/safe/safe_env_empty_fallback.js create mode 100644 tests/benchmark/corpus/php/crypto/crypto_md5_password_hash.php create mode 100644 tests/benchmark/corpus/php/safe/safe_md5_sha1_non_crypto_use.php create mode 100644 tests/benchmark/corpus/python/safe/safe_canonicalise_rooted_startswith.py create mode 100644 tests/benchmark/corpus/ruby/path_traversal/path_traversal_yaml_load_file_read.rb create mode 100644 tests/benchmark/corpus/ruby/safe/safe_canonicalise_rooted_unless.rb create mode 100644 tests/benchmark/corpus/ruby/ssrf/ssrf_open_uri.rb create mode 100644 tests/benchmark/corpus/typescript/safe/safe_env_empty_fallback.ts create mode 100644 tests/benchmark/cve_corpus/ruby/CVE-2021-21288/patched.rb create mode 100644 tests/benchmark/cve_corpus/ruby/CVE-2021-21288/vulnerable.rb create mode 100644 tests/benchmark/cve_corpus/ruby/CVE-2023-38337/patched.rb create mode 100644 tests/benchmark/cve_corpus/ruby/CVE-2023-38337/vulnerable.rb create mode 100644 tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/App.cpp create mode 100644 tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/expectations.json create mode 100644 tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/App.php create mode 100644 tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/expectations.json diff --git a/.gitignore b/.gitignore index e259fd8f..76c01ae3 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ /book .DS_Store .z3-trace +.pitboss .node_modules-target diff --git a/CHANGELOG.md b/CHANGELOG.md index 14f13de4..b86f0853 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to Nyx are documented here. The format is based on [Keep a C ## [Unreleased] +## [0.6.0] - TBD + +A focused release that splits data-exfiltration off from SSRF and ships sinks for outbound HTTP request bodies across all 10 languages, with calibration tuned so plain user input echoed back upstream does not fire. + ### Added - New `taint-data-exfiltration` rule, separate from SSRF. Fires when a Sensitive-tier source (cookie, header, env, file, database, caught exception) reaches the body, headers, or json payload of an outbound HTTP call. Plain user input gets suppressed at emission time so a gateway echoing `req.body` back upstream is not flagged. @@ -15,6 +19,21 @@ All notable changes to Nyx are documented here. The format is based on [Keep a C - Calibration. Severity is High for cookie or env sources, Medium for header, file, database, or caught-exception sources. Confidence stays at Medium even with strong corroboration, drops to Low without abstract or symbolic backing, and drops one tier on path-validated flows. SARIF output carries a `properties.data_exfil_field` entry on data-exfil findings, set to the destination object-literal field the leak reached (`body`, `headers`, or `json`). - Benchmark coverage. 13 vulnerable fixtures across 8 languages under `tests/benchmark/corpus/{lang}/data_exfil/` and 6 paired safe fixtures for the sensitivity gate and sanitizer convention. New `data_exfil` row in the per-class breakdown. Per-class CI floor at P, R, F1 ≥ 0.85 (current baseline is 1.000). - Backwards taint walk recognises `Cap::DATA_EXFIL` and emits the same rule ID. +- Ruby SSRF coverage. `OpenURI.open_uri` now classified as an SSRF sink (the low-level fetcher that `URI.open` delegates to). Closes the CarrierWave CVE-2021-21288 download path and equivalent gem shapes that route through `OpenURI` directly. +- Ruby chained-call wrapper classification. Statement-level wrappers like `YAML.safe_load(File.read(filename))` and `Marshal.load(File.read(p))` now classify the inner sink for cross-function summary extraction. Without this, the outer call became a non-sink node and the inner sink was lost when the helper was summarised. +- Ruby CVE corpus. Vulnerable + patched fixtures added for CVE-2021-21288 (CarrierWave SSRF) and CVE-2023-38337 (rswag path traversal). + +### Fixed (false positives) + +- C++ `cpp.memory.reinterpret_cast` no longer fires when the target type is well-defined by C++ aliasing rules. Suppressed targets: byte-pointer family (`char*`, `unsigned char*`, `signed char*`, `wchar_t*`, `uint8_t*`, `int8_t*`, `std::byte*`, `byte*`), `void*`, integer round-trip (`uintptr_t`, `intptr_t`, and `std::` variants, no pointer required), and the BSD socket address family (`sockaddr*`, `struct sockaddr*`, `sockaddr_in*`, `sockaddr_in6*`, `sockaddr_un*`, `sockaddr_storage*`). User-defined struct or class pointer targets keep firing. Closes ~70% over-fire on serialization, hashing, IPC, and socket-API code where the cast is the standard-blessed idiom. +- PHP `php.crypto.md5` and `php.crypto.sha1` suppress when the call's consuming context yields a non-cryptographic identifier name. Recognised contexts: assignment LHS (variable, `$obj->property`, `$arr['key']`), array element keys, subscript indices, return statements (resolved to enclosing method or function name with `get` prefix stripped), and method-call arguments where the method is a key/cache/lookup verb (`get`, `set`, `has`, `delete`, `fetch`, `store`, `find`, `getItem`, `setItem`). Names containing a crypto keyword (`password`, `secret`, `token`, `signature`, `hmac`, `digest`, `salt`, `key`) keep firing. Closes ETag generation, cache-key hashing, dedup fingerprint, and `getCacheKey()`-style false positives in real PHP repos (phpmyadmin, nextcloud). +- JS and TS `secrets.fallback_secret` no longer fire on empty-string fallbacks (`process.env.X || ""`). Developers write `|| ""` to satisfy non-undefined string types without committing a real secret. Non-empty literal fallbacks still fire. +- Path-traversal sink suppression accepts canonicalised-and-rooted shapes. New `PathFact::is_path_traversal_safe` predicate clears `Cap::FILE_IO` when the path is dotdot-free and either non-absolute or carries a verified prefix-lock. New `OPAQUE_PREFIX_LOCK` marker records the structural invariant ("rooted under SOME prefix") when the `starts_with`-style guard's argument is a method call, field access, or configured root rather than a string literal. Closes the Ruby `File.expand_path + start_with?(root)` shape (rswag CVE-2023-38337 patched counterpart), the Python `os.path.realpath + .startswith(root)` shape, and the JS `path.resolve + .startsWith(root)` shape. `classify_path_assertion` extended to JS `.startsWith(...)`, Python `.startswith(...)`, Ruby `.start_with?(...)` (paren and paren-less), and Go `strings.HasPrefix(...)`. +- Branch narrowing now flips prefix-lock attachment under condition negation. For `if !target.startsWith(ROOT) { return; }` the lock attaches to the surviving block, not the rejection arm. Rejection-axis narrowing is unchanged because the rejection classifier is text-level and already accounts for leading `!`. + +### Other + +- Action download script warning for the mutable `latest` tag now references `v0.6.0` instead of `v0.5.0`. ## [0.5.0] - 2026-04-29 diff --git a/Cargo.lock b/Cargo.lock index 42ab62cd..cd5808ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1134,7 +1134,7 @@ dependencies = [ [[package]] name = "nyx-scanner" -version = "0.5.0" +version = "0.6.0" dependencies = [ "assert_cmd", "axum", diff --git a/Cargo.toml b/Cargo.toml index efdd74ff..f37acc84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "nyx-scanner" -version = "0.5.0" +version = "0.6.0" edition = "2024" rust-version = "1.88" description = "A multi-language static analysis tool for detecting security vulnerabilities" diff --git a/README.md b/README.md index cc522345..4ab51350 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ Forward cross-file taint runs in every profile. Symex and the demand-driven back ### GitHub Action ```yaml -- uses: elicpeter/nyx@v0.5.0 +- uses: elicpeter/nyx@v0.6.0 with: format: sarif fail-on: MEDIUM @@ -115,15 +115,15 @@ Requires stable Rust 1.88+. The frontend is compiled and embedded in the binary ## Languages -All 10 languages parse via tree-sitter and run through the full pipeline, but rule depth and engine coverage are uneven. Benchmark F1 on the 433-case corpus at [`tests/benchmark/ground_truth.json`](tests/benchmark/ground_truth.json) is 100% for nine of ten languages and 94.1% for Go, so F1 alone no longer separates the tiers. Tiering reflects rule depth, gated-sink coverage, and structural idioms the synthetic corpus does not fully stress: +All 10 languages parse via tree-sitter and run through the full pipeline, but rule depth and engine coverage are uneven. Benchmark F1 on the 492-case corpus at [`tests/benchmark/ground_truth.json`](tests/benchmark/ground_truth.json) is 100% across all ten languages, so F1 alone no longer separates the tiers. Tiering reflects rule depth, gated-sink coverage, and structural idioms the synthetic corpus does not fully stress: | Tier | Languages | F1 | Use as a CI gate? | |---|---|---|---| | **Stable** | Python, JavaScript, TypeScript | 100% | Yes | -| **Beta** | Java, PHP, Ruby, Rust, Go | 94.1% to 100% | Yes, with light FP triage | +| **Beta** | Java, PHP, Ruby, Rust, Go | 100% | Yes, with light FP triage | | **Preview** | C, C++ | 100% on synthetic corpus | No. STL container flow, builder chains, and inline class member functions are tracked, but deep pointer aliasing and function pointers are not. Pair with clang-tidy or Clang Static Analyzer | -Aggregate rule-level F1: 99.8% (P=0.995, R=1.000). All real-CVE fixtures fire; the single open FP is `go-safe-009`. Per-dimension detail and known blind spots live on the [Language maturity page](https://elicpeter.github.io/nyx/language-maturity.html). +Aggregate rule-level F1: 100.0% (P=1.000, R=1.000). All real-CVE fixtures fire and the corpus carries zero open FPs. Per-dimension detail and known blind spots live on the [Language maturity page](https://elicpeter.github.io/nyx/language-maturity.html). ### Validated against real CVEs @@ -200,7 +200,7 @@ Or add rules interactively: `nyx config add-rule --lang javascript --matcher esc ## Status -Under active development. APIs, detector behavior, and configuration options may change between releases. Rule-level F1 on the 433-case corpus is the CI regression floor; per-language detail lives in [`tests/benchmark/RESULTS.md`](tests/benchmark/RESULTS.md). +Under active development. APIs, detector behavior, and configuration options may change between releases. Rule-level F1 on the 492-case corpus is the CI regression floor; per-language detail lives in [`tests/benchmark/RESULTS.md`](tests/benchmark/RESULTS.md). Taint analysis is interprocedural. Persisted per-function SSA summaries carry per-return-path transforms and parameter-granularity points-to, and call-graph SCCs (including SCCs that span files) iterate to a joint fixed-point. The default `balanced` profile also runs k=1 context-sensitive inlining for intra-file callees. Symex (with cross-file and interprocedural frames) and the demand-driven backwards walk are opt-in. Enable them individually with `--symex` and `--backwards-analysis`, or together with `--engine-profile deep`. diff --git a/THIRDPARTY-LICENSES.html b/THIRDPARTY-LICENSES.html index 2f9eae1f..56f32c49 100644 --- a/THIRDPARTY-LICENSES.html +++ b/THIRDPARTY-LICENSES.html @@ -4768,7 +4768,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

GNU General Public License v3.0 only

Used by:

 GNU GENERAL PUBLIC LICENSE
diff --git a/action-scripts/download.sh b/action-scripts/download.sh
index fb9c8d4a..9401135a 100755
--- a/action-scripts/download.sh
+++ b/action-scripts/download.sh
@@ -27,7 +27,7 @@ esac
 
 # ── Resolve "latest" to an actual release tag ────────────────────────────────
 if [[ "$VERSION" == "latest" ]]; then
-  echo "::warning::version: latest follows a mutable tag. Pin to a specific release (e.g. v0.5.0) for supply-chain safety."
+  echo "::warning::version: latest follows a mutable tag. Pin to a specific release (e.g. v0.6.0) for supply-chain safety."
   API_URL="https://api.github.com/repos/${REPO}/releases/latest"
   CURL_ARGS=(-fsSL)
   if [[ -n "${GITHUB_TOKEN:-}" ]]; then
diff --git a/action.yml b/action.yml
index 17cee50c..4877d290 100644
--- a/action.yml
+++ b/action.yml
@@ -12,9 +12,9 @@ inputs:
     required: false
     default: '.'
   version:
-    description: 'Nyx release tag (e.g. v0.5.0). "latest" is accepted but discouraged, pinning to a specific tag protects against upstream compromise.'
+    description: 'Nyx release tag (e.g. v0.6.0). "latest" is accepted but discouraged, pinning to a specific tag protects against upstream compromise.'
     required: false
-    default: 'v0.5.0'
+    default: 'v0.6.0'
   format:
     description: 'Output format: sarif, json, or console'
     required: false
diff --git a/docs/advanced-analysis.md b/docs/advanced-analysis.md
index c9641ef0..cd5e389b 100644
--- a/docs/advanced-analysis.md
+++ b/docs/advanced-analysis.md
@@ -31,6 +31,22 @@ SQL sink as an injection risk; an SSRF sink whose URL prefix is locked to a
 trusted host stays quiet. This turns a large class of FPs on numeric and
 locked-prefix paths into true negatives.
 
+**Path traversal.** The path domain accepts canonicalised-and-rooted
+shapes via `PathFact::is_path_traversal_safe`: a path that is
+dotdot-free and either non-absolute or carries a verified prefix-lock has
+its `Cap::FILE_IO` cleared. When the lock argument is a string literal
+the lock prefix is recorded directly; when it is a method call, field
+access, or configured root, an `OPAQUE_PREFIX_LOCK` marker captures the
+structural invariant ("rooted under SOME prefix") instead. This closes
+the Ruby `File.expand_path + start_with?(root)`, Python
+`os.path.realpath + .startswith(root)`, and JS
+`path.resolve + .startsWith(root)` shapes. `classify_path_assertion`
+recognises JS `.startsWith(...)`, Python `.startswith(...)`, Ruby
+`.start_with?(...)` (paren and paren-less), and Go `strings.HasPrefix(...)`.
+Branch narrowing flips lock attachment under condition negation
+(`if !target.startsWith(ROOT) { return; }` attaches the lock to the
+surviving block, not the rejection arm).
+
 **How to turn it off.**
 
 | Surface | Value |
@@ -111,9 +127,8 @@ identity independent of the parent value.
 |---|---|
 | Env var | `NYX_POINTER_ANALYSIS=0` |
 
-The pass is **on by default** as of 2026-04-26. The env-var override is
-kept for one release so you can compare against the pre-pointer baseline,
-then will be removed.
+The pass is **on by default**. The env-var override exists so you can
+compare against the pre-pointer baseline.
 
 **Limitations.** This is not a general escape analysis. Function pointers
 and arbitrary indirect calls still resolve to no callee, and deep alias
diff --git a/docs/auth.md b/docs/auth.md
index f0db642b..3dfece39 100644
--- a/docs/auth.md
+++ b/docs/auth.md
@@ -86,6 +86,6 @@ Auth findings render alongside taint findings in the [browser UI](serve.md). The
 
 

Nyx finding detail: numbered source → call → sink walk with a How to fix panel and an inline evidence object

-## Where the work was done +## Benchmark corpus -The remediation work is documented release-by-release in `tests/benchmark/RESULTS.md` under the Rust auth row. Phases A1 through B5 (precision and structural improvements) and Phase C (taint-based variant) all landed on the 0.5.0 release branch. The benchmark corpus at [`tests/benchmark/corpus/rust/auth/`](https://github.com/elicpeter/nyx/tree/master/tests/benchmark/corpus/rust/auth/) is 10 fixtures covering the five FP patterns plus a true-positive control. +The Rust auth corpus at [`tests/benchmark/corpus/rust/auth/`](https://github.com/elicpeter/nyx/tree/master/tests/benchmark/corpus/rust/auth/) is 10 fixtures covering the five FP patterns plus a true-positive control. Per-row metrics live under the Rust auth row in `tests/benchmark/RESULTS.md`. diff --git a/docs/configuration.md b/docs/configuration.md index dacf1b07..15d2b6f2 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -394,7 +394,7 @@ engine. No flag is needed; CI pipelines keep working across upgrades. The rebuild is logged at `info` level: ``` -engine version changed (0.4.0 → 0.5.0), rebuilding index +engine version changed (), rebuilding index ``` If you see this once per upgrade it is working as intended. If you see it on diff --git a/docs/detectors/patterns.md b/docs/detectors/patterns.md index adf50b23..38ac501b 100644 --- a/docs/detectors/patterns.md +++ b/docs/detectors/patterns.md @@ -56,8 +56,10 @@ Full list: [rules.md](../rules.md). | `eval("hardcoded literal")` | Pattern matches structure | Run `--mode cfg` to drop AST patterns and rely on taint | | `unsafe` block with sound justification | Every `unsafe` matches `rs.quality.unsafe_block` | Filter `>=MEDIUM` (it's Medium) or accept the noise | | `.unwrap()` in tests | Acceptable in test code | Default non-prod severity downgrade reduces it | -| `md5` for non-cryptographic checksums | Pattern can't see intent | Suppress with `--severity ">=MEDIUM"` or per-line `nyx:ignore` | +| `md5` for non-cryptographic checksums | Pattern can't see intent in most languages | PHP recognises non-crypto consuming context structurally (cache keys, ETag, dedup, `getCacheKey()` returns) and suppresses. Other languages: `--severity ">=MEDIUM"` or per-line `nyx:ignore` | | SQL concat with trusted data (Tier B) | Heuristic can't verify the source | Taint is more precise; or convert to a parameterized query | +| C++ `reinterpret_cast(...)` for byte-pointer / void* / `sockaddr` | Pattern fires on every cast regardless of target type | Suppressed when the target is well-defined by C++ aliasing rules: `char*`, `unsigned char*`, `signed char*`, `wchar_t*`, `uint8_t*`, `int8_t*`, `std::byte*`, `byte*`, `void*`, `uintptr_t` / `intptr_t` (and `std::` variants), and the BSD socket address family. User-defined struct or class pointer targets keep firing. | +| JS / TS `secrets.fallback_secret` on `process.env.X \|\| ""` | Empty-string fallback satisfies non-undefined string types without committing a secret | Empty-string fallbacks are excluded from the rule. Non-empty literal fallbacks still fire. | ## Confidence levels diff --git a/docs/detectors/taint.md b/docs/detectors/taint.md index 5cf2b754..4ada6baa 100644 --- a/docs/detectors/taint.md +++ b/docs/detectors/taint.md @@ -130,7 +130,7 @@ Sources, sanitizers, and sinks are linked by named capabilities. A sanitizer onl | `shell_escape` | | `shlex.quote`, `shell_escape::escape` | `system`, `Command::new`, `eval` | | `url_encode` | | `encodeURIComponent` | `location.href`, HTTP client URL arg | | `json_parse` | | `JSON.parse` | | -| `file_io` | | `os.path.realpath`, `filepath.Clean` | `open`, `fs::read_to_string`, `send_file` | +| `file_io` | | `os.path.realpath`, `filepath.Clean`, canonicalise + `starts_with`-rooted guard | `open`, `fs::read_to_string`, `send_file` | | `fmt_string` | | | `printf(var)` | | `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` with concatenation | | `deserialize` | | | `pickle.loads`, `yaml.load`, `Marshal.load` | diff --git a/docs/how-it-works.md b/docs/how-it-works.md index f1c68001..f9dc9d70 100644 --- a/docs/how-it-works.md +++ b/docs/how-it-works.md @@ -16,7 +16,7 @@ Two extra layers tune precision around calls. **Context-sensitive inlining** (k= When a method call has a receiver typed as a super-class, trait, or interface, **hierarchy fan-out** widens the resolved callee set to every concrete implementer the engine has seen. A class diagram extracted in pass 1 (Java extends/implements, Rust impl-for, TS/JS extends, Python bases, Ruby includes, PHP extends/implements, C++ inheritance) feeds an index that the call resolver consults during pass 2. The fan-out is capped at 8 implementers per call site; over-fanning is a precision tax, not a soundness issue. -A separate **field-sensitive points-to** pass tracks abstract locations down to the field level, so `c.mu.Lock()` is a lock on `Field(c, mu)` rather than on `c` as a whole. That distinction is what lets the resource-lifecycle and taint passes tell `obj.field = tainted; sink(obj.other_field)` apart from the conservative whole-variable approximation. Subscript reads and writes (`arr[i]`, `map[k] = v`) lower to synthetic `__index_get__` / `__index_set__` calls so the same container model handles them. Set `NYX_POINTER_ANALYSIS=0` to fall back to the pre-pointer-pass behaviour for one release if you need to compare baselines. +A separate **field-sensitive points-to** pass tracks abstract locations down to the field level, so `c.mu.Lock()` is a lock on `Field(c, mu)` rather than on `c` as a whole. That distinction is what lets the resource-lifecycle and taint passes tell `obj.field = tainted; sink(obj.other_field)` apart from the conservative whole-variable approximation. Subscript reads and writes (`arr[i]`, `map[k] = v`) lower to synthetic `__index_get__` / `__index_set__` calls so the same container model handles them. Set `NYX_POINTER_ANALYSIS=0` to fall back to the pre-pointer-pass behaviour for baseline comparison. ## Optional analyses on top diff --git a/docs/installation.md b/docs/installation.md index 73334fd3..9112ee8d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -61,7 +61,7 @@ Optional features: Nyx stores its scanner version in the project's index database. When the binary's version differs from the stored version, the index is wiped on the next scan and rebuilt against the new engine. You'll see one info-level log line: ``` -engine version changed (0.4.0 → 0.5.0), rebuilding index +engine version changed (), rebuilding index ``` No flag needed. If you see this on *every* scan, the metadata row isn't being persisted; file an issue. diff --git a/docs/language-maturity.md b/docs/language-maturity.md index 2e6c279d..4a99fd75 100644 --- a/docs/language-maturity.md +++ b/docs/language-maturity.md @@ -9,24 +9,22 @@ The classifications here are grounded in three concrete signals: 1. **Rule depth**: how many distinct source / sanitizer / sink matchers exist for the language in `src/labels/.rs`, and how many vulnerability classes (Cap bits) those matchers cover. -2. **Benchmark results**: rule-level precision / recall / F1 on the 433-case +2. **Benchmark results**: rule-level precision / recall / F1 on the 492-case corpus in - [`tests/benchmark/RESULTS.md`](https://github.com/elicpeter/nyx/blob/master/tests/benchmark/RESULTS.md), - last measured 2026-04-29 with scanner version 0.5.0. + [`tests/benchmark/RESULTS.md`](https://github.com/elicpeter/nyx/blob/master/tests/benchmark/RESULTS.md). 3. **Known weak spots**: FPs and FNs the maintainers have deliberately left in the benchmark rather than suppressed, plus structural engine - limitations the corpus does not stress, documented release-by-release in + limitations the corpus does not stress, documented in [`RESULTS.md`](https://github.com/elicpeter/nyx/blob/master/tests/benchmark/RESULTS.md). -As of 2026-04-29 the synthetic corpus has effectively saturated: every -real-CVE fixture fires and rule-level recall is 100%. Nine of ten -languages report rule-level F1 = 100.0%; Go reports 98.0% on the back of -a single safe-fixture FP. Aggregate rule-level P=0.995, R=1.000, F1=0.998. -That means F1 alone no longer differentiates tiers, so the differentiators -are **rule depth**, **gated-sink coverage**, and **structural idioms the -corpus does not fully stress** (deep pointer aliasing in C/C++, -framework-specific context). All parser integrations use tree-sitter and -are stable; parsing is not a differentiator. +The synthetic corpus has effectively saturated: every +real-CVE fixture fires and rule-level precision and recall are both 100%. +All ten languages report rule-level F1 = 100.0%. Aggregate rule-level +P=1.000, R=1.000, F1=1.000. That means F1 alone no longer differentiates +tiers, so the differentiators are **rule depth**, **gated-sink coverage**, +and **structural idioms the corpus does not fully stress** (deep pointer +aliasing in C/C++, framework-specific context). All parser integrations +use tree-sitter and are stable; parsing is not a differentiator. --- @@ -35,7 +33,7 @@ are stable; parsing is not a differentiator. | Tier | Languages | F1 | What to expect | |------|-----------|----|----------------| | **Stable** | Python, JavaScript, TypeScript | 100% | Deep rule sets, gated sinks (argument-role-aware), framework detection, extensive fixtures, and the bulk of advanced-analysis (SSA two-level solve, context-sensitivity, symbolic execution, abstract interpretation) coverage. Safe to depend on in CI gates. | -| **Beta** | Go, Java, PHP, Ruby, Rust | 98.0% to 100% | Solid mid-depth rule sets with narrower cap coverage and **no gated sinks**. Cross-file flows work; some idioms (variable-typed method receivers, framework context, string interpolation, match-arm guards) are partially modeled. Usable in CI; review FP/FN lists before tightening gates. | +| **Beta** | Go, Java, PHP, Ruby, Rust | 100% | Solid mid-depth rule sets with narrower cap coverage and **no gated sinks**. Cross-file flows work; some idioms (variable-typed method receivers, framework context, string interpolation, match-arm guards) are partially modeled. Usable in CI; review FP/FN lists before tightening gates. | | **Preview** | C, C++ | 100% on synthetic corpus | Recent work taught the engine to follow taint through `std::vector` / `std::string` / map containers (including `c_str()`), through fluent builder chains like `Socket::builder().host(h).connect()`, and through inline class member functions. Function pointers and deeper pointer aliasing through `*p` / `p->field` are still not tracked. Rule-level scores against a corpus of obvious unsafe-API uses look perfect, but that is not the same as a clean audit on a real codebase. Pair with clang-tidy, Clang Static Analyzer, or Infer. | --- @@ -90,13 +88,15 @@ are stable; parsing is not a differentiator. ### Beta tier -#### Go: 96.2% P / 100.0% R / 98.0% F1 *(53-case corpus, 1 FP, 0 FNs)* +#### Go: 100% P / 100% R / 100% F1 *(56-case corpus)* - **Rule depth**: 4 source families, 4 sanitizer families, 9 sink matchers covering HTML, URL, Shell, SQL, SSRF, Crypto, and File I/O. - **Framework context**: Gin, Echo source matchers. -- **Open weak spots**: one safe Go fixture (`go-safe-009`) draws a spurious - CMDi finding. +- **Recent fix**: `strings.ReplaceAll` is now recognised as a CMDi sanitiser + in chain-wrapper / call-site-replace shapes, clearing the last open + Go safe-fixture FP (`go-safe-009`, `validate(s string)` wrapping a + `strings.ReplaceAll` over `;`). - **Known gaps**: no gated sinks, no deserialization class. `fmt.Sprintf` is deliberately not a sink. Cap coverage is narrower than the Stable tier and argument-role-aware sink modeling is not yet implemented for Go, @@ -126,21 +126,25 @@ are stable; parsing is not a differentiator. #### Ruby: 100% P / 100% R / 100% F1 *(39-case corpus)* -- **Rule depth**: 3 source families, 7 sanitizer families, 15 sink matchers - covering HTML, Shell, SQL, Code, SSRF, File I/O, and Deserialization. +- **Rule depth**: 3 source families, 7 sanitizer families, 16 sink matchers + covering HTML, Shell, SQL, Code, SSRF, File I/O, and Deserialization. SSRF + coverage includes `URI.open` and the low-level `OpenURI.open_uri` it + delegates to (the canonical CarrierWave CVE-2021-21288 sink). + Statement-level chained-call wrappers + (`YAML.safe_load(File.read(filename))`, `Marshal.load(File.read(p))`, + `String.new(File.read(x))`) classify the inner sink for cross-function + summary extraction so the outer call does not strip the sink classification + on the helper. - **Framework context**: Rails helpers (`sanitize_sql`, `permit`, `require`). - **Known gaps**: string interpolation inside shell and SQL strings is recognized structurally but not modeled as a distinct operator. `begin/rescue/ensure` exception-edge wiring is documented as deferred - (structurally incompatible with `build_try()`). The previous open - `rb-interproc-001` FN closed in the 2026-04-28 baseline after the - Ruby `Kernel#open` CMDI sink and exact-match sigil work landed. + (structurally incompatible with `build_try()`). #### Rust: 100% P / 100% R / 100% F1 *(70-case adversarial corpus)* -Rust holds the largest per-language adversarial corpus and was promoted -from Experimental to Beta in the 2026-04-25 measurement after the PathFact -landings closed every previously-open `rs-safe-*` regression. +Rust holds the largest per-language adversarial corpus. PathFact-driven +path-domain narrowing covers the `rs-safe-*` regression set. - **Rule depth**: 6 source families, **2** sanitizer families (prefix and type-coercion), 11 sink matchers covering HTML, Shell, SQL, SSRF, @@ -149,20 +153,18 @@ landings closed every previously-open `rs-safe-*` regression. narrow sanitizer count is the primary reason Rust is not in the Stable tier. Engine-side path/typed sanitizer recognition (PathFact) compensates, but the ruleset itself is shallow. -- **Recent additions**: SQL class (`rusqlite`, `sqlx`, `diesel`, - `postgres`), Deserialization class (`serde_yaml`, `bincode`, - `rmp_serde`, `ciborium`, `ron`, `toml`), expanded file I/O - (`fs::remove_file/dir/rename/copy`), `reqwest` SSRF builder chain. -- **Closed by recent PathFact landings** - (`src/abstract_interp/path_domain.rs` + per-return-path PathFact entries - on `SsaFuncSummary`): `rs-safe-007` (`.replace("..","")` sanitiser), - `rs-safe-008` (negative-validation return), `rs-safe-009` (match-arm - guards via condition lifting), `rs-safe-010` (static-map lookup), - `rs-safe-012` (`.contains("..")` + `.starts_with('/')` rejection), - `rs-safe-014` (Option-returning user sanitiser), `rs-safe-015` - (`Path::new(p).is_absolute()` typed rejection), `rs-safe-016` - (cross-function `.contains("..")` rejection), and CVE patches - `CVE-2018-20997`, `CVE-2022-36113`, `CVE-2024-24576`. +- **Coverage**: SQL class (`rusqlite`, `sqlx`, `diesel`, `postgres`), + Deserialization class (`serde_yaml`, `bincode`, `rmp_serde`, `ciborium`, + `ron`, `toml`), file I/O (`fs::remove_file/dir/rename/copy`), and the + `reqwest` SSRF builder chain. +- **PathFact-narrowed shapes** (`src/abstract_interp/path_domain.rs` plus + per-return-path PathFact entries on `SsaFuncSummary`) cover + `.replace("..","")` sanitisers, negative-validation returns, match-arm + guards via condition lifting, static-map lookups, + `.contains("..")` + `.starts_with('/')` rejection, Option-returning + user sanitisers, `Path::new(p).is_absolute()` typed rejection, + cross-function `.contains("..")` rejection, and the + `CVE-2018-20997` / `CVE-2022-36113` / `CVE-2024-24576` patch shapes. - **Not yet covered**: unsafe FFI / `std::mem::transmute` (no rules), Tokio `process::Command` async variants (not distinguished from sync), `hyper` / `surf` / `ureq` SSRF clients (reqwest family only). @@ -170,17 +172,16 @@ landings closed every previously-open `rs-safe-*` regression. ### Preview tier C and C++ remain **Preview** despite reporting 100% rule-level F1 on the -synthetic corpus. A run of additions in late April taught the engine to -follow taint through several constructs that used to be hard cutoffs (STL -containers, builder chains, inline member functions, the wider `std::sto*` -family), so the gap between "passes the synthetic corpus" and "would catch -the same flow on a real codebase" is narrower than it used to be. It is not -zero. The biggest remaining gaps are deep pointer aliasing and function +synthetic corpus. The engine follows taint through STL containers, builder +chains, inline member functions, and the wider `std::sto*` family, so the +gap between "passes the synthetic corpus" and "would catch the same flow +on a real codebase" is narrower than the synthetic numbers suggest. It is +not zero. The biggest remaining gaps are deep pointer aliasing and function pointers, both of which are pervasive in real C/C++ code. Treat a clean report as a starting point, not an audit. Pair Nyx with clang-tidy, the Clang Static Analyzer, or Infer for production use. -**What now works** (added in late April): +**What works:** - STL container flow. `vec.push_back(tainted)` followed by `vec.front().c_str()` carries taint into a downstream `system()` sink. @@ -216,8 +217,8 @@ Clang Static Analyzer, or Infer for production use. `void (*fn)(char *)` resolves to no callee, so cross-pointer flows are invisible. - Array-element taint by index. Writes to `buf[i]` do not always propagate - taint to `buf` as a whole; the recent subscript-handling work helps the - general case but doesn't make `buf` an alias for every element. + taint to `buf` as a whole; subscript-handling helps the general case but + doesn't make `buf` an alias for every element. - Nested classes beyond one level (C++ only). #### C: 100% P / 100% R / 100% F1 *(30-case corpus)* @@ -269,9 +270,8 @@ have moved out of the blind-spot list. Synthetic-corpus F1 is not a reliable signal for Preview-tier languages: a clean report can coexist with structural gaps. -(The previous **Experimental** tier was retired in the 2026-04-25 -measurement when Rust's adversarial corpus reached 100% F1; no language -currently sits in that tier.) +(No language currently sits in the **Experimental** tier; it is reserved +for future additions whose corpus has not yet stabilised.) --- diff --git a/docs/rules.md b/docs/rules.md index ea5ea07c..9582277b 100644 --- a/docs/rules.md +++ b/docs/rules.md @@ -1,6 +1,6 @@ # Rule reference -Every finding Nyx emits has a rule ID. This page enumerates the IDs that ship with scanner 0.5.0, grouped by family. +Every finding Nyx emits has a rule ID. This page enumerates the IDs that ship with the scanner, grouped by family. > This page is written by hand and drifts against the code. Authoritative sources: [`src/patterns/.rs`](https://github.com/elicpeter/nyx/tree/master/src/patterns) for AST patterns, [`src/labels/.rs`](https://github.com/elicpeter/nyx/tree/master/src/labels) for taint matchers, and [`src/auth_analysis/config.rs`](https://github.com/elicpeter/nyx/blob/master/src/auth_analysis/config.rs) for auth rules. If a rule fires that isn't listed here, the source file is right and this page is wrong. diff --git a/docs/serve.md b/docs/serve.md index 24d59bd7..648ce2ab 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -82,7 +82,7 @@ Modifiers in the ±5 range nudge the result for trend (only after the second sca It's a Nyx-finding-pressure metric, not a security audit. Score 100 means Nyx didn't find anything under its current rules and language coverage; it doesn't certify the absence of vulnerabilities. The score doesn't see runtime config, IAM, secret stores, dependency CVEs, or anything outside the source tree being scanned. A repo of mostly Kotlin (where Nyx coverage is thin) will score artificially well because most of the code never gets evaluated. -The current ceilings are calibrated for v0.5 scanner false-positive rates. As symex coverage and rule precision improve, the ceilings tighten. Calibration data and the rationale behind each tunable lives in [health-score-audit.md](health-score-audit.md). +Ceilings are calibrated for the current scanner false-positive rates. As symex coverage and rule precision improve, the ceilings tighten. Calibration data and the rationale behind each tunable lives in [health-score-audit.md](health-score-audit.md). ### Findings and Finding detail diff --git a/frontend/package.json b/frontend/package.json index fe3a6141..3a4b9c6c 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,7 +1,7 @@ { "name": "nyx-frontend", "private": true, - "version": "0.5.0", + "version": "0.6.0", "license": "GPL-3.0-or-later", "type": "module", "scripts": { diff --git a/src/abstract_interp/path_domain.rs b/src/abstract_interp/path_domain.rs index 3888df1e..d46aa3c3 100644 --- a/src/abstract_interp/path_domain.rs +++ b/src/abstract_interp/path_domain.rs @@ -207,6 +207,34 @@ impl PathFact { !self.is_bottom && self.dotdot == Tri::No && self.absolute == Tri::No } + /// True iff the fact proves the path stays inside a trusted region + /// for path-traversal purposes (the FILE_IO sink-suppression + /// predicate). + /// + /// Accepts either of two structural invariants: + /// + /// * `dotdot = No && absolute = No` — the relative-and-`..`-free + /// shape recognised by [`is_path_safe`]. Cannot escape to an + /// attacker-controlled absolute location. + /// * `dotdot = No && prefix_lock.is_some()` — a canonicalised path + /// (typically `File.expand_path` / `realpath` / `fs::canonicalize`) + /// that has been verified-rooted by a `starts_with`-style guard + /// against some prefix. The prefix may be opaque + /// ([`OPAQUE_PREFIX_LOCK`]); the structural guarantee is the same: + /// the path is provably inside the locked subtree. + /// + /// This relaxation closes the rswag CVE-2023-38337 patched-counterpart + /// FP shape (`File.expand_path(File.join(root, p)) + start_with? root`) + /// and the equivalent Python (`os.path.realpath + .startswith(root)`) + /// and JS (`path.resolve + .startsWith(root)`) idioms, all of which + /// produce absolute paths but are sound against `..` traversal. + pub fn is_path_traversal_safe(&self) -> bool { + if self.is_bottom || self.dotdot != Tri::No { + return false; + } + self.absolute == Tri::No || self.prefix_lock.is_some() + } + /// True iff the fact has a prefix lock equal to or contained under /// `root`. Used by sink-suppression to confirm that a path derived /// from a locked root is provably still under that root. @@ -391,6 +419,16 @@ pub enum PathAssertion { None, } +/// Sentinel root attached to a [`PathFact::prefix_lock`] when the +/// `starts_with`-style guard's argument is non-literal (a method call, +/// field access, configured root from the application). The structural +/// invariant — "verified rooted under SOME prefix" — is what the sink- +/// suppression layer needs; the *exact* prefix bytes are not. Combined +/// with a `dotdot=No` proof from canonicalisation or `..`-rejection, an +/// opaque prefix-lock is sufficient to prove the path stays inside a +/// trusted region. +pub const OPAQUE_PREFIX_LOCK: &str = "__nyx_opaque_prefix__"; + /// Recognise a Rust path-rejection branch idiom from the raw condition text. /// /// Accepts both atomic conditions (`x.contains("..")`) and multi-clause @@ -449,6 +487,22 @@ pub fn classify_path_rejection_axes(text: &str) -> smallvec::SmallVec<[PathRejec out } +/// True iff any top-level OR clause of `text` is the pre-negated +/// `!filepath.IsLocal()` Go idiom — i.e. a clause whose `!` is +/// already consumed by [`classify_path_rejection_axes`] when reporting +/// the safe arm. Callers use this to decide whether AST-level negation +/// (`condition_negated`) was already accounted for by the classifier +/// (returns `true`) or still needs to flip the safe-arm polarity for +/// polarity-blind atoms like `!path.contains("..")` (returns `false`). +pub(crate) fn cond_has_pre_negated_islocal_clause(text: &str) -> bool { + for clause in split_top_level_or(text) { + if has_negated_filepath_is_local(clause.trim()) { + return true; + } + } + false +} + /// Detect `!filepath.IsLocal()`, Go's idiomatic path-traversal /// guard. Whitespace-tolerant: `! filepath.IsLocal(`, `!filepath . IsLocal(`, /// etc. Used by [`classify_path_rejection_axes`] to inject both @@ -651,19 +705,39 @@ fn split_top_level_or(text: &str) -> smallvec::SmallVec<[&str; 4]> { out } -/// Recognise a Rust path-positive-assertion branch idiom. +/// Recognise a path-positive-assertion branch idiom (language-agnostic). +/// +/// Returns: +/// +/// * `PrefixLock()` when the condition is a `starts_with`-style +/// call with a literal prefix of length ≥ 2. Sibling single-character +/// prefixes (`"/"`, `"\\"`) are absolute-axis rejections, not locks. +/// * `PrefixLock(`[`OPAQUE_PREFIX_LOCK`]`)` when the call has a +/// non-empty, *non-literal* argument (method call, field access, local +/// variable). The opaque marker certifies the structural invariant +/// "verified rooted under some prefix" without committing to bytes, +/// which is exactly what FILE_IO sink-suppression needs to combine with +/// a `dotdot=No` proof — the upstream code path +/// `File.expand_path(...) + start_with?()` is the +/// motivating example. +/// * `None` otherwise. pub fn classify_path_assertion(text: &str) -> PathAssertion { let trimmed = text.trim(); - if let Some(needle) = extract_starts_with_arg(trimmed) { - // Positive assertion: a literal-prefix `starts_with` on a locked - // root. Sibling slash ("/") and backslash ("\\") are also - // classified as rejections above; prefix-lock only fires when the - // prefix is multi-character (i.e. carries real locking info). - if needle.len() >= 2 { - return PathAssertion::PrefixLock(needle); + match extract_starts_with_arg(trimmed) { + Some(needle) if needle.len() >= 2 => PathAssertion::PrefixLock(needle), + // Single-char literal (`"/"`, `"\\"`) is an absolute-axis + // rejection idiom handled by `classify_path_rejection_axes`, not + // a positive prefix-lock — fall through to None. + Some(_) => PathAssertion::None, + // No literal recovered: check for a non-literal argument + // (method call, field access, configured root) and attach the + // opaque marker so the structural "verified rooted under SOME + // prefix" invariant is recorded for downstream sink suppression. + None if has_starts_with_call_with_nonempty_arg(trimmed) => { + PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string()) } + None => PathAssertion::None, } - PathAssertion::None } /// Recognise a *structural* one-argument enum-variant constructor. @@ -1136,6 +1210,69 @@ fn extract_starts_with_arg(text: &str) -> Option { None } +/// Detect a `starts_with`-style call with a non-empty argument, where the +/// argument is *not* recovered as a string literal by +/// [`extract_starts_with_arg`] (so it's a method call, field access, local +/// variable, etc.). Used by [`classify_path_assertion`] to attach an +/// opaque prefix-lock when the application validates with a configured +/// root rather than an inline string literal. +/// +/// Whitespace-tolerant. Conservative: returns `false` for any shape where +/// the argument cannot be confirmed non-empty. +fn has_starts_with_call_with_nonempty_arg(text: &str) -> bool { + // Method-call forms with parens. The argument-presence check is + // simple: after the opening `(`, the first non-whitespace byte must + // not be `)` (empty arg list). + for method in [ + ".starts_with(", + ".start_with?(", + ".startsWith(", + ".startswith(", + ] { + if let Some(idx) = text.find(method) { + let after = &text[idx + method.len()..]; + if first_non_ws_byte(after).is_some_and(|b| b != b')') { + return true; + } + } + } + // Ruby paren-less call: `r.start_with? `. Tree-sitter still + // serialises the source text verbatim, so a space (or tab) follows + // the `?`. Require a non-empty, non-clause-terminator token after. + if let Some(idx) = text.find(".start_with?") { + let rest = &text[idx + ".start_with?".len()..]; + // Skip the `(` form (already covered above) and any whitespace. + let after = rest.trim_start(); + if !after.is_empty() { + let first = after.as_bytes()[0]; + // `(` belongs to the parenthesised form; clause terminators + // (`&&` / `||` / `)` / `]` / `;` / `,`) mean the call has no + // arguments at this position. + if !matches!(first, b'(' | b'&' | b'|' | b')' | b']' | b';' | b',') { + return true; + } + } + } + // Go free-function form `strings.HasPrefix(, )`. The + // second argument must exist and be non-empty. + if let Some(idx) = text.find("strings.HasPrefix(") { + let inner = &text[idx + "strings.HasPrefix(".len()..]; + if let Some(comma_idx) = top_level_comma(inner) { + let after_comma = inner[comma_idx + 1..].trim_start(); + if !after_comma.is_empty() && !after_comma.starts_with(')') { + return true; + } + } + } + false +} + +/// Return the first non-whitespace byte of `text`, or `None` if the slice +/// is empty or all-whitespace. +fn first_non_ws_byte(text: &str) -> Option { + text.bytes().find(|b| !b.is_ascii_whitespace()) +} + /// Find the index of the first top-level `,` in a slice (depth 0, ignoring /// commas inside nested parentheses, brackets, braces, or string literals). /// Returns `None` if no top-level comma is present. @@ -1716,6 +1853,109 @@ mod tests { ); } + #[test] + fn assertion_opaque_prefix_lock_method_call_arg() { + // rswag CVE-2023-38337 patched shape: `start_with?` with a + // configured-root method call as argument. The exact bytes are + // unknown to the analyser, but the structural invariant "rooted + // under SOME prefix" is captured via the opaque marker. + assert_eq!( + classify_path_assertion("filename.start_with? @config.resolve_swagger_root(env)"), + PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string()) + ); + } + + #[test] + fn assertion_opaque_prefix_lock_paren_method_call() { + // Same shape, parenthesised: `r.start_with?(some_root)`. + assert_eq!( + classify_path_assertion("filename.start_with?(@config.root)"), + PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string()) + ); + } + + #[test] + fn assertion_opaque_prefix_lock_python_startswith() { + // Python: `os.path.realpath(p).startswith(safe_root)` where + // `safe_root` is a local variable, not a literal. + assert_eq!( + classify_path_assertion("p.startswith(safe_root)"), + PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string()) + ); + } + + #[test] + fn assertion_opaque_prefix_lock_js_starts_with() { + assert_eq!( + classify_path_assertion("resolved.startsWith(uploadsDir)"), + PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string()) + ); + } + + #[test] + fn assertion_opaque_prefix_lock_go_hasprefix() { + assert_eq!( + classify_path_assertion("strings.HasPrefix(p, safeRoot)"), + PathAssertion::PrefixLock(OPAQUE_PREFIX_LOCK.to_string()) + ); + } + + #[test] + fn assertion_no_lock_on_empty_arg() { + // `r.starts_with()` (degenerate) should not produce a lock. + assert_eq!( + classify_path_assertion("r.starts_with()"), + PathAssertion::None + ); + } + + #[test] + fn is_path_traversal_safe_relative_dotdot_free() { + let f = PathFact::default() + .with_dotdot_cleared() + .with_absolute_cleared(); + assert!(f.is_path_traversal_safe()); + } + + #[test] + fn is_path_traversal_safe_canonicalised_with_prefix_lock() { + // `File.expand_path + start_with?(root)` shape: dotdot=No, + // absolute=Yes, prefix_lock=Some. The relaxed predicate should + // accept this even though the strict `is_path_safe` rejects it. + let f = PathFact::default() + .with_dotdot_cleared() + .with_prefix_lock("__nyx_opaque_prefix__"); + assert!(!f.is_path_safe(), "absolute axis still Maybe blocks strict"); + // Setting absolute=Yes via expand_path-style transfer: + let mut f2 = f.clone(); + f2.absolute = Tri::Yes; + assert!(!f2.is_path_safe(), "absolute=Yes blocks strict predicate"); + assert!( + f2.is_path_traversal_safe(), + "prefix_lock + dotdot=No is sufficient under relaxed predicate" + ); + } + + #[test] + fn is_path_traversal_safe_rejects_dotdot_maybe() { + let f = PathFact::default().with_prefix_lock("/var/app/"); + // dotdot still Maybe — relaxed predicate must still reject. + assert!(!f.is_path_traversal_safe()); + } + + #[test] + fn is_path_traversal_safe_rejects_absolute_without_lock() { + let mut f = PathFact::default().with_dotdot_cleared(); + f.absolute = Tri::Yes; + // No prefix_lock — relaxed predicate must reject. + assert!(!f.is_path_traversal_safe()); + } + + #[test] + fn is_path_traversal_safe_rejects_bottom() { + assert!(!PathFact::bottom().is_path_traversal_safe()); + } + #[test] fn primitive_canonicalize_normalises() { let f = classify_path_primitive("fs::canonicalize", &PathFact::top()).unwrap(); diff --git a/src/ast.rs b/src/ast.rs index 6d3d12b2..ec621951 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -420,17 +420,72 @@ fn build_taint_diag( // rule paths (Java HTTP clients where type-qualified resolution // attaches both `SSRF` and `DATA_EXFIL` Sink labels to the same call, // e.g. `client.send(req)` covering both URL and body channels of the - // request value) produce a single dual-cap event. In that case the - // source's sensitivity tier disambiguates: a Sensitive source - // (cookie, header, env, db, session) leaking into an outbound - // request is canonically DATA_EXFIL even if the sink also carries - // an SSRF label, because operator-bound state is not URL-shaped - // attacker input. Plain user input keeps SSRF routing (the typical - // user-controlled-URL pattern). + // request value) produce a single dual-cap event. Disambiguate using + // the flow path: when a body-bind verb (`.body(`, `.json(`, `.form(`, + // `.multipart(`, `BodyPublishers`, `setEntity`, `bodyValue`, etc.) + // appears anywhere in the SSA flow steps or the sink chain text, the + // taint reached an outbound payload field, route to DATA_EXFIL. When + // no body-bind verb is on the path (Sensitive-tier source flowing + // straight into the URL position via `.get`/`.post`/`.send`), this is + // a real SSRF and routes to taint-unsanitised-flow regardless of + // source sensitivity. Source sensitivity is still required for the + // DATA_EXFIL route, plain user input echoed into a request body is + // not exfiltration. + let flow_has_body_bind = { + let body_bind_substrings = [ + ".body(", + ".json(", + ".form(", + ".multipart(", + ".bodyvalue(", + ".setentity(", + "bodypublishers", + "body_string", + "body_json", + "body_bytes", + "send_string", + "send_json", + "send_form", + // Spring RestTemplate one-shot verbs that take a body argument + // inline (no separate `BodyPublishers` / `setEntity` step in the + // chain). Method-name suffixes are unique enough that bare + // substring matching is safe. + "postforobject", + "postforentity", + "patchforobject", + ]; + let chain_lower = call_site_callee.to_ascii_lowercase(); + let in_sink = body_bind_substrings.iter().any(|m| chain_lower.contains(m)); + let in_steps = finding.flow_steps.iter().any(|step| { + cfg_graph[step.cfg_node] + .call + .callee + .as_deref() + .map(|c| { + let lc = c.to_ascii_lowercase(); + body_bind_substrings.iter().any(|m| lc.contains(m)) + }) + .unwrap_or(false) + }); + in_sink || in_steps + }; + // Java HTTP-client builder pattern hides the body-bind step inside a + // builder chain whose intermediate calls collapse to `HttpRequest.build` + // in the flow. When the source is unambiguously credential-bearing + // (cookies, session attributes, caught exceptions carrying stack + // frames) and the sink fires DATA_EXFIL, treat that as exfil even + // when no body-bind verb is visible in the flow. Env vars stay + // ambiguous (they often carry URL config) so they still require an + // explicit body-bind hit on the path. + let source_is_credential_bearing = matches!( + finding.source_kind, + crate::labels::SourceKind::Cookie | crate::labels::SourceKind::CaughtException + ); let is_data_exfil_rule = effective_caps.contains(crate::labels::Cap::DATA_EXFIL) && !effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) && (!effective_caps.contains(crate::labels::Cap::SSRF) - || finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive); + || (finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive + && (flow_has_body_bind || source_is_credential_bearing))); let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) { "rs.auth.missing_ownership_check.taint".to_string() @@ -916,6 +971,45 @@ impl<'a> ParsedSource<'a> { { continue; } + // Layer E: C++ `reinterpret_cast(x)` when T is a + // type explicitly defined as safe by the C++ aliasing + // rules — byte-pointer family (`char*`, `unsigned + // char*`, `uint8_t*`, `std::byte*`, etc., per + // [basic.lval]/11), `void*`, the integer round-trip + // types `uintptr_t` / `intptr_t`, and the BSD-socket + // `sockaddr` family (POSIX intentionally type-puns + // `sockaddr*` <-> `sockaddr_in*` etc.). A pattern + // rule cannot tell these from genuinely dangerous + // strict-aliasing UB casts, so it over-fires + // dramatically on serialization, hashing, and + // socket-API code where the cast is the canonical + // (and standard-blessed) idiom. + if self.lang_slug == "cpp" + && is_cpp_cast_target_type_safe(cq.meta.id, cap.node, self.bytes) + { + continue; + } + // Layer F: PHP `md5()` / `sha1()` flagged as weak hash + // functions, but used in a non-cryptographic context + // (ETag generation, cache-key / array-index hashing, + // identifier fingerprinting, deduplication). The + // pattern rule cannot distinguish weak-hash crypto + // misuse from these idiomatic uses, so it over-fires + // on every `md5(...)` callsite regardless of the + // surrounding consuming context. Suppress when the + // call's *consuming context* yields a name that + // matches a recognised non-cryptographic identifier + // pattern (variable / field / array-key / method + // suffix). Genuine weak-hash crypto misuse — + // `$password_hash = md5(...)`, `$signature = md5(...)`, + // `$tokenHash = md5(...)` — keeps firing because the + // name contains an excluded crypto-keyword substring. + if (cq.meta.id == "php.crypto.md5" || cq.meta.id == "php.crypto.sha1") + && self.lang_slug == "php" + && is_php_weak_hash_non_crypto_use(cap.node, self.bytes) + { + continue; + } let point = cap.node.start_position(); out.push(Diag { path: self.path.to_string_lossy().into_owned(), @@ -2603,6 +2697,778 @@ fn is_string_literal_with_text(node: tree_sitter::Node, text: &str, bytes: &[u8] false } +/// C++-only Layer E: structural suppression of `cpp.memory.reinterpret_cast` +/// when the cast's target type is explicitly defined as safe by the C++ +/// aliasing rules. +/// +/// `reinterpret_cast(x)` is *not* always undefined behaviour — the C++ +/// standard ([basic.lval]/11) explicitly permits accessing any object +/// representation through a pointer to `char`, `unsigned char`, or +/// `std::byte` (and, by long-standing convention, `int8_t` / `uint8_t`). +/// `void*` is similarly safe because reads / writes are illegal through it +/// (the program must always cast back before dereferencing). The integer +/// round-trip `uintptr_t` / `intptr_t` is guaranteed lossless by the +/// standard. POSIX additionally type-puns the `sockaddr` family — the +/// BSD-socket API takes `struct sockaddr *` and the program must cast from +/// `sockaddr_in*` / `sockaddr_in6*` / `sockaddr_un*` / `sockaddr_storage*`, +/// which is the API's intended use. +/// +/// The pattern rule `cpp.memory.reinterpret_cast` cannot distinguish these +/// well-defined casts from genuinely dangerous strict-aliasing UB casts +/// (`reinterpret_cast(buf)`), so it over-fires by ~70% on +/// real-repo serialization, hashing, IPC, and socket-API code where the +/// cast is the canonical (and standard-blessed) idiom. Suppressing the +/// well-defined target-type set is a layer-2 structural fix (per the +/// bughunt depth hierarchy): the engine recognises the property +/// (well-defined target type) that makes the cast safe in C++ and +/// suppresses based on it. Genuine strict-aliasing risk casts (target is +/// a user struct / class type) keep firing. +/// +/// Shapes recognised (any pointer depth `>= 1` unless noted): +/// - `char*`, `signed char*`, `unsigned char*`, `wchar_t*` +/// - `uint8_t*`, `int8_t*`, `std::byte*`, `byte*` +/// - `void*` +/// - `uintptr_t`, `std::uintptr_t`, `intptr_t`, `std::intptr_t` (no +/// pointer depth required — the standard guarantees the lossless +/// round-trip even for the integer form) +/// - `sockaddr*`, `struct sockaddr*`, `sockaddr_in*`, `sockaddr_in6*`, +/// `sockaddr_un*`, `sockaddr_storage*` (any of the BSD-socket +/// address-structure family) +/// +/// Conservative refusals (kept firing): user-defined struct / class +/// pointer targets, template type parameters (`T*`), and any target the +/// normaliser cannot identify. +fn is_cpp_cast_target_type_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool { + if rule_id != "cpp.memory.reinterpret_cast" { + return false; + } + // `cap_node` is the `(identifier) @n` "reinterpret_cast" capture (the + // pattern's index-0 capture, by query-string order — see Layer A's + // `c.index == 0` selection in `run_ast_queries`). Walk up via + // `find_enclosing_call` to reach the outer `call_expression`. Its + // `function` field is a `template_function` whose `arguments` field is + // the `template_argument_list` carrying the target type. + let call = find_enclosing_call(cap_node); + let Some(call) = call else { return false }; + let func = call.child_by_field_name("function"); + let Some(func) = func else { return false }; + if func.kind() != "template_function" { + return false; + } + let targs = func.child_by_field_name("arguments"); + let Some(targs) = targs else { return false }; + if targs.kind() != "template_argument_list" { + return false; + } + let Ok(text) = std::str::from_utf8(&bytes[targs.byte_range()]) else { + return false; + }; + let inner = text + .trim() + .trim_start_matches('<') + .trim_end_matches('>') + .trim(); + cpp_cast_target_type_is_safe(inner) +} + +/// Normalise a C++ cast target type string and report whether it names a +/// well-defined-by-aliasing-rules type per the policy in +/// [`is_cpp_cast_target_type_safe`]. Public to the module so the unit +/// tests can pin the canonical and adversarial shapes. +pub(crate) fn cpp_cast_target_type_is_safe(s: &str) -> bool { + // Collapse all internal whitespace (tabs, newlines, multiple spaces) + // to single spaces so the normalised form is `const char *` with one + // space between every token. + let normalised: String = { + let mut out = String::with_capacity(s.len()); + let mut prev_ws = true; + for ch in s.chars() { + if ch.is_whitespace() { + if !prev_ws { + out.push(' '); + prev_ws = true; + } + } else { + out.push(ch); + prev_ws = false; + } + } + out.trim().to_string() + }; + let Some(base) = strip_pointer_and_cv(&normalised) else { + return false; + }; + // Pointer-indirection depth = count of `*` tokens in the normalised + // form (whitespace already collapsed; compound forms with parens / + // brackets / templates are filtered by `strip_pointer_and_cv`). + let depth = normalised.chars().filter(|c| *c == '*').count(); + + // Depth 0 (value cast): only the pointer<->integer round-trip types + // are well-defined. Aliasing *through* a `uintptr_t*` / `intptr_t*` + // is **not** covered by the standard exemption — only converting a + // pointer value to/from the integer type is defined behaviour + // ([basic.compound]/3). Therefore we accept these names only at + // depth 0. + if depth == 0 { + return matches!( + base.as_str(), + "uintptr_t" | "intptr_t" | "std::uintptr_t" | "std::intptr_t" + ); + } + + // Depth >= 2 (pointer-to-pointer and beyond) is never safe: the + // [basic.lval]/11 aliasing exemption is for accessing an object's + // representation as bytes through a single pointer indirection. + // Reading a `char*` object through a `char**` is a strict-aliasing + // violation, and the same logic applies to `void**`, `uint8_t**`, + // etc. + if depth != 1 { + return false; + } + + // Depth 1: standard aliasing exemption for byte-view access plus + // POSIX socket type-punning and the opaque `void*` target. + matches!( + base.as_str(), + "char" + | "signed char" + | "unsigned char" + | "wchar_t" + | "uint8_t" + | "int8_t" + | "std::byte" + | "byte" + | "void" + | "sockaddr" + | "struct sockaddr" + | "sockaddr_in" + | "sockaddr_in6" + | "sockaddr_un" + | "sockaddr_storage" + | "struct sockaddr_in" + | "struct sockaddr_in6" + | "struct sockaddr_un" + | "struct sockaddr_storage" + ) +} + +/// Strip a single C++ cast target's leading/trailing `const`/`volatile` +/// qualifiers and trailing `*` characters (any depth). Returns the bare +/// base type identifier on success. Returns `None` if anything left over +/// after pointer/cv stripping is not a plain identifier or scoped name +/// (e.g. function-pointer `void(*)(int)` or template `vector`). +fn strip_pointer_and_cv(s: &str) -> Option { + let mut t: &str = s.trim(); + // Strip leading `const` / `volatile`, possibly multiple. + loop { + let after = t + .strip_prefix("const ") + .or_else(|| t.strip_prefix("volatile ")); + match after { + Some(rest) => t = rest.trim_start(), + None => break, + } + } + // Repeatedly strip trailing `*` and trailing cv-qualifiers in either + // order — `T*`, `T* const`, `T*const`, `T const*`, `T**`, `const T*` + // are all reachable. The loop terminates when neither suffix + // matches. + loop { + let mut progressed = false; + // Strip trailing const/volatile that appears AFTER any `*` or + // before the first `*` (e.g. `T const`). Forms: ` const`, ` volatile`. + loop { + let after = t + .trim_end() + .strip_suffix(" const") + .or_else(|| t.trim_end().strip_suffix(" volatile")); + match after { + Some(rest) => { + t = rest; + progressed = true; + } + None => break, + } + } + // Strip trailing `*`s. + let trimmed = t.trim_end(); + if let Some(stripped) = trimmed.strip_suffix('*') { + t = stripped; + progressed = true; + } + if !progressed { + break; + } + } + let base = t.trim(); + if base.is_empty() { + return None; + } + // Refuse anything that contains characters typical of compound + // type forms we don't want to reason about: parens (function + // pointer), angle brackets (template instantiation), brackets + // (array), commas (multiple arguments). Accept identifier + // characters, `_`, `:` (for `std::byte`), spaces (for `unsigned + // char` / `struct sockaddr`). + for ch in base.chars() { + if !(ch.is_ascii_alphanumeric() || ch == '_' || ch == ':' || ch == ' ') { + return None; + } + } + Some(base.to_string()) +} + +/// PHP-only Layer F: structural suppression of `php.crypto.md5` / +/// `php.crypto.sha1` when the call's *consuming context* yields a name +/// that matches a recognised non-cryptographic identifier pattern. +/// +/// The pattern rule fires syntactically on every `md5(...)` / +/// `sha1(...)` callsite regardless of how the result is used. In real +/// PHP code these functions are pervasively used for non-cryptographic +/// purposes — ETag generation (HTTP cache validators), array/cache-key +/// hashing, dedup fingerprints, content addressing for templates — and +/// those uses do not realise the "weak hash function" risk the rule +/// names. Suppress only when the consuming context yields a name from +/// a recognised non-crypto suffix set, while keeping every callsite +/// whose name contains a crypto-keyword substring (`password`, +/// `secret`, `token`, `signature`, `hmac`, `digest`, `salt`, …). +/// +/// Consuming contexts inspected (walk up through transparent wrappers +/// — `binary_expression` for concat / equality, `parenthesized_expression`, +/// `conditional_expression`, `argument`): +/// - `assignment_expression` (covers `=`, `??=`, `+=`, …) — resolve +/// the LHS to a final identifier (variable name, member-access +/// property name, or string-literal subscript index). +/// - `array_element_initializer` — the key is a string literal whose +/// contents are the consuming name. +/// - `subscript_expression` where the call sits in the index position +/// — using a hash as an array index is intrinsically non-crypto. +/// - `return_statement` — resolve the enclosing +/// `function_definition` / `method_declaration` name (with the +/// conventional `get` prefix stripped). +/// +/// All other consuming forms (bare expression statements, comparison +/// operands without an LHS, lambda returns, arguments to user-defined +/// helpers) keep firing. +fn is_php_weak_hash_non_crypto_use(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool { + let call = if cap_node.kind() == "function_call_expression" { + cap_node + } else { + let mut cur = cap_node; + let mut found = None; + for _ in 0..4 { + if cur.kind() == "function_call_expression" { + found = Some(cur); + break; + } + match cur.parent() { + Some(p) => cur = p, + None => break, + } + } + match found { + Some(c) => c, + None => return false, + } + }; + + let mut cur = call; + let mut steps = 0u32; + while let Some(parent) = cur.parent() { + if steps > 16 { + return false; + } + steps += 1; + match parent.kind() { + // Transparent wrappers — keep walking to find the + // consumer. These node kinds preserve the value flowing + // out of the md5/sha1 call without transforming its + // semantics, so we let the OUTER context (LHS name, + // array key, return method, etc.) classify the use. + // + // - `binary_expression`: concat (`'foo_' . md5($x)`), + // equality (`md5($x) === $stored`), arithmetic. + // - `parenthesized_expression`: redundant parens. + // - `conditional_expression`: `$cond ? md5($x) : ''`. + // - `argument` / `arguments`: positional / wrapped arg + // lists — the enclosing call (`substr(md5($x), 0, 8)`, + // `$q->createNamedParameter(md5($x))`) is what matters. + // - `function_call_expression`: identity-shaped wrappers + // such as `substr(...)`, `strtolower(...)`, + // `urlencode(...)` which propagate the hash to its + // real consumer. + // - `encapsed_string`: `"prefix-{md5($x)}"` interpolation. + // + // `member_call_expression` / `nullsafe_member_call_expression` + // are NOT in this transparent set — they have their own + // arm below that performs lookup-verb classification on + // the method name (`->get(md5($k))`, `->set(...)`, …) + // before optionally falling through to the outer + // consumer. + "binary_expression" + | "parenthesized_expression" + | "conditional_expression" + | "argument" + | "arguments" + | "function_call_expression" + | "encapsed_string" => {} + "assignment_expression" | "augmented_assignment_expression" => { + let lhs = parent + .child_by_field_name("left") + .or_else(|| parent.named_child(0)); + let Some(lhs) = lhs else { + return false; + }; + return resolve_php_lvalue_name(lhs, bytes) + .map(|n| name_is_non_crypto(&n)) + .unwrap_or(false); + } + "array_element_initializer" => { + if parent.named_child_count() < 2 { + return false; + } + let key = parent.named_child(0); + let Some(key) = key else { + return false; + }; + let Some(key_text) = string_literal_text(key, bytes) else { + return false; + }; + return name_is_non_crypto(&key_text); + } + "subscript_expression" => { + // tree-sitter-php: subscript_expression has the receiver as + // the first named child and the index as the second. If our + // call sits past the receiver's end byte, we are the index. + let r0 = parent.named_child(0); + let Some(r0) = r0 else { + cur = parent; + continue; + }; + if call.start_byte() >= r0.end_byte() { + return true; + } + // Otherwise we're inside the receiver chain; the surrounding + // `assignment_expression` (if any) will resolve the LHS name. + } + "member_call_expression" | "nullsafe_member_call_expression" => { + // The md5/sha1 result is being passed as an argument to a + // method call. When the method name is a recognised + // key/cache/lookup verb (`get`, `set`, `has`, `delete`, + // `fetch`, `store`, `find`, `getItem`, `setItem`, …), the + // result is being used as a non-cryptographic lookup key — + // canonical for cache backends, hash maps, and storage + // adapters where the developer is hashing arbitrary input + // to a fixed-length, character-safe key. Genuine + // crypto-comparison wrappers (`hash_equals`, `verify`, + // `password_verify`) keep firing because their method + // name does not match the verb set. + let name_node = parent.child_by_field_name("name").or_else(|| { + // Fallback: last named child is the method name. + let count = parent.named_child_count(); + if count == 0 { + None + } else { + parent.named_child(count as u32 - 1) + } + }); + if let Some(nn) = name_node + && nn.kind() == "name" + && let Ok(method) = std::str::from_utf8(&bytes[nn.byte_range()]) + && method_is_lookup_verb(method) + { + return true; + } + // Otherwise treat as transparent so the OUTER consumer can + // classify (`$x = $cache->get(sha1($k))` resolves LHS `x`). + } + "return_statement" => { + let mut p = parent; + for _ in 0..10 { + let Some(pp) = p.parent() else { + return false; + }; + p = pp; + let kind = p.kind(); + if kind == "method_declaration" || kind == "function_definition" { + let Some(nn) = p + .child_by_field_name("name") + .or_else(|| find_named_child_of_kind(p, "name")) + else { + return false; + }; + let Ok(name) = std::str::from_utf8(&bytes[nn.byte_range()]) else { + return false; + }; + return method_name_is_non_crypto(name); + } + if kind == "anonymous_function" + || kind == "arrow_function" + || kind == "anonymous_function_creation_expression" + { + return false; + } + } + return false; + } + // Halt at scope / statement boundaries we cannot resolve through. + "expression_statement" + | "compound_statement" + | "method_declaration" + | "function_definition" + | "anonymous_function" + | "anonymous_function_creation_expression" + | "arrow_function" + | "program" => return false, + _ => return false, + } + cur = parent; + } + false +} + +/// Resolve the final identifier of a PHP l-value expression to a string +/// suitable for [`name_is_non_crypto`] classification. +/// +/// Handles: +/// - `$variable` (`variable_name` → inner name child) +/// - `$obj->property` (`member_access_expression` → name field) +/// - `$arr['literal_key']` (`subscript_expression` → string-literal index) +/// - `Class::$static` / `self::$prop` (`scoped_property_access_expression`) +/// +/// Returns `None` for unrecognised l-value shapes (dynamic property +/// access, computed indices, function-call l-values, etc.); the caller +/// then falls back to keeping the finding. +fn resolve_php_lvalue_name(lhs: tree_sitter::Node, bytes: &[u8]) -> Option { + let lhs = unwrap_php_paren(lhs); + match lhs.kind() { + "variable_name" => { + let name_node = lhs.named_child(0)?; + std::str::from_utf8(&bytes[name_node.byte_range()]) + .ok() + .map(String::from) + } + "member_access_expression" => { + let n = lhs.child_by_field_name("name").or_else(|| { + let count = lhs.named_child_count(); + if count == 0 { + None + } else { + lhs.named_child(count as u32 - 1) + } + })?; + // Property access can name a `name` (bare ident) or a + // `variable_name` (dynamic ${$x} — which we don't resolve). + if n.kind() == "name" { + std::str::from_utf8(&bytes[n.byte_range()]) + .ok() + .map(String::from) + } else { + None + } + } + "subscript_expression" => { + if lhs.named_child_count() >= 2 { + let idx = lhs.named_child(1)?; + if let Some(txt) = string_literal_text(idx, bytes) { + return Some(txt); + } + } + // Dynamic / non-literal index: recurse into the receiver + // so `$columnNamesHashes[$col]` resolves to + // `columnNamesHashes`. This handles canonical + // `$lookup_by_hash[$key] = md5($key)` shapes. + let r = lhs.named_child(0)?; + resolve_php_lvalue_name(r, bytes) + } + "scoped_property_access_expression" => { + let count = lhs.named_child_count(); + if count == 0 { + return None; + } + let prop = lhs.named_child(count as u32 - 1)?; + // The static property is a `variable_name`. Reuse this + // function recursively to extract the bare name. + resolve_php_lvalue_name(prop, bytes) + } + _ => None, + } +} + +/// Return the textual contents of a PHP string literal node (`string` +/// or `encapsed_string`), stripping surrounding quotes. Returns `None` +/// for any non-string node and for interpolated `encapsed_string`s +/// containing template variables. +fn string_literal_text(node: tree_sitter::Node, bytes: &[u8]) -> Option { + if node.kind() != "string" && node.kind() != "encapsed_string" { + return None; + } + if has_interpolation(node) { + return None; + } + for i in 0..node.named_child_count() as u32 { + if let Some(c) = node.named_child(i) + && (c.kind() == "string_content" || c.kind() == "string_value") + { + return std::str::from_utf8(&bytes[c.byte_range()]) + .ok() + .map(String::from); + } + } + if let Ok(s) = std::str::from_utf8(&bytes[node.byte_range()]) { + let trimmed = s.trim_matches(|c| c == '\'' || c == '"'); + return Some(trimmed.to_string()); + } + None +} + +fn unwrap_php_paren(mut node: tree_sitter::Node) -> tree_sitter::Node { + for _ in 0..4 { + if node.kind() == "parenthesized_expression" + && let Some(inner) = node.named_child(0) + { + node = inner; + continue; + } + break; + } + node +} + +/// Classify a PHP identifier as non-cryptographic by name. Two-tier +/// check: any name containing a crypto-keyword substring is hard-rejected +/// (kept as a finding); the remaining names are accepted when their +/// form ends in a recognised non-crypto suffix at a word boundary +/// (underscore, digit, camelCase transition) or via a long-enough +/// stand-alone suffix (≥4 chars). +/// +/// The crypto-keyword exclude list uses substring match (not just +/// suffix) so compound names like `hashedPassword` / `tokenHash` / +/// `sigStore` are conservatively kept. False rejections of safe +/// shapes are acceptable; false acceptances of crypto shapes are not. +pub(crate) fn name_is_non_crypto(name: &str) -> bool { + if name.is_empty() { + return false; + } + let lower = name.to_ascii_lowercase(); + static CRYPTO_EXCLUDES: &[&str] = &[ + "password", + "passwd", + "pw_hash", + "pwhash", + "pwdhash", + "pwd_hash", + "passhash", + "pass_hash", + "secret", + "token", + "signature", + "signed", + "hmac", + "digest", + "verifier", + "challenge", + "csrf", + "salt", + "nonce_secret", + "auth_code", + "authcode", + "auth_key", + "authkey", + "private", + "credential", + "creds", + "encryption", + "decryption", + "encryptkey", + "decryptkey", + "encrypt_key", + "decrypt_key", + "apikey", + "api_key", + ]; + for ex in CRYPTO_EXCLUDES { + if lower.contains(ex) { + return false; + } + } + // `sig` / `mac` are excluded only at word boundaries — the substrings + // appear in legitimate non-crypto names (`signal`, `unsigned`, + // `assignee`, `design`, `magic`). + if lower == "sig" || lower.ends_with("_sig") || lower.ends_with("sig_") { + return false; + } + if lower == "mac" || lower.ends_with("_mac") { + return false; + } + // Permissive safe-suffix recognition. + static SAFE_SUFFIXES: &[&str] = &[ + "hash", + "hashes", + "etag", + "etags", + "md5", + "sha1", + "fingerprint", + "fingerprints", + "cachekey", + "cache_key", + "cacheid", + "cache_id", + "id", + "uid", + "uuid", + "guid", + "name_hash", + "checksum", + "slot", + "bucket", + "seed", + "marker", + "tag", + "gravatar", + "hashid", + "opaque", + "shortid", + "short_id", + "fnv", + "fingerprintkey", + "anchor", + "version", + "buster", + "cachebuster", + "cache_buster", + "revision", + "rev", + ]; + let bytes_orig = name.as_bytes(); + for s in SAFE_SUFFIXES { + if lower == *s { + return true; + } + if !lower.ends_with(s) { + continue; + } + let prev_pos = lower.len() - s.len(); + if prev_pos == 0 { + return true; + } + let prev_char_orig = bytes_orig[prev_pos - 1] as char; + // Word boundary: underscore, digit, etc. + if !prev_char_orig.is_ascii_alphabetic() { + return true; + } + // CamelCase boundary: suffix starts with an uppercase letter + // in the original casing (`storageId`, `tableHash`, `sqlMd5`). + let suffix_first_orig = bytes_orig[prev_pos] as char; + if suffix_first_orig.is_ascii_uppercase() { + return true; + } + // Long stand-alone suffix (≥4 chars) — accept without boundary. + if s.len() >= 4 { + return true; + } + } + false +} + +/// Like [`name_is_non_crypto`] but with a leading `get` prefix stripped +/// to recognise the canonical `getETag` / `getHash` / `getCacheKey` +/// accessor naming convention. Pass the original-case name through so +/// downstream camelCase-boundary detection still works. +fn method_name_is_non_crypto(name: &str) -> bool { + let stripped = name + .strip_prefix("get") + .or_else(|| name.strip_prefix("Get")) + .unwrap_or(name); + if name_is_non_crypto(stripped) { + return true; + } + // Some accessors keep the prefix (e.g., `recoveryKeyId`, + // `formatPath` returning a hashed-path identifier). Also try the + // raw name for camelCase-boundary suffix detection. + name_is_non_crypto(name) +} + +/// Recognise PHP method names that signal a lookup / cache / store / +/// container key-or-value operation. When `md5(...)` / `sha1(...)` is +/// passed to such a method, the result is being used as a content- +/// addressed key — not for cryptographic strength. The verb set is +/// purposely narrow so cryptographic comparison helpers +/// (`hash_equals`, `verify`, `password_verify`, `decryptWith`) keep +/// firing. +fn method_is_lookup_verb(method: &str) -> bool { + let lower = method.to_ascii_lowercase(); + static VERBS: &[&str] = &[ + "get", + "set", + "has", + "delete", + "remove", + "fetch", + "store", + "put", + "save", + "exists", + "find", + "lookup", + "getitem", + "setitem", + "hasitem", + "deleteitem", + "addtag", + "addtotag", + "key", + "keyfor", + "containskey", + "haskey", + "loadbykey", + "fetchbykey", + "getbykey", + "setbykey", + "deletebykey", + "incr", + "incrby", + "decr", + "decrby", + "expire", + "ttl", + "namespacekey", + "cachekey", + ]; + if VERBS.contains(&lower.as_str()) { + return true; + } + // Composite forms like `getCacheKey`, `setCacheKey`, `getRoute` — + // very common in cache adapters, accept any name ending in one of + // a few non-crypto-typed-result suffixes preceded by a get/set/has + // verb. + static SUFFIX_HINTS: &[&str] = &[ + "cachekey", + "key", + "id", + "hash", + "etag", + "uid", + "tag", + "fingerprint", + ]; + if let Some(rest) = lower + .strip_prefix("get") + .or_else(|| lower.strip_prefix("set")) + .or_else(|| lower.strip_prefix("has")) + .or_else(|| lower.strip_prefix("create")) + .or_else(|| lower.strip_prefix("build")) + { + for h in SUFFIX_HINTS { + if rest.ends_with(h) { + return true; + } + } + } + false +} + /// Check if a string node contains interpolation (e.g., PHP `"Hello $name"`). fn has_interpolation(node: tree_sitter::Node) -> bool { for i in 0..node.child_count() as u32 { @@ -3552,6 +4418,221 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() { ); } +#[test] +fn php_weak_hash_non_crypto_use_recognises_canonical_shapes() { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP); + parser.set_language(&lang).unwrap(); + let q = r#"(function_call_expression function: (name) @n (#match? @n "^(md5|sha1)$")) @vuln"#; + + // ETag concat returned from getETag() — return-statement enclosing + // method name path. + let code = b"data) . '\"'; } }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_php_capture(&tree, code, q); + assert!( + is_php_weak_hash_non_crypto_use(cap, code), + "getETag concat should be suppressed" + ); + + // Array element value with a string-literal key whose name is non-crypto. + let code = b" md5($x)]; }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_php_capture(&tree, code, q); + assert!( + is_php_weak_hash_non_crypto_use(cap, code), + "array element with `*_hash` key should be suppressed" + ); + + // Subscript LHS with a string-literal index `'etag'`. + let code = b"storageId = md5($this->id); } }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_php_capture(&tree, code, q); + assert!( + is_php_weak_hash_non_crypto_use(cap, code), + "member-access LHS `storageId` should be suppressed" + ); + + // Null-coalescing assignment with subscript LHS. + let code = b"get(sha1(...))`). + let code = b"cache->get(sha1($u)); } }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_php_capture(&tree, code, q); + assert!( + is_php_weak_hash_non_crypto_use(cap, code), + "method call to lookup-verb `get(sha1(..))` should be suppressed" + ); + + // Createnamedparameter wrapper around md5 inside an array element value. + let code = b"q->insert('t')->values(['etag' => $this->q->createNamedParameter(md5($d))]); } }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_php_capture(&tree, code, q); + assert!( + is_php_weak_hash_non_crypto_use(cap, code), + "wrapper-call inside array element with `etag` key should be suppressed" + ); + + // Dynamic-index subscript LHS with a non-crypto receiver name. + let code = b"password = md5($pwd). + let code = + b"password = md5($p); } }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_php_capture(&tree, code, q); + assert!( + !is_php_weak_hash_non_crypto_use(cap, code), + "$this->password = md5(...) is crypto storage and must NOT be suppressed" + ); + + // Compound name with crypto-keyword substring. $tokenHash = md5(...). + let code = b"( .node } +#[cfg(test)] +fn first_cpp_capture<'tree>( + tree: &'tree tree_sitter::Tree, + code: &[u8], + query_str: &str, +) -> tree_sitter::Node<'tree> { + use tree_sitter::StreamingIterator; + let lang = tree_sitter::Language::from(tree_sitter_cpp::LANGUAGE); + let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles"); + let mut cursor = tree_sitter::QueryCursor::new(); + let mut matches = cursor.matches(&query, tree.root_node(), code); + let m = matches.next().expect("query should match"); + m.captures + .iter() + .find(|c| c.index == 0) + .expect("capture index 0") + .node +} + +#[test] +fn cpp_cast_target_type_is_safe_recognises_canonical_shapes() { + use crate::ast::cpp_cast_target_type_is_safe as f; + // Byte-pointer family — C++ explicitly permits byte-level access. + assert!(f("char*")); + assert!(f("char *")); + assert!(f("const char*")); + assert!(f("const char *")); + assert!(f("unsigned char*")); + assert!(f("const unsigned char*")); + assert!(f("signed char*")); + assert!(f("uint8_t*")); + assert!(f("const uint8_t*")); + assert!(f("int8_t*")); + assert!(f("std::byte*")); + assert!(f("const std::byte*")); + assert!(f("byte*")); + assert!(f("wchar_t*")); + // void* — well-defined target. + assert!(f("void*")); + assert!(f("const void*")); + // Integer round-trip — value cast only (depth 0). Aliasing + // *through* a `uintptr_t*` / `intptr_t*` is NOT covered by the + // standard exemption — only the pointer<->integer value + // conversion is well-defined. + assert!(f("uintptr_t")); + assert!(f("std::uintptr_t")); + assert!(f("intptr_t")); + assert!(f("std::intptr_t")); + // BSD socket family — POSIX intentionally type-puns these. + assert!(f("sockaddr*")); + assert!(f("struct sockaddr*")); + assert!(f("sockaddr_in*")); + assert!(f("sockaddr_in6*")); + assert!(f("sockaddr_un*")); + assert!(f("sockaddr_storage*")); + + // Multi-token / extra whitespace — normaliser should collapse it. + assert!(f("const uint8_t *")); + assert!(f("uint8_t * const")); + assert!(f("const unsigned char *")); + + // Pointer-to-pointer is NOT covered by the [basic.lval]/11 + // aliasing exemption — accessing a `char*` object through a + // `char**` is a strict-aliasing violation. Same for `void**`, + // `uint8_t**`, etc. + assert!(!f("char**")); + assert!(!f("uint8_t**")); + assert!(!f("void**")); + assert!(!f("void **")); + // Pointer-to-integer-roundtrip-type (`uintptr_t*`, `intptr_t*`) + // is also not safe: only the pointer<->integer **value** cast is + // well-defined, not aliasing through a pointer-to-uintptr_t. + assert!(!f("uintptr_t*")); + assert!(!f("intptr_t*")); + assert!(!f("std::uintptr_t*")); + + // Non-safe shapes — must NOT be suppressed. + assert!(!f("MyStruct*")); + assert!(!f("InstanceType*")); + assert!(!f("DBImpl*")); + assert!(!f("C*")); + assert!(!f("CPP*")); + assert!(!f("T*")); + assert!(!f("secp256k1_keypair*")); + assert!(!f("PIP_ADAPTER_ADDRESSES")); + assert!(!f("std::vector*")); + assert!(!f("void(*)(int)")); + assert!(!f("char[10]")); + // Bare integer (no pointer) is only safe for the round-trip + // types — `int`, `size_t`, `uint64_t` should NOT match. + assert!(!f("int")); + assert!(!f("size_t")); + assert!(!f("uint64_t")); + assert!(!f("char")); // bare char without pointer + assert!(!f("uint8_t")); // bare uint8_t without pointer +} + +#[test] +fn cpp_reinterpret_cast_layer_e_recognises_byte_pointer_targets() { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_cpp::LANGUAGE); + parser.set_language(&lang).unwrap(); + let q = r#"(call_expression + function: (template_function + name: (identifier) @n (#eq? @n "reinterpret_cast"))) + @vuln"#; + + // reinterpret_cast(p) — the leveldb / serialization shape. + let code = b"void f(int* p) { auto q = reinterpret_cast(p); (void)q; }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_cpp_capture(&tree, code, q); + assert!( + is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code), + "reinterpret_cast must be suppressed (byte-pointer target)" + ); + + // reinterpret_cast(p) — qualified scoped name. + let code = b"#include \nvoid f(int* p) { auto q = reinterpret_cast(p); (void)q; }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_cpp_capture(&tree, code, q); + assert!( + is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code), + "reinterpret_cast must be suppressed" + ); + + // reinterpret_cast(0x08000000) — synthetic-address shape. + let code = b"void* f() { return reinterpret_cast(0x08000000); }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_cpp_capture(&tree, code, q); + assert!( + is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code), + "reinterpret_cast must be suppressed (synthetic address)" + ); + + // reinterpret_cast(p) — integer round-trip. + let code = + b"#include \nuintptr_t f(int* p) { return reinterpret_cast(p); }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_cpp_capture(&tree, code, q); + assert!( + is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code), + "reinterpret_cast must be suppressed (integer round-trip)" + ); + + // reinterpret_cast(&addr) — POSIX socket-API shape. + let code = b"struct sockaddr_in { int x; };\nstruct sockaddr;\nvoid f(struct sockaddr_in* a) { auto* s = reinterpret_cast(a); (void)s; }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_cpp_capture(&tree, code, q); + assert!( + is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code), + "reinterpret_cast must be suppressed (BSD socket pun)" + ); + + // reinterpret_cast(buf) — strict-aliasing UB risk, must NOT + // be suppressed. + let code = b"struct MyStruct { int a; };\nMyStruct* f(char* buf) { return reinterpret_cast(buf); }\n"; + let tree = parser.parse(code, None).unwrap(); + let cap = first_cpp_capture(&tree, code, q); + assert!( + !is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code), + "reinterpret_cast must NOT be suppressed (genuine strict-aliasing risk)" + ); + + // Other rule ids are unaffected. + assert!( + !is_cpp_cast_target_type_safe("cpp.memory.const_cast", cap, code), + "Layer E must only fire for cpp.memory.reinterpret_cast" + ); +} + #[test] fn c_buffer_call_literal_safe_recognises_canonical_shapes() { let mut parser = tree_sitter::Parser::new(); diff --git a/src/cfg/cfg_tests.rs b/src/cfg/cfg_tests.rs index de7edc0d..e63796e5 100644 --- a/src/cfg/cfg_tests.rs +++ b/src/cfg/cfg_tests.rs @@ -85,6 +85,77 @@ fn inner_call_override_narrows_classification_span() { ); } +/// Ruby (and any language without an `expression_statement` wrapper) +/// reaches `push_node` with `ast.kind() == "call"` (`Kind::CallMethod`) +/// for top-level statement-position calls. The inner-call fallback at +/// `push_node` line ~1690 must include `Kind::CallFn | Kind::CallMethod +/// | Kind::CallMacro` in its kind gate, otherwise an unclassified outer +/// wrapper around a sink (e.g. `YAML.safe_load(File.read(filename))`, +/// `String.new(File.read(x))`, `JSON.parse(File.read(x))` — every +/// chain-style sink wrapper used in real Ruby helpers) loses the inner +/// sink's classification entirely. Cross-function summary extraction +/// then misses the wrapper's `param_to_sink` and downstream callers +/// silently lose detection. Regression guard for CVE-2023-38337 +/// (rswag-api `parse_file → load_yaml/load_json → File.read` chain) +/// and CVE-2021-21288 (CarrierWave `download → OpenURI.open_uri`). +#[test] +fn ruby_inner_call_fallback_classifies_wrapper_around_file_read() { + let src = b"def f(x)\n YAML.safe_load(File.read(x))\nend\n"; + let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE); + let (cfg, _entry) = parse_and_build(src, "ruby", ts_lang); + + // The outer call `YAML.safe_load(...)` does not classify by itself; + // the fallback must descend into its argument list and pick up the + // inner `File.read(x)` Sink(FILE_IO) label. + let sink = cfg + .node_indices() + .find(|&i| cfg[i].call.callee.as_deref() == Some("File.read")) + .expect( + "inner-call fallback should override the outer YAML.safe_load callee with File.read", + ); + + let info = &cfg[sink]; + assert!( + info.taint + .labels + .iter() + .any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::FILE_IO))), + "wrapper-around-File.read node must carry the FILE_IO sink label" + ); + // outer_callee should preserve the original callee text so cross-fn + // summary lookup can still find the wrapping function. + assert_eq!( + info.call.outer_callee.as_deref(), + Some("YAML.safe_load"), + "outer_callee must preserve the original wrapping callee" + ); +} + +/// Identical-shape regression guard for the *bare-function* call +/// variant (`outer(File.read(x))`) — exercises the `Kind::CallFn` +/// branch of the gate, where Ruby/Python/etc.'s top-level free +/// function calls lacking a method receiver land. +#[test] +fn ruby_inner_call_fallback_classifies_bare_outer_around_file_read() { + let src = b"def f(x)\n outer(File.read(x))\nend\n"; + let ts_lang = Language::from(tree_sitter_ruby::LANGUAGE); + let (cfg, _entry) = parse_and_build(src, "ruby", ts_lang); + + let sink = cfg + .node_indices() + .find(|&i| cfg[i].call.callee.as_deref() == Some("File.read")) + .expect("inner-call fallback must override `outer` callee with File.read"); + + let info = &cfg[sink]; + assert!( + info.taint + .labels + .iter() + .any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::FILE_IO))), + "wrapper-around-File.read node must carry FILE_IO sink label" + ); +} + /// `classification_span()` must fall back to `ast.span` when no narrower /// sub-expression was recorded, so existing structural code paths keep /// working unchanged for nodes whose classification applies to the whole diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 428e4dd2..0ab57237 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -1681,12 +1681,31 @@ pub(super) fn push_node<'a>( // When the callee is overridden, save the original for container ops // (e.g. `parts.add(req.getParameter(...))`, callee becomes // "req.getParameter" but outer_callee preserves "parts.add"). + // + // Statement-level calls in languages without a separate + // `expression_statement` wrapper (Ruby, where `body_statement` directly + // contains the call AST node) reach `push_node` with `ast.kind() == + // "call"` (`Kind::CallMethod`) rather than `Kind::CallWrapper`. Without + // including the call kinds in the gate, an unclassified outer wrapper + // around a sink (e.g. `YAML.safe_load(File.read(filename))` or + // `String.new(File.read(x))`) loses the inner sink's classification + // entirely — the outer call becomes a non-sink node, and the inner call + // is not emitted as a standalone CFG node because it sits inside the + // outer's `argument_list`. Cross-function summary extraction then + // misses the `param_to_sink` for the wrapper helper, breaking detection + // of every chain-style sink wrapper used in real Ruby CVEs (rswag + // CVE-2023-38337, the Marshal/JSON/YAML-of-File.read pattern, etc.). let mut outer_callee: Option = None; let mut inner_callee_span: Option<(usize, usize)> = None; if labels.is_empty() && matches!( lookup(lang, ast.kind()), - Kind::CallWrapper | Kind::Assignment | Kind::Return + Kind::CallWrapper + | Kind::Assignment + | Kind::Return + | Kind::CallFn + | Kind::CallMethod + | Kind::CallMacro ) && let Some((inner_text, inner_label, inner_span)) = find_classifiable_inner_call(ast, lang, code, extra) diff --git a/src/labels/mod.rs b/src/labels/mod.rs index 39d7b877..26d58b33 100644 --- a/src/labels/mod.rs +++ b/src/labels/mod.rs @@ -576,6 +576,7 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind { || cl.contains("form") || cl.contains("query") || cl.contains("params") + || cl.contains("param") || cl.contains("input") || cl.contains("body") || cl.contains("location") @@ -1691,6 +1692,16 @@ mod tests { assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF))); } + #[test] + fn classify_ruby_openuri_open_uri_is_ssrf_sink() { + // OpenURI.open_uri is the canonical low-level URI fetcher that + // URI.open delegates to. CarrierWave / Paperclip / similar gems + // route SSRF-vulnerable downloads through it directly. + // CVE-2021-21288 (CarrierWave) regression guard. + let result = classify("ruby", "OpenURI.open_uri", None); + assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF))); + } + #[test] fn unpack_matcher_strips_exact_sigil() { let (m, exact) = unpack_matcher(b"=open"); diff --git a/src/labels/ruby.rs b/src/labels/ruby.rs index 0dc5d1ac..90656daa 100644 --- a/src/labels/ruby.rs +++ b/src/labels/ruby.rs @@ -127,11 +127,15 @@ pub static RULES: &[LabelRule] = &[ }, // URI.open is the network-capable Kernel#open wrapper, more specific than // plain `open` (excluded to avoid file I/O false positives). + // OpenURI.open_uri is the canonical low-level URI fetcher that URI.open + // delegates to — every SSRF-vulnerable Ruby download helper (CarrierWave + // pre-2.1.1 / 1.3.2, Paperclip, etc.) ultimately reaches it. LabelRule { matchers: &[ "Net::HTTP.get", "Net::HTTP.post", "URI.open", + "OpenURI.open_uri", "HTTParty.get", "HTTParty.post", ], diff --git a/src/patterns/javascript.rs b/src/patterns/javascript.rs index bd8fa418..6f720009 100644 --- a/src/patterns/javascript.rs +++ b/src/patterns/javascript.rs @@ -255,6 +255,7 @@ pub const PATTERNS: &[Pattern] = &[ confidence: Confidence::High, }, // ── Tier A: Hardcoded fallback secret ────────────────────────────── + // Empty-string fallback (`|| ""`) is excluded — see typescript.rs for rationale. Pattern { id: "js.secrets.fallback_secret", description: "Environment variable with secret-like name has hardcoded fallback value", @@ -266,7 +267,7 @@ pub const PATTERNS: &[Pattern] = &[ property: (property_identifier) @key (#match? @key "(?i)(secret|password|key|token)")) operator: "||" - right: (string) @fallback) + right: (string) @fallback (#match? @fallback "[^\"']")) @vuln"#, severity: Severity::Medium, tier: PatternTier::A, diff --git a/src/patterns/typescript.rs b/src/patterns/typescript.rs index cf052bc9..b8e13184 100644 --- a/src/patterns/typescript.rs +++ b/src/patterns/typescript.rs @@ -244,6 +244,10 @@ pub const PATTERNS: &[Pattern] = &[ confidence: Confidence::High, }, // ── Tier A: Hardcoded fallback secret ────────────────────────────── + // The `(#match? @fallback "[^\"']")` predicate excludes empty-string + // fallbacks (`process.env.X || ""`), which are the dominant FP shape + // in production TypeScript: developers write `|| ""` to satisfy the + // non-undefined string type without committing a real secret. Pattern { id: "ts.secrets.fallback_secret", description: "Environment variable with secret-like name has hardcoded fallback value", @@ -255,7 +259,7 @@ pub const PATTERNS: &[Pattern] = &[ property: (property_identifier) @key (#match? @key "(?i)(secret|password|key|token)")) operator: "||" - right: (string) @fallback) + right: (string) @fallback (#match? @fallback "[^\"']")) @vuln"#, severity: Severity::Medium, tier: PatternTier::A, diff --git a/src/symex/strings.rs b/src/symex/strings.rs index 4cae38cd..78d9309f 100644 --- a/src/symex/strings.rs +++ b/src/symex/strings.rs @@ -1037,6 +1037,75 @@ pub fn detect_replace_sanitizer( } } +/// Detect a call-site Replace sanitizer from syntactic argument literals. +/// +/// Used by SSA transfer to recognize replace-based shell/HTML/SQL escapers +/// without requiring a label rule per pattern. Returns the sanitized caps +/// when: +/// * the callee is a recognized Replace string method (per language), +/// * the pattern argument is a concrete string literal, and +/// * the pattern matches a security-relevant escape pattern in +/// [`detect_replace_sanitizer`]. +/// +/// Non-global replaces (e.g. JS `s.replace(";", "")` only replaces the first +/// occurrence) are excluded because partial replacement does not provide a +/// sanitiser-strength guarantee at the call site. +pub fn detect_call_site_replace_sanitizer( + callee: &str, + lang: Lang, + arg_string_literals: &[Option], +) -> Option { + let pattern_pos = pattern_arg_position(callee, lang)?; + let pattern = arg_string_literals + .get(pattern_pos) + .and_then(|o| o.as_deref())?; + let replacement = arg_string_literals + .get(pattern_pos + 1) + .and_then(|o| o.as_deref()) + .unwrap_or(""); + let info = detect_replace_sanitizer(pattern, replacement, callee, lang)?; + if !info.is_global || info.sanitized_caps.is_empty() { + return None; + } + Some(info.sanitized_caps) +} + +fn pattern_arg_position(callee: &str, lang: Lang) -> Option { + let method = bare_method_name(callee); + match lang { + Lang::JavaScript | Lang::TypeScript => match method { + "replace" | "replaceAll" => Some(0), + _ => None, + }, + Lang::Python => match method { + "replace" => Some(0), + "sub" if callee == "re.sub" => Some(0), + _ => None, + }, + Lang::Ruby => match method { + "gsub" | "sub" => Some(0), + _ => None, + }, + Lang::Java => match method { + "replace" | "replaceAll" => Some(0), + _ => None, + }, + Lang::Go => match callee { + "strings.Replace" | "strings.ReplaceAll" => Some(1), + _ => None, + }, + Lang::Php => match callee { + "str_replace" => Some(0), + _ => None, + }, + Lang::Rust => match method { + "replace" | "replacen" => Some(0), + _ => None, + }, + _ => None, + } +} + /// Determine whether a replace call is global (replaces all occurrences). fn is_global_replace(callee: &str, lang: Lang) -> bool { let method = bare_method_name(callee); diff --git a/src/taint/path_state.rs b/src/taint/path_state.rs index 692dca3d..c655f48a 100644 --- a/src/taint/path_state.rs +++ b/src/taint/path_state.rs @@ -566,6 +566,57 @@ fn count_call_args(text: &str) -> Option { Some(count) } +/// Extract the first top-level argument from `args_part`, the substring +/// immediately following the open paren of a call expression. Walks +/// paren/bracket/brace depth and skips quoted strings so nested calls and +/// punctuation inside string literals do not confuse the scan. Returns +/// the trimmed argument substring up to the first top-level `,` or +/// matching `)`, or `None` when no balanced close paren is found. +/// +/// Robust against trailing wrapper parens such as +/// `(!ALLOWED.includes(cmd))` where naïve `strip_suffix(')')` would leave +/// `cmd)` and lose the argument. +fn first_call_arg(args_part: &str) -> Option<&str> { + let bytes = args_part.as_bytes(); + let mut depth: usize = 1; + let mut end: Option = None; + let mut first_comma: Option = None; + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + match b { + b'(' | b'[' | b'{' => depth += 1, + b')' | b']' | b'}' => { + depth -= 1; + if depth == 0 { + end = Some(i); + break; + } + } + b',' if depth == 1 && first_comma.is_none() => first_comma = Some(i), + b'"' | b'\'' => { + let quote = b; + i += 1; + while i < bytes.len() { + if bytes[i] == b'\\' && i + 1 < bytes.len() { + i += 2; + continue; + } + if bytes[i] == quote { + break; + } + i += 1; + } + } + _ => {} + } + i += 1; + } + let end = end?; + let cut = first_comma.unwrap_or(end); + Some(args_part[..cut].trim()) +} + /// Extract the validated variable from a condition text. /// /// Handles two patterns: @@ -592,11 +643,10 @@ fn extract_validation_target(text: &str) -> Option { } } - // Function call pattern: `func(x, ...)`, extract first argument - // Strip closing paren if present - let args_inner = args_part.trim_end().strip_suffix(')').unwrap_or(args_part); - // Take text up to first comma (first argument) - let first_arg = args_inner.split(',').next()?.trim(); + // Function call pattern: `func(x, ...)`, extract first argument with + // balanced-paren scan so trailing wrapper parens (`(validate(x))`) do + // not corrupt the argument substring. + let first_arg = first_call_arg(args_part)?; // Strip reference operators (e.g. `&x` → `x`) let first_arg = first_arg.strip_prefix('&').unwrap_or(first_arg).trim(); @@ -630,11 +680,11 @@ fn extract_allowlist_target(text: &str) -> Option { if let Some(pos) = lower.find(method) { let args_start = pos + method.len(); let args_part = &trimmed[args_start..]; - let inner = args_part.strip_suffix(')').unwrap_or(args_part); - let first_arg = inner.split(',').next()?.trim(); - let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); - if !first_arg.is_empty() && is_identifier(first_arg) { - return Some(first_arg.to_string()); + if let Some(first_arg) = first_call_arg(args_part) { + let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); + if !first_arg.is_empty() && is_identifier(first_arg) { + return Some(first_arg.to_string()); + } } } } @@ -643,11 +693,11 @@ fn extract_allowlist_target(text: &str) -> Option { if let Some(pos) = lower.find("in_array(") { let args_start = pos + "in_array(".len(); let args_part = &trimmed[args_start..]; - let inner = args_part.strip_suffix(')').unwrap_or(args_part); - let first_arg = inner.split(',').next()?.trim(); - let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); - if !first_arg.is_empty() && is_identifier(first_arg) { - return Some(first_arg.to_string()); + if let Some(first_arg) = first_call_arg(args_part) { + let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg); + if !first_arg.is_empty() && is_identifier(first_arg) { + return Some(first_arg.to_string()); + } } } @@ -1063,6 +1113,32 @@ mod tests { ); } + #[test] + fn extract_allowlist_target_negated_paren_wrapper() { + // Tree-sitter records the if-condition as `(!ALLOWED.includes(cmd))`, + // including the surrounding parens. Naïve `strip_suffix(')')` left + // `cmd)` and `is_identifier` rejected the trailing `)`, dropping the + // structural guard for `cfg-unguarded-sink` suppression. The + // balanced-paren scan must return `Some("cmd")`. + let (kind, target) = classify_condition_with_target("(!ALLOWED.includes(cmd))"); + assert_eq!(kind, PredicateKind::AllowlistCheck); + assert_eq!(target.as_deref(), Some("cmd")); + } + + #[test] + fn extract_allowlist_target_java_contains_paren_wrapper() { + let (kind, target) = classify_condition_with_target("(!ALLOWED.contains(cmd))"); + assert_eq!(kind, PredicateKind::AllowlistCheck); + assert_eq!(target.as_deref(), Some("cmd")); + } + + #[test] + fn extract_allowlist_target_in_array_paren_wrapper() { + let (kind, target) = classify_condition_with_target("(!in_array($cmd, $allowed))"); + assert_eq!(kind, PredicateKind::AllowlistCheck); + assert_eq!(target.as_deref(), Some("cmd")); + } + // ── TypeCheck classification ────────────────────────────────────── #[test] diff --git a/src/taint/ssa_transfer/events.rs b/src/taint/ssa_transfer/events.rs index df9c0cf7..eccf8755 100644 --- a/src/taint/ssa_transfer/events.rs +++ b/src/taint/ssa_transfer/events.rs @@ -296,16 +296,16 @@ pub fn ssa_events_to_findings( crate::taint::ssa_transfer::state::record_all_validated_span(span); // Mirror the path-safety pathway: when the SSA engine has - // already proved every tainted input to a privileged - // FILE_IO sink passed through validation, publish the sink - // span so the state-analysis pass suppresses - // `state-unauthed-access` on the same span. Trust here - // matches the trust the engine already extends when - // dropping the taint flow finding. Scoped to FILE_IO sinks - // because that is the only sink class state-unauthed-access - // currently fires on; broadening would risk stretching - // validator-name heuristics into unrelated finding classes. - if event.sink_caps.intersects(Cap::FILE_IO) { + // already proved every tainted input to a privileged sink + // passed through validation, publish the sink span so the + // state-analysis pass suppresses `state-unauthed-access` + // on the same span. Trust here matches the trust the + // engine already extends when dropping the taint flow + // finding. Covers the privileged sink classes + // [`is_privileged_sink`] keys on (FILE_IO + SHELL_ESCAPE); + // broadening past those would stretch the validator-trust + // heuristic into unrelated finding classes. + if event.sink_caps.intersects(Cap::FILE_IO | Cap::SHELL_ESCAPE) { crate::taint::ssa_transfer::state::record_path_safe_suppressed_span(span); } continue; diff --git a/src/taint/ssa_transfer/mod.rs b/src/taint/ssa_transfer/mod.rs index a893d512..ff07c614 100644 --- a/src/taint/ssa_transfer/mod.rs +++ b/src/taint/ssa_transfer/mod.rs @@ -987,6 +987,7 @@ fn compute_succ_states( &effective_vars, ssa, Some(transfer.interner), + effective_negated, ); // Validation-call err-check narrowing. When the condition @@ -1522,7 +1523,13 @@ fn resolve_var_to_ssa_value(var_name: &str, ssa: &SsaBody, block: BlockId) -> Op /// variables) and updates its [`PathFact`] according to the classified /// rejection / assertion idiom. /// -/// Gated on `transfer.lang == Lang::Rust` by the caller. +/// `negated` reflects the effective negation of `cond_text`: when true, +/// the condition's surface form is `!` (or `not `) +/// and the True/False successor states correspond to the *rejection* / +/// *surviving* arms inverted relative to the unwrapped condition. The +/// narrowing functions are written against the unwrapped condition; this +/// flag lets the caller route prefix-lock / rejection-axis narrowing to +/// the arm where the unwrapped condition holds. #[cfg(test)] fn apply_path_fact_branch_narrowing( true_state: &mut SsaTaintState, @@ -1538,6 +1545,7 @@ fn apply_path_fact_branch_narrowing( effective_vars, ssa, None, + false, ); } @@ -1548,10 +1556,12 @@ fn apply_path_fact_branch_narrowing_with_interner( effective_vars: &[String], ssa: &SsaBody, interner: Option<&SymbolInterner>, + negated: bool, ) { use crate::abstract_interp::PathFact; use crate::abstract_interp::path_domain::{ PathAssertion, PathRejection, classify_path_assertion, classify_path_rejection_axes, + cond_has_pre_negated_islocal_clause, }; let rejection_axes = classify_path_rejection_axes(cond_text); @@ -1561,24 +1571,44 @@ fn apply_path_fact_branch_narrowing_with_interner( return; } - // Mark validated_may on the false branch when a path-rejection + // Resolve the "safe arm" for the rejection axes. + // + // `classify_path_rejection_axes` reports axes that hold on the FALSE + // branch of `cond_text` AS WRITTEN, with one exception: the + // `!filepath.IsLocal(...)` Go idiom is matched at the clause level + // and the classifier consumes the leading `!` itself (the safe arm + // remains the FALSE branch of the whole condition). + // + // For polarity-blind atoms like `!path.contains("..")`, the + // classifier ignores the leading `!` and still extracts `..`. In + // that shape, AST detects the unary `!` and sets + // `condition_negated = true`, but the rejection axis's *true* safe + // arm is the TRUE branch of the whole condition. So when + // `negated == true` AND no clause is the pre-negated IsLocal idiom, + // flip the narrow target. + let rejection_pre_negated = cond_has_pre_negated_islocal_clause(cond_text); + let rejection_safe_is_true = negated && !rejection_pre_negated; + + // Mark validated_may on the safe arm when a path-rejection // pattern fires. Mirrors the AllowlistCheck quirk that already // marks validated on the rejection-arm via `apply_branch_predicates` // for languages whose `.contains(...)` / membership idiom hits the // AllowlistCheck classifier, but normalises behaviour for shapes // like C `strstr(path, "..") != NULL` that hit the NullCheck arm // first and never get a chance to mark validation through the - // allowlist path. Once the path-rejection classifier has accepted - // the condition, the false branch (where the sink is reached after - // the rejection-arm terminates) is the validated arm by - // construction. + // allowlist path. if !rejection_axes.is_empty() && let Some(intern) = interner { + let safe_state: &mut SsaTaintState = if rejection_safe_is_true { + &mut *true_state + } else { + &mut *false_state + }; for var in effective_vars { if let Some(sym) = intern.get(var) { - false_state.validated_may.insert(sym); - false_state.validated_must.insert(sym); + safe_state.validated_may.insert(sym); + safe_state.validated_must.insert(sym); } } } @@ -1632,15 +1662,47 @@ fn apply_path_fact_branch_narrowing_with_interner( } }; + // Apply rejection axes to the safe arm. The rejection classifier + // (`has_negated_filepath_is_local` + `classify_path_rejection_atom`) + // reports axes that hold on the FALSE branch of `cond_text` AS + // WRITTEN, with one exception: the `!filepath.IsLocal(...)` Go idiom + // is matched at the clause level and the classifier consumes the + // leading `!` itself (safe arm remains the FALSE branch). + // + // For polarity-blind atoms like `!path.contains("..")` the classifier + // ignores the leading `!` but AST-level negation flips the safe arm + // to TRUE. Use the same `rejection_safe_is_true` resolution as the + // validated-marker block above so soundness is consistent. + let rejection_state: &mut SsaTaintState = if rejection_safe_is_true { + &mut *true_state + } else { + &mut *false_state + }; for v in &targets { - if let Some(ref mut abs) = false_state.abstract_state { + if let Some(ref mut abs) = rejection_state.abstract_state { let mut av = abs.get(*v); narrow_false(&mut av.path); if !av.is_top() { abs.set(*v, av); } } - if let Some(ref mut abs) = true_state.abstract_state { + } + + // Apply prefix-lock assertion to the cond-holds branch. Unlike the + // rejection classifier, `classify_path_assertion` is naive about + // leading negation — it just searches cond_text for a + // `starts_with`-like substring. When `condition_negated` is true + // (e.g. `if !target.startsWith(ROOT) { return; }`) the assertion + // actually holds on the *false* CFG edge, where the sink is reached. + // Flip the destination state in that case so the lock attaches to + // the surviving block. + let assertion_state = if negated { + &mut *false_state + } else { + &mut *true_state + }; + for v in &targets { + if let Some(ref mut abs) = assertion_state.abstract_state { let mut av = abs.get(*v); narrow_true(&mut av.path); if !av.is_top() { @@ -3024,6 +3086,80 @@ pub(super) fn transfer_inst( return; } + // Chain-wrapper sanitiser detection. Computed up-front so + // both the container-element-write hook and the outer- + // callee taint suppression block below can consult it. + // Walks `info.arg_callees` for the chain shape + // `outer(... wrapper() ...)`, collecting any + // sanitiser caps the wrapper's summary or label exposes. + // The set is empty when there is no chain wrapper or when + // none of the wrappers expose sanitisation. + // + // Argument attribution: when `find_classifiable_inner_call` + // overrode the callee to an inner Source, the source can be + // either (a) a direct argument call (`outer(escape(x), + // source())`) or (b) nested inside one wrapper + // (`outer(escape(source(x)))`). Crediting any wrapper's + // sanitizer caps when the source sits in a different argument + // position would suppress real taint flow. + // + // * `source_arg_pos = Some(N)` — the source call is the + // immediate callee of arg N (`arg_callees[N] == callee`). + // No other-arg wrapper can sanitize it. Credit nothing. + // * `source_arg_pos = None` — the source is nested inside + // some arg's wrapper. Credit only when exactly one arg + // has a sanitizing wrapper, since that one must be the + // parent of the nested source. Multiple sanitizing + // wrappers across different positions is ambiguous; stay + // conservative and credit nothing. + let caller_func_for_chain = info.ast.enclosing_func.as_deref().unwrap_or(""); + let mut chain_wrapper_sanitizer_caps = Cap::empty(); + if !info.arg_callees.is_empty() { + let source_arg_pos = info + .arg_callees + .iter() + .position(|c| c.as_deref() == Some(callee.as_str())); + let mut per_arg_sanitizer_caps: SmallVec<[Cap; 4]> = SmallVec::new(); + for (idx, maybe_callee) in info.arg_callees.iter().enumerate() { + if Some(idx) == source_arg_pos { + continue; + } + let Some(wrap_callee) = maybe_callee else { + continue; + }; + if Some(wrap_callee.as_str()) == info.call.outer_callee.as_deref() { + continue; + } + let mut caps_here = Cap::empty(); + if let Some(resolved) = resolve_callee_hinted( + transfer, + wrap_callee, + caller_func_for_chain, + info.call.call_ordinal, + None, + ) { + caps_here |= resolved.sanitizer_caps; + } else { + let labels = crate::labels::classify_all( + transfer.lang.as_str(), + wrap_callee, + transfer.extra_labels, + ); + for lbl in &labels { + if let DataLabel::Sanitizer(bits) = lbl { + caps_here |= *bits; + } + } + } + if !caps_here.is_empty() { + per_arg_sanitizer_caps.push(caps_here); + } + } + if source_arg_pos.is_none() && per_arg_sanitizer_caps.len() == 1 { + chain_wrapper_sanitizer_caps = per_arg_sanitizer_caps[0]; + } + } + // Container element-write hook. Runs before other Call-arm // processing so `try_container_propagation`'s early-return // can't bypass us. Writes only into `(loc, ELEM)` cells on @@ -3033,8 +3169,48 @@ pub(super) fn transfer_inst( // through: cell `must = AND` over args (every writer must be // must-validated), `may = OR` over args. Anonymous SSA temps // contribute `false/false` and break the `must` invariant. - if let (Some(pf), Some(rcv)) = (transfer.pointer_facts, *receiver) { - if crate::pointer::is_container_write_callee(callee) { + // + // Two callee shapes: + // * Method-style write (`receiver.push(val)`) — `receiver` + // channel resolves the container, value args start at + // position 0. + // * Go `append` builtin (or chain shape with + // `outer_callee == "append"`) — no receiver channel, + // `args[0]` is the slice itself, value args start at + // position 1. + if let Some(pf) = transfer.pointer_facts { + let go_append_chain = transfer.lang == Lang::Go + && receiver.is_none() + && (callee == "append" || info.call.outer_callee.as_deref() == Some("append")); + // For Go append, args[0] is the input slice whose + // points-to set may be empty when the slice was just + // initialised with a composite literal (`cmds := + // []string{}`). The call result (inst.value) carries + // the fresh allocation site that pointer analysis + // attaches to every Call op, and downstream uses of + // the slice flow through that result, so it is the + // authoritative container identity. Fall back to + // args[0] when the result has no pt set yet. + let resolved_recv: Option = if let Some(rcv) = *receiver { + Some(rcv) + } else if go_append_chain { + let result_v = inst.value; + let result_pt = pf.pt(result_v); + if !result_pt.is_empty() && !result_pt.is_top() { + Some(result_v) + } else { + args.first().and_then(|a| a.first().copied()) + } + } else { + None + }; + let value_arg_start = if go_append_chain { 1 } else { 0 }; + let write_callee_match = if go_append_chain { + true + } else { + crate::pointer::is_container_write_callee(callee) + }; + if let (Some(rcv), true) = (resolved_recv, write_callee_match) { let pt = pf.pt(rcv); if !pt.is_empty() && !pt.is_top() { let mut elem_caps = Cap::empty(); @@ -3043,7 +3219,7 @@ pub(super) fn transfer_inst( let mut elem_must_all = true; // AND over args (vacuously true for empty args) let mut elem_may_any = false; // OR over args let mut saw_any_arg = false; - for arg_group in args { + for arg_group in args.iter().skip(value_arg_start) { for &arg_v in arg_group { saw_any_arg = true; if let Some(t) = state.get(arg_v) { @@ -3059,6 +3235,35 @@ pub(super) fn transfer_inst( elem_may_any |= av; } } + // Chain-shape Go append: the inner Source label + // fires on this same call instruction, so its + // caps are not yet on any positional arg's SSA + // value at this point. Pull them in directly + // from the source labels so the W4 cell sees + // the real source caps; without this the cell + // is empty for the chain shape and the index- + // read taint flow appears clean for the wrong + // reason. + if go_append_chain { + for lbl in &info.taint.labels { + if let DataLabel::Source(bits) = lbl { + elem_caps |= *bits; + saw_any_arg = true; + } + } + // A chain-shape sanitising wrapper around the + // source counts as the validation that the + // ELEM cell needs. Each entry in + // `info.arg_callees` whose summary or label + // exposes non-empty `sanitizer_caps` + // contributes to validation, the cell's + // must/may bits flip on so the index-read + // counterpart sees the value as validated. + if !chain_wrapper_sanitizer_caps.is_empty() { + elem_must_all = true; + elem_may_any = true; + } + } // Vacuous AND: a zero-arg container write supplies // no validation source, so coerce must to false. if !saw_any_arg { @@ -3204,6 +3409,20 @@ pub(super) fn transfer_inst( } } + // Call-site replace sanitizer detection. Recognises + // `s.replace*(pat, rep)` / `strings.ReplaceAll(s, pat, rep)` / + // `str_replace($pat, $rep, $s)` shapes whose pattern is a + // concrete shell/HTML/SQL escape literal and treats the call + // as a sanitizer for the corresponding caps. Mirrors the + // semantics that label-rule sanitizers already provide. + if let Some(extra) = crate::symex::strings::detect_call_site_replace_sanitizer( + callee, + transfer.lang, + &info.call.arg_string_literals, + ) { + sanitizer_bits |= extra; + } + // Resolve callee summary, always attempt, even when explicit // labels are present. Labels take precedence for source caps, but // summary propagation and sanitizer behaviour must still apply @@ -4006,7 +4225,10 @@ pub(super) fn transfer_inst( // produces return_bits. Check if the wrapper function blocks taint: // if its SSA summary shows no propagation, no source_caps, and no // container identity return, the return value is independent of its - // arguments, clear return_bits. + // arguments, clear return_bits. Additionally apply the wrapper's + // sanitizer caps (StripBits transforms) so a sanitising wrapper + // like `validate()` clears the relevant cap bits even + // when the wrapper still propagates other taint. if !return_bits.is_empty() && has_source_label { if let Some(ref oc) = info.call.outer_callee { if let Some(ref oc_sum) = resolve_callee_hinted( @@ -4021,11 +4243,36 @@ pub(super) fn transfer_inst( // no internal sources reaching return. return_bits = Cap::empty(); return_origins.clear(); + } else if !oc_sum.sanitizer_caps.is_empty() { + return_bits &= !oc_sum.sanitizer_caps; } } } } + // Chain-wrapper sanitizer suppression: when the chain shape + // `outer(... wrapper() ...)` puts a sanitising wrapper + // function between the inner Source and the outer call, + // mark the call result's symbol as validated so any + // downstream sink event over the same value fires with + // `all_validated = true`, suppressing the taint finding and + // (via [`record_path_safe_suppressed_span`]) the + // `state-unauthed-access` finding on the same span. + // `chain_wrapper_sanitizer_caps` is computed up-front above + // so the container-element-write hook can also consult it. + if has_source_label && !chain_wrapper_sanitizer_caps.is_empty() { + if let Some(name) = ssa + .value_defs + .get(inst.value.0 as usize) + .and_then(|vd| vd.var_name.as_deref()) + { + if let Some(sym) = transfer.interner.get(name) { + state.validated_must.insert(sym); + state.validated_may.insert(sym); + } + } + } + // Constructor cap narrowing: a `new X(...)` call returns an object // instance, not a string. Caps that name a string-shaped sink // pattern (path argument, format string, URL component, JSON @@ -7654,11 +7901,12 @@ fn is_abstract_safe_for_sink( } /// Check every tainted leaf flowing into `inst`'s used values carries a -/// PathFact proving it is dotdot-free and non-absolute. +/// PathFact proving it cannot perform path traversal. /// -/// Core gate for the rs-safe-0** FP closure (see [`PathFact::is_path_safe`]). -/// Traces through Assign chains so `Path::new(sanitised)` still resolves -/// to the sanitised string's fact. +/// Core gate for the rs-safe-0** FP closure plus the canonicalised+rooted +/// shape (see [`PathFact::is_path_traversal_safe`]). Traces through +/// Assign chains so `Path::new(sanitised)` still resolves to the +/// sanitised string's fact. fn is_path_safe_for_sink( inst: &SsaInst, state: &SsaTaintState, @@ -7670,7 +7918,9 @@ fn is_path_safe_for_sink( if leaves.is_empty() { return false; } - let safe = leaves.iter().all(|v| abs.get(*v).path.is_path_safe()); + let safe = leaves + .iter() + .all(|v| abs.get(*v).path.is_path_traversal_safe()); if safe { // Publish the suppression to the file-level set so the // state-analysis pass can suppress `state-unauthed-access` on @@ -7925,7 +8175,7 @@ fn trace_single_leaf( // existing trace-through-args behaviour. let proves_path_safe = state.abstract_state.as_ref().is_some_and(|abs_state| { let f = abs_state.get(v).path; - !f.is_top() && f.is_path_safe() + !f.is_top() && f.is_path_traversal_safe() }); if is_source || proves_path_safe { leaves.push(v); diff --git a/src/taint/ssa_transfer/tests.rs b/src/taint/ssa_transfer/tests.rs index 930fc1ae..18cfd491 100644 --- a/src/taint/ssa_transfer/tests.rs +++ b/src/taint/ssa_transfer/tests.rs @@ -1229,6 +1229,80 @@ mod goto_succ_propagation_tests { ); } + #[test] + fn path_fact_negated_contains_dotdot_narrows_true_branch() { + // `if !path.contains("..") { return; } sink(path);` — the surviving + // (sink-reaching) arm is the TRUE branch of the IF condition. The + // rejection axis (DotDot) must narrow `true_state`, not `false_state`, + // otherwise the unsafe arm gets dotdot=No and the sink suppression + // masks the bug. + let ssa = ssa_body_with_named_value("path"); + let mut true_state = initial_state_with_abstract(); + let mut false_state = initial_state_with_abstract(); + + super::super::apply_path_fact_branch_narrowing_with_interner( + &mut true_state, + &mut false_state, + "!path.contains(\"..\")", + &["path".to_string()], + &ssa, + None, + true, + ); + + let true_abs = true_state.abstract_state.as_ref().unwrap(); + let false_abs = false_state.abstract_state.as_ref().unwrap(); + assert_eq!( + true_abs.get(SsaValue(0)).path.dotdot, + crate::abstract_interp::Tri::No, + "negated-contains: TRUE arm (sink-reaching, safe) must narrow" + ); + assert_eq!( + false_abs.get(SsaValue(0)).path.dotdot, + crate::abstract_interp::Tri::Maybe, + "negated-contains: FALSE arm (rejection arm) must NOT narrow" + ); + } + + #[test] + fn path_fact_negated_filepath_islocal_narrows_false_branch() { + // `if !filepath.IsLocal(p) { return; } sink(p);` — Go idiom. The + // classifier consumes the `!` itself (pre-negated handler), so the + // safe arm remains the FALSE branch of the whole condition even + // though `condition_negated == true` at AST level. + let ssa = ssa_body_with_named_value("p"); + let mut true_state = initial_state_with_abstract(); + let mut false_state = initial_state_with_abstract(); + + super::super::apply_path_fact_branch_narrowing_with_interner( + &mut true_state, + &mut false_state, + "!filepath.IsLocal(p)", + &["p".to_string()], + &ssa, + None, + true, + ); + + let true_abs = true_state.abstract_state.as_ref().unwrap(); + let false_abs = false_state.abstract_state.as_ref().unwrap(); + assert_eq!( + false_abs.get(SsaValue(0)).path.dotdot, + crate::abstract_interp::Tri::No, + "!filepath.IsLocal: FALSE arm (sink-reaching, IsLocal=true) must narrow" + ); + assert_eq!( + false_abs.get(SsaValue(0)).path.absolute, + crate::abstract_interp::Tri::No, + "!filepath.IsLocal: FALSE arm absolute axis must narrow" + ); + assert_eq!( + true_abs.get(SsaValue(0)).path.dotdot, + crate::abstract_interp::Tri::Maybe, + "!filepath.IsLocal: TRUE arm (return) must NOT narrow" + ); + } + #[test] fn path_fact_no_match_leaves_state_untouched() { let ssa = ssa_body_with_named_value("x"); diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index 9ad003eb..8b6b7d3d 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -105,11 +105,11 @@ cargo test --release --all-features --test benchmark_test -- --ignored --nocaptu and fails if the corpus rule-level metrics fall below the thresholds encoded at the bottom of `tests/benchmark_test.rs`: -| Metric | Floor | Current baseline (~432 cases) | +| Metric | Floor | Current baseline (491 cases run) | |---|---|---| -| Precision | ≥ 0.861 | 0.991 | -| Recall | ≥ 0.944 | 0.995 | -| F1 | ≥ 0.901 | 0.993 | +| Precision | ≥ 0.861 | 1.000 | +| Recall | ≥ 0.944 | 1.000 | +| F1 | ≥ 0.901 | 1.000 | The floors sit roughly 8 pp below the current baseline. A single-case flip is about 0.2 pp on this corpus, so the headroom absorbs honest FP/TN diff --git a/tests/benchmark/RESULTS.md b/tests/benchmark/RESULTS.md index 60a93c87..bf229a37 100644 --- a/tests/benchmark/RESULTS.md +++ b/tests/benchmark/RESULTS.md @@ -1,14 +1,14 @@ # Benchmark Results -Current baseline (2026-04-29): +Current baseline (2026-05-02): | Metric | File-level | Rule-level | CI floor | |-----------|------------|------------|----------| -| Precision | 0.996 | 0.996 | 0.861 | +| Precision | 1.000 | 1.000 | 0.861 | | Recall | 1.000 | 1.000 | 0.944 | -| F1 | 0.998 | 0.998 | 0.901 | +| F1 | 1.000 | 1.000 | 0.901 | -Corpus: 451 cases across 10 languages, 449 evaluated (no disabled). Per-run JSON lands in `tests/benchmark/results/` (`latest.json` plus dated snapshots). See `README.md` for what the scoring modes mean and how to run a subset. +Corpus: 492 cases across 10 languages, 491 evaluated (1 disabled). Per-run JSON lands in `tests/benchmark/results/` (`latest.json` plus dated snapshots). See `README.md` for what the scoring modes mean and how to run a subset. The corpus is mostly synthetic 8-20 line fixtures, one vulnerability or one safe pattern per file. A smaller real-CVE replay set under `cve_corpus/` covers 20 published CVEs across all 10 languages. Both contribute to the headline numbers. @@ -35,6 +35,8 @@ Real disclosed CVEs reduced to minimal reproducers, vulnerable + patched pair pe | CVE-2022-42889 | Java | Apache Commons Text | Apache-2.0 | code_exec | detected | | CVE-2013-0156 | Ruby | Ruby on Rails | MIT | Deserialization | detected | | CVE-2020-8130 | Ruby | Rake | MIT | CMDI | detected | +| CVE-2021-21288 | Ruby | CarrierWave | MIT | SSRF | detected | +| CVE-2023-38337 | Ruby | rswag | MIT | path_traversal | detected | | CVE-2017-9841 | PHP | PHPUnit | BSD-3-Clause | code_exec | detected | | CVE-2018-15133 | PHP | Laravel | MIT | Deserialization | detected | | CVE-2016-3714 | C | ImageMagick (ImageTragick) | ImageMagick License | CMDI | detected | @@ -65,6 +67,9 @@ Most recent first. Metrics are rule-level on the corpus size at that point. | Date | Change | Corpus | P | R | F1 | |------------|------------------------------------------------------------------------------|--------|-------|-------|-------| +| 2026-05-02 | `strings.ReplaceAll` recognised as CMDi sanitiser in chain-wrapper / call-site-replace shapes; clears `go-safe-009` (last open corpus FP); aggregate rule-level reaches P=R=F1=1.000 | 492 | 1.000 | 1.000 | 1.000 | +| 2026-05-01 | PathFact opaque-prefix-lock (`canonicalise + start_with?()` recognised across Ruby/Python/JS) + `is_path_traversal_safe` predicate + negated-form polarity flip on assertion narrowing; rswag CVE-2023-38337 detected | 490 | 0.972 | 0.992 | 0.982 | +| 2026-05-01 | Ruby `OpenURI.open_uri` SSRF sink + inner-call fallback for statement-level Ruby calls (`YAML.safe_load(File.read(x))` shape now classifies); CVE-2021-21288 (CarrierWave) detected | 482 | 0.972 | 0.992 | 0.982 | | 2026-04-29 | Java SnakeYAML + Text4Shell patterns; CVE-2022-1471 and CVE-2022-42889 detected | 449 | 0.996 | 1.000 | 0.998 | | 2026-04-29 | Indirect-validator branch narrowing (`const err = validate(x); if (err) throw …;`) + helper-summary all_validated propagation; Novu GHSA-4x48-cgf9-q33f detected | 445 | 0.991 | 1.000 | 0.995 | | 2026-04-29 | Python f-string SQLi pattern + bindparams sanitizer + HttpClient SSRF rules; CVE-2025-69662 (geopandas) and CVE-2026-33626 (LMDeploy) detected | 439 | 0.991 | 1.000 | 0.995 | diff --git a/tests/benchmark/corpus/cpp/buffer_overflow/buffer_reinterpret_cast_struct_alias.cpp b/tests/benchmark/corpus/cpp/buffer_overflow/buffer_reinterpret_cast_struct_alias.cpp new file mode 100644 index 00000000..d99ee4e2 --- /dev/null +++ b/tests/benchmark/corpus/cpp/buffer_overflow/buffer_reinterpret_cast_struct_alias.cpp @@ -0,0 +1,15 @@ +// Vulnerable counterpart to `cpp/safe/safe_reinterpret_cast_byte_pointer.cpp`. +// `reinterpret_cast(buf)` (or any user-defined struct / +// class pointer target) is a genuine strict-aliasing UB risk: the +// program writes through a pointer to one type while the underlying +// storage was written as another, violating [basic.lval]/11. The +// `cpp.memory.reinterpret_cast` pattern must continue to fire on these. + +struct UserStruct { + int a; + int b; +}; + +UserStruct* alias_byte_buffer(char* buf) { + return reinterpret_cast(buf); +} diff --git a/tests/benchmark/corpus/cpp/safe/safe_reinterpret_cast_byte_pointer.cpp b/tests/benchmark/corpus/cpp/safe/safe_reinterpret_cast_byte_pointer.cpp new file mode 100644 index 00000000..457b65ac --- /dev/null +++ b/tests/benchmark/corpus/cpp/safe/safe_reinterpret_cast_byte_pointer.cpp @@ -0,0 +1,52 @@ +// Canonical safe `reinterpret_cast(x)` shapes — Layer E in +// `src/ast.rs::is_cpp_cast_target_type_safe` recognises these as +// well-defined-by-aliasing-rules per [basic.lval]/11 and POSIX socket +// API contracts and suppresses the `cpp.memory.reinterpret_cast` +// pattern finding. +// +// Distilled from real-repo shapes: +// - `reinterpret_cast(...)` — bitcoin/leveldb serialization +// - `reinterpret_cast(...)` — bitcoin crc32c hashing +// - `reinterpret_cast(0x08000000)` — bitcoin lockedpool synth +// - `reinterpret_cast(...)` — bitcoin crc32c round-up +// - `reinterpret_cast(...)` — bitcoin netif BSD socket pun + +#include +#include + +struct sockaddr { + int family; +}; +struct sockaddr_in { + int family; + int port; +}; + +void serialize_to_byte_buffer(int* dst) { + auto* p = reinterpret_cast(dst); + auto* q = reinterpret_cast(dst); + auto* r = reinterpret_cast(dst); + (void)p; + (void)q; + (void)r; +} + +void hash_input_via_byte_view(const int* src) { + const auto* a = reinterpret_cast(src); + const auto* b = reinterpret_cast(src); + (void)a; + (void)b; +} + +void* make_synthetic_address() { + return reinterpret_cast(0x08000000); +} + +uintptr_t pointer_to_int(int* p) { + return reinterpret_cast(p); +} + +void bsd_socket_addr_pun(sockaddr_in* in) { + auto* generic = reinterpret_cast(in); + (void)generic; +} diff --git a/tests/benchmark/corpus/javascript/safe/safe_canonicalise_rooted_startsWith.js b/tests/benchmark/corpus/javascript/safe/safe_canonicalise_rooted_startsWith.js new file mode 100644 index 00000000..6e369e42 --- /dev/null +++ b/tests/benchmark/corpus/javascript/safe/safe_canonicalise_rooted_startsWith.js @@ -0,0 +1,20 @@ +// js-safe-canonicalise-rooted: path.resolve + .startsWith with a +// non-literal root variable (an opaque prefix-lock). Combined with +// path.resolve's dotdot=No proof, is_path_traversal_safe should suppress +// the FILE_IO sink even though the canonicalised path is absolute. +const fs = require("fs"); +const path = require("path"); + +const UPLOAD_ROOT = path.resolve("/srv/uploads"); + +function serveFile(req, res) { + const name = req.query.name; + const target = path.resolve(path.join(UPLOAD_ROOT, name)); + if (!target.startsWith(UPLOAD_ROOT)) { + res.status(403).end(); + return; + } + fs.readFile(target, (err, data) => res.send(data)); +} + +module.exports = { serveFile }; diff --git a/tests/benchmark/corpus/javascript/safe/safe_env_empty_fallback.js b/tests/benchmark/corpus/javascript/safe/safe_env_empty_fallback.js new file mode 100644 index 00000000..06cafcb1 --- /dev/null +++ b/tests/benchmark/corpus/javascript/safe/safe_env_empty_fallback.js @@ -0,0 +1,9 @@ +// Empty-string fallback on a secret-named env var is not a hardcoded +// secret — `js.secrets.fallback_secret` must not fire on this shape. + +const stripeApiKey = process.env.STRIPE_API_KEY || ""; +const sendgridKey = process.env.SENDGRID_API_KEY || ''; +const sessionSecret = process.env.SESSION_SECRET || ""; +const vapidPrivateKey = process.env.VAPID_PRIVATE_KEY || ""; + +module.exports = { stripeApiKey, sendgridKey, sessionSecret, vapidPrivateKey }; diff --git a/tests/benchmark/corpus/php/crypto/crypto_md5_password_hash.php b/tests/benchmark/corpus/php/crypto/crypto_md5_password_hash.php new file mode 100644 index 00000000..86577f83 --- /dev/null +++ b/tests/benchmark/corpus/php/crypto/crypto_md5_password_hash.php @@ -0,0 +1,42 @@ +password = md5($password); + } + + /** Token generation via sha1 — used as a session/credential token. */ + public function rotateToken(string $secret): string { + $token = sha1($secret . microtime(true)); + $_SESSION['csrf_token'] = $token; + return $token; + } + + /** Signature comparison value built with sha1 — explicit crypto intent. */ + public function signRequest(string $payload, string $key): string { + $signature = sha1($key . $payload); + return $signature; + } + + /** Compound `*_hash` name preceded by a crypto-keyword token. */ + public function storeUser(string $username, string $pwd): void { + $pw_hash = md5($pwd); + $this->saveUser($username, $pw_hash); + } + + /** Returns a pre-shared digest used for HMAC-style comparison. */ + public function digest(string $msg, string $key): string { + return sha1($key . $msg); + } + + private function saveUser(string $u, string $pw): void {} + + public string $password = ''; +} diff --git a/tests/benchmark/corpus/php/safe/safe_md5_sha1_non_crypto_use.php b/tests/benchmark/corpus/php/safe/safe_md5_sha1_non_crypto_use.php new file mode 100644 index 00000000..f6f535c9 --- /dev/null +++ b/tests/benchmark/corpus/php/safe/safe_md5_sha1_non_crypto_use.php @@ -0,0 +1,86 @@ +data) . '"'; + } + + /** Array-element value with an ETag-flagged key. */ + public function rowFor(string $objectData): array { + return [ + 'etag' => md5($objectData), + 'size' => strlen($objectData), + ]; + } + + /** Subscript-LHS with a string-literal index. */ + public function memo(string $favoriteTableName): array { + $row = []; + $row['table_name_hash'] = md5($favoriteTableName); + return $row; + } + + /** Null-coalescing assignment with subscript LHS. */ + public function lazyHash(string $table, array &$tables): void { + $tables[$table]['hash'] ??= md5($table); + } + + /** Bare variable LHS named `*Hash` / `*Md5` / `*etag`. */ + public function trio(string $sql): array { + $sqlMd5 = md5($sql); + $tableHash = md5($sql . '.t'); + $etag = md5($sql . '.e'); + return [$sqlMd5, $tableHash, $etag]; + } + + /** Dynamic-index subscript LHS — receiver name carries the signal. */ + public function indexByCol(array $columnNames): array { + $columnNamesHashes = []; + foreach ($columnNames as $col) { + $columnNamesHashes[$col] = md5($col); + } + return $columnNamesHashes; + } + + /** md5 result used as an array index — hash-table lookup. */ + public function fetch(array $arr, string $x): mixed { + return $arr[md5($x)] ?? null; + } + + /** Concatenation feeding a non-crypto-named LHS. */ + public function recoveryKeyId(): string { + return 'recoveryKey_' . substr(md5((string)time()), 0, 8); + } + + /** Cache-buster — return from a method whose name encodes intent. */ + public function getCacheBuster(string $version): string { + return substr(sha1($version), 0, 8); + } + + /** Receiver with `Method`-typed lookup verb — `cache->get`/`cache->set`. */ + public function lookup(string $uid): mixed { + return $this->cache->get(sha1($uid)); + } + + /** Cross-language non-crypto: ID hashing for DB-safe characters. */ + public function safeStorageId(string $storageId): string { + if (strlen($storageId) > 64) { + $storageId = md5($storageId); + } + return $storageId; + } +} diff --git a/tests/benchmark/corpus/python/safe/safe_canonicalise_rooted_startswith.py b/tests/benchmark/corpus/python/safe/safe_canonicalise_rooted_startswith.py new file mode 100644 index 00000000..381cc337 --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_canonicalise_rooted_startswith.py @@ -0,0 +1,19 @@ +# py-safe-canonicalise-rooted: os.path.realpath + .startswith with a +# non-literal root variable (an opaque prefix-lock). Combined with +# realpath's dotdot=No proof, is_path_traversal_safe should suppress the +# FILE_IO sink even though the canonicalised path is absolute. +import os +from flask import Flask, request + +UPLOAD_ROOT = "/srv/uploads" +app = Flask(__name__) + + +@app.route("/file") +def file(): + name = request.args.get("name", "") + target = os.path.realpath(os.path.join(UPLOAD_ROOT, name)) + if not target.startswith(UPLOAD_ROOT): + return "forbidden", 403 + with open(target) as f: + return f.read() diff --git a/tests/benchmark/corpus/ruby/path_traversal/path_traversal_yaml_load_file_read.rb b/tests/benchmark/corpus/ruby/path_traversal/path_traversal_yaml_load_file_read.rb new file mode 100644 index 00000000..fb4eb309 --- /dev/null +++ b/tests/benchmark/corpus/ruby/path_traversal/path_traversal_yaml_load_file_read.rb @@ -0,0 +1,8 @@ +require 'yaml' + +def load_yaml(filename) + YAML.safe_load(File.read(filename)) +end + +filename = params[:p] +load_yaml(filename) diff --git a/tests/benchmark/corpus/ruby/safe/safe_canonicalise_rooted_unless.rb b/tests/benchmark/corpus/ruby/safe/safe_canonicalise_rooted_unless.rb new file mode 100644 index 00000000..c748bf7f --- /dev/null +++ b/tests/benchmark/corpus/ruby/safe/safe_canonicalise_rooted_unless.rb @@ -0,0 +1,18 @@ +# ruby-safe-021: File.expand_path + `unless start_with?` with a non-literal +# prefix (configured root reachable through a method call). The opaque +# prefix-lock combined with `expand_path`'s dotdot=No proof is sufficient +# under PathFact::is_path_traversal_safe to suppress the FILE_IO sink. +class Config + def root + '/srv/app/uploads' + end +end + +def serve(env, config) + path = env['PATH_INFO'] + filename = File.expand_path(File.join(config.root, path)) + unless filename.start_with? config.root + return [403, {}, []] + end + File.read(filename) +end diff --git a/tests/benchmark/corpus/ruby/ssrf/ssrf_open_uri.rb b/tests/benchmark/corpus/ruby/ssrf/ssrf_open_uri.rb new file mode 100644 index 00000000..fd17d3cb --- /dev/null +++ b/tests/benchmark/corpus/ruby/ssrf/ssrf_open_uri.rb @@ -0,0 +1,6 @@ +require 'open-uri' + +def fetch_url(params) + url = params[:url] + OpenURI.open_uri(url) +end diff --git a/tests/benchmark/corpus/typescript/safe/safe_env_empty_fallback.ts b/tests/benchmark/corpus/typescript/safe/safe_env_empty_fallback.ts new file mode 100644 index 00000000..5e625f1f --- /dev/null +++ b/tests/benchmark/corpus/typescript/safe/safe_env_empty_fallback.ts @@ -0,0 +1,21 @@ +// Empty-string fallback on a secret-named env var is not a hardcoded +// secret. Developers commonly write `|| ""` to satisfy TypeScript's +// non-undefined string typing while leaving the actual secret to be +// supplied at runtime. The `ts.secrets.fallback_secret` / +// `js.secrets.fallback_secret` patterns must not fire here. + +const stripeApiKey: string = process.env.STRIPE_API_KEY || ""; +const sendgridKey: string = process.env.SENDGRID_API_KEY || ''; +const sessionSecret: string = process.env.SESSION_SECRET || ""; +const vapidPrivateKey: string = process.env.VAPID_PRIVATE_KEY || ""; +const calendsoEncryptionKey: string = process.env.CALENDSO_ENCRYPTION_KEY || ""; + +export function bootstrap() { + return { + stripeApiKey, + sendgridKey, + sessionSecret, + vapidPrivateKey, + calendsoEncryptionKey, + }; +} diff --git a/tests/benchmark/cve_corpus/ruby/CVE-2021-21288/patched.rb b/tests/benchmark/cve_corpus/ruby/CVE-2021-21288/patched.rb new file mode 100644 index 00000000..5597ef0a --- /dev/null +++ b/tests/benchmark/cve_corpus/ruby/CVE-2021-21288/patched.rb @@ -0,0 +1,67 @@ +# Nyx CVE benchmark fixture (patched counterpart). +# +# CVE: CVE-2021-21288 +# Project: carrierwaveuploader/carrierwave +# License: MIT (carrierwave.gemspec: s.licenses = ["MIT"]) +# Advisory: https://github.com/advisories/GHSA-fwcm-636p-68r5 +# Patched: 012702eb3ba1663452aa025831caa304d1a665c0 +# lib/carrierwave/downloader/base.rb:7-78 +# +# Carrierwave 2.1.1 / 1.3.2 routes the URL through the `ssrf_filter` +# gem's `SsrfFilter.get` helper, which resolves the URL host and aborts +# if it lands in a private / link-local / loopback IP range — closing +# the SSRF vector. +# +# Patched-fix simplifications: +# - The upstream `download` body keeps an `if skip_ssrf_protection?(uri)` +# escape hatch (a configurable opt-out the library exposes for users +# who genuinely need to fetch from intranet hosts; default returns +# `false`). This fixture omits that branch because it is dead code in +# the default configuration AND a CVE benchmark should not score the +# library on a documented opt-out. The remaining `SsrfFilter.get` +# branch is the actual CVE fix. +# - `remote_file.rb` is omitted (same as the vulnerable counterpart). +# - The Sinatra controller is scaffold so the source statement +# (`params[:remote_image_url]`) and the safe sink site live together. +# +# Load-bearing line copied verbatim from upstream +# `lib/carrierwave/downloader/base.rb` (commit 012702eb…): +# - `response = SsrfFilter.get(uri, headers: headers) do |req|` (33) +# - `request = req` (34) and `response.uri = request.uri` (36) +# - `response.value` (37) +require 'sinatra/base' +require 'open-uri' +require 'ssrf_filter' +require 'addressable' + +class Downloader + def download(url, remote_headers = {}) + headers = remote_headers. + reverse_merge('User-Agent' => "CarrierWave/2.1.1") + uri = process_uri(url.to_s) + begin + request = nil + response = SsrfFilter.get(uri, headers: headers) do |req| + request = req + end + response.uri = request.uri + response.value + rescue StandardError => e + raise "could not download file: #{e.message}" + end + response + end + + def process_uri(uri) + uri_parts = uri.split('?') + encoded_uri = Addressable::URI.parse(uri_parts.shift).normalize.to_s + query = uri_parts.any? ? "?#{uri_parts.join('?')}" : '' + URI.parse("#{encoded_uri}#{query}") + end +end + +class UploaderController < Sinatra::Base + post '/upload' do + Downloader.new.download(params[:remote_image_url]) + end +end diff --git a/tests/benchmark/cve_corpus/ruby/CVE-2021-21288/vulnerable.rb b/tests/benchmark/cve_corpus/ruby/CVE-2021-21288/vulnerable.rb new file mode 100644 index 00000000..2d05fb07 --- /dev/null +++ b/tests/benchmark/cve_corpus/ruby/CVE-2021-21288/vulnerable.rb @@ -0,0 +1,66 @@ +# Nyx CVE benchmark fixture. +# +# CVE: CVE-2021-21288 +# Project: carrierwaveuploader/carrierwave +# License: MIT (carrierwave.gemspec: s.licenses = ["MIT"]) +# Advisory: https://github.com/advisories/GHSA-fwcm-636p-68r5 +# Vulnerable: 09f9f27c0259d5c644caf0e93cf2582f32784018 +# lib/carrierwave/downloader/base.rb:7-49 +# +# CarrierWave's remote-file download path passed an attacker-controlled URL +# straight into `OpenURI.open_uri` after only running it through a +# normalisation helper (`process_uri`). open-uri follows redirects and +# performs no host-allowlisting, so an attacker who could control the +# `remote_image_url` setter (typical for any uploader that exposes a +# remote-URL form field) reached internal services / cloud metadata +# endpoints / file:// URIs. Fixed in 2.1.1 / 1.3.2 by routing the +# request through `SsrfFilter.get`, which validates the resolved host +# against private IP ranges before issuing the request. +# +# Trims: +# - `remote_file.rb` is omitted; only the `download` / `process_uri` +# pair from `base.rb` is in scope. +# - The Sinatra controller wrapping `Downloader.new.download(...)` is +# scaffold so the source statement (`params[:remote_image_url]`) and +# the sink (`OpenURI.open_uri(...)`) live in one parseable file. +# - Constants (`User-Agent`) replace the `CarrierWave::VERSION` lookup. +# - The `attr_reader`/`initialize`/`uploader` plumbing is dropped; the +# CVE flows through `download(url, ...)` regardless of how the +# instance was constructed. +# +# Load-bearing lines copied verbatim from upstream +# `lib/carrierwave/downloader/base.rb` (commit 09f9f27c…): +# - `def download(url, remote_headers = {})` signature (line 22) +# - `headers = remote_headers.reverse_merge('User-Agent' => ...)` (23-24) +# - `file = OpenURI.open_uri(process_uri(url.to_s), headers)` — the CVE (26) +# - `def process_uri(uri)` body (38-49) including the +# `Addressable::URI.parse(uri_parts.shift).normalize.to_s` step +require 'sinatra/base' +require 'open-uri' +require 'addressable' + +class Downloader + def download(url, remote_headers = {}) + headers = remote_headers. + reverse_merge('User-Agent' => "CarrierWave/2.1.0") + begin + file = OpenURI.open_uri(process_uri(url.to_s), headers) + rescue StandardError => e + raise "could not download file: #{e.message}" + end + file + end + + def process_uri(uri) + uri_parts = uri.split('?') + encoded_uri = Addressable::URI.parse(uri_parts.shift).normalize.to_s + query = uri_parts.any? ? "?#{uri_parts.join('?')}" : '' + URI.parse("#{encoded_uri}#{query}") + end +end + +class UploaderController < Sinatra::Base + post '/upload' do + Downloader.new.download(params[:remote_image_url]) + end +end diff --git a/tests/benchmark/cve_corpus/ruby/CVE-2023-38337/patched.rb b/tests/benchmark/cve_corpus/ruby/CVE-2023-38337/patched.rb new file mode 100644 index 00000000..3782a464 --- /dev/null +++ b/tests/benchmark/cve_corpus/ruby/CVE-2023-38337/patched.rb @@ -0,0 +1,75 @@ +# Nyx CVE benchmark fixture (patched counterpart). +# +# CVE: CVE-2023-38337 +# Project: rswag/rswag (rswag-api gem) +# License: MIT (per rswag.gemspec) +# Advisory: https://github.com/advisories/GHSA-vc79-65pr-q82v +# Patched: tag 2.10.1, rswag-api/lib/rswag/api/middleware.rb:1-72 +# https://github.com/rswag/rswag/blob/2.10.1/rswag-api/lib/rswag/api/middleware.rb +# +# rswag 2.10.1 routes the path through `File.expand_path(File.join(root, +# path))` (canonicalising it) and aborts with `return @app.call(env)` +# when the resolved filename does not start with the configured swagger +# root. Any `..`-style PATH_INFO either resolves outside the root and +# is rejected, or stays inside and is therefore safe to read. +# +# Trims (same as vulnerable.rb): +# - The Rswag::Api::Config object (`@config`) is a stub method +# `resolve_swagger_root` returning a hardcoded path string. +# - `Rack::Mime` content-type lookup at upstream L25-26 is dropped. +# - `unload_swagger`/`load_yaml`/`load_json` retained verbatim. +# +# Load-bearing lines copied verbatim from upstream +# `rswag-api/lib/rswag/api/middleware.rb` (lines 16-22 in 2.10.1): +# - `filename = File.expand_path(File.join(@config.resolve_swagger_root(env), path))` (16) +# - `unless filename.start_with? @config.resolve_swagger_root(env)` (17) +# - ` return @app.call(env)` (18) — the rejection branch +# - `end` (19) +require 'json' +require 'yaml' + +class Config + def resolve_swagger_root(env) + '/srv/rswag-root' + end +end + +class Middleware + def initialize(app, config) + @app = app + @config = config + end + + def call(env) + path = env['PATH_INFO'] + filename = File.expand_path(File.join(@config.resolve_swagger_root(env), path)) + unless filename.start_with? @config.resolve_swagger_root(env) + return @app.call(env) + end + + if env['REQUEST_METHOD'] == 'GET' && File.file?(filename) + swagger = parse_file(filename) + return ['200', {}, [swagger.to_s]] + end + + return @app.call(env) + end + + private + + def parse_file(filename) + if /\.ya?ml$/ === filename + load_yaml(filename) + else + load_json(filename) + end + end + + def load_yaml(filename) + YAML.safe_load(File.read(filename)) + end + + def load_json(filename) + JSON.parse(File.read(filename)) + end +end diff --git a/tests/benchmark/cve_corpus/ruby/CVE-2023-38337/vulnerable.rb b/tests/benchmark/cve_corpus/ruby/CVE-2023-38337/vulnerable.rb new file mode 100644 index 00000000..34c8027f --- /dev/null +++ b/tests/benchmark/cve_corpus/ruby/CVE-2023-38337/vulnerable.rb @@ -0,0 +1,82 @@ +# Nyx CVE benchmark fixture. +# +# CVE: CVE-2023-38337 +# Project: rswag/rswag (rswag-api gem) +# License: MIT (per rswag.gemspec) +# Advisory: https://github.com/advisories/GHSA-vc79-65pr-q82v +# Vulnerable: tag 2.9.0, rswag-api/lib/rswag/api/middleware.rb:1-67 +# https://github.com/rswag/rswag/blob/2.9.0/rswag-api/lib/rswag/api/middleware.rb +# +# rswag's Rack middleware concatenated the request `PATH_INFO` directly +# into the Swagger root path before reading it, with no validation that +# the resolved file stayed inside the configured root. A request like +# `GET /../config/secrets.yml` therefore served arbitrary YAML / JSON +# files (with `safe_load`/`JSON.parse` content disclosure). Fixed in +# 2.10.1 by `File.expand_path` + `start_with?` rooted-path check. +# +# Trims: +# - The Rswag::Api::Config object (`@config`) is left as a stub method +# `resolve_swagger_root` that returns a hardcoded path; the only +# load-bearing behaviour is that the root is a constant string. +# - `Rack::Mime` content-type lookup at upstream L25-26 is dropped; +# not on the source-to-sink path. +# - `unload_swagger`/`load_yaml`/`load_json` helpers retained verbatim +# as the indirection is exactly where engines tend to lose flows +# (per fixture-design rule). +# +# Load-bearing lines copied verbatim from upstream +# `rswag-api/lib/rswag/api/middleware.rb` (lines 14-49 in 2.9.0): +# - `def call(env)` Rack middleware entry (line 14) +# - `path = env['PATH_INFO']` source statement (line 15) +# - `filename = "#{@config.resolve_swagger_root(env)}/#{path}"` — the +# unsanitised join (line 16) +# - `if env['REQUEST_METHOD'] == 'GET' && File.file?(filename)` (line 18) +# - `swagger = parse_file(filename)` (line 19) — flow continues +# - `def parse_file(filename)` body (lines 41-47) verbatim +# - `def load_yaml(filename); YAML.safe_load(File.read(filename)); end` (line 49-51) +# - `def load_json(filename); JSON.parse(File.read(filename)); end` (line 53-55) +require 'json' +require 'yaml' + +class Config + def resolve_swagger_root(env) + '/srv/rswag-root' + end +end + +class Middleware + def initialize(app, config) + @app = app + @config = config + end + + def call(env) + path = env['PATH_INFO'] + filename = "#{@config.resolve_swagger_root(env)}/#{path}" + + if env['REQUEST_METHOD'] == 'GET' && File.file?(filename) + swagger = parse_file(filename) + return ['200', {}, [swagger.to_s]] + end + + return @app.call(env) + end + + private + + def parse_file(filename) + if /\.ya?ml$/ === filename + load_yaml(filename) + else + load_json(filename) + end + end + + def load_yaml(filename) + YAML.safe_load(File.read(filename)) + end + + def load_json(filename) + JSON.parse(File.read(filename)) + end +end diff --git a/tests/benchmark/ground_truth.json b/tests/benchmark/ground_truth.json index 5f0cbc6f..c5a29258 100644 --- a/tests/benchmark/ground_truth.json +++ b/tests/benchmark/ground_truth.json @@ -3,7 +3,7 @@ "metadata": { "description": "Nyx benchmark ground truth", "created": "2026-03-20", - "corpus_size": 477 + "corpus_size": 492 }, "cases": [ { @@ -2902,7 +2902,7 @@ "user-input" ], "disabled": false, - "notes": "Vulnerable counterpart to php-safe-018: included variable is built from concatenation of $_GET inside the function — not a parameter pass-through. Pattern must still fire." + "notes": "Vulnerable counterpart to php-safe-018: included variable is built from concatenation of $_GET inside the function \u2014 not a parameter pass-through. Pattern must still fire." }, { "case_id": "php-deser-001", @@ -3432,6 +3432,44 @@ "disabled": false, "notes": "SSRF via HTTParty.get() with user-controlled URL" }, + { + "case_id": "ruby-ssrf-003", + "file": "ruby/ssrf/ssrf_open_uri.rb", + "language": "ruby", + "is_vulnerable": true, + "vuln_class": "ssrf", + "cwe": "CWE-918", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [ + "cfg-unguarded-sink" + ], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 5, + 5 + ] + ], + "expected_source_lines": [ + [ + 4, + 4 + ] + ], + "tags": [ + "open-uri", + "ssrf" + ], + "disabled": false, + "notes": "SSRF via OpenURI.open_uri() with user-controlled URL — canonical low-level URI fetcher; CarrierWave / Paperclip / similar gems route SSRF-vulnerable downloads through it" + }, { "case_id": "js-ssrf-safe-001", "file": "javascript/ssrf/safe_ssrf_hardcoded.js", @@ -4052,6 +4090,42 @@ "disabled": false, "notes": "Path traversal via send_file() with user-controlled path" }, + { + "case_id": "ruby-path_traversal-002", + "file": "ruby/path_traversal/path_traversal_yaml_load_file_read.rb", + "language": "ruby", + "is_vulnerable": true, + "vuln_class": "path_traversal", + "cwe": "CWE-22", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 4, + 4 + ] + ], + "expected_source_lines": [ + [ + 7, + 7 + ] + ], + "tags": [ + "wrapper-sink", + "path-traversal" + ], + "disabled": false, + "notes": "Path traversal via cross-fn helper that wraps File.read inside YAML.safe_load (the `outer(File.read(x))` shape used in real Ruby helpers — rswag CVE-2023-38337 chain). Regression guard for the inner-call fallback fix in src/cfg/mod.rs::push_node so a wrapper around an FILE_IO sink continues to surface in summary extraction." + }, { "case_id": "ruby-sqli-001", "file": "ruby/sqli/sqli_find_by_sql.rb", @@ -6430,6 +6504,67 @@ "disabled": false, "notes": "Builder chain with hardcoded host literal; terminal connect() must NOT fire (no taint on chain)." }, + { + "case_id": "cpp-safe-019", + "file": "cpp/safe/safe_reinterpret_cast_byte_pointer.cpp", + "language": "cpp", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-704", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "cpp.memory.reinterpret_cast" + ], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "safe", + "reinterpret-cast", + "byte-pointer", + "real-repo-precision-2026-05-01" + ], + "disabled": false, + "notes": "Layer E: reinterpret_cast targets that are well-defined-by-aliasing-rules (byte pointer / void* / uintptr_t / sockaddr family) must NOT fire cpp.memory.reinterpret_cast. Distilled from bitcoin leveldb/serialization/socket shapes." + }, + { + "case_id": "cpp-buf-003", + "file": "cpp/buffer_overflow/buffer_reinterpret_cast_struct_alias.cpp", + "language": "cpp", + "is_vulnerable": true, + "vuln_class": "buffer_overflow", + "cwe": "CWE-704", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "cpp.memory.reinterpret_cast" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 12, + 12 + ] + ], + "expected_source_lines": null, + "tags": [ + "vuln", + "reinterpret-cast", + "strict-aliasing", + "real-repo-precision-2026-05-01" + ], + "disabled": false, + "notes": "reinterpret_cast(buf) genuinely violates [basic.lval]/11 strict aliasing \u2014 the pattern must continue to fire because the user-defined struct target is not in the byte-pointer / void* / sockaddr safe set." + }, { "case_id": "rs-cmdi-001", "file": "rust/cmdi/cmdi_command.rs", @@ -10125,7 +10260,137 @@ "negative" ], "disabled": false, - "notes": "CVE-2020-8130 patched counterpart: open(fn, ...) replaced with File.open(fn, ...); File.open never pipes a leading |. Fixture additionally hardcodes the filename + pattern (patched-fix simplification) so the regression guard is class-clean — see vulnerable.rb header for rationale." + "notes": "CVE-2020-8130 patched counterpart: open(fn, ...) replaced with File.open(fn, ...); File.open never pipes a leading |. Fixture additionally hardcodes the filename + pattern (patched-fix simplification) so the regression guard is class-clean \u2014 see vulnerable.rb header for rationale." + }, + { + "case_id": "cve-rb-2021-21288-vulnerable", + "file": "cve_corpus/ruby/CVE-2021-21288/vulnerable.rb", + "language": "ruby", + "is_vulnerable": true, + "vuln_class": "ssrf", + "cwe": "CWE-918", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 64, + 64 + ] + ], + "expected_source_lines": [ + [ + 64, + 64 + ] + ], + "tags": [ + "cve", + "carrierwave", + "ssrf", + "open-uri" + ], + "disabled": false, + "notes": "CVE-2021-21288: CarrierWave Downloader::Base#download passed an attacker-controlled URL straight into OpenURI.open_uri (only host-normalised via process_uri). open-uri follows redirects without an allowlist, so any uploader exposing a remote_url field reached internal services / cloud metadata / file:// URIs. Fixed in 2.1.1 / 1.3.2 by routing through SsrfFilter.get. MIT" + }, + { + "case_id": "cve-rb-2021-21288-patched", + "file": "cve_corpus/ruby/CVE-2021-21288/patched.rb", + "language": "ruby", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "carrierwave", + "patched", + "negative" + ], + "disabled": false, + "notes": "CVE-2021-21288 patched counterpart: OpenURI.open_uri replaced with SsrfFilter.get(uri, ...) which validates the resolved host against private IP ranges before issuing the request. The upstream skip_ssrf_protection? escape hatch (default returns false) is omitted as a patched-fix simplification." + }, + { + "case_id": "cve-rb-2023-38337-vulnerable", + "file": "cve_corpus/ruby/CVE-2023-38337/vulnerable.rb", + "language": "ruby", + "is_vulnerable": true, + "vuln_class": "path_traversal", + "cwe": "CWE-22", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 58, + 58 + ] + ], + "expected_source_lines": [ + [ + 54, + 54 + ] + ], + "tags": [ + "cve", + "rswag", + "path_traversal", + "rack-middleware" + ], + "notes": "CVE-2023-38337: rswag-api Rack middleware concatenated env['PATH_INFO'] into the swagger root path with no validation; GET /../config/secrets.yml served arbitrary YAML/JSON files. Fixed in 2.10.1 by File.expand_path + start_with? rooted-path check. MIT" + }, + { + "case_id": "cve-rb-2023-38337-patched", + "file": "cve_corpus/ruby/CVE-2023-38337/patched.rb", + "language": "ruby", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "rswag", + "patched", + "negative" + ], + "notes": "CVE-2023-38337 patched counterpart: filename = File.expand_path(File.join(root, path)); unless filename.start_with?(root); return reject; end. Stays clean once Nyx recognises the canonicalize+rooted-path pattern." }, { "case_id": "cve-java-2015-7501-vulnerable", @@ -10301,7 +10566,7 @@ "fediverse" ], "disabled": false, - "notes": "CVE-2023-3188: Owncast 0.0.x webfinger SSRF — `account` parsed off the @-delimited handle reaches `http.DefaultClient.Get(requestURL.String())` with no host validation. Engine detects via cross-function taint flow (handler → GetWebfingerLinks param 0 → http.DefaultClient.Get SSRF sink). MIT." + "notes": "CVE-2023-3188: Owncast 0.0.x webfinger SSRF \u2014 `account` parsed off the @-delimited handle reaches `http.DefaultClient.Get(requestURL.String())` with no host validation. Engine detects via cross-function taint flow (handler \u2192 GetWebfingerLinks param 0 \u2192 http.DefaultClient.Get SSRF sink). MIT." }, { "case_id": "cve-go-2023-3188-patched", @@ -11794,6 +12059,35 @@ "disabled": false, "notes": "Python equivalent of rs-safe-014: direct-return sanitiser with `\"..\" in s` / `s.startswith(...)` rejection chain returning empty string." }, + { + "case_id": "py-safe-022", + "file": "python/safe/safe_canonicalise_rooted_startswith.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-22", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "pathfact-cross-language", + "path-sanitizer", + "canonicalise-and-rooted", + "opaque-prefix-lock", + "negated-form" + ], + "disabled": false, + "notes": "Pins `os.path.realpath` + `if not target.startswith()` shape (rswag CVE-2023-38337 sibling). Combined dotdot=No (from realpath) + opaque PrefixLock (from non-literal startswith guard) suppresses FILE_IO under is_path_traversal_safe with the negated-form polarity flip." + }, { "case_id": "py-safe-016", "file": "python/safe/safe_cross_function_dotdot.py", @@ -11848,6 +12142,62 @@ "disabled": false, "notes": "JS direct-return sanitiser. Standalone `nyx scan --index off` is clean, but the benchmark harness (single-thread + state/auth analysis enabled) reproduces a FP \u2014 diverges from production scan path. Disabled until benchmark/binary parity is re-established." }, + { + "case_id": "js-safe-021", + "file": "javascript/safe/safe_canonicalise_rooted_startsWith.js", + "language": "javascript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-22", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "pathfact-cross-language", + "path-sanitizer", + "canonicalise-and-rooted", + "opaque-prefix-lock", + "negated-form" + ], + "disabled": false, + "notes": "Pins `path.resolve` + `if (!target.startsWith())` shape (rswag CVE-2023-38337 sibling). Combined dotdot=No (from path.resolve) + opaque PrefixLock (from non-literal startsWith guard) suppresses FILE_IO under is_path_traversal_safe with the negated-form polarity flip." + }, + { + "case_id": "js-safe-022", + "file": "javascript/safe/safe_env_empty_fallback.js", + "language": "javascript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "js.secrets.fallback_secret" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "secrets", + "empty-fallback", + "real-repo-precision-2026-05-01" + ], + "disabled": false, + "notes": "Empty-string fallback (`process.env.X || \"\"`) is not a hardcoded secret. JS counterpart of ts-safe-020. Engine fix: pattern-level regex (#match? @fallback \"[^\\\"']\") in src/patterns/javascript.rs." + }, { "case_id": "ts-safe-014", "file": "typescript/safe/safe_direct_path_sanitizer.ts", @@ -12037,6 +12387,34 @@ "disabled": false, "notes": "Ruby equivalent of rs-safe-014: direct-return sanitiser with `include?` / `start_with?` rejection." }, + { + "case_id": "rb-safe-021", + "file": "ruby/safe/safe_canonicalise_rooted_unless.rb", + "language": "ruby", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-22", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "pathfact-cross-language", + "path-sanitizer", + "canonicalise-and-rooted", + "opaque-prefix-lock" + ], + "disabled": false, + "notes": "Pins `File.expand_path` + `unless filename.start_with? ` shape (rswag CVE-2023-38337 patched). Combined dotdot=No (from expand_path) + opaque PrefixLock (from non-literal start_with? guard) suppresses FILE_IO under is_path_traversal_safe." + }, { "case_id": "rb-safe-015", "file": "ruby/safe/safe_nil_path_sanitizer.rb", @@ -12198,7 +12576,7 @@ "allowed_classes" ], "disabled": false, - "notes": "PHP 7+ structural mitigation against object injection — unserialize($x, ['allowed_classes' => ...]) with false / array literal / class constant must not fire php.deser.unserialize. Distilled from nextcloud lib/private/Profiler/FileProfilerStorage.php and apps/dav CustomPropertiesBackend." + "notes": "PHP 7+ structural mitigation against object injection \u2014 unserialize($x, ['allowed_classes' => ...]) with false / array literal / class constant must not fire php.deser.unserialize. Distilled from nextcloud lib/private/Profiler/FileProfilerStorage.php and apps/dav CustomPropertiesBackend." }, { "case_id": "php-safe-018", @@ -12229,6 +12607,64 @@ "disabled": false, "notes": "Composer-style autoloader: closure / method takes a file path parameter and `include`s it. Pattern rule is heuristic without taint and over-fires; suppress when included variable is a formal parameter of the immediately enclosing function with no reassignment. Distilled from nextcloud composer/ClassLoader.php (32 copies), Router.php, Installer.php, Template/Base.php." }, + { + "case_id": "php-safe-019", + "file": "php/safe/safe_md5_sha1_non_crypto_use.php", + "language": "php", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-327", + "provenance": "real-repo-precision-2026-05-01", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "php.crypto.md5", + "php.crypto.sha1" + ], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "real-repo-precision-2026-05-01", + "weak-hash", + "non-crypto-use", + "etag", + "cache-key" + ], + "disabled": false, + "notes": "md5() / sha1() pervasively used for non-cryptographic purposes — ETag generation, cache-key / array-index hashing, dedup fingerprints, content-addressed identifier derivation. Layer F suppression recognises the consuming context (variable LHS, member-access LHS, subscript LHS, array element key, lookup-verb argument, return-from-method, hash-as-index) and refuses to fire. Distilled from nextcloud apps/dav CalDavBackend, contactsinteraction Card, Files/Cache, theming Util / CommonThemeTrait, encryption KeyManager; phpmyadmin src/Controllers/Database/StructureController, Controllers/Table/{RelationController, SearchController, ZoomSearchController}, src/Display/Results, Database/MultiTableQuery, Favorites/RecentFavoriteTables." + }, + { + "case_id": "php-crypto-001", + "file": "php/crypto/crypto_md5_password_hash.php", + "language": "php", + "is_vulnerable": true, + "vuln_class": "crypto", + "cwe": "CWE-327", + "provenance": "real-repo-precision-2026-05-01", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "php.crypto.md5", + "php.crypto.sha1" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": null, + "expected_source_lines": null, + "tags": [ + "real-repo-precision-2026-05-01", + "weak-hash", + "credential-storage" + ], + "disabled": false, + "notes": "Vulnerable counterpart to php-safe-019: md5 / sha1 used to store / sign / digest credentials, tokens, signatures. Consumer names contain crypto-keyword substrings (`password`, `token`, `signature`, `pw_hash`, `digest`) so Layer F suppression refuses to fire." + }, { "case_id": "c-safe-014", "file": "c/safe/safe_direct_path_sanitizer.c", @@ -12336,7 +12772,7 @@ "real-repo-precision-2026-04-28" ], "disabled": false, - "notes": "Postgres `pg_prewarm/autoprewarm.c` + `formatting.c::DCH_a_m` shape — strcpy/strcat with string-literal or ternary-of-literals source. Layer D suppression (src/ast.rs::is_c_buffer_call_literal_safe)." + "notes": "Postgres `pg_prewarm/autoprewarm.c` + `formatting.c::DCH_a_m` shape \u2014 strcpy/strcat with string-literal or ternary-of-literals source. Layer D suppression (src/ast.rs::is_c_buffer_call_literal_safe)." }, { "case_id": "c-safe-018", @@ -12362,7 +12798,7 @@ "real-repo-precision-2026-04-28" ], "disabled": false, - "notes": "Postgres `datetime.c::EncodeDateTime` shape — sprintf with literal format string containing only width/precision-bounded specifiers. Layer D suppression." + "notes": "Postgres `datetime.c::EncodeDateTime` shape \u2014 sprintf with literal format string containing only width/precision-bounded specifiers. Layer D suppression." }, { "case_id": "cpp-safe-014", @@ -12661,7 +13097,7 @@ "negative" ], "disabled": false, - "notes": "Indirect-validator branch narrowing — `const err = validateUrlSsrf(target); if (err) throw …;` should suppress the downstream axios.get sink. Pinned by tests/lib::indirect_validator_narrowing_marks_arg_validated." + "notes": "Indirect-validator branch narrowing \u2014 `const err = validateUrlSsrf(target); if (err) throw \u2026;` should suppress the downstream axios.get sink. Pinned by tests/lib::indirect_validator_narrowing_marks_arg_validated." }, { "case_id": "ts-safe-019", @@ -12688,7 +13124,34 @@ "negative" ], "disabled": false, - "notes": "Helper-summary all_validated propagation — when a helper's body validates the param via `validateXxx`, the per-param probe's all_validated event should be skipped during summary extraction so callers don't refire the cross-fn SSRF. Pinned by tests/lib::helper_with_validator_does_not_propagate_to_caller_via_summary." + "notes": "Helper-summary all_validated propagation \u2014 when a helper's body validates the param via `validateXxx`, the per-param probe's all_validated event should be skipped during summary extraction so callers don't refire the cross-fn SSRF. Pinned by tests/lib::helper_with_validator_does_not_propagate_to_caller_via_summary." + }, + { + "case_id": "ts-safe-020", + "file": "typescript/safe/safe_env_empty_fallback.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "ts.secrets.fallback_secret" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "secrets", + "empty-fallback", + "real-repo-precision-2026-05-01" + ], + "disabled": false, + "notes": "Empty-string fallback (`process.env.X || \"\"`) is not a hardcoded secret. Distilled from /Users/elipeter/oss/cal.com/apps/api/v2/src/modules/stripe/utils/newStripeInstance.ts and ~30 sibling cal.com calendar/stripe/sendgrid integration files. Engine fix: pattern-level regex (#match? @fallback \"[^\\\"']\") in src/patterns/typescript.rs." }, { "case_id": "py-auth-decorator-001", @@ -13377,7 +13840,7 @@ "real-repo-precision-2026-04-29" ], "disabled": false, - "notes": "Panic guard: CodeMirror Gherkin tokenizer ships a long localised regex inside a boolean sub-condition. Naive byte-slice truncation in CFG condition-text (`t[..MAX_CONDITION_TEXT_LEN]`) panicked when byte 256 landed inside a multi-byte UTF-8 character (Gurmukhi `ਖ`). Engine fix: src/utils/snippet.rs::truncate_at_char_boundary applied at three CFG sites + two symex display sites (gogs public/plugins/codemirror-5.17.0/mode/gherkin/gherkin.js:107)." + "notes": "Panic guard: CodeMirror Gherkin tokenizer ships a long localised regex inside a boolean sub-condition. Naive byte-slice truncation in CFG condition-text (`t[..MAX_CONDITION_TEXT_LEN]`) panicked when byte 256 landed inside a multi-byte UTF-8 character (Gurmukhi `\u0a16`). Engine fix: src/utils/snippet.rs::truncate_at_char_boundary applied at three CFG sites + two symex display sites (gogs public/plugins/codemirror-5.17.0/mode/gherkin/gherkin.js:107)." }, { "case_id": "go-safe-realrepo-001", @@ -13963,7 +14426,7 @@ "real-repo-precision-2026-04-29" ], "disabled": false, - "notes": "Excalidraw `Map` / `Set` / `WeakMap` / `WeakSet` / `Array` / `T[]` / `readonly T[]` receivers — direct annotation, same-file `type X = Map<...>` aliasing, and inline `new Map()` constructor. SSA `constructor_type` JS/TS arm + `cfg::params::ts_type_to_local_collection` + `cfg::dto::collect_type_alias_local_collections` route every shape through `TypeKind::LocalCollection` → `SinkClass::InMemoryLocal`, suppressing missing-ownership." + "notes": "Excalidraw `Map` / `Set` / `WeakMap` / `WeakSet` / `Array` / `T[]` / `readonly T[]` receivers \u2014 direct annotation, same-file `type X = Map<...>` aliasing, and inline `new Map()` constructor. SSA `constructor_type` JS/TS arm + `cfg::params::ts_type_to_local_collection` + `cfg::dto::collect_type_alias_local_collections` route every shape through `TypeKind::LocalCollection` \u2192 `SinkClass::InMemoryLocal`, suppressing missing-ownership." }, { "case_id": "ts-auth-realrepo-007", @@ -13989,7 +14452,7 @@ "real-repo-precision-2026-04-29" ], "disabled": false, - "notes": "Vulnerable counterpart to ts-auth-realrepo-006: `prisma.user.findUnique` / `prisma.user.update` with attacker-supplied id and no preceding auth check. Receiver is NOT a tracked Map / Set / Array, so the LocalCollection fix must NOT suppress this — proves the type-aware suppression doesn't blanket-cover real DB clients that share method names (`get`, `find`, `update`) with JS containers." + "notes": "Vulnerable counterpart to ts-auth-realrepo-006: `prisma.user.findUnique` / `prisma.user.update` with attacker-supplied id and no preceding auth check. Receiver is NOT a tracked Map / Set / Array, so the LocalCollection fix must NOT suppress this \u2014 proves the type-aware suppression doesn't blanket-cover real DB clients that share method names (`get`, `find`, `update`) with JS containers." }, { "case_id": "rs-auth-realrepo-009", @@ -14493,10 +14956,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [6, 9] + [ + 6, + 9 + ] ], "expected_source_lines": [ - [5, 5] + [ + 5, + 5 + ] ], "tags": [ "data_exfil", @@ -14524,10 +14993,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [6, 9] + [ + 6, + 9 + ] ], "expected_source_lines": [ - [5, 5] + [ + 5, + 5 + ] ], "tags": [ "data_exfil", @@ -14556,10 +15031,16 @@ "expected_severity": "MEDIUM", "expected_category": "Security", "expected_sink_lines": [ - [8, 8] + [ + 8, + 8 + ] ], "expected_source_lines": [ - [5, 5] + [ + 5, + 5 + ] ], "tags": [ "data_exfil", @@ -14587,10 +15068,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [6, 9] + [ + 6, + 9 + ] ], "expected_source_lines": [ - [5, 5] + [ + 5, + 5 + ] ], "tags": [ "data_exfil", @@ -14618,10 +15105,16 @@ "expected_severity": "MEDIUM", "expected_category": "Security", "expected_sink_lines": [ - [6, 9] + [ + 6, + 9 + ] ], "expected_source_lines": [ - [5, 5] + [ + 5, + 5 + ] ], "tags": [ "data_exfil", @@ -14649,10 +15142,16 @@ "expected_severity": "MEDIUM", "expected_category": "Security", "expected_sink_lines": [ - [14, 14] + [ + 14, + 14 + ] ], "expected_source_lines": [ - [12, 13] + [ + 12, + 13 + ] ], "tags": [ "data_exfil", @@ -14681,10 +15180,16 @@ "expected_severity": "MEDIUM", "expected_category": "Security", "expected_sink_lines": [ - [12, 15] + [ + 12, + 15 + ] ], "expected_source_lines": [ - [11, 11] + [ + 11, + 11 + ] ], "tags": [ "data_exfil", @@ -14713,10 +15218,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [16, 20] + [ + 16, + 20 + ] ], "expected_source_lines": [ - [13, 14] + [ + 13, + 14 + ] ], "tags": [ "data_exfil", @@ -14744,10 +15255,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [15, 21] + [ + 15, + 21 + ] ], "expected_source_lines": [ - [13, 13] + [ + 13, + 13 + ] ], "tags": [ "data_exfil", @@ -14775,10 +15292,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [12, 12] + [ + 12, + 12 + ] ], "expected_source_lines": [ - [10, 11] + [ + 10, + 11 + ] ], "tags": [ "data_exfil", @@ -14806,10 +15329,16 @@ "expected_severity": "MEDIUM", "expected_category": "Security", "expected_sink_lines": [ - [5, 8] + [ + 5, + 8 + ] ], "expected_source_lines": [ - [5, 5] + [ + 5, + 5 + ] ], "tags": [ "data_exfil", @@ -14838,10 +15367,16 @@ "expected_severity": "HIGH", "expected_category": "Security", "expected_sink_lines": [ - [9, 9] + [ + 9, + 9 + ] ], "expected_source_lines": [ - [7, 7] + [ + 7, + 7 + ] ], "tags": [ "data_exfil", @@ -14869,10 +15404,16 @@ "expected_severity": "MEDIUM", "expected_category": "Security", "expected_sink_lines": [ - [14, 14] + [ + 14, + 14 + ] ], "expected_source_lines": [ - [9, 9] + [ + 9, + 9 + ] ], "tags": [ "data_exfil", @@ -15046,4 +15587,4 @@ "notes": "fgets stdin user input echoed into curl_easy_setopt CURLOPT_POSTFIELDS at fixed URL; sensitivity-gate suppresses Plain-tier sources." } ] -} +} \ No newline at end of file diff --git a/tests/benchmark/results/latest.json b/tests/benchmark/results/latest.json index 136163f1..46022178 100644 --- a/tests/benchmark/results/latest.json +++ b/tests/benchmark/results/latest.json @@ -1,7 +1,7 @@ { "benchmark_version": "1.0", - "timestamp": "2026-04-30T23:44:32Z", - "scanner_version": "0.5.0", + "timestamp": "2026-05-02T07:03:06Z", + "scanner_version": "0.6.0", "scanner_config": { "analysis_mode": "Full", "taint_enabled": true, @@ -9,9 +9,9 @@ "state_analysis_enabled": true, "worker_threads": 1 }, - "ground_truth_hash": "sha256:228d1577d9560cfa08521e783ec513509363470455743a43a4102df713af1849", - "corpus_size": 477, - "cases_run": 476, + "ground_truth_hash": "sha256:ba8f5f6e20ce478b6032b1df98e5dc57a7b7a8ced8f1d3294dc811034bc6fc3c", + "corpus_size": 492, + "cases_run": 491, "cases_skipped": 1, "outcomes": [ { @@ -552,6 +552,25 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "cpp-buf-003", + "file": "cpp/buffer_overflow/buffer_reinterpret_cast_struct_alias.cpp", + "language": "cpp", + "vuln_class": "buffer_overflow", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "cpp.memory.reinterpret_cast" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "cpp.memory.reinterpret_cast" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "cpp-cmdi-001", "file": "cpp/cmdi/cmdi_system.cpp", @@ -968,6 +987,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "cpp-safe-019", + "file": "cpp/safe/safe_reinterpret_cast_byte_pointer.cpp", + "language": "cpp", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "cpp-ssrf-001", "file": "cpp/ssrf/ssrf_curl.cpp", @@ -1765,6 +1799,74 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "cve-rb-2021-21288-patched", + "file": "cve_corpus/ruby/CVE-2021-21288/patched.rb", + "language": "ruby", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-rb-2021-21288-vulnerable", + "file": "cve_corpus/ruby/CVE-2021-21288/vulnerable.rb", + "language": "ruby", + "vuln_class": "ssrf", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 64:29)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 64:29)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-rb-2023-38337-patched", + "file": "cve_corpus/ruby/CVE-2023-38337/patched.rb", + "language": "ruby", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-rb-2023-38337-vulnerable", + "file": "cve_corpus/ruby/CVE-2023-38337/vulnerable.rb", + "language": "ruby", + "vuln_class": "path_traversal", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 54:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 54:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "cve-rs-2018-20997-patched", "file": "cve_corpus/rust/CVE-2018-20997/patched.rs", @@ -2432,21 +2534,13 @@ "language": "go", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "go.cmdi.exec_command", - "state-unauthed-access", - "taint-unsanitised-flow (source 17:31)" - ], - "all_finding_ids": [ - "go.cmdi.exec_command", - "state-unauthed-access", - "taint-unsanitised-flow (source 17:31)" - ], - "security_finding_count": 3, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3112,17 +3206,13 @@ "language": "java", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "cfg-unguarded-sink" - ], - "all_finding_ids": [ - "cfg-unguarded-sink" - ], - "security_finding_count": 1, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3206,17 +3296,13 @@ "language": "java", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "cfg-unguarded-sink" - ], - "all_finding_ids": [ - "cfg-unguarded-sink" - ], - "security_finding_count": 1, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3436,14 +3522,14 @@ "vuln_class": "ssrf", "is_vulnerable": true, "outcome_file_level": "TP", - "outcome_rule_level": "FN", - "outcome_location_level": "FN", - "matched_rule_ids": [], - "unexpected_rule_ids": [ - "taint-data-exfiltration (source 7:22)" + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 7:22)" ], + "unexpected_rule_ids": [], "all_finding_ids": [ - "taint-data-exfiltration (source 7:22)" + "taint-unsanitised-flow (source 7:22)" ], "security_finding_count": 1, "non_security_finding_count": 0 @@ -3473,17 +3559,13 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "cfg-unguarded-sink" - ], - "all_finding_ids": [ - "cfg-unguarded-sink" - ], - "security_finding_count": 1, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3734,17 +3816,13 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "cfg-unguarded-sink" - ], - "all_finding_ids": [ - "cfg-unguarded-sink" - ], - "security_finding_count": 1, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3768,17 +3846,13 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "cfg-unguarded-sink" - ], - "all_finding_ids": [ - "cfg-unguarded-sink" - ], - "security_finding_count": 1, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3862,17 +3936,13 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "FP", - "outcome_rule_level": "FP", + "outcome_file_level": "TN", + "outcome_rule_level": "TN", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [ - "cfg-unguarded-sink" - ], - "all_finding_ids": [ - "cfg-unguarded-sink" - ], - "security_finding_count": 1, + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, "non_security_finding_count": 0 }, { @@ -3920,6 +3990,36 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "js-safe-021", + "file": "javascript/safe/safe_canonicalise_rooted_startsWith.js", + "language": "javascript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "js-safe-022", + "file": "javascript/safe/safe_env_empty_fallback.js", + "language": "javascript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "js-safe-data_exfil-001", "file": "javascript/safe/safe_data_exfil_sanitizer_wrap.js", @@ -4322,6 +4422,33 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "php-crypto-001", + "file": "php/crypto/crypto_md5_password_hash.php", + "language": "php", + "vuln_class": "crypto", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "php.crypto.md5", + "php.crypto.sha1", + "php.crypto.sha1", + "php.crypto.md5", + "php.crypto.sha1" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "php.crypto.md5", + "php.crypto.sha1", + "php.crypto.sha1", + "php.crypto.md5", + "php.crypto.sha1" + ], + "security_finding_count": 5, + "non_security_finding_count": 0 + }, { "case_id": "php-deser-001", "file": "php/deser/deser_unserialize.php", @@ -4688,6 +4815,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "php-safe-019", + "file": "php/safe/safe_md5_sha1_non_crypto_use.php", + "language": "php", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "php-safe-filter-001", "file": "php/safe/safe_filter_input.php", @@ -5484,6 +5626,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "py-safe-022", + "file": "python/safe/safe_canonicalise_rooted_startswith.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "py-safe-data_exfil-001", "file": "python/safe/safe_data_exfil_user_input_echo.py", @@ -5794,6 +5951,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "rb-safe-021", + "file": "ruby/safe/safe_canonicalise_rooted_unless.rb", + "language": "ruby", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "rb-safe-data_exfil-001", "file": "ruby/safe/safe_data_exfil_user_input_echo.rb", @@ -7042,15 +7214,15 @@ "vuln_class": "ssrf", "is_vulnerable": true, "outcome_file_level": "TP", - "outcome_rule_level": "FN", - "outcome_location_level": "FN", - "matched_rule_ids": [], - "unexpected_rule_ids": [ - "taint-data-exfiltration (source 4:15)" + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 4:15)" ], + "unexpected_rule_ids": [], "all_finding_ids": [ "rs.quality.unwrap", - "taint-data-exfiltration (source 4:15)" + "taint-unsanitised-flow (source 4:15)" ], "security_finding_count": 1, "non_security_finding_count": 1 @@ -7251,6 +7423,27 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "ruby-path_traversal-002", + "file": "ruby/path_traversal/path_traversal_yaml_load_file_read.rb", + "language": "ruby", + "vuln_class": "path_traversal", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 3:1)", + "taint-unsanitised-flow (source 7:1)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 3:1)", + "taint-unsanitised-flow (source 7:1)" + ], + "security_finding_count": 2, + "non_security_finding_count": 0 + }, { "case_id": "ruby-safe-001", "file": "ruby/safe/safe_constant.rb", @@ -7545,6 +7738,25 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "ruby-ssrf-003", + "file": "ruby/ssrf/ssrf_open_uri.rb", + "language": "ruby", + "vuln_class": "ssrf", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-unsanitised-flow (source 4:3)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 4:3)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "ruby-ssrf-safe-001", "file": "ruby/ssrf/safe_ssrf_hardcoded.rb", @@ -8254,6 +8466,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "ts-safe-020", + "file": "typescript/safe/safe_env_empty_fallback.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "ts-secrets-001", "file": "typescript/secrets/fallback_secret.ts", @@ -8558,22 +8785,22 @@ } ], "aggregate_file_level": { - "tp": 238, - "fp": 7, + "tp": 244, + "fp": 0, "fn_": 0, - "tn": 231, - "precision": 0.9714285714285714, + "tn": 247, + "precision": 1.0, "recall": 1.0, - "f1": 0.9855072463768115 + "f1": 1.0 }, "aggregate_rule_level": { - "tp": 236, - "fp": 7, - "fn_": 2, - "tn": 231, - "precision": 0.9711934156378601, - "recall": 0.9915966386554622, - "f1": 0.9812889812889812 + "tp": 244, + "fp": 0, + "fn_": 0, + "tn": 247, + "precision": 1.0, + "recall": 1.0, + "f1": 1.0 }, "by_language": { "c": { @@ -8586,46 +8813,46 @@ "f1": 1.0 }, "cpp": { - "tp": 18, + "tp": 19, "fp": 0, "fn_": 0, - "tn": 15, + "tn": 16, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "go": { "tp": 26, - "fp": 1, + "fp": 0, "fn_": 0, - "tn": 29, - "precision": 0.9629629629629629, + "tn": 30, + "precision": 1.0, "recall": 1.0, - "f1": 0.9811320754716981 + "f1": 1.0 }, "java": { - "tp": 20, - "fp": 2, - "fn_": 1, - "tn": 18, - "precision": 0.9090909090909091, - "recall": 0.9523809523809523, - "f1": 0.9302325581395349 + "tp": 21, + "fp": 0, + "fn_": 0, + "tn": 20, + "precision": 1.0, + "recall": 1.0, + "f1": 1.0 }, "javascript": { "tp": 22, - "fp": 4, - "fn_": 0, - "tn": 22, - "precision": 0.8461538461538461, - "recall": 1.0, - "f1": 0.9166666666666666 - }, - "php": { - "tp": 18, "fp": 0, "fn_": 0, - "tn": 19, + "tn": 28, + "precision": 1.0, + "recall": 1.0, + "f1": 1.0 + }, + "php": { + "tp": 19, + "fp": 0, + "fn_": 0, + "tn": 20, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8634,34 +8861,34 @@ "tp": 28, "fp": 0, "fn_": 0, - "tn": 29, + "tn": 30, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "ruby": { - "tp": 20, + "tp": 24, "fp": 0, "fn_": 0, - "tn": 21, + "tn": 24, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "rust": { - "tp": 34, + "tp": 35, "fp": 0, - "fn_": 1, + "fn_": 0, "tn": 39, "precision": 1.0, - "recall": 0.9714285714285714, - "f1": 0.9855072463768115 + "recall": 1.0, + "f1": 1.0 }, "typescript": { "tp": 34, "fp": 0, "fn_": 0, - "tn": 23, + "tn": 24, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8678,7 +8905,7 @@ "f1": 1.0 }, "buffer_overflow": { - "tp": 6, + "tp": 7, "fp": 0, "fn_": 0, "tn": 0, @@ -8714,7 +8941,7 @@ "f1": 1.0 }, "crypto": { - "tp": 1, + "tp": 2, "fp": 0, "fn_": 0, "tn": 0, @@ -8777,7 +9004,7 @@ "f1": 1.0 }, "path_traversal": { - "tp": 25, + "tp": 27, "fp": 0, "fn_": 0, "tn": 0, @@ -8796,12 +9023,12 @@ }, "safe": { "tp": 0, - "fp": 7, + "fp": 0, "fn_": 0, - "tn": 231, - "precision": 0.0, + "tn": 247, + "precision": 1.0, "recall": 1.0, - "f1": 0.0 + "f1": 1.0 }, "secrets": { "tp": 1, @@ -8831,13 +9058,13 @@ "f1": 1.0 }, "ssrf": { - "tp": 26, + "tp": 30, "fp": 0, - "fn_": 2, + "fn_": 0, "tn": 0, "precision": 1.0, - "recall": 0.9285714285714286, - "f1": 0.962962962962963 + "recall": 1.0, + "f1": 1.0 }, "xss": { "tp": 23, @@ -8852,30 +9079,30 @@ "by_confidence": { ">=High": { "tp": 74, - "fp": 106, - "fn_": 164, - "tn": 132, - "precision": 0.4111111111111111, - "recall": 0.31092436974789917, - "f1": 0.354066985645933 + "fp": 108, + "fn_": 170, + "tn": 139, + "precision": 0.4065934065934066, + "recall": 0.30327868852459017, + "f1": 0.3474178403755868 }, ">=Low": { - "tp": 76, - "fp": 133, - "fn_": 162, - "tn": 105, - "precision": 0.36363636363636365, - "recall": 0.31932773109243695, - "f1": 0.34004474272930646 + "tp": 75, + "fp": 129, + "fn_": 169, + "tn": 118, + "precision": 0.36764705882352944, + "recall": 0.3073770491803279, + "f1": 0.3348214285714286 }, ">=Medium": { - "tp": 76, - "fp": 123, - "fn_": 162, - "tn": 115, - "precision": 0.38190954773869346, - "recall": 0.31932773109243695, - "f1": 0.34782608695652173 + "tp": 75, + "fp": 124, + "fn_": 169, + "tn": 123, + "precision": 0.3768844221105528, + "recall": 0.3073770491803279, + "f1": 0.33860045146726864 } } } \ No newline at end of file diff --git a/tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/App.cpp b/tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/App.cpp new file mode 100644 index 00000000..e3d72dfd --- /dev/null +++ b/tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/App.cpp @@ -0,0 +1,37 @@ +// FP guard for Layer E: `cpp.memory.reinterpret_cast` must NOT fire on +// `reinterpret_cast(x)` whose target T is a type explicitly defined +// as safe by the C++ aliasing rules — byte-pointer family, void*, +// integer round-trip, BSD socket address family. + +#include +#include + +struct sockaddr { + int family; +}; +struct sockaddr_in { + int family; + int port; +}; + +void byte_view(int* p) { + auto* a = reinterpret_cast(p); + auto* b = reinterpret_cast(p); + auto* c = reinterpret_cast(p); + auto* d = reinterpret_cast(p); + auto* e = reinterpret_cast(p); + (void)a; (void)b; (void)c; (void)d; (void)e; +} + +void* synth() { + return reinterpret_cast(0x08000000); +} + +uintptr_t roundtrip(int* p) { + return reinterpret_cast(p); +} + +void socket_pun(sockaddr_in* in) { + auto* s = reinterpret_cast(in); + (void)s; +} diff --git a/tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/expectations.json b/tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/expectations.json new file mode 100644 index 00000000..041f3495 --- /dev/null +++ b/tests/fixtures/fp_guards/cpp_reinterpret_cast_byte_pointer/expectations.json @@ -0,0 +1,16 @@ +{ + "required_findings": [], + "forbidden_findings": [ + { "id_prefix": "cpp.memory.reinterpret_cast" } + ], + "noise_budget": { + "max_total_findings": 0, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1000, + "max_ms_index_cold": 1500, + "max_ms_index_warm": 500, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/App.php b/tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/App.php new file mode 100644 index 00000000..20c29aab --- /dev/null +++ b/tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/App.php @@ -0,0 +1,70 @@ +data) . '"'; + } + + public function rowFor(string $objectData): array { + return [ + 'etag' => md5($objectData), + 'size' => strlen($objectData), + ]; + } + + public function memo(string $favoriteTableName): array { + $row = []; + $row['table_name_hash'] = md5($favoriteTableName); + return $row; + } + + public function lazyHash(string $table, array &$tables): void { + $tables[$table]['hash'] ??= md5($table); + } + + public function trio(string $sql): array { + $sqlMd5 = md5($sql); + $tableHash = md5($sql . '.t'); + $etag = md5($sql . '.e'); + return [$sqlMd5, $tableHash, $etag]; + } + + public function indexByCol(array $columnNames): array { + $columnNamesHashes = []; + foreach ($columnNames as $col) { + $columnNamesHashes[$col] = md5($col); + } + return $columnNamesHashes; + } + + public function fetch(array $arr, string $x): mixed { + return $arr[md5($x)] ?? null; + } + + public function recoveryKeyId(): string { + return 'recoveryKey_' . substr(md5((string)time()), 0, 8); + } + + public function getCacheBuster(string $version): string { + return substr(sha1($version), 0, 8); + } + + public function lookup(string $uid): mixed { + return $this->cache->get(sha1($uid)); + } + + public function safeStorageId(string $storageId): string { + if (strlen($storageId) > 64) { + $storageId = md5($storageId); + } + return $storageId; + } +} diff --git a/tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/expectations.json b/tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/expectations.json new file mode 100644 index 00000000..4cea3c24 --- /dev/null +++ b/tests/fixtures/fp_guards/php_md5_sha1_non_crypto_use/expectations.json @@ -0,0 +1,17 @@ +{ + "required_findings": [], + "forbidden_findings": [ + { "id_prefix": "php.crypto.md5" }, + { "id_prefix": "php.crypto.sha1" } + ], + "noise_budget": { + "max_total_findings": 0, + "max_high_findings": 0 + }, + "performance_expectations": { + "max_ms_no_index": 1000, + "max_ms_index_cold": 1500, + "max_ms_index_warm": 500, + "ci_mode": "lenient" + } +} diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 82dad1c1..46a8cbda 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -986,6 +986,25 @@ fn fp_guard_php_unserialize_allowed_classes() { validate_expectations(&diags, &dir); } +/// FP guard, PHP `md5()` / `sha1()` weak-hash pattern rule firing +/// syntactically on every callsite. Real-world PHP uses these +/// functions pervasively for non-cryptographic purposes (ETag +/// generation, cache-key / array-index hashing, dedup fingerprints). +/// Layer F suppression recognises the consuming context — variable +/// LHS, member-access LHS, subscript LHS, array element key, +/// lookup-verb argument, return-from-method, hash-as-index — and +/// refuses to fire. Distilled from nextcloud apps/dav (CalDavBackend, +/// CardDavBackend, CardDav PhotoCache), apps/contactsinteraction, +/// apps/theming (Util / CommonThemeTrait), apps/encryption KeyManager, +/// apps/files Cache, and phpmyadmin Controllers/Database / Table / +/// Display / Favorites. +#[test] +fn fp_guard_php_md5_sha1_non_crypto_use() { + let dir = fixture_path("fp_guards/php_md5_sha1_non_crypto_use"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + /// FP guard, JS / TS local-collection receivers. Pinned from the /// excalidraw element-manipulation cluster (66 → ~9 on /// `js.auth.missing_ownership_check` over the repo). The fix lives at @@ -1022,6 +1041,25 @@ fn fp_guard_c_buffer_literal_src() { validate_expectations(&diags, &dir); } +/// FP guard, `cpp.memory.reinterpret_cast` over-fires on every +/// `reinterpret_cast(x)` syntactically — including the canonical +/// well-defined-by-aliasing-rules targets: byte-pointer family +/// (`char*`, `uint8_t*`, `std::byte*`), `void*`, the integer +/// round-trip types `uintptr_t` / `intptr_t`, and the BSD-socket +/// address family. These are exempt per [basic.lval]/11 and POSIX +/// socket-API contracts; suppressing them is a layer-2 structural fix +/// in `src/ast.rs::is_cpp_cast_target_type_safe`. Genuine +/// strict-aliasing UB casts (target is a user struct / class type) +/// keep firing. Distilled from bitcoin's leveldb / serialization / +/// IPC / netif shapes (109 → 55 findings on bitcoin in the +/// real-repo precision sweep). +#[test] +fn fp_guard_cpp_reinterpret_cast_byte_pointer() { + let dir = fixture_path("fp_guards/cpp_reinterpret_cast_byte_pointer"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + /// FP guard, `rs.auth.missing_ownership_check` over-fires on Rust /// helpers when (a) a parameter's TYPE annotation contains an /// identifier whose lower-case form matches the framework-request-name