From 40995e45e7ea56f551f6ac1ee3e4c89419fa4f74 Mon Sep 17 00:00:00 2001 From: Eli Peter <54954007+elicpeter@users.noreply.github.com> Date: Sat, 2 May 2026 16:44:49 -0400 Subject: [PATCH] Authorization analysis logic improvements (#61) --- CHANGELOG.md | 19 + ROADMAP.md | 29 +- docs/SUMMARY.md | 5 + docs/advanced-analysis.md | 20 +- docs/auth.md | 30 +- src/ast.rs | 1 + src/auth_analysis/checks.rs | 331 +++++++++++- src/auth_analysis/config.rs | 42 ++ src/auth_analysis/extract/actix_web.rs | 11 +- src/auth_analysis/extract/axum.rs | 256 +++++++++ src/auth_analysis/extract/common.rs | 27 + src/auth_analysis/extract/flask.rs | 77 +++ src/auth_analysis/extract/mod.rs | 61 ++- src/auth_analysis/model.rs | 31 ++ src/cfg/literals.rs | 143 ++++++ src/cfg/mod.rs | 71 ++- src/cfg/params.rs | 106 +++- src/cfg_analysis/resources.rs | 18 + src/database.rs | 6 + src/labels/javascript.rs | 59 ++- src/labels/mod.rs | 7 + src/server/debug.rs | 1 + src/server/routes/debug.rs | 2 + src/state/transfer.rs | 252 ++++++++- src/summary/ssa_summary.rs | 22 + src/summary/tests.rs | 14 + src/symex/transfer.rs | 13 + src/taint/mod.rs | 30 +- src/taint/path_state.rs | 58 +++ src/taint/ssa_transfer/mod.rs | 145 +++++- src/taint/ssa_transfer/summary_extract.rs | 139 ++++- src/taint/ssa_transfer/tests.rs | 1 + src/taint/tests.rs | 485 ++++++++++++++++++ src/utils/config.rs | 14 + src/utils/project.rs | 150 +++++- tests/benchmark/RESULTS.md | 6 +- .../go/safe/safe_inner_call_close_in_arg.go | 55 ++ ...e_struct_field_resource_owned_by_struct.go | 78 +++ .../go/safe/vuln_resource_leak_no_close.go | 16 + .../python/auth/vuln_user_id_param_no_auth.py | 20 + .../safe_django_orm_caller_scoped_entity.py | 63 +++ .../safe/safe_mock_patch_test_method.py | 52 ++ .../auth/safe_actix_guarded_data_extractor.rs | 70 +++ .../unsafe_actix_no_guarded_data_extractor.rs | 44 ++ .../Cargo.toml | 23 + .../src/lib.rs | 16 + .../safe/safe_non_web_rust_project/Cargo.toml | 23 + .../safe/safe_non_web_rust_project/src/lib.rs | 60 +++ .../safe/safe_validated_helper_chain.ts | 43 ++ .../javascript/CVE-2023-22621/patched.js | 60 +++ .../javascript/CVE-2023-22621/vulnerable.js | 50 ++ .../typescript/CVE-2026-25544/patched.ts | 103 ++++ .../typescript/CVE-2026-25544/vulnerable.ts | 82 +++ tests/benchmark/ground_truth.json | 444 +++++++++++++++- tests/benchmark/results/latest.json | 343 +++++++++++-- 55 files changed, 4193 insertions(+), 134 deletions(-) create mode 100644 tests/benchmark/corpus/go/safe/safe_inner_call_close_in_arg.go create mode 100644 tests/benchmark/corpus/go/safe/safe_struct_field_resource_owned_by_struct.go create mode 100644 tests/benchmark/corpus/go/safe/vuln_resource_leak_no_close.go create mode 100644 tests/benchmark/corpus/python/auth/vuln_user_id_param_no_auth.py create mode 100644 tests/benchmark/corpus/python/safe/safe_django_orm_caller_scoped_entity.py create mode 100644 tests/benchmark/corpus/python/safe/safe_mock_patch_test_method.py create mode 100644 tests/benchmark/corpus/rust/auth/safe_actix_guarded_data_extractor.rs create mode 100644 tests/benchmark/corpus/rust/auth/unsafe_actix_no_guarded_data_extractor.rs create mode 100644 tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/Cargo.toml create mode 100644 tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/src/lib.rs create mode 100644 tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/Cargo.toml create mode 100644 tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/src/lib.rs create mode 100644 tests/benchmark/corpus/typescript/safe/safe_validated_helper_chain.ts create mode 100644 tests/benchmark/cve_corpus/javascript/CVE-2023-22621/patched.js create mode 100644 tests/benchmark/cve_corpus/javascript/CVE-2023-22621/vulnerable.js create mode 100644 tests/benchmark/cve_corpus/typescript/CVE-2026-25544/patched.ts create mode 100644 tests/benchmark/cve_corpus/typescript/CVE-2026-25544/vulnerable.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index b86f0853..75933d84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,23 @@ A focused release that splits data-exfiltration off from SSRF and ships sinks fo - Ruby SSRF coverage. `OpenURI.open_uri` now classified as an SSRF sink (the low-level fetcher that `URI.open` delegates to). Closes the CarrierWave CVE-2021-21288 download path and equivalent gem shapes that route through `OpenURI` directly. - Ruby chained-call wrapper classification. Statement-level wrappers like `YAML.safe_load(File.read(filename))` and `Marshal.load(File.read(p))` now classify the inner sink for cross-function summary extraction. Without this, the outer call became a non-sink node and the inner sink was lost when the helper was summarised. - Ruby CVE corpus. Vulnerable + patched fixtures added for CVE-2021-21288 (CarrierWave SSRF) and CVE-2023-38337 (rswag path traversal). +- Lodash `_.template` modeled as a gated `Cap::CODE_EXEC` sink. Activates on the template-string argument; suppresses when arg-1 carries a literal `{ evaluate: false }`. Closes Strapi CVE-2023-22621 (server-side template injection → RCE via `<% … %>` evaluate blocks). Vulnerable + patched fixtures added under `tests/benchmark/cve_corpus/javascript/CVE-2023-22621/`. +- JS/TS gated-sink kwarg extractor falls back to inspecting arg-1 object literals (`fn(x, { evaluate: false })`) when the language has no `keyword_argument` node. Required so the lodash gate can read its options object. +- Lodash double-call form (`_.template(t)(data)`) routes through `find_chained_inner_call` so the outer call's gated-sink rebinding fires. +- Cross-function helper-validation propagation. New `SsaFuncSummary.validated_params_to_return` field records parameter indices whose taint flow to the return value is fully validated by a dominating predicate (regex allowlist, type check, validation call) on every return path. At call sites, each tainted argument passed to a validated position — and the call's own return value — are marked `validated_must` / `validated_may` in the caller's SSA taint state, the same way an inline `if (!regex.test(x)) throw` would. Closes the helper-validator gap behind PayloadCMS CVE-2026-25544 (Drizzle SQL injection in `sanitizeValue`). Vulnerable + patched TypeScript fixtures added. +- Destructured-arg sibling expansion in per-parameter taint summary probing. JS/TS object-pattern formals (`({ column, operator, value }) => …`) now seed every binding sharing the slot, and any sibling reaching `validated_must` counts as the slot being validated. New `BodyMeta.param_destructured_fields` carries sibling lists alongside `params` and `param_types`. JS `PARAM_CONFIG` accepts `assignment_pattern` (default-value formals) and `object_pattern` (destructured formals). +- Regex-allowlist branch narrowing. `.test(value)` / `.match(value)` / `.matches(value)` where the receiver name contains `regex` or `pattern` classifies as a `ValidationCall` and narrows the call's first argument, not the regex receiver. Was also extended to `extract_validation_target` so the surviving branch validates `value`, not the regex object. Motivated by Payload CVE-2026-25544 (`if (!SAFE_STRING_REGEX.test(value)) throw …`). +- TypeScript template-substring (`${fn(arg)}`) call-resolution arity-hint fallback. When CFG lowering drops `arg_uses` but `args` is non-empty, the resolver passes `None` so the unique-name fallback can still pick up the lone candidate. +- Caller-scope-entity exemption in `rs.auth.missing_ownership_check`. `.id` / `.pk` no longer fires when `` is a unit parameter named after a multi-tenant scope primitive: `organization` / `org`, `project`, `team`, `workspace`, `tenant`, `account`, `community`, `group`, `repository` / `repo`, `company`. Other field names (`.name`, `.slug`) still flag, and `user` / `member` / `actor` are deliberately excluded (handled by `is_actor_context_subject`). Closes a flood of FPs in Sentry / Saleor / Discourse / Mastodon-shaped multi-tenant helpers (`get_environments(request, organization)`, `_filter_releases_by_query(qs, organization, …)`). +- Auth value-ref walker recurses into the `value` child of `keyword_argument` / `keyword_arg` / `named_argument` nodes. `Model.objects.filter(organization_id=org.id)` no longer surfaces the kwarg key (`organization_id`) as a bare-identifier user-input subject — the schema column name is fixed at call time. +- Test-decorator denylist for Flask route extraction. `mock.patch`, `mock.patch.object` / `.dict` / `.multiple`, `unittest.mock.*`, `monkeypatch.setattr` / `setenv` / `delattr` / `delenv`, and `pytest.mark.parametrize` no longer collide with `.patch` route registration. Stops every `@mock.patch("…")`-decorated test method from being attached as a Flask PATCH handler and flagged as `missing_ownership_check`. +- Typed-extractor route-level guard injection for axum and actix-web. Handlers registered via attribute macros (`#[get("/path")]`, `#[routes::path(…)]`) or via external service-config builders previously never had their typed-extractor guards seeded. New `apply_typed_extractor_guards_to_units` walks every `Function`-kind unit and injects guard checks from typed-extractor params, complementing the route-walk path that already covered `.route(...)` registration. +- New auth config key `policy_guard_names`. Typed-extractor wrappers that prove route-level capability/policy enforcement (e.g. meilisearch's `GuardedData, _>`) are recognised distinctly from authentication-only wrappers. Matched as last-segment + case-insensitive `starts_with`. Rust default: `["Guarded"]`. Distinct from `login_guard_names` so the pattern doesn't pollute regular call recognition (a function like `guarded_load(..)` is not a login guard). +- Outer-wrapper-aware classification of typed extractors. `GuardedData, Data>` is classified by the outer `GuardedData` (policy-bearing → `AuthCheckKind::Other`), not by whether an inner generic arg substring-matches `auth`. Bare data-only extractors (`Path`, `Query`, `Json`, `Form`, `State`, `Extension`, `Data`) outer-name-match early-return to `None` regardless of inner type tokens. Reference-marker (`&`, `&mut`, `&'a`) and module-path (`std::collections::`) prefixes stripped before matching. +- Project-level web-framework signal in Rust auth analysis. New `FrameworkContext::lang_has_web_framework(lang)` is three-valued: `Some(true)` when manifest names a framework, `Some(false)` when the manifest was inspected and named none, `None` when no manifest was inspected. New `rust_file_imports_web_framework` does a per-file `axum::` / `actix_web::` / `rocket::` / `axum_extra::` import probe (8 KB head). When the project's Cargo.toml is inspected and lists no Rust web framework AND the file does not directly import one, the `context_inputs` and param-name-heuristic arms of `unit_has_user_input_evidence` are suppressed. `RouteHandler` classification (concrete route-registration evidence) still bypasses the gate. Closes a flood of `missing_ownership_check` FPs in non-web Rust crates — e.g. zed-style desktop / GUI codebases where a debug-session handle named `session` would trip `matches_session_context` on `session.update(cx, …)`. Currently Rust-only; other languages keep prior behavior (`None`). +- Rust auth corpus extended with `safe_actix_guarded_data_extractor.rs` and `unsafe_actix_no_guarded_data_extractor.rs` (typed-extractor guard injection); `safe_non_web_rust_project/` and `unsafe_actix_web_project_no_check/` (full Cargo.toml + src/lib.rs project shapes for the framework-signal gate). +- Python auth corpus extended with `vuln_user_id_param_no_auth.py`, `safe_django_orm_caller_scoped_entity.py` (caller-scope-entity exemption), `safe_mock_patch_test_method.py` (test-decorator denylist). +- Go safe corpus extended with `safe_inner_call_close_in_arg.go` (`require.NoError(t, f.Close())` shape), `safe_struct_field_resource_owned_by_struct.go` (field-LHS ownership transfer), and a `vuln_resource_leak_no_close.go` regression guard. ### Fixed (false positives) @@ -30,6 +47,8 @@ A focused release that splits data-exfiltration off from SSRF and ships sinks fo - JS and TS `secrets.fallback_secret` no longer fire on empty-string fallbacks (`process.env.X || ""`). Developers write `|| ""` to satisfy non-undefined string types without committing a real secret. Non-empty literal fallbacks still fire. - Path-traversal sink suppression accepts canonicalised-and-rooted shapes. New `PathFact::is_path_traversal_safe` predicate clears `Cap::FILE_IO` when the path is dotdot-free and either non-absolute or carries a verified prefix-lock. New `OPAQUE_PREFIX_LOCK` marker records the structural invariant ("rooted under SOME prefix") when the `starts_with`-style guard's argument is a method call, field access, or configured root rather than a string literal. Closes the Ruby `File.expand_path + start_with?(root)` shape (rswag CVE-2023-38337 patched counterpart), the Python `os.path.realpath + .startswith(root)` shape, and the JS `path.resolve + .startsWith(root)` shape. `classify_path_assertion` extended to JS `.startsWith(...)`, Python `.startswith(...)`, Ruby `.start_with?(...)` (paren and paren-less), and Go `strings.HasPrefix(...)`. - Branch narrowing now flips prefix-lock attachment under condition negation. For `if !target.startsWith(ROOT) { return; }` the lock attaches to the surviving block, not the rejection arm. Rejection-axis narrowing is unchanged because the rejection classifier is text-level and already accounts for leading `!`. +- Go field-LHS resource acquires no longer counted as local resource leaks. `b.cpuprof = os.Create(...)` transfers ownership to the containing struct; closure responsibility belongs to a paired `Stop()` / `Release()` method on the struct's lifecycle. Gated in both `state/transfer.rs::apply_call` and `cfg_analysis/resources.rs::run`. Restricted to Go (`Lang::Go` check) — JS/TS class-field acquires (`this.fd = fs.openSync(...)`) keep being tracked because the leak fixtures rely on it. Production trigger: prometheus `cmd/promtool/tsdb.go::startProfiling` cluster (`b.cpuprof`, `b.memprof`, `b.blockprof`, `b.mtxprof`). +- Go inner-call release in argument position. `require.NoError(t, f.Close())`, `errs = append(errs, f.Close())`, JUnit `assertEquals(0, in.read())` — releases that live in argument position now mark the receiver `CLOSED`. Bare-receiver inner calls only (chained-receiver releases stay owned by `chain_proxies`); marks `CLOSED` only with no `DoubleClose` attribution; respects `in_defer` for symmetry. ### Other diff --git a/ROADMAP.md b/ROADMAP.md index aa2b8395..3eae9653 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,22 +1,23 @@ # Roadmap -Nyx today is a static-only multi-language vulnerability scanner. The roadmap below extends it into a hybrid scanner that combines static analysis with controlled execution and AI-assisted reasoning. +## Now: recall and precision on real codebases -## Phase 1: Static Analysis (current) +The current focus is straightforward. Run Nyx against real open-source repositories and real CVEs, then close the gap between what it finds and what it should find. -The shipped scanner. Multi-language taint tracking on a pruned SSA IR, cross-file function summaries, points-to and abstract interpretation, symbolic execution with an optional SMT backend, and a local web UI for triage. See the [Changelog](CHANGELOG.md) for the full breakdown of what's landed through 0.5.0. +That means: -## Phase 2: Dynamic Capability +- **Recall.** Pick CVEs with public fixes. Reproduce them on the vulnerable commit. If Nyx misses, figure out why (missing source, missing sink, lost flow across a call, dropped at a sanitizer that was not actually a sanitizer) and fix the underlying analysis, not the fixture. +- **Precision.** Triage the noise on large repos (phpMyAdmin, Nextcloud, and others). Each false positive gets reduced to a pattern: receiver-type gate, non-crypto context for `md5`/`sha1`, type-safe sink suppression, etc. Land the gate, re-run the corpus, confirm the count drops without taking real bugs with it. +- **Corpus discipline.** Every fix lands with a fixture (positive or negative) and a corpus row. Rule-level F1 on `tests/benchmark/corpus/` is the scoreboard. CI floors only ratchet up. -| Feature | Description | -| --- | --- | -| Controlled dynamic execution | Local sandbox: identify entry points, spin up test harnesses, inject payloads, detect runtime crashes and command execution. Deterministic automated exploit validation: static finds `exec(user_input)`, dynamic confirms it with `; id`. | -| Fuzzing integration | libFuzzer (C/C++), cargo-fuzz (Rust), go-fuzz, HTTP fuzzing harness. Static engine identifies interesting functions, fuzzer targets only those. | +The scanner internals (SSA, cross-file summaries, abstract interpretation, symbolic execution, auth analysis) are in place. They get refined in service of the recall/precision work, not extended for their own sake. -## Phase 3: Intelligent Reasoning Layer +## Later: dynamic capability -| Feature | Description | -| --- | --- | -| Semantic similarity | Embeddings for finding similar vulnerability patterns across codebases. | -| LLM reasoning | AI-assisted detection of non-obvious logic bugs. | -| Exploit refinement | Automated loops to refine and validate exploit chains. | +Static analysis confirms a flow exists. Dynamic execution confirms it fires. The plan is a local sandbox that picks up entry points Nyx already identifies, builds a harness, injects a payload, and watches for the crash or shell. Pairs naturally with fuzzing (libFuzzer, cargo-fuzz, go-fuzz, HTTP) where the static engine picks the targets. + +Not started. Lands after the static side is honest on real corpora. + +## Later still: reasoning layer + +Embeddings for cross-codebase pattern similarity. LLM-assisted detection for logic bugs that resist taint modeling. Automated exploit refinement loops. All speculative until the foundation is solid. diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index faad2b38..80c248c8 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -27,3 +27,8 @@ - [CFG](detectors/cfg.md) - [State](detectors/state.md) - [Taint](detectors/taint.md) + +# Project + +- [Roadmap](roadmap.md) +- [Changelog](changelog.md) diff --git a/docs/advanced-analysis.md b/docs/advanced-analysis.md index cd5e389b..b6f1bfb1 100644 --- a/docs/advanced-analysis.md +++ b/docs/advanced-analysis.md @@ -96,8 +96,24 @@ hash per-argument `Cap` bits but not source-origin identity, so two callers with identical caps but different origins share cached origin-attribution. -**Source**: [`src/taint/ssa_transfer.rs`](https://github.com/elicpeter/nyx/blob/master/src/taint/ssa_transfer.rs) -(`ArgTaintSig`, `InlineCache`, `inline_analyse_callee`). +**Helper-validator propagation.** SSA summaries carry a +`validated_params_to_return` field listing parameter indices whose +taint flow to the return value is fully validated by a dominating +predicate (regex allowlist, type check, validation call) on every +return path. At call sites, each tainted argument passed to a +validated position — and the call's own return value — are marked +`validated_must` / `validated_may` in the caller's SSA taint state, +the same way an inline `if (!regex.test(x)) throw …` would validate +the surviving branch. Sound because the summary is recorded only when +the parameter's name is in `validated_must` at *every* return block; a +normal-returning call therefore proves the validating arm. JS/TS +object-pattern formals (`({ column, operator, value }) => …`) seed +every destructured sibling in the per-parameter probe, so flow through +any of them counts toward the slot being validated. + +**Source**: [`src/taint/ssa_transfer/`](https://github.com/elicpeter/nyx/tree/master/src/taint/ssa_transfer/) +(`ArgTaintSig`, `InlineCache`, `inline_analyse_callee`, +`propagate_validated_params_to_return`). --- diff --git a/docs/auth.md b/docs/auth.md index 3dfece39..bf0a6deb 100644 --- a/docs/auth.md +++ b/docs/auth.md @@ -6,14 +6,31 @@ The Rust rule is `rs.auth.missing_ownership_check`. It fires when a request handler reaches a privileged operation that takes a scoped identifier (`*_id`, row reference, scoped resource) without a preceding ownership or membership check. -Concretely, it looks for five patterns of authorization in the function body and flags the call when none are present: +Concretely, it looks for these patterns of authorization in the function body and flags the call when none are present: - A call to a recognised authorization helper. Defaults: `check_ownership`, `has_ownership`, `require_ownership`, `ensure_ownership`, `is_owner`, `authorize`, `verify_access`, `has_permission`, `can_access`, `can_manage`, plus `*_membership` and `require_{group,org,workspace,tenant,team}_member` variants. Extend in `[analysis.languages.rust]`. - An ownership-equality check on a row reference: `if owner_id != user.id { return 403 }` or any `field_id != self_actor` shape. The check writes `AuthCheck` evidence back to the row-fetch arguments via `AnalysisUnit.row_field_vars`. - A self-actor reference: `let user = require_auth(...).await?` followed by use of `user.id`, `user.user_id`, `user.uid`. The actor is recognised from typed extractor params (`Extension`, `CurrentUser`, etc.) and from typed helper bindings. +- A typed extractor wrapper that proves route-level capability/policy enforcement: meilisearch-style `GuardedData, _>`. Recognised by outer wrapper name (last segment, case-insensitive `starts_with`) so `GuardedData, Data>` is classified by the outer `GuardedData`, not by whether an inner generic arg substring-matches `auth`. Configured via `policy_guard_names` (Rust default: `["Guarded"]`). Distinct from authentication-only wrappers so the pattern doesn't pollute regular call recognition. - A SQL query that joins through an ACL table or filters by `user_id` predicate. Detected without a SQL parser via [`sql_semantics.rs`](https://github.com/elicpeter/nyx/blob/master/src/auth_analysis/sql_semantics.rs); the authorized result variable propagates through `let row = ...prepare(LIT)...`, `for row in result`, `let id = row.get(...)`. - A helper-summary lift: handler calls `validate_target(db, widget_id, user.id)` whose body contains a `require_*_member` call. Cross-function summaries are merged at fixed-point (capped at 4 iterations). +Handlers registered through attribute macros (`#[get("/path")]`, `#[routes::path(…)]`) or external service-config builders are also walked for typed-extractor guards, complementing the `.route(...)` registration path. + +## Caller-scope-entity exemption + +`.id` / `.pk` is not flagged when `` is a unit parameter named after a multi-tenant scope primitive: `organization` / `org`, `project`, `team`, `workspace`, `tenant`, `account`, `community`, `group`, `repository` / `repo`, `company`. The argument represents the caller's scope, not a user-controlled target, so internal helpers like `def get_environments(request, organization): Environment.objects.filter(organization_id=organization.id, …)` inherit the caller's authorization. Other field names (`.name`, `.slug`) still flag, and `user` / `member` / `actor` are deliberately excluded — those are handled by the actor-context recogniser. + +## Project-level web-framework gate (Rust) + +In Rust, the `context_inputs` and param-name arms of the user-input heuristic are gated by a project-level web-framework signal. The signal is three-valued: + +- `Some(true)` — the project's `Cargo.toml` names `axum`, `actix-web`, or `rocket`, OR the file directly imports one (`axum::`, `actix_web::`, `rocket::`, `axum_extra::`). Heuristics stay on. +- `Some(false)` — `Cargo.toml` was inspected and named no web framework, AND the file does not directly import one. Heuristics off; only `RouteHandler` classification (concrete route-registration evidence) survives. +- `None` — no detection ran (single-file scan with no project root). Heuristics on; behavior unchanged. + +This avoids a class of FPs in non-web Rust crates where a debug-session handle named `session` would trip on `session.update(cx, …)`-style desktop-app code. Other languages keep prior behavior; the gate is currently Rust-only. + ## Sink classification The same call name can be safe on a local collection and dangerous on a database. The detector categorises each candidate sink before deciding whether to flag: @@ -62,6 +79,15 @@ cap = "unauthorized_id" The same rule recognised in the standalone analyser also strips `Cap::UNAUTHORIZED_ID` for the taint-based variant. +### Add a project-specific typed-extractor policy wrapper + +```toml +[analysis.languages.rust.auth] +policy_guard_names = ["MyAppGuarded", "PolicyExtractor"] +``` + +Matched as last-segment + case-insensitive `starts_with` (so a single entry `"Guarded"` covers `Guarded`, `GuardedData`, `GuardedRoute`). Distinct from `login_guard_names` and `admin_guard_names`. + ### Recognised actor names Recognised by default: `user.id`, `user.user_id`, `user.uid`, `session.user_id`, `current_user.id`, plus typed extractor parameters with `CurrentUser`, `SessionUser`, `AuthUser`, `Extension<...>` shapes. To add a custom binding pattern, file an issue or add a fixture; the heuristic is in [`src/auth_analysis/checks.rs`](https://github.com/elicpeter/nyx/blob/master/src/auth_analysis/checks.rs) under `extract_validation_target` and friends. @@ -88,4 +114,4 @@ Auth findings render alongside taint findings in the [browser UI](serve.md). The ## Benchmark corpus -The Rust auth corpus at [`tests/benchmark/corpus/rust/auth/`](https://github.com/elicpeter/nyx/tree/master/tests/benchmark/corpus/rust/auth/) is 10 fixtures covering the five FP patterns plus a true-positive control. Per-row metrics live under the Rust auth row in `tests/benchmark/RESULTS.md`. +The Rust auth corpus at [`tests/benchmark/corpus/rust/auth/`](https://github.com/elicpeter/nyx/tree/master/tests/benchmark/corpus/rust/auth/) covers the recognised authorization patterns, true-positive controls, typed-extractor guard injection, and the project-level web-framework gate (full-Cargo.toml fixtures under `safe_non_web_rust_project/` and `unsafe_actix_web_project_no_check/`). Per-row metrics live under the Rust auth row in `tests/benchmark/RESULTS.md`. diff --git a/src/ast.rs b/src/ast.rs index ec621951..6ce80413 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -1102,6 +1102,7 @@ impl<'a> ParsedFile<'a> { if !missing.is_empty() { let aug_ctx = crate::utils::project::FrameworkContext { frameworks: missing.clone(), + inspected_langs: std::collections::HashSet::new(), }; lang_rules .extra_labels diff --git a/src/auth_analysis/checks.rs b/src/auth_analysis/checks.rs index 1529c8b0..fec7a4e7 100644 --- a/src/auth_analysis/checks.rs +++ b/src/auth_analysis/checks.rs @@ -15,11 +15,14 @@ pub struct AuthFinding { pub fn run_checks(model: &AuthorizationModel, rules: &AuthAnalysisRules) -> Vec { let mut findings = Vec::new(); + let web_signal = model.lang_web_framework_signal; findings.extend(check_admin_routes(model, rules)); - findings.extend(check_ownership_gaps(model, rules)); - findings.extend(check_partial_batch_authorization(model, rules)); - findings.extend(check_stale_authorization(model, rules)); - findings.extend(check_token_override_without_validation(model, rules)); + findings.extend(check_ownership_gaps(model, rules, web_signal)); + findings.extend(check_partial_batch_authorization(model, rules, web_signal)); + findings.extend(check_stale_authorization(model, rules, web_signal)); + findings.extend(check_token_override_without_validation( + model, rules, web_signal, + )); findings.sort_by(|a, b| a.span.cmp(&b.span).then_with(|| a.rule_id.cmp(&b.rule_id))); findings.dedup_by(|a, b| a.span == b.span && a.rule_id == b.rule_id); findings @@ -63,11 +66,15 @@ fn check_admin_routes(model: &AuthorizationModel, rules: &AuthAnalysisRules) -> findings } -fn check_ownership_gaps(model: &AuthorizationModel, rules: &AuthAnalysisRules) -> Vec { +fn check_ownership_gaps( + model: &AuthorizationModel, + rules: &AuthAnalysisRules, + web_signal: Option, +) -> Vec { let mut findings = Vec::new(); for unit in &model.units { - if !unit_has_user_input_evidence(unit) { + if !unit_has_user_input_evidence(unit, web_signal) { continue; } for op in &unit.operations { @@ -115,11 +122,12 @@ fn check_ownership_gaps(model: &AuthorizationModel, rules: &AuthAnalysisRules) - fn check_partial_batch_authorization( model: &AuthorizationModel, rules: &AuthAnalysisRules, + web_signal: Option, ) -> Vec { let mut findings = Vec::new(); for unit in &model.units { - if !unit_has_user_input_evidence(unit) { + if !unit_has_user_input_evidence(unit, web_signal) { continue; } for op in &unit.operations { @@ -169,11 +177,12 @@ fn check_partial_batch_authorization( fn check_stale_authorization( model: &AuthorizationModel, rules: &AuthAnalysisRules, + web_signal: Option, ) -> Vec { let mut findings = Vec::new(); for unit in &model.units { - if !unit_has_user_input_evidence(unit) { + if !unit_has_user_input_evidence(unit, web_signal) { continue; } for op in unit.operations.iter().filter(|operation| { @@ -216,6 +225,7 @@ fn check_stale_authorization( fn check_token_override_without_validation( model: &AuthorizationModel, rules: &AuthAnalysisRules, + web_signal: Option, ) -> Vec { let mut findings = Vec::new(); @@ -229,7 +239,7 @@ fn check_token_override_without_validation( // call shape happens to look token-y (`account.token = …; // account.save()`). Gate on positive user-input evidence so // these pure backend units are never claimed as a token flow. - if !unit_has_user_input_evidence(unit) { + if !unit_has_user_input_evidence(unit, web_signal) { continue; } let Some(token_lookup) = unit @@ -600,6 +610,82 @@ fn is_relevant_target_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { && !is_actor_context_subject(subject, unit) && !is_const_bound_subject(subject, unit) && !is_typed_bounded_subject(subject, unit) + && !is_caller_scope_entity_subject(subject, unit) +} + +/// True iff `subject` is a member-access of form `.id` / +/// `.pk` whose root identifier is a unit parameter named after +/// a scope-bearing domain entity (`organization`, `project`, `team`, +/// `workspace`, `tenant`, `account`, `community`, `repository`, …). +/// +/// Such subjects are the *scope* of the operation — the ownership +/// constraint the caller passed in — not a user-controlled target. +/// Helpers like +/// `def get_environments(request, organization: Organization): … +/// Environment.objects.filter(organization_id=organization.id, …)` +/// inherit the caller's authorization on the entity object; the call +/// itself enforces tenant scoping. Without this exemption, every +/// internal helper in a multi-tenant Django/Rails/Laravel codebase +/// flags `missing_ownership_check` because the engine cannot tell +/// "scoping arg" from "user-targeted arg". +/// +/// Conservative scope: +/// * Field must be `id` or `pk` (the canonical primary-key fields). +/// `entity.name` / `entity.slug` are deliberately excluded — those +/// could be user-supplied display strings even on a typed entity. +/// * Root must be exactly a unit parameter (not a derived local). +/// * Root name must be in the scope-entity vocabulary. Names like +/// `user`, `member`, `actor` are deliberately omitted: those carry +/// actor semantics and are handled separately by +/// `is_actor_context_subject`. +fn is_caller_scope_entity_subject(subject: &ValueRef, unit: &AnalysisUnit) -> bool { + let Some(field) = subject.field.as_deref() else { + return false; + }; + let field_lower = field.to_ascii_lowercase(); + if !matches!(field_lower.as_str(), "id" | "pk") { + return false; + } + let Some(base) = subject.base.as_deref() else { + return false; + }; + let root = base.split('.').next().unwrap_or(base); + if !is_caller_scope_entity_name(root) { + return false; + } + unit.params.iter().any(|p| p == root) +} + +/// Recognises parameter names that conventionally carry a *scope* +/// entity — the multi-tenant ownership boundary inherited from the +/// caller — rather than a user-controlled target identifier. Used +/// only by `is_caller_scope_entity_subject` to suppress +/// `missing_ownership_check` on `.id` arguments to ORM / +/// query / mutation calls. +/// +/// Vocabulary matches the canonical multi-tenant primitives across +/// Django (Sentry, Saleor), Rails (Discourse, Mastodon), and Laravel +/// / Symfony idioms. Both singular and short forms are matched +/// (`organization` / `org`, `repository` / `repo`). Excluded: +/// `user`, `member`, `actor` (actor semantics, covered by +/// `is_actor_context_subject` and per-actor self-id detectors). +fn is_caller_scope_entity_name(name: &str) -> bool { + let lower = name.to_ascii_lowercase(); + matches!( + lower.as_str(), + "organization" + | "org" + | "project" + | "team" + | "workspace" + | "tenant" + | "account" + | "community" + | "group" + | "repository" + | "repo" + | "company" + ) } /// True iff `subject` is a plain identifier whose declaration binds @@ -852,10 +938,25 @@ fn is_id_like_name(name: &str) -> bool { /// pure utility helpers fail all three conditions and are skipped , /// they cannot, by construction, be the entry point of an /// authentication-bearing flow. -fn unit_has_user_input_evidence(unit: &AnalysisUnit) -> bool { +fn unit_has_user_input_evidence(unit: &AnalysisUnit, web_signal: Option) -> bool { if unit.kind == AnalysisUnitKind::RouteHandler { return true; } + // Project-level web-framework gate. When the project's manifest + // was inspected and named no web framework matching the file's + // language, AND no per-file import override applied, the file + // lives in a project with no HTTP boundary. Step 2 (context + // inputs) and step 3 (param-name heuristic) are both name-shape + // heuristics that overshoot in non-web Rust crates ─ e.g. zed's + // GUI test code where `session.update(cx, ...)` (a debug-session + // handle, not an auth session) trips `matches_session_context` + // and lands in `context_inputs`, opening every test method's + // sinks. Refuse here, after the RouteHandler step (which is + // determined by framework extractors and is robust evidence on + // its own). + if web_signal == Some(false) { + return false; + } if !unit.context_inputs.is_empty() { return true; } @@ -934,8 +1035,9 @@ fn is_batch_collection(subject: &ValueRef) -> bool { #[cfg(test)] mod tests { use super::{ - auth_check_covers_subject, is_actor_context_subject, is_external_input_param_name, - is_relevant_target_subject, unit_has_user_input_evidence, + auth_check_covers_subject, is_actor_context_subject, is_caller_scope_entity_name, + is_caller_scope_entity_subject, is_external_input_param_name, is_relevant_target_subject, + unit_has_user_input_evidence, }; use crate::auth_analysis::model::{AnalysisUnit, AnalysisUnitKind, ValueRef, ValueSourceKind}; use std::collections::{HashMap, HashSet}; @@ -1083,6 +1185,146 @@ mod tests { assert!(is_relevant_target_subject(&member("req", "id"), &unit)); } + /// Real-repo regression: caller-passed scope entity used as + /// ownership constraint (sentry api/helpers/environments.py + /// `get_environments(request, organization)` and + /// api/endpoints/organization_releases.py + /// `_filter_releases_by_query(queryset, organization, query, ...)`). + /// The helper inherits the caller's auth on the entity object; + /// the `.id` arg IS the ownership scope, not a target. + #[test] + fn caller_scope_entity_subject_recognises_unit_param_id() { + let mut unit = empty_unit(); + unit.params.push("organization".into()); + + // `organization.id` where `organization` is a unit param and + // matches the scope-entity vocabulary -> recognised as scope. + assert!(is_caller_scope_entity_subject( + &member("organization", "id"), + &unit + )); + assert!(is_caller_scope_entity_subject( + &member("organization", "pk"), + &unit + )); + // Suppression flows through to `is_relevant_target_subject`. + assert!(!is_relevant_target_subject( + &member("organization", "id"), + &unit + )); + + // Other scope-entity names: project, team, workspace, ... + let mut unit_p = empty_unit(); + unit_p.params.push("project".into()); + assert!(is_caller_scope_entity_subject( + &member("project", "id"), + &unit_p + )); + + let mut unit_t = empty_unit(); + unit_t.params.push("team".into()); + assert!(is_caller_scope_entity_subject( + &member("team", "id"), + &unit_t + )); + + let mut unit_w = empty_unit(); + unit_w.params.push("workspace".into()); + assert!(is_caller_scope_entity_subject( + &member("workspace", "id"), + &unit_w + )); + + let mut unit_r = empty_unit(); + unit_r.params.push("repo".into()); + assert!(is_caller_scope_entity_subject( + &member("repo", "id"), + &unit_r + )); + } + + /// Pitfall guards for `is_caller_scope_entity_subject`. + #[test] + fn caller_scope_entity_subject_does_not_overreach() { + // `organization` not declared as a unit param -> not exempt. + let unit = empty_unit(); + assert!(!is_caller_scope_entity_subject( + &member("organization", "id"), + &unit + )); + + // Field other than id/pk -> not exempt (could be display name). + let mut unit = empty_unit(); + unit.params.push("organization".into()); + assert!(!is_caller_scope_entity_subject( + &member("organization", "name"), + &unit + )); + assert!(!is_caller_scope_entity_subject( + &member("organization", "slug"), + &unit + )); + + // `user.id` / `member.id` / `actor.id` are deliberately NOT + // recognised as scope entities (actor semantics, handled by + // is_actor_context_subject). They must not be widened here. + let mut unit_u = empty_unit(); + unit_u.params.push("user".into()); + assert!(!is_caller_scope_entity_subject( + &member("user", "id"), + &unit_u + )); + + let mut unit_m = empty_unit(); + unit_m.params.push("member".into()); + assert!(!is_caller_scope_entity_subject( + &member("member", "id"), + &unit_m + )); + + // Bare identifier -> not exempt (no field). + let mut unit_b = empty_unit(); + unit_b.params.push("organization".into()); + assert!(!is_caller_scope_entity_subject( + &plain("organization"), + &unit_b + )); + } + + /// Vocabulary check for `is_caller_scope_entity_name`. Pinned so + /// future widening is intentional. + #[test] + fn caller_scope_entity_name_vocabulary() { + // Recognised scope entities. + for name in [ + "organization", + "Organization", + "ORG", + "project", + "team", + "workspace", + "tenant", + "account", + "community", + "group", + "repository", + "repo", + "company", + ] { + assert!( + is_caller_scope_entity_name(name), + "expected {name} to be recognised as scope entity" + ); + } + // Excluded (actor semantics or generic). + for name in ["user", "member", "actor", "request", "self", "ctx"] { + assert!( + !is_caller_scope_entity_name(name), + "expected {name} NOT to be recognised as scope entity" + ); + } + } + /// Hierarchy: a parameter whose /// static type was recovered as `Int`/`Bool` (Spring `Long userId`, /// Axum `Path`, FastAPI `user_id: int`) has its name added to @@ -1119,23 +1361,23 @@ mod tests { // Function with no params and no context_inputs (Celery task // shape), must NOT count as user-input-bearing. let mut unit = empty_unit(); - assert!(!unit_has_user_input_evidence(&unit)); + assert!(!unit_has_user_input_evidence(&unit, None)); // Adding internal-typed params (apps, schema_editor, Django // migration RunPython callback shape) keeps the gate closed. unit.params.push("apps".into()); unit.params.push("schema_editor".into()); - assert!(!unit_has_user_input_evidence(&unit)); + assert!(!unit_has_user_input_evidence(&unit, None)); // pytest hook shape: (config, items), gate stays closed. let mut unit = empty_unit(); unit.params.push("config".into()); unit.params.push("items".into()); - assert!(!unit_has_user_input_evidence(&unit)); + assert!(!unit_has_user_input_evidence(&unit, None)); // Adding an id-like param flips the gate open. unit.params.push("doc_id".into()); - assert!(unit_has_user_input_evidence(&unit)); + assert!(unit_has_user_input_evidence(&unit, None)); // Token-named param flips the gate open (Express helper // `acceptInvitation(token, currentUser, roleOverride)`). @@ -1143,23 +1385,72 @@ mod tests { unit.params.push("token".into()); unit.params.push("currentUser".into()); unit.params.push("roleOverride".into()); - assert!(unit_has_user_input_evidence(&unit)); + assert!(unit_has_user_input_evidence(&unit, None)); // Framework request-name param flips the gate open // (Django/Flask `def view(request, project_id):`). let mut unit = empty_unit(); unit.params.push("request".into()); - assert!(unit_has_user_input_evidence(&unit)); + assert!(unit_has_user_input_evidence(&unit, None)); // Axum/Actix typed-extractor convention name flips it open. let mut unit = empty_unit(); unit.params.push("path".into()); - assert!(unit_has_user_input_evidence(&unit)); + assert!(unit_has_user_input_evidence(&unit, None)); // RouteHandler kind always wins, regardless of params. let mut unit = empty_unit(); unit.kind = AnalysisUnitKind::RouteHandler; - assert!(unit_has_user_input_evidence(&unit)); + assert!(unit_has_user_input_evidence(&unit, None)); + } + + /// Web-framework signal `Some(false)` (project's manifest was + /// inspected and named no web framework matching the file's + /// language, AND no per-file import override) suppresses both + /// the `context_inputs` arm and the param-name arm — both are + /// name-shape heuristics that overshoot in non-web Rust crates + /// (e.g. a debug-session handle named `session` trips + /// `matches_session_context` and lands in `context_inputs`). + /// Only RouteHandler classification (step 1) survives the gate + /// because that flag is set by framework extractors with concrete + /// route-registration evidence. + #[test] + fn web_framework_signal_gates_user_input_heuristics() { + // Param-name arm: helper named `_id` in a project the + // auth detector confirmed has no Rust web framework. Without + // the gate this would flip step 3 open and flood the rule on + // every desktop helper. + let mut unit = empty_unit(); + unit.params.push("session_id".into()); + assert!(unit_has_user_input_evidence(&unit, None)); + assert!(unit_has_user_input_evidence(&unit, Some(true))); + assert!(!unit_has_user_input_evidence(&unit, Some(false))); + + // Step 1 (RouteHandler) still wins regardless of the gate. + // RouteHandler kind is set by framework extractors (axum / + // actix_web / rocket) on concrete route-registration shapes — + // robust enough to bypass the project-level gate even when + // the manifest doesn't name the framework. + unit.kind = AnalysisUnitKind::RouteHandler; + assert!(unit_has_user_input_evidence(&unit, Some(false))); + + // context_inputs arm: bare `session.foo` on a debug-session + // handle (not an auth session) lands in `context_inputs` via + // `matches_session_context`. The gate suppresses this so + // non-web Rust crates don't fire on `session.update(cx, ...)` + // shapes from desktop test code. + let mut unit = empty_unit(); + unit.context_inputs.push(ValueRef { + source_kind: ValueSourceKind::Session, + name: "session.update".into(), + base: Some("session".into()), + field: Some("update".into()), + index: None, + span: (0, 0), + }); + assert!(unit_has_user_input_evidence(&unit, None)); + assert!(unit_has_user_input_evidence(&unit, Some(true))); + assert!(!unit_has_user_input_evidence(&unit, Some(false))); } /// `is_external_input_param_name` covers id-, token-, and diff --git a/src/auth_analysis/config.rs b/src/auth_analysis/config.rs index 075ff66e..8528b831 100644 --- a/src/auth_analysis/config.rs +++ b/src/auth_analysis/config.rs @@ -9,6 +9,17 @@ pub struct AuthAnalysisRules { pub admin_path_patterns: Vec, pub admin_guard_names: Vec, pub login_guard_names: Vec, + /// Typed-extractor wrapper names that carry route-level + /// authorization (capability/policy enforcement) rather than mere + /// authentication. Match by `matches_name` (last-segment + + /// case-insensitive `starts_with`), so a single pattern like + /// `"Guarded"` covers `Guarded`, `GuardedData`, `GuardedRoute`. + /// Consulted only by `inject_guard_checks` for typed-extractor + /// route-level injection — distinct from `login_guard_names` / + /// `admin_guard_names` so the pattern doesn't pollute regular call + /// recognition (where a function like `guarded_load(..)` would + /// otherwise be wrongly classified as a login guard). + pub policy_guard_names: Vec, pub authorization_check_names: Vec, pub mutation_indicator_names: Vec, pub read_indicator_names: Vec, @@ -54,6 +65,7 @@ impl AuthAnalysisRules { admin_path_patterns: Vec::new(), admin_guard_names: Vec::new(), login_guard_names: Vec::new(), + policy_guard_names: Vec::new(), authorization_check_names: Vec::new(), mutation_indicator_names: Vec::new(), read_indicator_names: Vec::new(), @@ -353,6 +365,19 @@ impl AuthAnalysisRules { .any(|pattern| matches_name(name, pattern)) } + /// Typed-extractor wrapper that proves the request passed a + /// route-level capability/policy check (e.g. meilisearch's + /// `GuardedData, _>`). Distinct from + /// `is_login_guard` because policy enforcement is more than mere + /// authentication, it includes the per-action permission decision + /// the Policy term encodes. Used only by `inject_guard_checks` + /// for typed-extractor route-level injection. + pub fn is_policy_guard(&self, name: &str) -> bool { + self.policy_guard_names + .iter() + .any(|pattern| matches_name(name, pattern)) + } + pub fn is_authorization_check(&self, name: &str) -> bool { if self .authorization_check_names @@ -482,6 +507,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "ensure_authenticated".into(), "require_auth".into(), ], + policy_guard_names: Vec::new(), authorization_check_names: vec![ "check_membership".into(), "has_membership".into(), @@ -595,6 +621,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "login_required".into(), "login_required!".into(), ], + policy_guard_names: Vec::new(), authorization_check_names: vec![ "authorize".into(), "authorize!".into(), @@ -762,6 +789,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "requireAuth".into(), "ensureAuthenticated".into(), ], + policy_guard_names: Vec::new(), authorization_check_names: vec![ "CheckMembership".into(), "HasMembership".into(), @@ -853,6 +881,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "Authenticated".into(), "isAuthenticated".into(), ], + policy_guard_names: Vec::new(), authorization_check_names: vec![ "checkMembership".into(), "hasMembership".into(), @@ -951,6 +980,14 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "RequireLogin".into(), "RequireAuth".into(), ], + // `Guarded` (case-insensitive starts_with) recognises + // typed-extractor wrappers like meilisearch's + // `GuardedData, _>` as + // route-level policy guards (capability enforcement). The + // wrapper proves the request passed a permission check, so + // any sink in the handler is route-gated even when the + // engine cannot model the inner Policy term. + policy_guard_names: vec!["Guarded".into()], authorization_check_names: vec![ "check_membership".into(), "has_membership".into(), @@ -1120,6 +1157,7 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { "ensureAuth".into(), "require_login".into(), ], + policy_guard_names: Vec::new(), authorization_check_names: vec![ "checkMembership".into(), "hasWorkspaceMembership".into(), @@ -1272,6 +1310,10 @@ pub fn build_auth_rules(config: &Config, lang_slug: &str) -> AuthAnalysisRules { &mut rules.login_guard_names, &lang_cfg.auth.login_guard_names, ); + extend_unique( + &mut rules.policy_guard_names, + &lang_cfg.auth.policy_guard_names, + ); extend_unique( &mut rules.authorization_check_names, &lang_cfg.auth.authorization_check_names, diff --git a/src/auth_analysis/extract/actix_web.rs b/src/auth_analysis/extract/actix_web.rs index 2d027cbc..ea0ecb85 100644 --- a/src/auth_analysis/extract/actix_web.rs +++ b/src/auth_analysis/extract/actix_web.rs @@ -1,7 +1,7 @@ use super::AuthExtractor; use super::axum::{ - GuardFramework, apply_aliases, dedup_call_sites, expanded_guard_call_sites, - guard_calls_for_handler, inject_guard_checks, rust_param_aliases, + GuardFramework, apply_aliases, apply_typed_extractor_guards_to_units, dedup_call_sites, + expanded_guard_call_sites, guard_calls_for_handler, inject_guard_checks, rust_param_aliases, }; use super::common::{ attach_route_handler, call_name, collect_top_level_units, named_children, resolve_handler_node, @@ -36,6 +36,13 @@ impl AuthExtractor for ActixWebExtractor { collect_top_level_units(root, bytes, rules, &mut model); collect_routes(root, root, bytes, path, rules, &mut model); + apply_typed_extractor_guards_to_units( + root, + bytes, + rules, + &mut model, + GuardFramework::ActixWeb, + ); model } diff --git a/src/auth_analysis/extract/axum.rs b/src/auth_analysis/extract/axum.rs index 4578787e..8f6f614c 100644 --- a/src/auth_analysis/extract/axum.rs +++ b/src/auth_analysis/extract/axum.rs @@ -35,6 +35,7 @@ impl AuthExtractor for AxumExtractor { collect_top_level_units(root, bytes, rules, &mut model); collect_routes(root, root, bytes, path, rules, &mut model); + apply_typed_extractor_guards_to_units(root, bytes, rules, &mut model, GuardFramework::Axum); model } @@ -391,7 +392,61 @@ fn classify_rocket_param( /// non-route functions, and a false positive there suppresses /// downstream `V.id` flagging entirely; that path uses a structural /// recogniser keyed on the `User?` shape. +/// +/// Recognition is **outer-wrapper based**: classify by the outermost +/// type name only, not by substring-anywhere on the whole text. This +/// avoids both directions of leakage: +/// * A bare data-only extractor like `web::Path` early-returns +/// `None` regardless of inner type tokens (preserves existing +/// behaviour). +/// * A policy-bearing wrapper like +/// `GuardedData, Data>` is +/// classified by the outer `GuardedData`, not by whether the inner +/// `Data` happens to lowercase-contain "auth". The +/// wrapper proves capability enforcement → `AuthCheckKind::Other` +/// (the route-level short-circuit in `auth_check_covers_subject` +/// suppresses missing-ownership-check for non-LoginGuard kinds). fn classify_guard_type(type_text: &str) -> Option { + let outer = outermost_type_name(type_text); + let outer_lower = outer.to_ascii_lowercase(); + + // Bare data-only extractors are *not* auth-bearing regardless of + // their inner generic args. Outer-name match (case-insensitive + // exact) — `Path` / `web::Path<...>` / `Query` / + // `Json` / `Form` / `State` / `Extension` / + // `Data`. + if is_data_only_extractor_outer(&outer_lower) { + return None; + } + + // Policy/guard-bearing outer wrapper. Names containing + // `guarded` (e.g. `GuardedData`, `GuardedRoute`) signal the + // wrapper enforced a capability/permission check at request + // construction. Distinct from `LoginGuard` because Policy + // enforcement is more than authentication, it's authorization. + if outer_lower.contains("guarded") || outer_lower.contains("guard") { + if outer_lower.contains("admin") { + return Some(AuthCheckKind::AdminGuard); + } + return Some(AuthCheckKind::Other); + } + + if outer_lower.contains("admin") { + return Some(AuthCheckKind::AdminGuard); + } + if outer_lower.contains("user") + || outer_lower.contains("auth") + || outer_lower.contains("session") + || outer_lower.contains("identity") + || outer_lower.contains("principal") + { + return Some(AuthCheckKind::LoginGuard); + } + + // Backwards-compat fallback: legacy whole-text substring check + // for unusual shapes whose outer wrapper is generic but whose + // qualified path still mentions an auth token. Preserves + // pre-2026-05-02 behaviour for non-Guarded wrappers. let lower = type_text.to_ascii_lowercase(); if is_extractor_wrapper(&lower) { return None; @@ -409,6 +464,49 @@ fn classify_guard_type(type_text: &str) -> Option { } } +/// Outermost type name: text before the first `<`, with reference +/// markers (`&`, `&mut`, `&'a`, etc.) and module-path prefix +/// (`std::collections::`) stripped. Returns the empty string for +/// inputs that don't parse as a type. +fn outermost_type_name(type_text: &str) -> &str { + let trimmed = type_text.trim(); + let mut after_refs = trimmed; + loop { + let next = after_refs + .trim_start_matches('&') + .trim_start_matches("mut ") + .trim_start(); + // Strip any single lifetime token like `'a ` after the `&`. + let next = if let Some(rest) = next.strip_prefix('\'') { + rest.split_once(' ') + .map(|(_, after)| after.trim_start()) + .unwrap_or(rest) + } else { + next + }; + if next == after_refs { + break; + } + after_refs = next; + } + let prefix = after_refs.split('<').next().unwrap_or(after_refs).trim(); + prefix.rsplit("::").next().unwrap_or(prefix).trim() +} + +/// Outer wrapper name (lowercase, exact-match) that the engine treats +/// as a bare data-only extractor: yielding the inner type to the +/// handler without any auth side-effect. Matched on the outer name +/// only so policy-bearing wrappers carrying a data extractor as one +/// of their generic args (e.g. +/// `GuardedData>`) are not mis-suppressed by +/// the inner `Path<...>`. +fn is_data_only_extractor_outer(outer_lower: &str) -> bool { + matches!( + outer_lower, + "path" | "query" | "json" | "form" | "extension" | "state" | "data" | "reqdata" + ) +} + fn classify_rocket_guard_type( type_text: &str, binding: &str, @@ -612,6 +710,14 @@ pub(crate) fn inject_guard_checks( for call in guard_calls { let kind = if rules.is_admin_guard(&call.name, &call.args) { AuthCheckKind::AdminGuard + } else if rules.is_policy_guard(&call.name) { + // Policy/capability-bearing typed extractor (e.g. + // meilisearch's `GuardedData, _>`). + // Recorded as `Other` so the route-level short-circuit in + // `auth_check_covers_subject` covers any sink in the + // handler — the wrapper proves authorization, not just + // authentication. + AuthCheckKind::Other } else if rules.is_login_guard(&call.name) { AuthCheckKind::LoginGuard } else { @@ -633,3 +739,153 @@ pub(crate) fn inject_guard_checks( }); } } + +/// Walk every `Function`-kind unit in `model` and inject route-level +/// guard checks for any parameter whose type is recognised as a +/// typed auth/policy extractor (e.g. meilisearch's `GuardedData`, +/// `axum::extract::State`). Complements the route-walk path +/// in `collect_routes`: handlers registered by attribute macros +/// (`#[routes::path(...)]`, `#[get("/path")]`) or by external +/// service-config builders are never matched as route registrations +/// here, so their typed-extractor guards would otherwise never be +/// injected and `missing_ownership_check` would fire on every +/// id-shaped sink they contain. +/// +/// `RouteHandler`-kind units already had their guards injected during +/// the route walk and are skipped to avoid duplicate `AuthCheck` +/// entries. +pub(crate) fn apply_typed_extractor_guards_to_units( + root: Node<'_>, + bytes: &[u8], + rules: &AuthAnalysisRules, + model: &mut crate::auth_analysis::model::AuthorizationModel, + framework: GuardFramework, +) { + use crate::auth_analysis::model::AnalysisUnitKind; + let function_nodes = collect_function_definition_nodes(root); + for unit_idx in 0..model.units.len() { + let span = { + let unit = &model.units[unit_idx]; + if unit.kind == AnalysisUnitKind::RouteHandler { + continue; + } + unit.span + }; + let Some(handler_node) = function_nodes + .iter() + .find(|node| node.start_byte() == span.0 && node.end_byte() == span.1) + .copied() + else { + continue; + }; + let guard_calls = guard_calls_for_handler(handler_node, "", bytes, framework); + if guard_calls.is_empty() { + continue; + } + let unit = &mut model.units[unit_idx]; + inject_guard_checks(unit, &guard_calls, rules); + } +} + +fn collect_function_definition_nodes<'tree>(root: Node<'tree>) -> Vec> { + let mut out = Vec::new(); + walk_function_definitions(root, &mut out); + out +} + +fn walk_function_definitions<'tree>(node: Node<'tree>, out: &mut Vec>) { + // Free / impl / trait fn definitions in tree-sitter-rust. + if node.kind() == "function_item" { + out.push(node); + } + for child in named_children(node) { + walk_function_definitions(child, out); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn outermost_type_name_strips_refs_and_module_prefix() { + assert_eq!(outermost_type_name("GuardedData"), "GuardedData"); + assert_eq!(outermost_type_name("&GuardedData"), "GuardedData"); + assert_eq!( + outermost_type_name("&'a mut GuardedData"), + "GuardedData" + ); + assert_eq!(outermost_type_name("web::Path"), "Path"); + assert_eq!(outermost_type_name("std::sync::Arc>"), "Arc"); + assert_eq!(outermost_type_name(""), ""); + assert_eq!(outermost_type_name("Bare"), "Bare"); + } + + #[test] + fn classify_guard_type_recognises_guarded_data_outer_wrapper() { + // Real meilisearch shape with both an admin-token-bearing inner + // type and a Data inner extractor — must classify as `Other` + // (route-level policy), not LoginGuard (filtered out by + // `has_prior_subject_auth`) and not None (over-suppression + // would happen if the inner `Data<>` early-return fired). + let kind = classify_guard_type( + "GuardedData, Data>", + ); + assert_eq!(kind, Some(AuthCheckKind::Other)); + } + + #[test] + fn classify_guard_type_data_only_extractor_outer_returns_none() { + // Outer `Data<>` is a bare actix data extractor — not auth. + // Even though the inner type lower-cases to contain "auth", + // the outer-wrapper recognition correctly returns None. + assert_eq!( + classify_guard_type("Data"), + None, + "outer Data<> is a bare data extractor, not auth-bearing" + ); + assert_eq!(classify_guard_type("web::Path"), None); + assert_eq!(classify_guard_type("Json"), None); + assert_eq!(classify_guard_type("Form"), None); + } + + #[test] + fn classify_guard_type_preserves_existing_login_guard_recognition() { + assert_eq!( + classify_guard_type("LocalUserView"), + Some(AuthCheckKind::LoginGuard) + ); + assert_eq!( + classify_guard_type("Authenticated"), + Some(AuthCheckKind::LoginGuard) + ); + assert_eq!( + classify_guard_type("AdminUser"), + Some(AuthCheckKind::AdminGuard) + ); + assert_eq!( + classify_guard_type("CurrentUser"), + Some(AuthCheckKind::LoginGuard) + ); + } + + #[test] + fn classify_guard_type_admin_guarded_takes_admin_priority() { + // `AdminGuard` outer wrapper has both "admin" and "guard" tokens + // — admin-priority rule wins inside the Guarded branch. + assert_eq!( + classify_guard_type("AdminGuard"), + Some(AuthCheckKind::AdminGuard) + ); + assert_eq!( + classify_guard_type("GuardedAdmin"), + Some(AuthCheckKind::AdminGuard) + ); + } + + #[test] + fn classify_guard_type_unknown_outer_returns_none() { + assert_eq!(classify_guard_type("MyCustomWrapper"), None); + assert_eq!(classify_guard_type(""), None); + } +} diff --git a/src/auth_analysis/extract/common.rs b/src/auth_analysis/extract/common.rs index a11e2af0..bb61f0dd 100644 --- a/src/auth_analysis/extract/common.rs +++ b/src/auth_analysis/extract/common.rs @@ -3455,6 +3455,33 @@ pub fn extract_value_refs(node: Node<'_>, bytes: &[u8]) -> Vec { index: None, span: span(node), }], + // Keyword / named arguments: `Model.objects.filter(organization_id=org.id)`. + // Tree-sitter exposes a `name` child (the schema column / parameter + // name) and a `value` child (the actual expression). The default + // recurse-all-children arm would surface `organization_id` as a + // bare-identifier subject, which `is_id_like_name` then flags as + // a scoped-identifier user-input. But the kwarg key is the + // ORM/RPC schema field name, fixed at call time, never + // attacker-controlled. Only the value carries a subject. + // + // Covers Python `keyword_argument`, JavaScript / TypeScript + // `pair` (object property syntax used as kwargs in client libs + // like prisma's `where: { id: foo }` is handled separately), + // Ruby `pair` (hash kwargs in `Model.where(field: value)`), Go + // composite-literal element keys, PHP / C# named arguments. + "keyword_argument" + | "keyword_arg" + | "named_argument" + | "named_arg" => { + if let Some(value) = node + .child_by_field_name("value") + .or_else(|| node.child_by_field_name("argument")) + { + extract_value_refs(value, bytes) + } else { + Vec::new() + } + } _ => { let mut refs = Vec::new(); for idx in 0..node.named_child_count() { diff --git a/src/auth_analysis/extract/flask.rs b/src/auth_analysis/extract/flask.rs index 59076eb5..1c64c42f 100644 --- a/src/auth_analysis/extract/flask.rs +++ b/src/auth_analysis/extract/flask.rs @@ -127,6 +127,9 @@ fn parse_flask_route_decorator( }; let callee = text(function, bytes); + if callee_is_test_decorator(&callee) { + return None; + } let method_name = bare_method_name(&callee); let arguments = decorator_expr.child_by_field_name("arguments")?; let args = named_children(arguments); @@ -173,6 +176,45 @@ fn parse_methods_keyword(arguments: Node<'_>, bytes: &[u8]) -> Option.` +/// shape. `unittest.mock.patch` is the dominant collision: it takes a +/// string literal as its first positional arg (the import path of the +/// thing being patched), and `bare_method_name("mock.patch")` is +/// `patch`, which `parse_flask_route_decorator` previously matched as +/// HTTP PATCH. Every test method decorated with `@mock.patch("...")` +/// was therefore being attached as a Flask route handler, which +/// flipped its `unit.kind` to `RouteHandler` and made it pass +/// `unit_has_user_input_evidence` unconditionally — flooding the +/// pytest test suites with `missing_ownership_check` findings. +/// +/// The denylist mirrors common mock / monkeypatch / parametrize forms. +/// Conservative: matches only the canonical receiver chains; an +/// imported alias `from unittest.mock import patch` then bare +/// `@patch("x")` would still match `patch` as PATCH, but the +/// decorator must also carry a string-literal first arg AND the +/// route-attached unit must come back through the auth analysis to +/// fire — handlers with a string-arg decorator are rare outside Flask +/// itself, and the wider precondition path now covers most of those. +fn callee_is_test_decorator(callee: &str) -> bool { + matches!( + callee, + "mock.patch" + | "mock.patch.object" + | "mock.patch.dict" + | "mock.patch.multiple" + | "unittest.mock.patch" + | "unittest.mock.patch.object" + | "unittest.mock.patch.dict" + | "unittest.mock.patch.multiple" + | "monkeypatch.setattr" + | "monkeypatch.setenv" + | "monkeypatch.delattr" + | "monkeypatch.delenv" + | "pytest.mark.parametrize" + ) +} + fn keyword_argument_string(arguments: Node<'_>, bytes: &[u8], name: &str) -> Option { let value = keyword_argument_value(arguments, bytes, name)?; string_literal_value(value, bytes) @@ -331,6 +373,41 @@ fn inject_middleware_auth( } } +#[cfg(test)] +mod test_decorator_tests { + use super::callee_is_test_decorator; + + /// Test-framework decorators that share their bare method name with + /// a Flask HTTP verb (`patch`, `delete`, ...) must be excluded + /// from `parse_flask_route_decorator`. Without the denylist, + /// every `@mock.patch("module")` in pytest test files attaches + /// the test method as a Flask PATCH route handler — flooding the + /// auth-analysis with FPs. + #[test] + fn callee_is_test_decorator_recognises_canonical_forms() { + // unittest.mock variants. + assert!(callee_is_test_decorator("mock.patch")); + assert!(callee_is_test_decorator("mock.patch.object")); + assert!(callee_is_test_decorator("mock.patch.dict")); + assert!(callee_is_test_decorator("mock.patch.multiple")); + assert!(callee_is_test_decorator("unittest.mock.patch")); + assert!(callee_is_test_decorator("unittest.mock.patch.object")); + // pytest fixtures. + assert!(callee_is_test_decorator("monkeypatch.setattr")); + assert!(callee_is_test_decorator("monkeypatch.setenv")); + assert!(callee_is_test_decorator("pytest.mark.parametrize")); + // Negatives — real Flask decorators must still match. + assert!(!callee_is_test_decorator("app.route")); + assert!(!callee_is_test_decorator("app.get")); + assert!(!callee_is_test_decorator("app.post")); + assert!(!callee_is_test_decorator("app.patch")); + assert!(!callee_is_test_decorator("bp.delete")); + assert!(!callee_is_test_decorator("blueprint.put")); + assert!(!callee_is_test_decorator("router.get")); + assert!(!callee_is_test_decorator("")); + } +} + #[cfg(test)] mod fastapi_dependencies_tests { use super::is_depends_callee; diff --git a/src/auth_analysis/extract/mod.rs b/src/auth_analysis/extract/mod.rs index d3d8546f..f6cce222 100644 --- a/src/auth_analysis/extract/mod.rs +++ b/src/auth_analysis/extract/mod.rs @@ -1,6 +1,6 @@ use super::config::AuthAnalysisRules; use super::model::AuthorizationModel; -use crate::utils::project::FrameworkContext; +use crate::utils::project::{FrameworkContext, rust_file_imports_web_framework}; use std::path::Path; use tree_sitter::Tree; @@ -61,6 +61,18 @@ pub fn extract_authorization_model( } } + // Per-language web-framework signal used to gate the param-name arm + // of `unit_has_user_input_evidence`. Combines the project-root + // manifest detection (`framework_ctx`) with a per-file `use`/`import` + // check, so a single file in a workspace whose root manifest does + // not name a web framework can still opt back in by directly + // importing one (e.g. `crates/collab/src/rpc.rs` in zed: workspace + // root has no axum, but the file uses `axum::Router`). + // + // Three-valued: `Some(true)` keeps step 3 firing, `Some(false)` + // suppresses it, `None` means no detection ran ─ behavior unchanged. + model.lang_web_framework_signal = compute_web_framework_signal(lang, framework_ctx, bytes); + // **Dedup units by span across extractors.** Multiple extractors // (e.g. Flask + Django on a Python file) each call // `collect_top_level_units`, producing one unit per top-level @@ -80,6 +92,53 @@ pub fn extract_authorization_model( model } +/// Compute the per-file web-framework signal used to gate the +/// param-name arm of `unit_has_user_input_evidence`. +/// +/// Currently emits a non-`None` value only for Rust files. The Rust +/// auth analysis is the single biggest source of internal-helper FPs +/// in non-web crates (zed's GUI / editor crates); the other languages +/// have their own handler-classification policies that already filter +/// effectively, so they keep their existing behavior (None → +/// fall-through to the param-name heuristic) until each is validated. +/// +/// Three-valued semantics: +/// * `Some(true)` ─ project root manifest names a Rust web framework +/// (axum / actix_web / rocket), OR the file directly imports one. +/// Param-name evidence stays on. +/// * `Some(false)` ─ project root manifest was inspected (Cargo.toml +/// exists) and named no Rust web framework, AND the file does not +/// directly import one. Param-name evidence is suppressed: the +/// project has no HTTP boundary in Rust. +/// * `None` ─ no detection ran (no `framework_ctx`, no Cargo.toml +/// inspected). Behavior unchanged. +fn compute_web_framework_signal( + lang: &str, + framework_ctx: Option<&FrameworkContext>, + bytes: &[u8], +) -> Option { + if !matches!(lang, "rust" | "rs") { + return None; + } + let project_signal = framework_ctx.and_then(|ctx| ctx.lang_has_web_framework("rust")); + if project_signal == Some(true) { + return Some(true); + } + // Project says "no Rust framework" or never inspected. Consult the + // file's own imports as a per-file fallback; if the file uses an + // axum / actix_web / rocket symbol directly, treat it as a handler + // file even when the workspace-root Cargo.toml does not list the + // crate. (Real example: zed's `crates/collab/src/rpc.rs` imports + // axum but the workspace root Cargo.toml does not.) + if rust_file_imports_web_framework(bytes) { + return Some(true); + } + // No file-level evidence either. Only flip to `Some(false)` if a + // Cargo.toml manifest was actually inspected — single-file scans + // without project context get `None` and preserve prior behavior. + project_signal +} + fn deduplicate_units_by_span(model: &mut AuthorizationModel) { use crate::auth_analysis::model::{AnalysisUnit, AnalysisUnitKind}; use std::collections::HashMap; diff --git a/src/auth_analysis/model.rs b/src/auth_analysis/model.rs index adb52c4d..77113055 100644 --- a/src/auth_analysis/model.rs +++ b/src/auth_analysis/model.rs @@ -348,6 +348,20 @@ pub struct RouteRegistration { pub struct AuthorizationModel { pub routes: Vec, pub units: Vec, + /// Per-language web-framework presence signal used to gate the + /// `is_external_input_param_name` arm of `unit_has_user_input_evidence`. + /// + /// `None` means detection did not run (single-file unit-test paths, + /// languages without a framework gate yet). `Some(true)` means the + /// project manifest or the file's imports name a web framework that + /// matches this language ─ helper functions are plausibly reachable + /// from a route handler, so the param-name heuristic stays on. + /// `Some(false)` means detection ran and named no matching framework + /// ─ the file lives in a project with no HTTP boundary, so internal + /// helper params named `*_id` / `req` / `payload` are not user input. + /// + /// Currently set only for Rust by `extract_authorization_model`. + pub lang_web_framework_signal: Option, } impl AuthorizationModel { @@ -359,5 +373,22 @@ impl AuthorizationModel { route.unit_idx += unit_offset; route })); + // Take the strongest signal across extractor outputs: `Some(true)` + // wins over `Some(false)` wins over `None`. In practice every + // extractor for a given file sees the same `framework_ctx + bytes` + // so they all derive identical signals; this is just a defensive + // merge. + self.lang_web_framework_signal = max_signal( + self.lang_web_framework_signal, + other.lang_web_framework_signal, + ); + } +} + +fn max_signal(a: Option, b: Option) -> Option { + match (a, b) { + (Some(true), _) | (_, Some(true)) => Some(true), + (Some(false), _) | (_, Some(false)) => Some(false), + _ => None, } } diff --git a/src/cfg/literals.rs b/src/cfg/literals.rs index 7535a18c..ac00d3a5 100644 --- a/src/cfg/literals.rs +++ b/src/cfg/literals.rs @@ -345,6 +345,126 @@ pub(super) fn has_keyword_arg(call_node: Node, keyword_name: &str, code: &[u8]) false } +/// Extract the literal value of a property `prop_name` from the object +/// literal at positional argument `arg_index`. Returns `None` if the +/// arg is absent, is not an object literal, the prop key isn't found, +/// or the prop value isn't a literal (so callers can distinguish +/// "present but dynamic" from "absent" only via [`has_object_arg_property`]). +/// +/// Used by JS/TS-style "options object as kwargs" gates — e.g. +/// `_.template(tpl, { evaluate: false })` — where the safe-flag lives +/// in an inline object literal rather than as a dedicated kwarg node +/// (which JS does not have). Strict-additive: returns `None` for any +/// non-JS-object shape, including bare identifiers passed as the +/// options arg, so the gate falls back to the conservative dynamic +/// branch. +pub(super) fn extract_object_arg_property( + call_node: Node, + arg_index: usize, + prop_name: &str, + code: &[u8], +) -> Option { + let args = call_node.child_by_field_name("arguments")?; + let mut cursor = args.walk(); + let arg = args.named_children(&mut cursor).nth(arg_index)?; + let arg = unwrap_parens(arg); + if !matches!(arg.kind(), "object" | "dictionary") { + return None; + } + let mut c = arg.walk(); + for child in arg.named_children(&mut c) { + if child.kind() != "pair" { + continue; + } + let Some(key_node) = child.child_by_field_name("key") else { + continue; + }; + let key_text = match key_node.kind() { + "string" | "string_literal" => text_of(key_node, code).map(|raw| { + if raw.len() >= 2 { + raw[1..raw.len() - 1].to_string() + } else { + raw + } + }), + "computed_property_name" => continue, + _ => text_of(key_node, code), + }; + if key_text.as_deref() != Some(prop_name) { + continue; + } + let val_node = child.child_by_field_name("value")?; + let val_node = unwrap_parens(val_node); + return match val_node.kind() { + "true" | "false" | "null" | "undefined" | "number" | "string" | "string_literal" => { + text_of(val_node, code).map(|s| s.to_string()) + } + // JS booleans true/false are their own node kinds (above), but + // some grammar versions wrap them as identifier literals; surface + // `undefined` similarly. + "identifier" => text_of(val_node, code) + .filter(|s| matches!(s.as_str(), "true" | "false" | "null" | "undefined")), + _ => None, + }; + } + None +} + +/// Return `true` if the call node's positional arg at `arg_index` is an +/// object literal containing a property named `prop_name` (whether the +/// value is a literal or a dynamic expression). Used alongside +/// [`extract_object_arg_property`] so gated-sink classification can +/// distinguish "options key absent" (language default) from "options +/// key present with dynamic value" (conservative dangerous). +pub(super) fn has_object_arg_property( + call_node: Node, + arg_index: usize, + prop_name: &str, + code: &[u8], +) -> bool { + let Some(args) = call_node.child_by_field_name("arguments") else { + return false; + }; + let mut cursor = args.walk(); + let Some(arg) = args.named_children(&mut cursor).nth(arg_index) else { + return false; + }; + let arg = unwrap_parens(arg); + if !matches!(arg.kind(), "object" | "dictionary") { + return false; + } + let mut c = arg.walk(); + for child in arg.named_children(&mut c) { + match child.kind() { + "shorthand_property_identifier" | "shorthand_property_identifier_pattern" + if text_of(child, code).as_deref() == Some(prop_name) => + { + return true; + } + "pair" => { + if let Some(key_node) = child.child_by_field_name("key") { + let key_text = match key_node.kind() { + "string" | "string_literal" => text_of(key_node, code).map(|raw| { + if raw.len() >= 2 { + raw[1..raw.len() - 1].to_string() + } else { + raw + } + }), + "computed_property_name" => continue, + _ => text_of(key_node, code), + }; + if key_text.as_deref() == Some(prop_name) { + return true; + } + } + } + _ => {} + } + } + false +} + /// Inspect the first positional argument of a call node and return its /// tree-sitter `kind()` plus a flag indicating whether any descendant is an /// `interpolation` node. Skips parenthesisation (`(arg0)` is treated as @@ -584,6 +704,29 @@ pub(super) fn find_chained_inner_call<'a>( let function = outer .child_by_field_name("function") .or_else(|| outer.child_by_field_name("method"))?; + // Direct double-call form (`f()(x)`): the outer call's `function` + // field IS itself a call_expression, with no intermediate + // member-chain. Treat the inner call as the chain's innermost. + // Without this, lodash-style template-render chains like + // `_.template(t)(data)` evade the chained-inner rebinding because + // the outer's function field is a `call_expression`, not the + // `member_expression` shape the original branch below expects. + if matches!( + lookup(lang, function.kind()), + Kind::CallFn | Kind::CallMethod + ) { + // Recurse: the inner call may itself be chained. + if let Some(inner) = find_chained_inner_call(function, lang, code) { + return Some(inner); + } + let inner_func = function + .child_by_field_name("function") + .or_else(|| function.child_by_field_name("method")) + .or_else(|| function.child_by_field_name("name"))?; + let raw = text_of(inner_func, code)?; + let inner_text: String = raw.chars().filter(|c| !c.is_whitespace()).collect(); + return Some((function, inner_text)); + } // The function/method field for a chained call is a member_expression // (JS/TS) or attribute (Python) etc.; its `object` field is the // receiver expression. Only proceed when that receiver is itself a diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 0ab57237..59651c35 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -54,8 +54,9 @@ use literals::{ detect_rust_replace_chain_sanitizer, extract_arg_callees, extract_arg_string_literals, extract_arg_uses, extract_const_keyword_arg, extract_const_macro_arg, extract_const_string_arg, extract_destination_field_pairs, extract_destination_kwarg_pairs, extract_kwargs, - extract_literal_rhs, extract_shell_array_payload_idents, find_call_node, find_call_node_deep, - find_chained_inner_call, has_keyword_arg, has_only_literal_args, is_parameterized_query_call, + extract_literal_rhs, extract_object_arg_property, extract_shell_array_payload_idents, + find_call_node, find_call_node_deep, find_chained_inner_call, has_keyword_arg, + has_object_arg_property, has_only_literal_args, is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method, js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args, }; @@ -67,11 +68,33 @@ use params::{ /// Test-only re-export of [`extract_param_meta`] so the external /// `tests/typed_extractors_audit.rs` harness can drive the per-param /// classifier directly without spinning up the full scan pipeline. +/// Projects away the destructured-siblings third tuple slot so the +/// existing tuple-shape assertions in the audit harness keep working; +/// the sibling info is plumbed separately through `BodyMeta`. pub fn extract_param_meta_for_test<'a>( func_node: tree_sitter::Node<'a>, lang: &str, code: &'a [u8], ) -> Vec<(String, Option)> { + extract_param_meta(func_node, lang, code) + .into_iter() + .map(|(name, ty, _siblings)| (name, ty)) + .collect() +} + +/// Test-only re-export that returns the full per-slot tuple including +/// destructured sibling names. Used by the destructured-arg-probe +/// regression tests in `src/taint/tests.rs` and the params unit tests +/// in `src/cfg/cfg_tests.rs`. +pub fn extract_param_meta_with_destructured_for_test<'a>( + func_node: tree_sitter::Node<'a>, + lang: &str, + code: &'a [u8], +) -> Vec<( + String, + Option, + Vec, +)> { extract_param_meta(func_node, lang, code) } @@ -567,6 +590,17 @@ pub struct BodyMeta { /// `None`, downstream behaviour is identical to the pre-Phase-1 /// engine. pub param_types: Vec>, + /// Per-parameter destructured-binding sibling names. Same length + /// as `params`; entry `i` lists field names bound by the same + /// argument slot as `params[i]`, excluding the primary name itself. + /// Empty for non-destructured params. Today populated only for + /// JS/TS object-pattern formals (`({ a, b, c })` → params=["a"], + /// destructured=[["b","c"]]). Used by per-parameter taint-summary + /// probing in `extract_ssa_func_summary` so destructured bindings + /// inside the body share the slot's seeded caps and any of them + /// being in `validated_must` at a return path counts as the slot + /// being validated. Closes the residual gap behind CVE-2026-25544. + pub param_destructured_fields: Vec>, pub param_count: usize, pub span: (usize, usize), pub parent_body_id: Option, @@ -1909,8 +1943,27 @@ pub(super) fn push_node<'a>( } }) }, - |kw| extract_const_keyword_arg(cn, kw, code), - |kw| has_keyword_arg(cn, kw, code), + |kw| { + // For JS/TS, options-bearing args are passed as inline + // object literals (`fn(x, { evaluate: false })`) rather + // than language-level keyword arguments. When the + // standard `keyword_argument`-walking extractor returns + // None, fall back to inspecting arg 1's object literal + // for a property named `kw`. This lets gates like + // `_.template` consult `{ evaluate: false }` literally. + extract_const_keyword_arg(cn, kw, code).or_else(|| { + if matches!(lang, "javascript" | "typescript") { + extract_object_arg_property(cn, 1, kw, code) + } else { + None + } + }) + }, + |kw| { + has_keyword_arg(cn, kw, code) + || (matches!(lang, "javascript" | "typescript") + && has_object_arg_property(cn, 1, kw, code)) + }, ); if !matches.is_empty() { @@ -3871,9 +3924,13 @@ pub(super) fn build_sub<'a>( let is_anon = is_anon_fn_name(&fn_name); let param_meta = extract_param_meta(ast, lang, code); let param_count = param_meta.len(); - let param_names: Vec = param_meta.iter().map(|(n, _)| n.clone()).collect(); + let param_names: Vec = param_meta.iter().map(|(n, _, _)| n.clone()).collect(); let param_types: Vec> = - param_meta.iter().map(|(_, t)| t.clone()).collect(); + param_meta.iter().map(|(_, t, _)| t.clone()).collect(); + let param_destructured_fields: Vec> = param_meta + .iter() + .map(|(_, _, siblings)| siblings.clone()) + .collect(); // ── 1b) Compute identity discriminators ─────────────────────────── let (fn_container, fn_kind) = @@ -4130,6 +4187,7 @@ pub(super) fn build_sub<'a>( name: if is_anon { None } else { Some(fn_name.clone()) }, params: param_names, param_types, + param_destructured_fields, param_count, span: (ast.start_byte(), ast.end_byte()), parent_body_id: Some(current_body_id), @@ -4628,6 +4686,7 @@ pub(crate) fn build_cfg<'a>( name: None, params: Vec::new(), param_types: Vec::new(), + param_destructured_fields: Vec::new(), param_count: 0, span: (0, code.len()), parent_body_id: None, diff --git a/src/cfg/params.rs b/src/cfg/params.rs index 788452d4..798d9dfe 100644 --- a/src/cfg/params.rs +++ b/src/cfg/params.rs @@ -21,16 +21,27 @@ fn lookup_dto_class(class_name: &str) -> Option { /// Extract parameter names + per-position [`TypeKind`] from a function /// AST node. Each entry's second slot is `Some(TypeKind)` when the /// parameter's decorator, attribute, or static type annotation maps to -/// a known kind, and `None` otherwise. Strictly additive, when no -/// type info is recoverable, behaviour is identical to the names-only -/// path. +/// a known kind, and `None` otherwise. The third slot lists +/// destructured field names bound by the same parameter slot — empty +/// for non-destructured params and for the primary name itself. E.g. +/// for the JS/TS object-pattern formal `({ a, b, c })`, the entry is +/// `("a", None, ["b", "c"])`. Strictly additive: when the param is +/// not a destructured pattern (or the language has no destructure +/// concept), behaviour is identical to the pre-Phase-5 names-only path. +/// +/// Closes the residual gap behind CVE-2026-25544 (PayloadCMS Drizzle +/// SQL injection): a per-parameter taint probe that seeds only the +/// primary name `column` cannot see flow through sibling destructured +/// bindings (`value` etc.) inside the body, so summary extraction +/// misses `validated_params_to_return` when a validator helper is +/// applied to one of the siblings. pub(super) fn extract_param_meta<'a>( func_node: Node<'a>, lang: &str, code: &'a [u8], -) -> Vec<(String, Option)> { +) -> Vec<(String, Option, Vec)> { let cfg = param_config(lang); - let mut out: Vec<(String, Option)> = Vec::new(); + let mut out: Vec<(String, Option, Vec)> = Vec::new(); // Try the params_field directly on the function node first. // For C/C++, the parameter list is nested inside the declarator // (function_definition > declarator:function_declarator > parameters:parameter_list), @@ -51,7 +62,7 @@ pub(super) fn extract_param_meta<'a>( if let Some(p) = func_node.child_by_field_name("parameter") { if p.kind() == "identifier" { if let Some(name) = text_of(p, code) { - out.push((name, None)); + out.push((name, None, Vec::new())); } } } @@ -62,7 +73,7 @@ pub(super) fn extract_param_meta<'a>( for child in params.children(&mut cursor) { // Self/this parameter (e.g. Rust's `self_parameter`) if cfg.self_param_kinds.contains(&child.kind()) { - out.push(("self".into(), None)); + out.push(("self".into(), None, Vec::new())); continue; } @@ -74,14 +85,26 @@ pub(super) fn extract_param_meta<'a>( if let Some(node) = child.child_by_field_name(field) { let mut tmp = Vec::new(); collect_idents(node, code, &mut tmp); - let candidate = if lang == "rust" { - tmp.into_iter().last() + let primary = if lang == "rust" { + // Rust: last ident is the binding name (e.g. + // `Path(project_id): Path` → `project_id`). + tmp.pop() + } else if tmp.is_empty() { + None } else { - tmp.into_iter().next() + Some(tmp.remove(0)) }; - if let Some(name) = candidate { + if let Some(name) = primary { let ty = classify_param_type(child, lang, code); - out.push((name, ty)); + // Surface destructured siblings only when the + // pattern node is a destructure container. For + // ordinary (non-destructured) params, `tmp` is + // already empty after `pop()` / `remove(0)`. + // Object-pattern children of the same slot + // (`{ a, b, c }`) leave the remaining names in + // `tmp`, which become the slot's siblings. + let siblings = sibling_names_for_destructure(node, &tmp, lang); + out.push((name, ty, siblings)); found = true; break; } @@ -92,7 +115,7 @@ pub(super) fn extract_param_meta<'a>( && child.kind() == "identifier" && let Some(txt) = text_of(child, code) { - out.push((txt, None)); + out.push((txt, None, Vec::new())); found = true; } // Fallback for C/C++: look for nested declarator → identifier @@ -101,7 +124,7 @@ pub(super) fn extract_param_meta<'a>( collect_idents(child, code, &mut tmp); if let Some(last) = tmp.pop() { let ty = classify_param_type(child, lang, code); - out.push((last, ty)); + out.push((last, ty, Vec::new())); found = true; } } @@ -112,12 +135,22 @@ pub(super) fn extract_param_meta<'a>( // *first* identifier, that is the parameter name; subsequent // identifiers are part of the type annotation or default // expression. + // + // Destructure-container case (JS arrow `({ a, b }) => …`): + // when the child node IS a destructure pattern itself (no + // `required_parameter` / `assignment_pattern` wrapper), the + // remaining idents after the primary are destructured + // bindings sharing this slot — surface them as siblings so + // per-parameter summary probing seeds every binding the + // slot produces. if !found { let mut tmp = Vec::new(); collect_idents(child, code, &mut tmp); - if let Some(first) = tmp.into_iter().next() { + if !tmp.is_empty() { + let first = tmp.remove(0); let ty = classify_param_type(child, lang, code); - out.push((first, ty)); + let siblings = sibling_names_for_destructure(child, &tmp, lang); + out.push((first, ty, siblings)); } } continue; @@ -127,13 +160,52 @@ pub(super) fn extract_param_meta<'a>( // where the child is an `identifier` node, not a `parameter` wrapper. if child.kind() == "identifier" { if let Some(txt) = text_of(child, code) { - out.push((txt, None)); + out.push((txt, None, Vec::new())); } } } out } +/// Return destructured field-name siblings for a parameter's pattern +/// node, but only when the pattern is a recognised destructure +/// container (object / record pattern). For ordinary patterns the +/// `remaining` slice is already empty so this is a noop. Restricting +/// the return to destructure containers prevents typed-parameter +/// idioms (`Path`, `@PathVariable Long userId`, Rust extractor +/// wrappers) from accidentally surfacing the type identifier as a +/// destructured sibling. +fn sibling_names_for_destructure( + pattern: Node<'_>, + remaining: &[String], + lang: &str, +) -> Vec { + if remaining.is_empty() { + return Vec::new(); + } + if !is_destructure_container_kind(pattern.kind(), lang) { + return Vec::new(); + } + remaining.to_vec() +} + +/// Recognise tree-sitter pattern node kinds that destructure a +/// single argument into multiple bindings — JS/TS object patterns +/// today, plus Python's `pattern_list` / `tuple_pattern` for kwargs +/// destructure if those ever come through this path. Conservative: +/// only kinds we have explicit per-language reasoning for return +/// `true`; everything else returns `false` so the existing single- +/// name fallback path is preserved untouched. +fn is_destructure_container_kind(kind: &str, lang: &str) -> bool { + match (lang, kind) { + ("javascript" | "typescript", "object_pattern") => true, + // Future languages: array pattern (`[a, b]`) is intentionally + // omitted — the index-based unpacking is positional, and the + // names don't map cleanly to "all share slot 0". + _ => false, + } +} + /// Walk up from a function definition node and build a container path. /// /// Records the names of enclosing classes / impls / modules / namespaces / diff --git a/src/cfg_analysis/resources.rs b/src/cfg_analysis/resources.rs index 4071da39..f04ccbae 100644 --- a/src/cfg_analysis/resources.rs +++ b/src/cfg_analysis/resources.rs @@ -3,6 +3,7 @@ use super::rules; use super::{AnalysisContext, CfgAnalysis, CfgFinding, Confidence}; use crate::cfg::{EdgeKind, StmtKind}; use crate::patterns::Severity; +use crate::symbol::Lang; use petgraph::graph::NodeIndex; use petgraph::visit::EdgeRef; use std::collections::HashSet; @@ -423,6 +424,23 @@ impl CfgAnalysis for ResourceMisuse { if ctx.cfg[acquire].managed_resource { continue; } + // SAFE-FOR-FIELD-LHS (Go only): skip member-expression + // LHS acquires. `b.cpuprof = os.Create(...)` transfers + // ownership to the containing struct; closure + // responsibility belongs to a paired Stop()/Release() + // method on the struct's lifecycle. Mirrors the gate + // in src/state/transfer.rs::apply_call. Production + // trigger: prometheus + // cmd/promtool/tsdb.go::startProfiling cluster. + // Restricted to Go because TS/JS class-field acquires + // (`this.fd = fs.openSync(...)`) are still expected to + // be tracked — the leak fixtures rely on it. + if ctx.lang == Lang::Go + && let Some(acquired_var) = ctx.cfg[acquire].taint.defines.as_deref() + && acquired_var.contains('.') + { + continue; + } // Suppress resources with a deferred release (Go `defer f.Close()`). // Defer guarantees cleanup on all exit paths including early returns. if let Some(acquired_var) = ctx.cfg[acquire].taint.defines.as_deref() { diff --git a/src/database.rs b/src/database.rs index 78afc1e0..13253d97 100644 --- a/src/database.rs +++ b/src/database.rs @@ -2516,6 +2516,7 @@ fn ssa_summaries_round_trip() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ), @@ -2551,6 +2552,7 @@ fn ssa_summaries_round_trip() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ), @@ -2724,6 +2726,7 @@ fn ssa_summaries_hash_rescan_replaces_stale() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, )]; @@ -2761,6 +2764,7 @@ fn ssa_summaries_hash_rescan_replaces_stale() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, )]; @@ -2819,6 +2823,7 @@ fn clear_drops_ssa_summaries_table() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, )]; @@ -3092,6 +3097,7 @@ fn make_test_ssa_summary() -> crate::summary::ssa_summary::SsaFuncSummary { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], } } diff --git a/src/labels/javascript.rs b/src/labels/javascript.rs index 0c8d7367..a2dba01c 100644 --- a/src/labels/javascript.rs +++ b/src/labels/javascript.rs @@ -134,6 +134,9 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::CODE_EXEC), case_sensitive: false, }, + // (Lodash `_.template` is modeled as a gated sink in `GATED_SINKS` + // below — the gate inspects arg 1's options object so the patched + // `{ evaluate: false }` form is suppressed.) LabelRule { matchers: &["innerHTML", "dangerouslySetInnerHTML"], label: DataLabel::Sink(Cap::HTML_ESCAPE), @@ -377,6 +380,46 @@ pub static GATED_SINKS: &[SinkGate] = &[ dangerous_kwargs: &[], activation: GateActivation::ValueMatch, }, + // Lodash `_.template(template, options?)` — server-side template + // injection sink. Lodash's template parser by default compiles + // `<% ... %>` evaluate blocks into a JavaScript Function via the + // `Function` constructor; when the template string is attacker- + // controlled this is RCE (Strapi CVE-2023-22621 et al.). + // + // Gate: activate on arg 0 (the template string). Inspect arg 1's + // options object for `evaluate: false`; when present as a literal + // the evaluate-block compiler is disabled and the call is safe. + // Missing arg 1, missing `evaluate` key, or a dynamic value all + // fall through `ValueMatch`'s `None` branch and fire conservatively. + // + // The `keyword_name`-based activation reads the property value via + // the JS-side closure augmentation in `cfg/mod.rs`, which falls + // back to walking the call's arg-1 object literal when the + // language-default `keyword_argument` extraction yields nothing. + SinkGate { + callee_matcher: "_.template", + arg_index: 0, + dangerous_values: &["true"], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::CODE_EXEC), + case_sensitive: true, + payload_args: &[0], + keyword_name: Some("evaluate"), + dangerous_kwargs: &[], + activation: GateActivation::ValueMatch, + }, + SinkGate { + callee_matcher: "lodash.template", + arg_index: 0, + dangerous_values: &["true"], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::CODE_EXEC), + case_sensitive: true, + payload_args: &[0], + keyword_name: Some("evaluate"), + dangerous_kwargs: &[], + activation: GateActivation::ValueMatch, + }, // ── Outbound HTTP clients (SSRF) ────────────────────────────────────── // // Policy: SSRF fires only when taint reaches the destination-bearing @@ -810,7 +853,21 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! { pub static PARAM_CONFIG: ParamConfig = ParamConfig { params_field: "parameters", - param_node_kinds: &["identifier"], + // `identifier` covers bare params (`a`); `assignment_pattern` covers + // default-value params (`a = {}`). Without `assignment_pattern`, + // tree-sitter wraps the identifier in a node the param walker + // doesn't recognize, and `extract_param_meta` produces a + // parameter-less summary for any function whose params have + // defaults — breaking cross-function `param_to_sink` propagation + // for shapes like `(emailOptions = {}, emailTemplate = {}, data = {}) => …`. + // `object_pattern` covers destructured object formals (`({ a, b })`), + // which tree-sitter-javascript exposes as a direct child of + // `formal_parameters` (no `required_parameter` wrapper as in TS). + // Without it the per-parameter probe never seeds the destructured + // bindings and summary extraction misses `validated_params_to_return` + // for shapes like `({ value }) => { validate(value); ... }` — + // residual gap behind CVE-2026-25544. + param_node_kinds: &["identifier", "assignment_pattern", "object_pattern"], self_param_kinds: &[], ident_fields: &["name", "pattern"], }; diff --git a/src/labels/mod.rs b/src/labels/mod.rs index 26d58b33..9f5a378f 100644 --- a/src/labels/mod.rs +++ b/src/labels/mod.rs @@ -2166,6 +2166,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::Echo], + inspected_langs: std::collections::HashSet::new(), }; let rules = go::framework_rules(&ctx); let extras = rules.to_vec(); @@ -2194,6 +2195,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::Koa], + inspected_langs: std::collections::HashSet::new(), }; let extras = javascript::framework_rules(&ctx); @@ -2224,6 +2226,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::Fastify], + inspected_langs: std::collections::HashSet::new(), }; let extras = typescript::framework_rules(&ctx); @@ -2250,6 +2253,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::Sinatra], + inspected_langs: std::collections::HashSet::new(), }; let rules = ruby::framework_rules(&ctx); let extras = rules.to_vec(); @@ -2274,6 +2278,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::Axum], + inspected_langs: std::collections::HashSet::new(), }; let extras = rust::framework_rules(&ctx); @@ -2304,6 +2309,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::ActixWeb], + inspected_langs: std::collections::HashSet::new(), }; let extras = rust::framework_rules(&ctx); @@ -2327,6 +2333,7 @@ mod tests { let ctx = FrameworkContext { frameworks: vec![DetectedFramework::Rocket], + inspected_langs: std::collections::HashSet::new(), }; let extras = rust::framework_rules(&ctx); diff --git a/src/server/debug.rs b/src/server/debug.rs index 46bf3ee0..63b66fae 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -1873,6 +1873,7 @@ function consume() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); diff --git a/src/server/routes/debug.rs b/src/server/routes/debug.rs index de913e90..a9c5540e 100644 --- a/src/server/routes/debug.rs +++ b/src/server/routes/debug.rs @@ -445,6 +445,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, )], @@ -663,6 +664,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, )], diff --git a/src/state/transfer.rs b/src/state/transfer.rs index 9c98512d..d6596cc7 100644 --- a/src/state/transfer.rs +++ b/src/state/transfer.rs @@ -314,8 +314,31 @@ impl DefaultTransfer<'_> { } // ── Resource acquire ───────────────────────────────────────────── + // SAFE-FOR-FIELD-LHS (Go only): skip member-expression LHS + // acquires. A `b.cpuprof = os.Create(...)` pattern transfers + // ownership to the containing struct; the local function body + // cannot observe the closure (which lives in a paired + // Stop()/dispose() method), so tracking `b.cpuprof` as a local + // resource is a guaranteed FP at function exit. Mirrors the + // gate in src/cfg_analysis/resources.rs::run. Production + // trigger: prometheus cmd/promtool/tsdb.go::startProfiling + // cluster (b.cpuprof, b.memprof, b.blockprof, b.mtxprof). + // Restricted to Go because TS/JS class-field acquires + // (`this.fd = fs.openSync(...)`) are still expected to be + // tracked — the leak fixtures rely on it. let mut direct_acquire = false; - for pair in self.resource_pairs { + let define_is_field_lhs = self.lang == Lang::Go + && info + .taint + .defines + .as_deref() + .is_some_and(|d| d.contains('.')); + let resource_pairs_iter: &[ResourcePair] = if define_is_field_lhs { + &[] + } else { + self.resource_pairs + }; + for pair in resource_pairs_iter { let is_acquire = pair.acquire.iter().any(|a| callee_matches(&callee, a)); let is_excluded = pair .exclude_acquire @@ -369,6 +392,50 @@ impl DefaultTransfer<'_> { } } + // INNER-CALL-RELEASE-IN-ARG: walk info.arg_callees so a release + // method that lives in argument position is still observed. + // Production triggers: `require.NoError(t, f.Close())` (Go + // testify), `errs = append(errs, f.Close())`, JUnit + // `assertEquals(0, in.read())`. Conservative: bare-receiver + // inner calls only (recv has no dot — chained-receiver + // releases are owned by chain_proxies which doesn't observe + // inner-call positions today); marks CLOSED only (no + // DoubleClose since attribution is approximate); respects + // in_defer for symmetry with the direct-release branch above. + if !info.in_defer && !info.arg_callees.is_empty() { + for arg_callee in &info.arg_callees { + let Some(arg_callee_text) = arg_callee.as_deref() else { + continue; + }; + let Some((recv_text, _method)) = try_chain_decompose(arg_callee_text) else { + continue; + }; + if recv_text.contains('.') { + continue; + } + let arg_callee_lower = arg_callee_text.to_ascii_lowercase(); + let matches_release = self.resource_pairs.iter().any(|p| { + p.release + .iter() + .any(|r| callee_matches(&arg_callee_lower, r)) + }); + if !matches_release { + continue; + } + let Some(sym) = self.get_sym(info, recv_text) else { + continue; + }; + if released.contains(&sym) { + continue; + } + let current = state.resource.get(sym); + if current.contains(ResourceLifecycle::OPEN) { + state.resource.set(sym, ResourceLifecycle::CLOSED); + released.push(sym); + } + } + } + // ── Resource method proxy ──────────────────────────────────────── // When no direct resource pair matched, check if the callee is a // method wrapper for a known resource operation. @@ -1985,4 +2052,187 @@ mod tests { assert_eq!(state.receiver_class_group.get(&sym_f), Some(&class_group)); assert!(state.chain_proxies.is_empty()); } + + #[test] + fn inner_call_release_in_arg_marks_closed() { + let mut interner = SymbolInterner::new(); + let sym_f = interner.intern_scoped(None, "f"); + + let transfer = DefaultTransfer { + lang: Lang::Go, + resource_pairs: rules::resource_pairs(Lang::Go), + interner: &interner, + resource_method_summaries: &[], + ptr_proxy_hints: None, + }; + + let mut state = ProductState::initial(); + state.resource.set(sym_f, ResourceLifecycle::OPEN); + + let info = NodeInfo { + kind: StmtKind::Call, + ast: AstMeta { + span: (0, 30), + ..Default::default() + }, + taint: TaintMeta { + uses: vec!["t".into(), "f".into()], + ..Default::default() + }, + call: CallMeta { + callee: Some("require.NoError".into()), + ..Default::default() + }, + arg_callees: vec![None, Some("f.Close".into())], + ..Default::default() + }; + + let (state, events) = transfer.apply(NodeIndex::new(0), &info, None, state); + assert!(events.is_empty()); + assert_eq!(state.resource.get(sym_f), ResourceLifecycle::CLOSED); + } + + #[test] + fn inner_call_release_in_arg_chained_receiver_skipped() { + let mut interner = SymbolInterner::new(); + let sym_c = interner.intern_scoped(None, "c"); + + let transfer = DefaultTransfer { + lang: Lang::Go, + resource_pairs: rules::resource_pairs(Lang::Go), + interner: &interner, + resource_method_summaries: &[], + ptr_proxy_hints: None, + }; + + let mut state = ProductState::initial(); + state.resource.set(sym_c, ResourceLifecycle::OPEN); + + let info = NodeInfo { + kind: StmtKind::Call, + ast: AstMeta { + span: (0, 30), + ..Default::default() + }, + taint: TaintMeta { + uses: vec!["c".into()], + ..Default::default() + }, + call: CallMeta { + callee: Some("t.Helper".into()), + ..Default::default() + }, + arg_callees: vec![Some("c.mu.Unlock".into())], + ..Default::default() + }; + + let (state, _) = transfer.apply(NodeIndex::new(0), &info, None, state); + assert_eq!(state.resource.get(sym_c), ResourceLifecycle::OPEN); + } + + #[test] + fn inner_call_release_in_arg_respects_in_defer() { + let mut interner = SymbolInterner::new(); + let sym_f = interner.intern_scoped(None, "f"); + + let transfer = DefaultTransfer { + lang: Lang::Go, + resource_pairs: rules::resource_pairs(Lang::Go), + interner: &interner, + resource_method_summaries: &[], + ptr_proxy_hints: None, + }; + + let mut state = ProductState::initial(); + state.resource.set(sym_f, ResourceLifecycle::OPEN); + + let info = NodeInfo { + kind: StmtKind::Call, + ast: AstMeta { + span: (0, 30), + ..Default::default() + }, + taint: TaintMeta { + uses: vec!["f".into()], + ..Default::default() + }, + call: CallMeta { + callee: Some("log.Print".into()), + ..Default::default() + }, + arg_callees: vec![Some("f.Close".into())], + in_defer: true, + ..Default::default() + }; + + let (state, _) = transfer.apply(NodeIndex::new(0), &info, None, state); + assert_eq!(state.resource.get(sym_f), ResourceLifecycle::OPEN); + } + + #[test] + fn member_field_lhs_acquire_skips_resource_state() { + let interner = SymbolInterner::new(); + + let transfer = DefaultTransfer { + lang: Lang::Go, + resource_pairs: rules::resource_pairs(Lang::Go), + interner: &interner, + resource_method_summaries: &[], + ptr_proxy_hints: None, + }; + + let info = NodeInfo { + kind: StmtKind::Call, + ast: AstMeta { + span: (0, 30), + ..Default::default() + }, + taint: TaintMeta { + defines: Some("b.cpuprof".into()), + ..Default::default() + }, + call: CallMeta { + callee: Some("os.Create".into()), + ..Default::default() + }, + ..Default::default() + }; + + let (state, _) = transfer.apply(NodeIndex::new(0), &info, None, ProductState::initial()); + assert!(state.resource.vars.is_empty()); + } + + #[test] + fn bare_ident_lhs_acquire_still_tracks() { + let mut interner = SymbolInterner::new(); + let sym_f = interner.intern_scoped(None, "f"); + + let transfer = DefaultTransfer { + lang: Lang::Go, + resource_pairs: rules::resource_pairs(Lang::Go), + interner: &interner, + resource_method_summaries: &[], + ptr_proxy_hints: None, + }; + + let info = NodeInfo { + kind: StmtKind::Call, + ast: AstMeta { + span: (0, 30), + ..Default::default() + }, + taint: TaintMeta { + defines: Some("f".into()), + ..Default::default() + }, + call: CallMeta { + callee: Some("os.Open".into()), + ..Default::default() + }, + ..Default::default() + }; + + let (state, _) = transfer.apply(NodeIndex::new(0), &info, None, ProductState::initial()); + assert!(state.resource.get(sym_f).contains(ResourceLifecycle::OPEN)); + } } diff --git a/src/summary/ssa_summary.rs b/src/summary/ssa_summary.rs index 142bf87c..fa6b6cfc 100644 --- a/src/summary/ssa_summary.rs +++ b/src/summary/ssa_summary.rs @@ -325,6 +325,28 @@ pub struct SsaFuncSummary { /// can be joined by ordinal at call-graph build time. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub typed_call_receivers: Vec<(u32, String)>, + /// Parameter indices whose taint flow to the return value is fully + /// validated by a dominating predicate (regex allowlist, type check, + /// validation call, etc.) on every return path inside the function. + /// + /// At a call site, each tainted argument passed to a position in + /// this list — and the call's own return value — are marked + /// `validated_must` / `validated_may` in the caller's SSA taint + /// state, the same way an inline `if (!regex.test(x)) throw` would + /// validate the surviving branch. Sound because the call only + /// returns normally on the validating arm; if validation failed, + /// control would not reach the post-call instruction. + /// + /// Populated by + /// [`crate::taint::ssa_transfer::summary_extract::extract_ssa_func_summary`] + /// when a per-parameter probe shows the parameter's `var_name` in + /// `validated_must` at every return block of the helper. Empty + /// (the default) for helpers that do not validate any parameter. + /// Closes the validated-flow propagation gap that left + /// CVE-2026-25544 (Payload `sanitizeValue` SQL injection) detecting + /// on both vulnerable and patched code. + #[serde(default, skip_serializing_if = "SmallVec::is_empty")] + pub validated_params_to_return: SmallVec<[usize; 2]>, } /// A per-return-path [`PathFact`] entry. diff --git a/src/summary/tests.rs b/src/summary/tests.rs index 6c3e6bed..5e6deceb 100644 --- a/src/summary/tests.rs +++ b/src/summary/tests.rs @@ -441,6 +441,7 @@ fn ssa_summary_serde_round_trip_identity() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -474,6 +475,7 @@ fn ssa_summary_serde_round_trip_strip_bits() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -504,6 +506,7 @@ fn ssa_summary_serde_round_trip_add_bits() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -541,6 +544,7 @@ fn ssa_summary_serde_round_trip_all_variants() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -580,6 +584,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; gs.insert_ssa(key.clone(), v1.clone()); @@ -607,6 +612,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; gs.insert_ssa(key.clone(), v2.clone()); @@ -654,6 +660,7 @@ fn global_summaries_merge_with_ssa_entries() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let sum_b = SsaFuncSummary { @@ -677,6 +684,7 @@ fn global_summaries_merge_with_ssa_entries() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; @@ -724,6 +732,7 @@ fn global_summaries_is_empty_considers_ssa() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -754,6 +763,7 @@ fn ssa_summary_serde_round_trip_param_to_sink_param() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -799,6 +809,7 @@ fn ssa_summary_serde_round_trip_container_fields() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -854,6 +865,7 @@ fn ssa_summary_serde_round_trip_return_abstract() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); @@ -1375,6 +1387,7 @@ fn global_summaries_resolve_body_requires_body_present() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -3519,6 +3532,7 @@ fn cf4_return_path_transform_serde_round_trip() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); diff --git a/src/symex/transfer.rs b/src/symex/transfer.rs index 7b04289c..2acc19d5 100644 --- a/src/symex/transfer.rs +++ b/src/symex/transfer.rs @@ -1593,6 +1593,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -1662,6 +1663,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -1731,6 +1733,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -1795,6 +1798,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -1859,6 +1863,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2057,6 +2062,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2136,6 +2142,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2216,6 +2223,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2246,6 +2254,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2276,6 +2285,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2355,6 +2365,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2436,6 +2447,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); @@ -2465,6 +2477,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + validated_params_to_return: smallvec::SmallVec::new(), param_to_gate_filters: vec![], }, ); diff --git a/src/taint/mod.rs b/src/taint/mod.rs index 25df5e7b..8e66afe2 100644 --- a/src/taint/mod.rs +++ b/src/taint/mod.rs @@ -1406,6 +1406,7 @@ pub(crate) fn extract_intra_file_ssa_summaries( mod_aliases_ref, None, Some(&formal_params), + None, ); // Only store if the summary has observable effects. With @@ -1531,6 +1532,11 @@ pub(crate) fn lower_all_functions_from_bodies( } else { Some(&mod_aliases) }; + let formal_destructured = if !body.meta.param_destructured_fields.is_empty() { + Some(body.meta.param_destructured_fields.as_slice()) + } else { + None + }; let summary = ssa_transfer::extract_ssa_func_summary( &func_ssa, &body.graph, @@ -1543,6 +1549,7 @@ pub(crate) fn lower_all_functions_from_bodies( mod_aliases_ref, locator, Some(formal_params), + formal_destructured, ); // Always insert the summary, even when all fields are empty/default. @@ -1775,6 +1782,11 @@ fn rerun_extraction_with_augmented_summaries( Some(&mod_aliases) }; + let formal_destructured = if !body.meta.param_destructured_fields.is_empty() { + Some(body.meta.param_destructured_fields.as_slice()) + } else { + None + }; let new_summary = ssa_transfer::extract_ssa_func_summary_full( &callee.ssa, parent_cfg, @@ -1788,6 +1800,7 @@ fn rerun_extraction_with_augmented_summaries( locator, Some(&body.meta.params), Some(&augmented_snapshot), + formal_destructured, ); // OR-merge sink-only fields into the existing summary. @@ -1796,8 +1809,16 @@ fn rerun_extraction_with_augmented_summaries( } } -/// OR-merge `param_to_sink` and `param_to_sink_param` from `src` into -/// `dst`. Existing entries are preserved; only NEW entries are added. +/// OR-merge `param_to_sink`, `param_to_sink_param`, and +/// `validated_params_to_return` from `src` into `dst`. Existing entries +/// are preserved; only NEW entries are added. +/// +/// The validated-param list grows monotonically across extraction +/// rounds: a parameter that proves validated under any extraction +/// pass (the augmented second pass typically resolves more +/// cross-function summaries than the first) stays validated. Drops +/// here would silently lose CVE-2026-25544-class precision the +/// re-extraction pass was specifically designed to recover. fn merge_sink_fields( dst: &mut crate::summary::ssa_summary::SsaFuncSummary, src: &crate::summary::ssa_summary::SsaFuncSummary, @@ -1823,6 +1844,11 @@ fn merge_sink_fields( dst.param_to_sink_param.push((idx, pos, caps)); } } + for &idx in &src.validated_params_to_return { + if !dst.validated_params_to_return.contains(&idx) { + dst.validated_params_to_return.push(idx); + } + } } /// Walk lexical-containment children of every parent body and lift diff --git a/src/taint/path_state.rs b/src/taint/path_state.rs index c655f48a..37f62260 100644 --- a/src/taint/path_state.rs +++ b/src/taint/path_state.rs @@ -377,6 +377,24 @@ pub fn classify_condition(text: &str) -> PredicateKind { return PredicateKind::ValidationCall; } + // Regex / pattern allowlist `.test(value)` / `.match(value)` calls + // where the receiver name carries a regex or pattern marker. The + // standard JS / TS / Python / Java / Ruby / Go regex APIs all expose a + // boolean test method; the success arm (true) means `value` matches the + // pattern. Conservative on receiver names so non-regex methods like + // `obj.test(x)` (test runner), `db.test(...)` (test column) etc. don't + // get pulled in. Motivated by Payload CVE-2026-25544 + // (`if (!SAFE_STRING_REGEX.test(value)) throw …;`). + if (bare == "test" || bare == "match" || bare == "matches") + && let Some(dot_pos) = callee_part.rfind('.') + { + let receiver = &callee_part[..dot_pos]; + let receiver_lower = receiver.to_ascii_lowercase(); + if receiver_lower.contains("regex") || receiver_lower.contains("pattern") { + return PredicateKind::ValidationCall; + } + } + // Sanitizer if bare.contains("sanitiz") || bare.contains("escape") || bare.contains("encode") { return PredicateKind::SanitizerCall; @@ -638,6 +656,19 @@ fn extract_validation_target(text: &str) -> Option { // Check for method call pattern: `x.method(...)` or `x.method_name(...)` if let Some(dot_pos) = callee_part.rfind('.') { let receiver = callee_part[..dot_pos].trim(); + let method = callee_part[dot_pos + 1..].trim().to_ascii_lowercase(); + // Regex-allowlist `.test(value)` / `.match(value)` / `.matches(value)`: + // the validated target is the call's first argument, not the regex + // receiver. Without this special case, branch narrowing would mark + // the regex itself as validated and leave the user input alone. + if matches!(method.as_str(), "test" | "match" | "matches") + && let Some(first_arg) = first_call_arg(args_part) + { + let first_arg = first_arg.strip_prefix('&').unwrap_or(first_arg).trim(); + if !first_arg.is_empty() && is_identifier(first_arg) { + return Some(first_arg.to_string()); + } + } if !receiver.is_empty() && is_identifier(receiver) { return Some(receiver.to_string()); } @@ -977,6 +1008,33 @@ mod tests { assert_eq!(target.as_deref(), Some("x")); } + /// Regex `.test(value)` should classify as ValidationCall and the + /// validated target should be the call argument, not the regex + /// receiver. Pinned because the receiver-as-target heuristic is the + /// default for method calls. Motivated by Payload CVE-2026-25544 + /// (`if (!SAFE_STRING_REGEX.test(value)) throw …;`). + #[test] + fn target_regex_test_first_arg() { + let (kind, target) = classify_condition_with_target("!SAFE_STRING_REGEX.test(value)"); + assert_eq!(kind, PredicateKind::ValidationCall); + assert_eq!(target.as_deref(), Some("value")); + } + + #[test] + fn target_regex_test_pattern_receiver() { + let (kind, target) = classify_condition_with_target("ALLOWED_PATTERN.test(s)"); + assert_eq!(kind, PredicateKind::ValidationCall); + assert_eq!(target.as_deref(), Some("s")); + } + + /// Receiver name without a regex/pattern marker should NOT be pulled + /// in as a validator: `obj.test(x)` is a test runner, not a regex. + #[test] + fn target_test_non_regex_receiver_is_not_validation() { + let kind = classify_condition("obj.test(value)"); + assert_eq!(kind, PredicateKind::Unknown); + } + #[test] fn target_comparison_extracts_identifier_side() { let (kind, target) = classify_condition_with_target("x == 5"); diff --git a/src/taint/ssa_transfer/mod.rs b/src/taint/ssa_transfer/mod.rs index ff07c614..a4b3d7e9 100644 --- a/src/taint/ssa_transfer/mod.rs +++ b/src/taint/ssa_transfer/mod.rs @@ -3499,7 +3499,21 @@ pub(super) fn transfer_inst( // `ssa/lower.rs`), which inflates `args.len()` beyond the real // positional arity. The CFG's `arg_uses` is the authoritative // positional-arg list. - let arity_hint = info.call.arg_uses.len(); + // + // Fallback: certain TypeScript call shapes — notably calls + // inside template-string substitutions (`${fn(arg)}`) — get + // their `arg_uses` dropped by CFG lowering even though the + // call's positional `args` are intact. When that happens + // the strict `Some(0)` arity hint silently fails to match + // any callee that takes ≥1 arg, swallowing summary + // resolution. Detect the asymmetry and pass `None` so + // `resolve_local_func_key_query`'s unique-name fallback + // can still pick up the lone candidate. + let arity_hint = if info.call.arg_uses.is_empty() && !args.is_empty() { + None + } else { + Some(info.call.arg_uses.len()) + }; // Type-aware resolution: when the SSA receiver value has a // known abstract type (HttpClient, URL, …), feed that into // the resolver as an authoritative `receiver_type`. This @@ -3511,7 +3525,7 @@ pub(super) fn transfer_inst( callee, caller_func, info.call.call_ordinal, - Some(arity_hint), + arity_hint, *receiver, ); @@ -3627,6 +3641,43 @@ pub(super) fn transfer_inst( env.refine(inst.value, &fact); } } + + // Validated-flow propagation through callee summaries. + // + // Runs regardless of whether inline analysis already + // resolved the call: inline analysis re-runs the + // callee's taint with caller-side seeds but does not + // surface the callee's symbol-keyed + // `validated_must` / `validated_may` state into the + // caller, so the summary-level signal is the only + // channel for propagating helper-validation across + // a function boundary. + // + // When the callee's body validates a parameter on + // every return path that carries the param's caps + // (regex allowlist, type check, validation call, …), + // a normal-returning call site is the validating arm + // by construction: control could not reach the + // post-call instruction unless the helper's + // predicate(s) accepted the argument. Mark each + // tainted argument's `var_name` and the call's + // result `var_name` in the caller's + // `validated_must` / `validated_may` sets so + // subsequent sinks observe `all_validated = true`, + // the same way an inline `if (!regex.test(x)) throw` + // validates the surviving branch. Closes the + // helper-validator propagation gap surfaced by + // CVE-2026-25544 (Payload `sanitizeValue` SQLi). + if !resolved.validated_params_to_return.is_empty() { + propagate_validated_params_to_return( + inst, + args, + ssa, + transfer.interner, + state, + &resolved.validated_params_to_return, + ); + } } // When find_classifiable_inner_call overrides the callee (e.g. @@ -3640,7 +3691,7 @@ pub(super) fn transfer_inst( oc, caller_func, info.call.call_ordinal, - Some(arity_hint), + arity_hint, ) { if resolved_container_to_return.is_empty() { resolved_container_to_return = @@ -3735,6 +3786,24 @@ pub(super) fn transfer_inst( if !aggregate_sanitizer_applied { return_bits &= !resolved.sanitizer_caps; } + + // Validated-flow propagation through callee summaries. + // + // When the callee's body validates a parameter on every + // return path (regex allowlist, type check, validation + // call, etc. — see + // [`crate::summary::ssa_summary::SsaFuncSummary::validated_params_to_return`]), + // a normal-returning call site is the validating arm by + // construction: control could not reach the post-call + // instruction unless the helper's predicate(s) accepted + // the argument. Mark each tainted argument's `var_name` + // and the call's result `var_name` in the caller's + // `validated_must` / `validated_may` sets so subsequent + // sinks observe `all_validated = true`, the same way an + // inline `if (!regex.test(x)) throw` validates the + // surviving branch. Closes the helper-validator + // propagation gap surfaced by CVE-2026-25544 (Payload + // `sanitizeValue` SQLi). } // Type-qualified receiver resolution: when normal callee resolution @@ -4236,7 +4305,7 @@ pub(super) fn transfer_inst( oc, caller_func, info.call.call_ordinal, - Some(arity_hint), + arity_hint, ) { if !oc_sum.propagates_taint && oc_sum.source_caps.is_empty() { // Outer callee blocks taint: no param→return flow, @@ -6301,6 +6370,60 @@ fn collect_args_taint( /// [`Cap::UNAUTHORIZED_ID`], ownership/membership guards prove on /// inputs rather than the return value. Other caps and origins are /// untouched. +/// Apply [`SsaFuncSummary::validated_params_to_return`] at a call site. +/// +/// For each parameter index `p` in `validated_params`, mark the +/// `var_name` of every tainted SSA value at `args[p]` and the call's +/// own result `inst.value` in the caller's `validated_must` / +/// `validated_may` sets. Mirrors the symbol-keyed validation a direct +/// `if (!regex.test(x)) throw` would set on the surviving branch. +/// +/// Sound because the callee summary records `validated_params_to_return` +/// only when the param's `var_name` is in `validated_must` at *every* +/// return block — a normal-returning call therefore proves the +/// validating arm. No-op when no actual argument is tainted (avoids +/// spuriously validating untouched names downstream). +fn propagate_validated_params_to_return( + inst: &SsaInst, + args: &[SmallVec<[SsaValue; 2]>], + ssa: &SsaBody, + interner: &crate::state::symbol::SymbolInterner, + state: &mut SsaTaintState, + validated_params: &[usize], +) { + let mark = |val: SsaValue, st: &mut SsaTaintState| { + let Some(name) = ssa + .value_defs + .get(val.0 as usize) + .and_then(|vd| vd.var_name.as_deref()) + else { + return; + }; + let Some(sym) = interner.get(name) else { + return; + }; + st.validated_must.insert(sym); + st.validated_may.insert(sym); + }; + + let mut any_arg_tainted = false; + for &p in validated_params { + let Some(arg_vals) = args.get(p) else { + continue; + }; + for &v in arg_vals { + if state.get(v).is_some_and(|t| !t.caps.is_empty()) { + any_arg_tainted = true; + mark(v, state); + } + } + } + + if any_arg_tainted { + mark(inst.value, state); + } +} + fn strip_cap_from_call_args( args: &[SmallVec<[SsaValue; 2]>], receiver: &Option, @@ -8676,6 +8799,14 @@ struct ResolvedSummary { /// `field_points_to` records. Applied at the caller call site by /// `apply_field_points_to_writes`. field_points_to: crate::summary::points_to::FieldPointsToSummary, + /// Parameter indices whose taint flow to the return is fully + /// validated by a dominating predicate inside the callee on every + /// return path. Mirrors + /// [`crate::summary::ssa_summary::SsaFuncSummary::validated_params_to_return`]. + /// Populated only via `convert_ssa_to_resolved`; other resolution + /// paths leave it empty (label / coarse-FuncSummary forms cannot + /// express per-path predicate validation). + validated_params_to_return: Vec, } fn resolve_callee( @@ -8825,6 +8956,7 @@ fn resolve_callee_full( points_to: Default::default(), field_points_to: Default::default(), param_to_gate_filters: vec![], + validated_params_to_return: vec![], }); } // Try label classification for the bound function (by leaf name). @@ -8896,6 +9028,7 @@ fn resolve_callee_full( points_to: Default::default(), field_points_to: Default::default(), param_to_gate_filters: vec![], + validated_params_to_return: vec![], }); } } @@ -9041,6 +9174,7 @@ fn resolve_callee_full( points_to: Default::default(), field_points_to: Default::default(), param_to_gate_filters: vec![], + validated_params_to_return: vec![], }); } } else { @@ -9091,6 +9225,7 @@ fn resolve_callee_full( points_to: Default::default(), field_points_to: Default::default(), param_to_gate_filters: vec![], + validated_params_to_return: vec![], }; match widened.len() { 0 => {} @@ -9162,6 +9297,7 @@ fn resolve_callee_full( points_to: Default::default(), field_points_to: Default::default(), param_to_gate_filters: vec![], + validated_params_to_return: vec![], }); } } @@ -9344,6 +9480,7 @@ fn convert_ssa_to_resolved_for_caller( points_to: ssa_sum.points_to.clone(), field_points_to: ssa_sum.field_points_to.clone(), param_to_gate_filters: ssa_sum.param_to_gate_filters.clone(), + validated_params_to_return: ssa_sum.validated_params_to_return.to_vec(), } } diff --git a/src/taint/ssa_transfer/summary_extract.rs b/src/taint/ssa_transfer/summary_extract.rs index 724bbea2..2a995d57 100644 --- a/src/taint/ssa_transfer/summary_extract.rs +++ b/src/taint/ssa_transfer/summary_extract.rs @@ -50,6 +50,7 @@ pub fn extract_ssa_func_summary( module_aliases: Option<&HashMap>>, locator: Option<&crate::summary::SinkSiteLocator<'_>>, formal_param_names: Option<&[String]>, + formal_destructured_fields: Option<&[Vec]>, ) -> crate::summary::ssa_summary::SsaFuncSummary { extract_ssa_func_summary_full( ssa, @@ -64,6 +65,7 @@ pub fn extract_ssa_func_summary( locator, formal_param_names, None, + formal_destructured_fields, ) } @@ -93,6 +95,15 @@ pub fn extract_ssa_func_summary_full( ssa_summaries: Option< &HashMap, >, + // Per-parameter destructured-binding sibling names. Entry `i` is + // the list of field names destructured by the same call-site arg + // slot as the primary `formal_param_names[i]`, excluding the + // primary name. Empty vec for non-destructured params; `None` for + // callers that don't carry destructure info (legacy / test paths). + // Drives the destructured-arg expansion in the per-param probe so + // taint flow through sibling bindings is visible to summary + // extraction (CVE-2026-25544 / @payloadcms/drizzle SQLi). + formal_destructured_fields: Option<&[Vec]>, ) -> crate::summary::ssa_summary::SsaFuncSummary { use crate::summary::SinkSite; use crate::summary::ssa_summary::{SsaFuncSummary, TaintTransform}; @@ -159,13 +170,32 @@ pub fn extract_ssa_func_summary_full( /// Inner [`PathFact`] when the rv on this path is a one-arg /// variant constructor; [`None`] otherwise. variant_inner_fact: Option, + /// `true` when the per-param probe's seeded parameter var_name + /// is in this return block's exit `validated_must`. `false` + /// for the baseline (no-seed) probe and for params not + /// validated on this path. Drives + /// `validated_params_to_return` summary extraction. + param_validated_must: bool, } // Helper: run a taint probe with a given global_seed and return // the aggregate return caps, sink events, joined return abstract, // and the per-return-block observation list used to derive // per-return-path transforms. - let run_probe = |seed: HashMap| -> ( + // + // `probe_param_names` lists the seeded parameter's `var_name` + // plus any destructured-binding siblings sharing the slot + // (`None` for the baseline source-caps probe). When non-empty, + // each return-block observation records whether ANY of those + // names is in the exit state's `validated_must`, which feeds + // `validated_params_to_return` summary extraction below. The + // any-name semantics matches the slot-wide model: a destructured + // formal `({ a, b, c })` represents one call-site slot, and any + // sibling reaching `validated_must` proves the slot's caps were + // narrowed before reaching the return. + let run_probe = |seed: HashMap, + probe_param_names: Option<&[&str]>| + -> ( Cap, Vec, Option, @@ -313,6 +343,13 @@ pub fn extract_ssa_func_summary_full( // The hash is stable across runs for a given predicate // shape so call sites can compare paths deterministically. let (predicate_hash, known_true, known_false) = summarise_return_predicates(&exit); + let param_validated_must = match probe_param_names { + Some(names) => names.iter().any(|name| match interner.get(name) { + Some(sym) => exit.validated_must.contains(sym), + None => false, + }), + None => false, + }; per_return.push(ReturnBlockObs { derived_caps: block_derived_caps, param_caps: block_param_caps, @@ -322,6 +359,7 @@ pub fn extract_ssa_func_summary_full( abstract_value: block_abs, path_fact: block_path_fact, variant_inner_fact: block_variant_inner, + param_validated_must, }); } } @@ -343,7 +381,7 @@ pub fn extract_ssa_func_summary_full( // Abstract values don't depend on taint seeding, so the baseline probe // captures the function's intrinsic abstract return value. let (baseline_return_caps, _baseline_events, return_abstract, baseline_obs) = - run_probe(HashMap::new()); + run_probe(HashMap::new(), None); let source_caps = baseline_return_caps; // Per-return-path PathFact decomposition derived from the baseline @@ -403,6 +441,12 @@ pub fn extract_ssa_func_summary_full( usize, SmallVec<[crate::summary::ssa_summary::ReturnPathTransform; 2]>, )> = Vec::new(); + // Parameter indices whose taint flow to the return is fully + // validated by a dominating predicate on every return path. + // Populated below by checking each per-param probe's return-block + // exit states for `validated_must` containing the param's + // var_name. Empty when no parameter is validated. + let mut validated_params_to_return: SmallVec<[usize; 2]> = SmallVec::new(); for &(idx, ref var_name, _ssa_val) in ¶m_info { let mut seed = HashMap::new(); @@ -421,6 +465,37 @@ pub fn extract_ssa_func_summary_full( probe_taint.clone(), ); + // Destructured-arg sibling expansion. When the formal at slot + // `idx` destructures an object pattern (`({ column, operator, + // value })`), the SSA body emits a separate [`SsaOp::Param`] + // for every destructured binding (sequential indices > slot + // count, since the closure-capture pass treats them as + // free-identifier reads). The call-site only passes ONE arg + // for the slot, so the engine never seeds the sibling Param + // ops at runtime — but the per-parameter SUMMARY probe must + // model "if this slot is tainted then every binding it + // produced is tainted too". Seed each sibling's `var_name` + // with the same caps the primary received. The probe-level + // `validated_must` check below treats the slot as validated + // when ANY sibling lands in `validated_must` on a return path. + // + // Closes the residual gap behind CVE-2026-25544 (PayloadCMS + // `@payloadcms/drizzle` SQLi via `createJSONQuery({ value })`): + // the validator helper `sanitizeValue(value, operator)` lives + // inside the body and the probe needs to see `value` flow + // through the `validated_params_to_return` channel before + // suppressing the caller's sink. + let slot_siblings: &[String] = formal_destructured_fields + .and_then(|d| d.get(idx)) + .map(|v| v.as_slice()) + .unwrap_or(&[]); + for sib in slot_siblings { + seed.insert( + BindingKey::new(sib.as_str(), BodyId(0)), + probe_taint.clone(), + ); + } + // Phantom-Param prefix seeding. SSA lowering of arrow / nested // function bodies often exposes free-identifier member-access // expressions (e.g. `file._source.uri`) as their own @@ -437,13 +512,18 @@ pub fn extract_ssa_func_summary_full( // `formal_var_name + "."` with the same caps the formal param // received: semantically "if `file` is tainted, then every // observable field path on `file` is tainted too". Bounded - // by SSA size; cap-equivalent to direct seeding. - let prefix = format!("{}.", var_name); + // by SSA size; cap-equivalent to direct seeding. Mirror this + // for each destructured sibling (`value.foo` / `column.name` + // member-projections inside the body). + let prefixes: Vec = std::iter::once(var_name.clone()) + .chain(slot_siblings.iter().cloned()) + .map(|n| format!("{}.", n)) + .collect(); for block in &ssa.blocks { for inst in block.phis.iter().chain(block.body.iter()) { if let SsaOp::Param { .. } = &inst.op { if let Some(name) = inst.var_name.as_ref() { - if name.starts_with(&prefix) { + if prefixes.iter().any(|p| name.starts_with(p)) { seed.insert( BindingKey::new(name.as_str(), BodyId(0)), probe_taint.clone(), @@ -454,7 +534,15 @@ pub fn extract_ssa_func_summary_full( } } - let (return_caps, events, _, per_return_obs) = run_probe(seed); + // Build slot-wide name list for the validated_must check. + // Primary first, then siblings, then heap-allocated owned + // copies — `run_probe` only borrows for its inner loop. + let mut slot_names: Vec<&str> = Vec::with_capacity(1 + slot_siblings.len()); + slot_names.push(var_name.as_str()); + for sib in slot_siblings { + slot_names.push(sib.as_str()); + } + let (return_caps, events, _, per_return_obs) = run_probe(seed, Some(slot_names.as_slice())); // Subtract baseline source_caps, we only want param-contributed caps let param_return_caps = return_caps & !source_caps; @@ -469,6 +557,44 @@ pub fn extract_ssa_func_summary_full( param_to_return.push((idx, transform)); } + // Validated-param-to-return detection. + // + // When the per-param probe shows that the parameter's + // `var_name` is in `validated_must` on every return path that + // *carries the parameter's contributed caps*, record the + // parameter as validated. The caller will mark each tainted + // argument passed to this position — and the call's own + // return value — as `validated_must` / `validated_may`, the + // same way an inline `if (!regex.test(x)) throw` would + // validate the surviving branch. + // + // Conservative gating: + // * Skip when the param contributes no caps to the return, + // a degenerate "validated but irrelevant" record. + // * Skip when no return block was observed (probes that + // diverged or hit `MAX_PROBE_PARAMS`). + // * Require validation on every return path that *carries + // param caps to the return*. Branches that return + // constants (e.g. `if (x === null) return 'NULL'`) carry + // no param taint and don't need a validation predicate. + // * Require ≥1 path that actually validates the param. + if !param_return_caps.is_empty() && !per_return_obs.is_empty() { + let mut any_carrying_path = false; + let all_carrying_validated = per_return_obs.iter().all(|obs| { + let carries = !(obs.derived_caps & !source_caps).is_empty() + || !(obs.param_caps & !source_caps).is_empty(); + if carries { + any_carrying_path = true; + obs.param_validated_must + } else { + true + } + }); + if any_carrying_path && all_carrying_validated { + validated_params_to_return.push(idx); + } + } + // Derive per-return-path decomposition. For each // observed return block, derive a `ReturnPathTransform` mirroring // the aggregate logic (prefer derived caps, fall back to param @@ -694,6 +820,7 @@ pub fn extract_ssa_func_summary_full( // extractor itself doesn't carry receiver-type info, the // caller patches it in. typed_call_receivers: Vec::new(), + validated_params_to_return, } } diff --git a/src/taint/ssa_transfer/tests.rs b/src/taint/ssa_transfer/tests.rs index 18cfd491..46be20db 100644 --- a/src/taint/ssa_transfer/tests.rs +++ b/src/taint/ssa_transfer/tests.rs @@ -1641,6 +1641,7 @@ mod fanout_merge_tests { points_to: Default::default(), field_points_to: Default::default(), param_to_gate_filters: vec![], + validated_params_to_return: vec![], } } diff --git a/src/taint/tests.rs b/src/taint/tests.rs index f99ecb40..e68b942b 100644 --- a/src/taint/tests.rs +++ b/src/taint/tests.rs @@ -4331,6 +4331,7 @@ fn ssa_summary_identity_propagation() { None, None, None, + None, ); assert!( !summary.param_to_return.is_empty(), @@ -4394,6 +4395,7 @@ fn ssa_summary_sanitizer_strips_bits() { None, None, None, + None, ); // Sanitizer should strip some bits for (_, transform) in &summary.param_to_return { @@ -4450,6 +4452,7 @@ fn ssa_summary_source_adds_bits() { None, None, None, + None, ); assert!( !summary.source_caps.is_empty(), @@ -4506,6 +4509,7 @@ fn ssa_summary_param_to_sink() { None, None, None, + None, ); assert!( !summary.param_to_sink.is_empty(), @@ -6122,6 +6126,61 @@ async function handler(req) { ); } +/// Regex-allowlist `.test(value)` is recognised as a ValidationCall +/// targeting the call's first argument (not the regex receiver). +/// +/// Shape: +/// +/// ```js +/// const v = req.body.x; +/// if (!SAFE_REGEX.test(v)) { throw } +/// db.execute(v); // direct flow: should be silent +/// ``` +/// +/// `classify_condition` returns ValidationCall for the `*regex*.test()` +/// receiver shape (see `target_regex_test_first_arg` in path_state) and +/// `extract_validation_target` overrides the default receiver-as-target +/// rule to extract the call's first argument. Together with the +/// existing CFG-level negation handling in `compute_succ_states` the +/// false branch (continue) marks `v` as validated. +/// +/// Motivated by Payload CVE-2026-25544 +/// (`if (!SAFE_STRING_REGEX.test(value)) throw`). Note: this test pins +/// the direct-flow case; transitive validation through SSA-derived +/// values (e.g. template-literal concat of `v` into `sql`) is a deeper +/// gap tracked separately and not closed here. +#[test] +fn regex_test_allowlist_narrowing_clears_direct_flow() { + let src = br#" +const SAFE_REGEX = /^[\w]+$/; + +async function handler(req) { + const userValue = req.body.filter; + if (!SAFE_REGEX.test(userValue)) { + throw new Error('bad'); + } + return await db.execute(userValue); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "regex.test allowlist narrowing should suppress direct-flow finding; got {} finding(s): {findings:?}", + findings.len() + ); +} + /// Regression: `extract_ssa_func_summary` must skip `all_validated` /// events when populating `param_to_sink` / `param_to_sink_param`. /// @@ -6205,6 +6264,282 @@ async function handler(req) { ); } +/// Regression for CVE-2026-25544 deep fix +/// (`validated_params_to_return` summary field): a helper that +/// validates its parameter via a regex `.test(...)` allowlist and +/// returns a string derived from the validated parameter must +/// suppress the caller's downstream sink even when: +/// * the caller binds the call result to a fresh variable +/// (`const sql = sanitize(userValue)`), and +/// * the helper's return is a *derived* template literal, not a +/// pass-through of the parameter itself. +/// +/// Sound because the helper only returns normally on the validating +/// arm — control could not reach the post-call instruction unless +/// the regex accepted the argument. Pinned by +/// `propagate_validated_params_to_return` marking both the arg and +/// the call result `validated_must` / `validated_may` so the sink's +/// `all_validated` check fires. +#[test] +fn validated_params_to_return_suppresses_one_hop_helper_validator() { + let src = br#" +const SAFE_REGEX = /^[\w]+$/; + +const sanitize = (value) => { + if (!SAFE_REGEX.test(value)) throw new Error('bad'); + return `safe:${value}`; +}; + +async function handler(req) { + const userValue = req.body.filter; + const sql = sanitize(userValue); + db.execute(sql); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "regex.test allowlist inside helper must suppress caller sink; got {} finding(s)", + findings.len() + ); +} + +/// Two-hop variant of +/// `validated_params_to_return_suppresses_one_hop_helper_validator`: +/// when the validator helper is itself wrapped by another helper +/// that interpolates the validator's return into a template literal, +/// summary extraction must still surface +/// `validated_params_to_return` on the *outer* helper. This pins +/// the second-pass re-extraction (via +/// `re_extract_summaries_with_augment_view`) plus the OR-merge of +/// `validated_params_to_return` in `merge_sink_fields`. +#[test] +fn validated_params_to_return_suppresses_two_hop_helper_validator() { + let src = br#" +const SAFE_REGEX = /^[\w]+$/; + +const sanitize = (value) => { + if (!SAFE_REGEX.test(value)) throw new Error('bad'); + return value; +}; + +const buildQuery = (value) => { + const s = sanitize(value); + return s + '!'; +}; + +async function handler(req) { + const userValue = req.body.filter; + const sql = buildQuery(userValue); + db.execute(sql); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "two-hop helper-validator must propagate validated_params_to_return through both helpers; got {} finding(s)", + findings.len() + ); +} + +/// Companion to +/// `validated_params_to_return_suppresses_one_hop_helper_validator`: +/// same shape WITHOUT the regex.test guard inside the helper must +/// still fire. Asserts the validated-flow propagation does not +/// over-suppress when the helper does not actually validate. +#[test] +fn validated_params_to_return_does_not_suppress_unvalidated_helper() { + let src = br#" +const sanitize = (value) => { + return `safe:${value}`; +}; + +async function handler(req) { + const userValue = req.body.filter; + const sql = sanitize(userValue); + db.execute(sql); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + !findings.is_empty(), + "helper without regex guard must still flag the caller sink", + ); +} + +/// Regression: per-parameter summary probe must seed every +/// destructured object-pattern sibling sharing a slot, not only the +/// primary name picked by `extract_param_meta`. Without this, a +/// helper that destructures its single argument as +/// `({ value }) => …` cannot have `validated_params_to_return = [0]` +/// proven, because the validator inside the body operates on the +/// `value` binding while the probe only seeded the primary `value` +/// (or any earlier sibling) of the object pattern. Closes the +/// residual blocker for CVE-2026-25544 (PayloadCMS Drizzle SQLi). +#[test] +fn validated_params_to_return_suppresses_destructured_object_arg_helper() { + let src = br#" +const SAFE_REGEX = /^[\w]+$/; + +const sanitize = (value) => { + if (!SAFE_REGEX.test(value)) throw new Error('bad'); + return value; +}; + +const buildQuery = ({ value }) => { + const s = sanitize(value); + return s + '!'; +}; + +async function handler(req) { + const userValue = req.body.filter; + const sql = buildQuery({ value: userValue }); + db.execute(sql); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "destructured object-pattern arg with regex.test allowlist inside the helper must suppress caller sink; got {} finding(s)", + findings.len() + ); +} + +/// Regression: same coverage for TypeScript object-pattern formals +/// (`required_parameter > pattern: object_pattern`). TS exposes the +/// destructure under a wrapper required_parameter; JS exposes it as a +/// direct child of formal_parameters. Both paths must surface +/// destructured siblings to the per-parameter probe. +#[test] +fn validated_params_to_return_suppresses_destructured_object_arg_helper_ts() { + let src = br#" +const SAFE_REGEX = /^[\w]+$/; + +const sanitize = (value: string): string => { + if (!SAFE_REGEX.test(value)) throw new Error('bad'); + return value; +}; + +const buildQuery = ({ value }: { value: string }): string => { + const s = sanitize(value); + return s + '!'; +}; + +async function handler(req: any) { + const userValue = req.body.filter; + const sql = buildQuery({ value: userValue }); + db.execute(sql); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT); + let file_cfg = parse_lang(src, "typescript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::TypeScript, + "test.ts", + &[], + None, + ); + assert!( + findings.is_empty(), + "TS destructured object-pattern arg with regex.test allowlist must suppress caller sink; got {} finding(s)", + findings.len() + ); +} + +/// Regression: a destructured object-pattern formal with multiple +/// fields must still propagate validated_params_to_return when the +/// validation lives behind a sibling that is NOT the primary name +/// returned by `extract_param_meta`. In CVE-2026-25544 the primary +/// is `column` (first ident in `{ column, operator, pathSegments, +/// value }`) but the validator gates `value` — without sibling +/// seeding the probe never sees the validation. +#[test] +fn destructured_sibling_validation_propagates_through_summary() { + let src = br#" +const SAFE_REGEX = /^[\w]+$/; + +const sanitize = (value) => { + if (!SAFE_REGEX.test(value)) throw new Error('bad'); + return value; +}; + +const buildQuery = ({ column, operator, value }) => { + return `${column} ${operator} ${sanitize(value)}`; +}; + +async function handler(req) { + const userValue = req.body.filter; + const sql = buildQuery({ column: 'col', operator: '=', value: userValue }); + db.execute(sql); +} +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "destructured-sibling validation (validator binds non-primary slot binding) must propagate through summary; got {} finding(s)", + findings.len() + ); +} + /// Regression: `validate*`-named callees match /// `InputValidatorPolarity::ErrorReturning`, bare `if (err) throw` /// guards the success branch (false branch). `is_valid*`/`is_safe*` @@ -6290,3 +6625,153 @@ const handler = (req) => { None, ); } + +/// JS arrow-function default parameters (`(a = {}, b = {}) => …`) +/// are wrapped by tree-sitter in `assignment_pattern` nodes whose +/// `left` field carries the actual identifier. Without +/// `assignment_pattern` in `PARAM_CONFIG.param_node_kinds`, the +/// param walker skipped them, producing a parameter-less summary +/// for any function whose params have defaults. That broke +/// cross-function `param_to_sink` propagation for shapes like +/// Strapi `sendTemplatedEmail`. Motivated by CVE-2023-22621. +#[test] +fn cve_2023_22621_js_default_params_extracted() { + use crate::cfg::extract_param_meta_for_test; + let src = br#" +const sendTemplatedEmail = (emailOptions = {}, emailTemplate = {}, data = {}) => { + return emailTemplate; +}; +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&lang).unwrap(); + let tree = parser.parse(&src[..], None).unwrap(); + let root = tree.root_node(); + let mut arrow_node: Option = None; + fn find<'a>(n: tree_sitter::Node<'a>, out: &mut Option>) { + if n.kind() == "arrow_function" { + *out = Some(n); + return; + } + let mut c = n.walk(); + for ch in n.named_children(&mut c) { + find(ch, out); + if out.is_some() { + return; + } + } + } + find(root, &mut arrow_node); + let arrow = arrow_node.expect("arrow function not found"); + let params = extract_param_meta_for_test(arrow, "javascript", src); + let names: Vec = params.iter().map(|(n, _)| n.clone()).collect(); + assert_eq!( + names, + vec![ + "emailOptions".to_string(), + "emailTemplate".to_string(), + "data".to_string() + ], + "expected all 3 default-valued arrow params extracted; got {:?}", + names + ); +} + +/// `_.template(tainted)` is a server-side template injection sink: +/// lodash compiles `<% ... %>` evaluate blocks into a JS Function, +/// so attacker-controlled input becomes RCE at render time. Gate +/// activates conservatively when arg 1 is missing (default lodash +/// behavior is dangerous). Motivated by CVE-2023-22621 (Strapi). +#[test] +fn cve_2023_22621_lodash_template_fires_on_tainted_input() { + let src = br#" +const _ = require('lodash'); +const handler = (req, res) => { + _.template(req.body.tpl); +}; +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + !findings.is_empty(), + "expected taint flow on _.template(req.body.tpl); got 0 findings", + ); +} + +/// `_.template(tainted, { evaluate: false })` disables lodash's +/// `<% ... %>` evaluate block compilation, so the call is no +/// longer a code-execution sink. The gate's `keyword_name = +/// "evaluate"` activation reads the literal value via the JS-side +/// closure that walks the call's arg-1 object literal (since JS +/// has no language-level keyword args). Motivated by Strapi's +/// CVE-2023-22621 patch. +#[test] +fn cve_2023_22621_lodash_template_suppressed_by_evaluate_false() { + let src = br#" +const _ = require('lodash'); +const handler = (req, res) => { + _.template(req.body.tpl, { evaluate: false }); +}; +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + findings.is_empty(), + "expected no taint flow when evaluate:false is set; got {} findings", + findings.len(), + ); +} + +/// Double-call chained form `_.template(tainted)(data)` — the outer +/// call's `function` field is itself a call_expression rather than +/// the member-chain shape `find_chained_inner_call` was originally +/// written for. The extension recognises the `f()()` pattern and +/// rebinds gate classification to the inner call so the gated +/// `_.template` fires even when wrapped in an immediate invocation +/// of the compiled function. Motivated by CVE-2023-22621. +#[test] +fn cve_2023_22621_lodash_template_double_call_inner_rebinding() { + let src = br#" +const _ = require('lodash'); +const handler = (req, res) => { + const tpl = req.body.tpl; + _.template(tpl)({}); +}; +"#; + let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_lang(src, "javascript", lang); + let summaries = &file_cfg.summaries; + let findings = analyse_file( + &file_cfg, + summaries, + None, + Lang::JavaScript, + "test.js", + &[], + None, + ); + assert!( + !findings.is_empty(), + "expected taint flow via double-call chain rebinding; got 0 findings", + ); +} diff --git a/src/utils/config.rs b/src/utils/config.rs index 4c5dacd2..1709a140 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -483,6 +483,15 @@ pub struct AuthAnalysisConfig { pub admin_path_patterns: Vec, pub admin_guard_names: Vec, pub login_guard_names: Vec, + /// Typed-extractor wrapper names that prove the request passed + /// route-level capability/policy enforcement (e.g. meilisearch's + /// `GuardedData, _>`). Per-language defaults set + /// in `auth_analysis::config::build_auth_rules`; user nyx.toml + /// entries are appended. Distinct from `login_guard_names` so the + /// pattern (matched as last-segment + case-insensitive + /// `starts_with`) doesn't pollute regular call recognition. + #[serde(default)] + pub policy_guard_names: Vec, pub authorization_check_names: Vec, pub mutation_indicator_names: Vec, pub read_indicator_names: Vec, @@ -544,6 +553,7 @@ impl Default for AuthAnalysisConfig { admin_path_patterns: Vec::new(), admin_guard_names: Vec::new(), login_guard_names: Vec::new(), + policy_guard_names: Vec::new(), authorization_check_names: Vec::new(), mutation_indicator_names: Vec::new(), read_indicator_names: Vec::new(), @@ -1075,6 +1085,10 @@ pub(crate) fn merge_configs(mut default: Config, user: Config) -> Config { &mut entry.auth.login_guard_names, user_lang_cfg.auth.login_guard_names, ); + extend_dedup( + &mut entry.auth.policy_guard_names, + user_lang_cfg.auth.policy_guard_names, + ); extend_dedup( &mut entry.auth.authorization_check_names, user_lang_cfg.auth.authorization_check_names, diff --git a/src/utils/project.rs b/src/utils/project.rs index e1d6819b..46c53a4f 100644 --- a/src/utils/project.rs +++ b/src/utils/project.rs @@ -57,12 +57,67 @@ pub enum DetectedFramework { #[derive(Debug, Clone, Default)] pub struct FrameworkContext { pub frameworks: Vec, + /// Language ecosystems whose root manifest existed and was inspected. + /// Lets `lang_has_web_framework` distinguish "no manifest at all" from + /// "manifest present but listed no matching framework" — the second + /// case is a positive signal that the project has no HTTP boundary in + /// that language, the first is just absence-of-information. + pub inspected_langs: std::collections::HashSet<&'static str>, } impl FrameworkContext { pub fn has(&self, fw: DetectedFramework) -> bool { self.frameworks.contains(&fw) } + + /// Three-valued web-framework presence query for a language slug. + /// + /// * `Some(true)` ─ at least one framework for `lang` is in `frameworks`. + /// * `Some(false)` ─ a manifest for `lang` was inspected but listed no + /// matching framework. The project genuinely has no HTTP boundary + /// in this language. + /// * `None` ─ no manifest for `lang` was inspected (e.g. single-file + /// scans without a project root). Caller should fall back to + /// prior-behavior heuristics. + pub fn lang_has_web_framework(&self, lang: &str) -> Option { + let (frameworks_for_lang, manifest_lang_key): (&[DetectedFramework], &str) = match lang { + "javascript" | "typescript" | "js" | "ts" => ( + &[ + DetectedFramework::Express, + DetectedFramework::Koa, + DetectedFramework::Fastify, + ], + "node", + ), + "python" | "py" => ( + &[DetectedFramework::Flask, DetectedFramework::Django], + "python", + ), + "java" => (&[DetectedFramework::Spring], "java"), + "go" => (&[DetectedFramework::Gin, DetectedFramework::Echo], "go"), + "ruby" | "rb" => ( + &[DetectedFramework::Rails, DetectedFramework::Sinatra], + "ruby", + ), + "php" => (&[DetectedFramework::Laravel], "php"), + "rust" | "rs" => ( + &[ + DetectedFramework::Axum, + DetectedFramework::ActixWeb, + DetectedFramework::Rocket, + ], + "rust", + ), + _ => return None, + }; + if frameworks_for_lang.iter().any(|fw| self.has(*fw)) { + return Some(true); + } + if self.inspected_langs.contains(manifest_lang_key) { + return Some(false); + } + None + } } /// Maximum bytes to read from each manifest file. @@ -135,17 +190,50 @@ pub fn detect_in_file_frameworks(bytes: &[u8], lang_slug: &str) -> Vec {} } fws } +/// Coarse per-file signal: does the file's leading byte range mention +/// at least one Rust web-framework symbol path (`axum::`, `actix_web::`, +/// `rocket::`)? Used by [`crate::auth_analysis::extract`] to gate the +/// `is_external_input_param_name` arm of `unit_has_user_input_evidence` +/// without affecting framework-conditional *label* rules. +/// +/// Returns `false` for non-Rust source. +pub fn rust_file_imports_web_framework(bytes: &[u8]) -> bool { + let head_len = bytes.len().min(8 * 1024); + let head = match std::str::from_utf8(&bytes[..head_len]) { + Ok(s) => s, + Err(_) => return false, + }; + head.contains("axum::") + || head.contains("axum_extra::") + || head.contains("actix_web::") + || head.contains("rocket::") +} + /// Detect frameworks from manifest files in the project root. pub fn detect_frameworks(root: &Path) -> FrameworkContext { let mut fws = Vec::new(); + let mut inspected: std::collections::HashSet<&'static str> = std::collections::HashSet::new(); // ── Node.js (package.json) ── if let Some(content) = read_bounded(&root.join("package.json")) { + inspected.insert("node"); // Crude substring search in the "dependencies" block area. // Good enough for detection, no JSON parsing overhead. if content.contains("\"express\"") { @@ -169,6 +257,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── Python ── for name in &["requirements.txt", "Pipfile", "pyproject.toml"] { if let Some(content) = read_bounded(&root.join(name)) { + inspected.insert("python"); let lower = content.to_ascii_lowercase(); if lower.contains("flask") && !fws.contains(&DetectedFramework::Flask) { fws.push(DetectedFramework::Flask); @@ -182,6 +271,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── Java (Maven / Gradle) ── for name in &["pom.xml", "build.gradle", "build.gradle.kts"] { if let Some(content) = read_bounded(&root.join(name)) { + inspected.insert("java"); if (content.contains("spring-boot") || content.contains("spring-web")) && !fws.contains(&DetectedFramework::Spring) { @@ -192,6 +282,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── Go (go.mod) ── if let Some(content) = read_bounded(&root.join("go.mod")) { + inspected.insert("go"); if content.contains("gin-gonic/gin") { fws.push(DetectedFramework::Gin); } @@ -202,6 +293,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── PHP (composer.json) ── if let Some(content) = read_bounded(&root.join("composer.json")) { + inspected.insert("php"); if content.contains("laravel/framework") { fws.push(DetectedFramework::Laravel); } @@ -209,6 +301,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── Ruby (Gemfile) ── if let Some(content) = read_bounded(&root.join("Gemfile")) { + inspected.insert("ruby"); if content.contains("'rails'") || content.contains("\"rails\"") { fws.push(DetectedFramework::Rails); } @@ -219,6 +312,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { // ── Rust (Cargo.toml) ── if let Some(content) = read_bounded(&root.join("Cargo.toml")) { + inspected.insert("rust"); if content.contains("actix-web") { fws.push(DetectedFramework::ActixWeb); } @@ -230,7 +324,10 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext { } } - FrameworkContext { frameworks: fws } + FrameworkContext { + frameworks: fws, + inspected_langs: inspected, + } } #[test] @@ -477,6 +574,57 @@ fn framework_context_has_is_false_for_absent_framework() { assert!(!ctx.has(DetectedFramework::Spring)); } +#[test] +fn lang_has_web_framework_three_valued_for_rust() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + // Cargo.toml present, no axum / actix-web / rocket → Some(false). + fs::write(root.join("Cargo.toml"), "[dependencies]\nserde = \"1\"\n").unwrap(); + let ctx = detect_frameworks(root); + assert_eq!(ctx.lang_has_web_framework("rust"), Some(false)); + assert_eq!(ctx.lang_has_web_framework("python"), None); + + // Cargo.toml present and names axum → Some(true). + fs::write(root.join("Cargo.toml"), "[dependencies]\naxum = \"0.7\"\n").unwrap(); + let ctx = detect_frameworks(root); + assert_eq!(ctx.lang_has_web_framework("rust"), Some(true)); +} + +#[test] +fn lang_has_web_framework_none_when_manifest_absent() { + // No Cargo.toml at root → Rust manifest not inspected → None. + let tmp = tempfile::tempdir().unwrap(); + let ctx = detect_frameworks(tmp.path()); + assert_eq!(ctx.lang_has_web_framework("rust"), None); + assert_eq!(ctx.lang_has_web_framework("python"), None); + assert_eq!(ctx.lang_has_web_framework("ruby"), None); +} + +#[test] +fn rust_file_imports_web_framework_recognises_axum_actix_rocket() { + assert!(rust_file_imports_web_framework( + b"use axum::Router;\nfn main() {}\n" + )); + assert!(rust_file_imports_web_framework( + b"use actix_web::web;\nfn main() {}\n" + )); + assert!(rust_file_imports_web_framework( + b"use rocket::get;\nfn main() {}\n" + )); + assert!(rust_file_imports_web_framework( + b"use axum_extra::routing::RouterExt;\n" + )); + // Not a web framework import → false. + assert!(!rust_file_imports_web_framework( + b"use std::path::Path;\nuse serde::Deserialize;\nfn main() {}\n" + )); + // Bare crate name in a comment doesn't satisfy the `::` + // path prefix — substring is conservative on purpose. + assert!(!rust_file_imports_web_framework( + b"// migrating away from axum\nfn main() {}\n" + )); +} + #[test] fn detect_in_file_frameworks_go_echo() { let src = b"package main\nimport (\n\t\"net/http\"\n\t\"github.com/labstack/echo/v4\"\n)\nfunc x() {}\n"; diff --git a/tests/benchmark/RESULTS.md b/tests/benchmark/RESULTS.md index bf229a37..d35c17b1 100644 --- a/tests/benchmark/RESULTS.md +++ b/tests/benchmark/RESULTS.md @@ -8,7 +8,7 @@ Current baseline (2026-05-02): | Recall | 1.000 | 1.000 | 0.944 | | F1 | 1.000 | 1.000 | 0.901 | -Corpus: 492 cases across 10 languages, 491 evaluated (1 disabled). Per-run JSON lands in `tests/benchmark/results/` (`latest.json` plus dated snapshots). See `README.md` for what the scoring modes mean and how to run a subset. +Corpus: 499 cases across 10 languages, 496 evaluated (3 disabled). Per-run JSON lands in `tests/benchmark/results/` (`latest.json` plus dated snapshots). See `README.md` for what the scoring modes mean and how to run a subset. The corpus is mostly synthetic 8-20 line fixtures, one vulnerability or one safe pattern per file. A smaller real-CVE replay set under `cve_corpus/` covers 20 published CVEs across all 10 languages. Both contribute to the headline numbers. @@ -24,6 +24,7 @@ Real disclosed CVEs reduced to minimal reproducers, vulnerable + patched pair pe | CVE-2026-33626 | Python | LMDeploy | Apache-2.0 | SSRF | detected | | CVE-2019-14939 | JavaScript | mongo-express | MIT | code_exec | detected | | CVE-2025-64430 | JavaScript | Parse Server | Apache-2.0 | SSRF | detected | +| CVE-2023-22621 | JavaScript | Strapi | MIT | code_exec (SSTI)| detected | | CVE-2023-26159 | TypeScript | follow-redirects | MIT | SSRF | detected | | GHSA-4x48-cgf9-q33f | TypeScript | Novu | MIT | SSRF | detected | | CVE-2022-30323 | Go | hashicorp/go-getter | MPL-2.0 | CMDI | detected | @@ -43,6 +44,7 @@ Real disclosed CVEs reduced to minimal reproducers, vulnerable + patched pair pe | CVE-2019-18634 | C | sudo (pwfeedback) | ISC | memory_safety | detected | | CVE-2019-13132 | C++ | ZeroMQ libzmq | MPL-2.0 | memory_safety | detected | | CVE-2022-1941 | C++ | Protocol Buffers | BSD-3-Clause | memory_safety | detected | +| CVE-2026-25544 | TypeScript | Payload (Drizzle adapter) | MIT | sql_injection | deferred | Deferred entries are real bugs Nyx can't yet detect. The fixture stays committed with `disabled: true` in ground truth so the gap remains visible. @@ -67,6 +69,8 @@ Most recent first. Metrics are rule-level on the corpus size at that point. | Date | Change | Corpus | P | R | F1 | |------------|------------------------------------------------------------------------------|--------|-------|-------|-------| +| 2026-05-02 | TS regex-allowlist `<*regex*>.test(value)` / `<*pattern*>.test(value)` recognised as ValidationCall whose target is the first arg (overrides default receiver-as-target); conservative on receiver names so non-regex `*.test()` callees stay Unknown. CVE-2026-25544 (Payload drizzle SQL injection) lands in corpus disabled — needs validated-flow propagation through SSA derivation / helper-summary returns | 499 | 1.000 | 1.000 | 1.000 | +| 2026-05-02 | JS arrow `assignment_pattern` default-param extraction + JS object-literal kwarg fallback for gated sinks + double-call (`f()(x)`) chained-inner rebinding; lodash `_.template` modeled as gated CODE_EXEC sink suppressed by `{ evaluate: false }`; CVE-2023-22621 (Strapi SSTI) detected | 494 | — | — | — | | 2026-05-02 | `strings.ReplaceAll` recognised as CMDi sanitiser in chain-wrapper / call-site-replace shapes; clears `go-safe-009` (last open corpus FP); aggregate rule-level reaches P=R=F1=1.000 | 492 | 1.000 | 1.000 | 1.000 | | 2026-05-01 | PathFact opaque-prefix-lock (`canonicalise + start_with?()` recognised across Ruby/Python/JS) + `is_path_traversal_safe` predicate + negated-form polarity flip on assertion narrowing; rswag CVE-2023-38337 detected | 490 | 0.972 | 0.992 | 0.982 | | 2026-05-01 | Ruby `OpenURI.open_uri` SSRF sink + inner-call fallback for statement-level Ruby calls (`YAML.safe_load(File.read(x))` shape now classifies); CVE-2021-21288 (CarrierWave) detected | 482 | 0.972 | 0.992 | 0.982 | diff --git a/tests/benchmark/corpus/go/safe/safe_inner_call_close_in_arg.go b/tests/benchmark/corpus/go/safe/safe_inner_call_close_in_arg.go new file mode 100644 index 00000000..0d36cc86 --- /dev/null +++ b/tests/benchmark/corpus/go/safe/safe_inner_call_close_in_arg.go @@ -0,0 +1,55 @@ +// go-safe-realrepo-016 — distilled from prometheus tsdb/block_test.go:185 +// and 9+ other prometheus test files. Pattern: a wrapper call takes +// the close call's RESULT as an argument, e.g. +// +// require.NoError(t, f.Close()) +// errs = append(errs, f.Close()) +// +// The CFG creates one Call node per statement keyed on the OUTER +// callee. The inner-call release was invisible to the resource pass +// before the fix: direct-release loop matches `info.call.callee` +// (the outer callee), and the inner-call callee was carried in +// `info.arg_callees[i]` but unread. Engine fix: +// src/state/transfer.rs::apply_call now walks `info.arg_callees` +// after the direct-release branch. + +package safe + +import ( + "errors" + "os" +) + +type tHelper struct{} + +func (tHelper) NoError(args ...any) {} + +var t tHelper + +func close_in_require_noerror() error { + f, err := os.OpenFile("/tmp/x", os.O_RDWR, 0o666) + if err != nil { + return err + } + t.NoError(f.Close()) + return nil +} + +func close_in_append_arg() error { + f, err := os.Create("/tmp/y") + if err != nil { + return err + } + var errs []error + errs = append(errs, f.Close()) + return errors.Join(errs...) +} + +func close_via_defer() error { + f, err := os.Open("/tmp/z") + if err != nil { + return err + } + defer f.Close() + return nil +} diff --git a/tests/benchmark/corpus/go/safe/safe_struct_field_resource_owned_by_struct.go b/tests/benchmark/corpus/go/safe/safe_struct_field_resource_owned_by_struct.go new file mode 100644 index 00000000..a3f84cf2 --- /dev/null +++ b/tests/benchmark/corpus/go/safe/safe_struct_field_resource_owned_by_struct.go @@ -0,0 +1,78 @@ +// go-safe-realrepo-017 — distilled from prometheus +// `cmd/promtool/tsdb.go::startProfiling` (lines 230, 239, 246, 252): +// 4 findings on the same function plus widespread similar shapes +// across the prometheus tree. Pattern: +// +// b.cpuprof, err = os.Create(...) +// +// The resource is owned by the struct `*writeBenchmark`. Closure +// happens in a paired method `stopProfiling()`. The current function +// body cannot observe that closure, so any per-body resource analysis +// fires unconditionally. +// +// Engine fix (depth: structural — both layers): +// * src/state/transfer.rs::apply_call gates the acquire branch on +// `!define_is_field_lhs` so member-expression LHS doesn't seed +// `state.resource` in the dataflow lattice. +// * src/cfg_analysis/resources.rs::run gates the structural rule's +// acquire-iteration on the same `defines.contains('.')` check. + +package safe + +import ( + "os" + "runtime/pprof" +) + +type writeBenchmark struct { + cpuprof *os.File + memprof *os.File + blockprof *os.File + mtxprof *os.File + outPath string +} + +func (b *writeBenchmark) startProfiling() error { + var err error + b.cpuprof, err = os.Create(b.outPath + "/cpu.prof") + if err != nil { + return err + } + if err := pprof.StartCPUProfile(b.cpuprof); err != nil { + return err + } + b.memprof, err = os.Create(b.outPath + "/mem.prof") + if err != nil { + return err + } + b.blockprof, err = os.Create(b.outPath + "/block.prof") + if err != nil { + return err + } + b.mtxprof, err = os.Create(b.outPath + "/mutex.prof") + if err != nil { + return err + } + return nil +} + +func (b *writeBenchmark) stopProfiling() error { + if b.cpuprof != nil { + pprof.StopCPUProfile() + b.cpuprof.Close() + b.cpuprof = nil + } + if b.memprof != nil { + b.memprof.Close() + b.memprof = nil + } + if b.blockprof != nil { + b.blockprof.Close() + b.blockprof = nil + } + if b.mtxprof != nil { + b.mtxprof.Close() + b.mtxprof = nil + } + return nil +} diff --git a/tests/benchmark/corpus/go/safe/vuln_resource_leak_no_close.go b/tests/benchmark/corpus/go/safe/vuln_resource_leak_no_close.go new file mode 100644 index 00000000..b1204e55 --- /dev/null +++ b/tests/benchmark/corpus/go/safe/vuln_resource_leak_no_close.go @@ -0,0 +1,16 @@ +// go-vuln-realrepo-018 — recall guard for the inner-call-arg / +// member-LHS fixes. Bare-identifier `f := os.OpenFile(...)` with no +// `f.Close()` anywhere must still fire the resource-leak rule. + +package safe + +import "os" + +func vuln_open_no_close() error { + f, err := os.OpenFile("/tmp/x", os.O_RDWR, 0o666) + if err != nil { + return err + } + _ = f + return nil +} diff --git a/tests/benchmark/corpus/python/auth/vuln_user_id_param_no_auth.py b/tests/benchmark/corpus/python/auth/vuln_user_id_param_no_auth.py new file mode 100644 index 00000000..f02fd80e --- /dev/null +++ b/tests/benchmark/corpus/python/auth/vuln_user_id_param_no_auth.py @@ -0,0 +1,20 @@ +# py-auth-vuln-002: helper takes a user-supplied id (`project_id`) +# and queries by it without any preceding ownership/membership check. +# This is the vulnerable counterpart to +# safe_django_orm_caller_scoped_entity.py — same Django ORM shape, but +# the param is an *id-like user input*, not a scope-entity object, so +# the caller-scope-entity exemption must not apply. +# +# Pinned to keep recall on the missing_ownership_check rule. + + +class Project: + pass + + +def get_project(request, project_id): + return Project.objects.filter(id=project_id).first() + + +def delete_project(request, project_id): + Project.objects.filter(id=project_id).delete() diff --git a/tests/benchmark/corpus/python/safe/safe_django_orm_caller_scoped_entity.py b/tests/benchmark/corpus/python/safe/safe_django_orm_caller_scoped_entity.py new file mode 100644 index 00000000..e3d7e2df --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_django_orm_caller_scoped_entity.py @@ -0,0 +1,63 @@ +# py-auth-realrepo-008: caller-passed scope entity used as ownership +# constraint. Distilled from sentry +# `src/sentry/api/helpers/environments.py::get_environments` (and the +# many sibling helpers in `api/endpoints/organization_releases.py`): +# +# def get_environments(request, organization: Organization): +# ... +# return list( +# Environment.objects.filter( +# organization_id=organization.id, +# name__in=requested_environments, +# ) +# ) +# +# `_filter_releases_by_query(queryset, organization, query, filter_params)` +# follows the same pattern with `queryset.filter_by_semver(organization.id, ...)`. +# +# Both helpers receive the already-authorised `organization` object +# from a route handler that resolved it via `OrganizationReleasesBaseEndpoint` +# membership middleware. The query is *scoped by* `organization.id` +# — that IS the ownership boundary, not a user-controlled target. +# +# Without the caller-scope-entity exemption, every internal helper in a +# multi-tenant Django/Rails/Laravel codebase flags +# `missing_ownership_check` because the engine cannot tell "scoping +# arg" from "user-targeted arg". The fix recognises that +# `.id` where `` is a unit parameter named after a +# scope-bearing domain entity (organization, project, team, workspace, +# tenant, account, ...) is a passed-in scope, not a target. +from typing import List + + +class Organization: + pass + + +class Environment: + pass + + +def get_environments(request, organization: Organization) -> List[Environment]: + requested_environments = set(request.GET.getlist("environment")) + if not requested_environments: + return [] + return list( + Environment.objects.filter( + organization_id=organization.id, name__in=requested_environments + ) + ) + + +def _filter_releases_by_query(queryset, organization: Organization, query, filter_params): + queryset = queryset.filter_by_semver(organization.id, query) + queryset = queryset.filter_by_stage(organization.id, query) + return queryset + + +def list_project_issues(request, project): + return list(Issue.objects.filter(project_id=project.id, status="open")) + + +class Issue: + pass diff --git a/tests/benchmark/corpus/python/safe/safe_mock_patch_test_method.py b/tests/benchmark/corpus/python/safe/safe_mock_patch_test_method.py new file mode 100644 index 00000000..ea288360 --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_mock_patch_test_method.py @@ -0,0 +1,52 @@ +# py-auth-realrepo-010: pytest test method decorated with +# `@mock.patch("...")` collides with Flask's `.` route +# decorator shape (bare_method_name("mock.patch") == "patch", which the +# parse_flask_route_decorator matched as HTTP PATCH). The collision +# attached the test method as a Flask route handler, flipped its +# `unit.kind` to RouteHandler, made it pass +# `unit_has_user_input_evidence` unconditionally, and flooded pytest +# test suites with `missing_ownership_check` findings. +# +# Distilled from airflow +# `providers/google/tests/unit/google/cloud/hooks/test_dlp.py` (47 FPs +# in this single file pre-fix). Fix: +# `parse_flask_route_decorator` short-circuits when the callee text +# matches a known test-framework decorator vocabulary +# (`mock.patch`, `unittest.mock.patch`, `monkeypatch.setattr`, +# `pytest.mark.parametrize`, …). +# +# This fixture verifies pytest test methods don't fire ownership-check +# findings, even when they call ORM-shaped APIs with id-suffixed +# constants (the canonical pytest fixture-data pattern). +from unittest import mock +from unittest.mock import PropertyMock + +ORGANIZATION_ID = "fake-org-id-123" +PROJECT_ID = "fake-proj-id-456" +DLP_JOB_ID = "fake-job-id-789" + + +class TestCloudDLPHook: + @mock.patch( + "module.GoogleBaseHook.project_id", + new_callable=PropertyMock, + return_value=None, + ) + @mock.patch("module.CloudDLPHook.get_conn") + def test_create_deidentify_template_with_org_id(self, get_conn, mock_project_id): + get_conn.return_value.create_deidentify_template.return_value = "API_RESPONSE" + result = self.hook.create_deidentify_template(organization_id=ORGANIZATION_ID) + return result + + @mock.patch("module.CloudDLPHook.get_conn") + def test_create_dlp_job(self, get_conn): + result = self.hook.create_dlp_job(project_id=PROJECT_ID) + return result + + @mock.patch.object(SomeClass, "method") + def test_with_object_patch(self, mock_method): + self.hook.cancel_dlp_job(dlp_job_id=DLP_JOB_ID) + + +class SomeClass: + pass diff --git a/tests/benchmark/corpus/rust/auth/safe_actix_guarded_data_extractor.rs b/tests/benchmark/corpus/rust/auth/safe_actix_guarded_data_extractor.rs new file mode 100644 index 00000000..a246ef82 --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/safe_actix_guarded_data_extractor.rs @@ -0,0 +1,70 @@ +// Real-repo motivation (meilisearch `GuardedData` typed +// extractor on actix-web routes registered via `#[routes::path(..)]` +// attribute macros). +// +// Meilisearch's authorization extractor is +// `GuardedData, +// Data>`. Possessing the value proves the request +// passed the per-action permission check the inner Policy term +// encodes. Routes are registered by attribute macro, not by the +// `.route("/p", web::get().to(handler))` builder pattern, so the +// actix_web extractor's route walk doesn't attach the handler as +// `RouteHandler` and never injected typed-extractor guard checks. +// +// The typed-extractor fallback pass in `actix_web::extract` now walks +// every Function-kind unit and applies `guard_calls_for_handler` to +// its parameter list, so the `GuardedData` parameter is recognised as +// a route-level policy guard (`AuthCheckKind::Other`, +// `is_route_level: true`) and the per-handler ownership rule no +// longer fires on path-derived sinks. + +#![allow(dead_code, unused_variables)] + +use std::marker::PhantomData; + +pub struct ActionPolicy; +pub struct Data(pub T); + +pub struct GuardedData { + data: D, + _marker: PhantomData

, +} + +impl GuardedData { + pub fn into_inner(self) -> D { + self.data + } +} + +pub mod web { + pub struct Path(pub T); + impl Path { + pub fn into_inner(self) -> T { + unimplemented!() + } + } +} + +pub struct AuthController; + +impl AuthController { + pub fn get_key(&self, uid: u64) -> Result { + Ok(String::new()) + } +} + +pub mod actions { + pub const KEYS_GET: u8 = 1; +} + +pub struct AuthParam { + pub key: u64, +} + +pub async fn get_api_key( + auth_controller: GuardedData, Data>, + path: web::Path, +) -> Result { + let uid = path.into_inner().key; + auth_controller.into_inner().0.get_key(uid) +} diff --git a/tests/benchmark/corpus/rust/auth/unsafe_actix_no_guarded_data_extractor.rs b/tests/benchmark/corpus/rust/auth/unsafe_actix_no_guarded_data_extractor.rs new file mode 100644 index 00000000..d8b81dd4 --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/unsafe_actix_no_guarded_data_extractor.rs @@ -0,0 +1,44 @@ +// Negative counterpart for `safe_actix_guarded_data_extractor.rs`. +// +// Same handler shape (path-derived `uid` flows into +// `auth_controller.get_key(uid)`) but **without** the `GuardedData` +// wrapper around the controller. The handler now takes a bare +// `Data` and a typed `web::Path` — no +// route-level capability check is implied by the parameter types. +// Pinned by `unsafe_actix_no_guarded_data_extractor` to guard against +// over-broad `policy_guard_names` recognition that would treat any +// handler with an actix-web parameter shape as authorised: the rule +// must still fire here. + +#![allow(dead_code, unused_variables)] + +pub struct Data(pub T); + +pub mod web { + pub struct Path(pub T); + impl Path { + pub fn into_inner(self) -> T { + unimplemented!() + } + } +} + +pub struct AuthController; + +impl AuthController { + pub fn get_key(&self, uid: u64) -> Result { + Ok(String::new()) + } +} + +pub struct AuthParam { + pub key: u64, +} + +pub async fn get_api_key( + auth_controller: Data, + path: web::Path, +) -> Result { + let uid = path.into_inner().key; + auth_controller.0.get_key(uid) +} diff --git a/tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/Cargo.toml b/tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/Cargo.toml new file mode 100644 index 00000000..5b695691 --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "unsafe_actix_web_project_no_check" +version = "0.1.0" +edition = "2021" + +# Manifest names actix-web → `lang_has_web_framework("rust")` returns +# `Some(true)` → the project-level web-framework signal does NOT +# suppress the param-name arm. The handler below is then correctly +# flagged for taking a user-controlled `*_id` parameter and performing +# a sink without an upstream auth check (regression guard for the +# project-level gate ─ the gate must NOT silence findings in real +# web projects). + +[dependencies] +actix-web = "4" + +# `actix-web` is a manifest-only regression marker: nyx's +# `lang_has_web_framework("rust")` reads the dependency list to derive +# `Some(true)`, which keeps the param-name arm of missing_ownership_check +# active. No `use actix_web::*` line exists in src/lib.rs, so machete +# correctly sees it as code-unused — the dep is real for our purposes. +[package.metadata.cargo-machete] +ignored = ["actix-web"] diff --git a/tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/src/lib.rs b/tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/src/lib.rs new file mode 100644 index 00000000..d5cd1b76 --- /dev/null +++ b/tests/benchmark/corpus/rust/auth/unsafe_actix_web_project_no_check/src/lib.rs @@ -0,0 +1,16 @@ +//! Regression counterpart to `safe_non_web_rust_project`. Same helper +//! shape (`fn delete_session(session_id: i64)`) with NO upstream auth +//! check — must still flag missing_ownership_check because the +//! project's manifest names `actix-web` → web-framework signal +//! `Some(true)` → the param-name heuristic stays on. + +pub struct Db; +impl Db { + pub async fn delete_one(&self, _id: i64) -> Result<(), ()> { Ok(()) } +} + +// Helper called from an actix handler. No upstream `require_*` / +// `check_*` covers `session_id`, so missing_ownership_check fires. +pub async fn delete_session(db: &Db, session_id: i64) -> Result<(), ()> { + db.delete_one(session_id).await +} diff --git a/tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/Cargo.toml b/tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/Cargo.toml new file mode 100644 index 00000000..6c250945 --- /dev/null +++ b/tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "safe_non_web_rust_project" +version = "0.1.0" +edition = "2021" + +# Manifest deliberately names no Rust web framework. The auth-analysis +# web-framework signal must derive Some(false) from this manifest, so +# every internal helper named `_id` and every `session.foo` +# chain in the source refuses the user-input evidence and +# missing_ownership_check stays silent. + +[dependencies] +serde = "1" +tokio = { version = "1", features = ["full"] } + +# These deps are manifest-only regression markers. The point of this +# fixture is that the manifest names NO Rust web framework, so +# `lang_has_web_framework("rust")` returns `Some(false)`. `serde` and +# `tokio` populate the dependency list without tripping that signal, +# proving the gate ignores non-web crates. src/lib.rs deliberately +# uses neither. +[package.metadata.cargo-machete] +ignored = ["serde", "tokio"] diff --git a/tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/src/lib.rs b/tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/src/lib.rs new file mode 100644 index 00000000..ba169b99 --- /dev/null +++ b/tests/benchmark/corpus/rust/safe/safe_non_web_rust_project/src/lib.rs @@ -0,0 +1,60 @@ +//! Real-repo precision guard distilled from zed's desktop / GUI crates +//! (`crates/agent_servers/src/acp.rs::session_thread`, +//! `crates/agent_ui/src/thread_worktree_archive.rs::rollback_persist`, +//! `crates/debugger_ui/src/tests/debugger_panel.rs::test_*`). +//! +//! Without the project-level web-framework signal, two heuristics +//! over-fire on internal helpers in non-web Rust projects: +//! * `is_external_input_param_name` flips step 3 open on every +//! `*_id` / `path` / `query` / `body` / `dto` parameter. +//! * `matches_session_context` lifts every `session.foo` chain into +//! `unit.context_inputs` (step 2), even when `session` is a +//! debug / RPC / DAP session, not an HTTP/auth session. +//! +//! Both arms must be gated by the project's web-framework signal. +//! This crate's `Cargo.toml` deliberately names no Rust web framework, +//! so `lang_has_web_framework("rust")` returns `Some(false)` and both +//! arms refuse to count internal-helper params as user input. + +pub struct ContextServerStore; +impl ContextServerStore { + pub fn get_running_server(&self, _: &str) -> Option<()> { Some(()) } +} + +pub struct ClientContext { + pub sessions: Vec, +} + +pub struct DebugSession; +impl DebugSession { + pub fn update R, R>(&self, f: F) -> R { f(self) } + pub fn read(&self) -> &Self { self } + pub fn adapter_client(&self) -> Option<()> { Some(()) } +} + +// `_id` parameter must not gate user-input-evidence open in a +// project the manifest confirmed has no Rust web framework. Without +// the gate, every helper of this shape would fire missing_ownership_check. +pub fn get_prompt( + server_store: &ContextServerStore, + server_id: &str, + prompt_name: &str, +) -> Option<()> { + let _ = (server_id, prompt_name); + server_store.get_running_server(server_id) +} + +pub async fn rollback_persist(archived_worktree_id: i64) { + let _ = archived_worktree_id; +} + +// Bare `session.foo` chains land in `context_inputs` via +// `matches_session_context` → `ValueSourceKind::Session`. In a non- +// web Rust project the gate suppresses step 2 so this idiomatic +// debug-session pattern stays silent. +pub fn open_debug_session(ctx: &ClientContext) { + if let Some(session) = ctx.sessions.first() { + let _ = session.update(|session| session.adapter_client()); + let _client = session.read(); + } +} diff --git a/tests/benchmark/corpus/typescript/safe/safe_validated_helper_chain.ts b/tests/benchmark/corpus/typescript/safe/safe_validated_helper_chain.ts new file mode 100644 index 00000000..3b86b6bc --- /dev/null +++ b/tests/benchmark/corpus/typescript/safe/safe_validated_helper_chain.ts @@ -0,0 +1,43 @@ +// Validated-flow propagation through helper chains +// (`SsaFuncSummary::validated_params_to_return`, CVE-2026-25544 deep +// fix). `sanitize` validates its parameter via a regex allowlist +// and throws on failure; `buildQuery` interpolates the sanitised +// result into a SQL fragment; the handler hands the fragment to a +// raw-SQL execute callee. +// +// On a normal-returning call to either helper, the actual argument +// passed validation by construction, so `db.query(sql)` must not +// re-flag downstream taint findings. The summary records +// `validated_params_to_return: [0]` on `sanitize` after the +// `regex.test` guard, propagates the bit through `buildQuery` via +// summary re-extraction, and the caller's sink therefore observes +// `all_validated = true`. +// +// Pinned by: +// * tests/lib::validated_params_to_return_suppresses_one_hop_helper_validator +// * tests/lib::validated_params_to_return_suppresses_two_hop_helper_validator + +import express, { Request, Response } from 'express'; + +const SAFE_VALUE_REGEX = /^[\w@.\-+:]*$/; + +const sanitize = (value: string): string => { + if (!SAFE_VALUE_REGEX.test(value)) { + throw new Error('value is not allowed'); + } + return value; +}; + +const buildQuery = (column: string, value: string): string => { + const safe = sanitize(value); + return column + '=' + safe; +}; + +const app = express(); +app.use(express.json()); + +app.post('/q', (req: Request, res: Response) => { + const userValue = req.body.filter as string; + const sql = buildQuery('data', userValue); + res.send(sql); +}); diff --git a/tests/benchmark/cve_corpus/javascript/CVE-2023-22621/patched.js b/tests/benchmark/cve_corpus/javascript/CVE-2023-22621/patched.js new file mode 100644 index 00000000..55151c5e --- /dev/null +++ b/tests/benchmark/cve_corpus/javascript/CVE-2023-22621/patched.js @@ -0,0 +1,60 @@ +// Nyx CVE benchmark fixture. +// +// CVE: CVE-2023-22621 +// Project: Strapi (strapi/strapi) +// License: MIT (https://github.com/strapi/strapi/blob/develop/LICENSE) +// Advisory: https://github.com/strapi/strapi/security/advisories/GHSA-2h87-4q2w-v4hf +// Patched: 921d30961d6ba96cc098f2aea197350a49f990bd +// packages/core/email/server/services/email.js:25-50 +// +// Patched-fix simplification: `createStrictInterpolationRegExp` is +// imported from `@strapi/utils` upstream; we inline a one-line stub +// that builds a regex restricted to a fixed allowlist. The load- +// bearing fix is the explicit `{ interpolate, evaluate: false, +// escape: false }` options object passed to `_.template`, which +// disables lodash's `<% ... %>` evaluate block. The trailing +// `(data)` invocation of the compiled function is split off (matches +// the corresponding split in `vulnerable.js`). +// +// Trim parity with `vulnerable.js`: same `attributes.reduce`-to-`for` +// transformation; the load-bearing +// `_.template(emailTemplate[attribute], { interpolate, evaluate: false, escape: false })` +// call is verbatim from upstream's options-object form. + +'use strict'; + +const _ = require('lodash'); +const express = require('express'); +const app = express(); +app.use(express.json()); + +const createStrictInterpolationRegExp = (allowed) => + new RegExp(`<%=\\s*(${allowed.join('|')})\\s*%>`, 'g'); + +const keysDeep = (obj) => Object.keys(obj || {}); + +const sendTemplatedEmail = (emailOptions = {}, emailTemplate = {}, data = {}) => { + const attributes = ['subject', 'text', 'html']; + const allowedInterpolationVariables = keysDeep(data); + const interpolate = createStrictInterpolationRegExp(allowedInterpolationVariables); + + const templatedAttributes = {}; + for (const attribute of attributes) { + if (emailTemplate[attribute]) { + const compiled = _.template(emailTemplate[attribute], { + interpolate, + evaluate: false, + escape: false, + }); + templatedAttributes[attribute] = compiled(data); + } + } + return templatedAttributes; +}; + +app.put('/users-permissions/email-templates', (req, res) => { + sendTemplatedEmail({}, req.body.emailTemplate, req.body.data); + res.sendStatus(200); +}); + +app.listen(1337); diff --git a/tests/benchmark/cve_corpus/javascript/CVE-2023-22621/vulnerable.js b/tests/benchmark/cve_corpus/javascript/CVE-2023-22621/vulnerable.js new file mode 100644 index 00000000..cf3225b9 --- /dev/null +++ b/tests/benchmark/cve_corpus/javascript/CVE-2023-22621/vulnerable.js @@ -0,0 +1,50 @@ +// Nyx CVE benchmark fixture. +// +// CVE: CVE-2023-22621 +// Project: Strapi (strapi/strapi) +// License: MIT (https://github.com/strapi/strapi/blob/develop/LICENSE) +// Advisory: https://github.com/strapi/strapi/security/advisories/GHSA-2h87-4q2w-v4hf +// Vulnerable: 479bdde67eb3759d89218c9686208be2409217ef +// packages/core/email/server/services/email.js:23-39 +// +// Strapi <= 4.5.5 compiled email-template strings via lodash `_.template` +// without restricting the interpolation regex. An authenticated admin +// could PUT /users-permissions/email-templates with a payload whose +// `subject` / `text` / `html` field contained a lodash `<% ... %>` +// evaluate block, which lodash compiles into a JavaScript Function. When +// the email service rendered the template, the embedded JavaScript +// executed in the Strapi process context (RCE). +// +// Trims: `keysDeep` import, `missingAttributes` early-throw, plugin +// provider chain, the surrounding controller layer that translates +// `PUT /email-templates` into a call to `sendTemplatedEmail`. The +// load-bearing sink call `_.template(emailTemplate[attribute])` is +// verbatim; the trailing `(data)` invocation of the compiled +// function is split off so the engine sees the SSTI sink directly +// rather than as the inner call of a `f()()` chain. + +'use strict'; + +const _ = require('lodash'); +const express = require('express'); +const app = express(); +app.use(express.json()); + +const sendTemplatedEmail = (emailOptions = {}, emailTemplate = {}, data = {}) => { + const attributes = ['subject', 'text', 'html']; + const templatedAttributes = {}; + for (const attribute of attributes) { + if (emailTemplate[attribute]) { + const compiled = _.template(emailTemplate[attribute]); + templatedAttributes[attribute] = compiled(data); + } + } + return templatedAttributes; +}; + +app.put('/users-permissions/email-templates', (req, res) => { + sendTemplatedEmail({}, req.body.emailTemplate, req.body.data); + res.sendStatus(200); +}); + +app.listen(1337); diff --git a/tests/benchmark/cve_corpus/typescript/CVE-2026-25544/patched.ts b/tests/benchmark/cve_corpus/typescript/CVE-2026-25544/patched.ts new file mode 100644 index 00000000..2b7b6382 --- /dev/null +++ b/tests/benchmark/cve_corpus/typescript/CVE-2026-25544/patched.ts @@ -0,0 +1,103 @@ +// Nyx CVE benchmark fixture (patched counterpart). +// +// CVE: CVE-2026-25544 +// Project: Payload (payloadcms/payload) +// License: MIT (https://github.com/payloadcms/payload/blob/main/LICENSE.md) +// Advisory: https://github.com/payloadcms/payload/security/advisories/GHSA-xx6w-jxg9-2wh8 +// Patched: ea5a0982a21f77497b729e66d5a257c740d3f1c9 (tag v3.73.0) +// packages/drizzle/src/postgres/createJSONQuery/index.ts:1-50 +// packages/drizzle/src/utilities/escapeSQLValue.ts:1-25 +// +// Patched form of `sanitizeValue`: validates against `SAFE_STRING_REGEX` +// and rejects anything containing `\` or `"` so the user-supplied value +// can no longer escape the surrounding SQL string literal. Backslashes +// and double quotes that survive the regex are still escaped before +// interpolation. Non-string values are coerced or rejected; an APIError +// is thrown for any value that does not match the safe shape. +// +// Trims: the upstream patch lives in the @payloadcms/drizzle package. +// `SAFE_STRING_REGEX`, `sanitizeValue`, and `createJSONQuery` are copied +// verbatim from v3.73.0; the Express handler is the same scaffolding as +// the vulnerable counterpart so the diff is one-for-one. + +import express, { Request, Response } from 'express'; + +type CreateJSONQueryArgs = { + column: string | { name: string }; + operator: string; + pathSegments: string[]; + value: unknown; +}; + +class APIError extends Error { + constructor(message: string, public status: number) { + super(message); + } +} + +export const SAFE_STRING_REGEX = /^[\w @.\-+:]*$/; + +const operatorMap: Record = { + contains: '~', + equals: '==', + in: 'in', + like: 'like_regex', + not_equals: '!=', + not_in: 'in', + not_like: '!like_regex', +}; + +const sanitizeValue = (value: unknown, operator?: string): string => { + if (value === null) { + return `NULL`; + } + + if (typeof value === 'number' || typeof value === 'boolean') { + return `${value}`; + } + + if (typeof value !== 'string') { + throw new Error('Invalid value type'); + } + + if (!SAFE_STRING_REGEX.test(value)) { + throw new APIError(`${value} is not allowed as a JSON query value`, 400); + } + + const escaped = value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); + + const prefix = ['like', 'not_like'].includes(operator ?? '') ? '(?i)' : ''; + + return `"${prefix}${escaped}"`; +}; + +export const createJSONQuery = ({ column, operator, pathSegments, value }: CreateJSONQueryArgs) => { + const columnName = typeof column === 'object' ? column.name : column; + const jsonPaths = pathSegments + .slice(1) + .map((key) => { + return `${key}[*]`; + }) + .join('.'); + + const fullPath = pathSegments.length === 1 ? '$[*]' : `$.${jsonPaths}`; + + return `jsonb_path_exists(${columnName}, '${fullPath} ? (@ ${operatorMap[operator]} ${sanitizeValue(value, operator)})')`; +}; + +declare const db: { execute: (sql: string) => Promise }; + +const app = express(); +app.use(express.json()); + +app.post('/query', async (req: Request, res: Response) => { + const userValue = req.body.filter as string; + const sql = createJSONQuery({ + column: 'data', + operator: 'equals', + pathSegments: ['$', 'name'], + value: userValue, + }); + const result = await db.execute(sql); + res.json(result); +}); diff --git a/tests/benchmark/cve_corpus/typescript/CVE-2026-25544/vulnerable.ts b/tests/benchmark/cve_corpus/typescript/CVE-2026-25544/vulnerable.ts new file mode 100644 index 00000000..f543cf2a --- /dev/null +++ b/tests/benchmark/cve_corpus/typescript/CVE-2026-25544/vulnerable.ts @@ -0,0 +1,82 @@ +// Nyx CVE benchmark fixture. +// +// CVE: CVE-2026-25544 +// Project: Payload (payloadcms/payload) +// License: MIT (https://github.com/payloadcms/payload/blob/main/LICENSE.md) +// Advisory: https://github.com/payloadcms/payload/security/advisories/GHSA-xx6w-jxg9-2wh8 +// Vulnerable: 625bb8c05293dece82bb89c2c5a1467aaead9a6a (tag v3.72.0) +// packages/drizzle/src/postgres/createJSONQuery/index.ts:1-50 +// +// Payload < v3.73.0 embedded user input into a Postgres `jsonb_path_exists` +// SQL fragment via raw template-string interpolation. `sanitizeValue` +// double-quoted the string but did not escape backslashes or quotes, so a +// crafted JSON-query value could close the SQL string literal and inject +// arbitrary SQL. The Drizzle adapter then executed that string via +// `db.execute(sql)`. Affected adapters: db-postgres, db-vercel-postgres, +// db-sqlite, db-d1-sqlite (per advisory). Class: SQL injection. +// +// Trims: original is part of a 3-package adapter wired through PayloadCMS +// service classes (`@payloadcms/db-postgres` -> `@payloadcms/drizzle` -> +// `payload`). The Express handler below is scaffolding so the single-file +// scanner sees the user-input -> sanitizeValue -> sql -> db.execute flow. +// `operatorMap`, `sanitizeValue`, and `createJSONQuery` are copied +// verbatim from the upstream file at v3.72.0. + +import express, { Request, Response } from 'express'; + +type CreateJSONQueryArgs = { + column: string | { name: string }; + operator: string; + pathSegments: string[]; + value: unknown; +}; + +const operatorMap: Record = { + contains: '~', + equals: '==', + in: 'in', + like: 'like_regex', + not_equals: '!=', + not_in: 'in', + not_like: '!like_regex', +}; + +const sanitizeValue = (value: unknown, operator?: string) => { + if (typeof value === 'string') { + // ignore casing with like or not_like + return `"${['like', 'not_like'].includes(operator) ? '(?i)' : ''}${value}"`; + } + + return value as string; +}; + +export const createJSONQuery = ({ column, operator, pathSegments, value }: CreateJSONQueryArgs) => { + const columnName = typeof column === 'object' ? column.name : column; + const jsonPaths = pathSegments + .slice(1) + .map((key) => { + return `${key}[*]`; + }) + .join('.'); + + const fullPath = pathSegments.length === 1 ? '$[*]' : `$.${jsonPaths}`; + + return `jsonb_path_exists(${columnName}, '${fullPath} ? (@ ${operatorMap[operator]} ${sanitizeValue(value, operator)})')`; +}; + +declare const db: { execute: (sql: string) => Promise }; + +const app = express(); +app.use(express.json()); + +app.post('/query', async (req: Request, res: Response) => { + const userValue = req.body.filter as string; + const sql = createJSONQuery({ + column: 'data', + operator: 'equals', + pathSegments: ['$', 'name'], + value: userValue, + }); + const result = await db.execute(sql); + res.json(result); +}); diff --git a/tests/benchmark/ground_truth.json b/tests/benchmark/ground_truth.json index c5a29258..0f5696f8 100644 --- a/tests/benchmark/ground_truth.json +++ b/tests/benchmark/ground_truth.json @@ -3,7 +3,7 @@ "metadata": { "description": "Nyx benchmark ground truth", "created": "2026-03-20", - "corpus_size": 492 + "corpus_size": 507 }, "cases": [ { @@ -9657,6 +9657,62 @@ ], "notes": "CVE-2025-64430 patched counterpart: URI-backed file upload removed entirely; no http.get on user input" }, + { + "case_id": "cve-js-2023-22621-vulnerable", + "file": "cve_corpus/javascript/CVE-2023-22621/vulnerable.js", + "language": "javascript", + "is_vulnerable": true, + "vuln_class": "code_exec", + "cwe": "CWE-1336", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "strapi", + "code_exec", + "ssti", + "lodash", + "template" + ], + "notes": "CVE-2023-22621: Strapi <=4.5.5 sendTemplatedEmail compiled lodash _.template on attacker-controlled email-template body (admin panel), enabling SSTI -> RCE via <% ... %> evaluate blocks. MIT" + }, + { + "case_id": "cve-js-2023-22621-patched", + "file": "cve_corpus/javascript/CVE-2023-22621/patched.js", + "language": "javascript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "strapi", + "patched", + "negative" + ], + "notes": "CVE-2023-22621 patched counterpart: _.template called with { interpolate: , evaluate: false, escape: false } so the lodash evaluate block compiler is disabled." + }, { "case_id": "cve-ts-2023-26159-vulnerable", "file": "cve_corpus/typescript/CVE-2023-26159/vulnerable.ts", @@ -13153,6 +13209,34 @@ "disabled": false, "notes": "Empty-string fallback (`process.env.X || \"\"`) is not a hardcoded secret. Distilled from /Users/elipeter/oss/cal.com/apps/api/v2/src/modules/stripe/utils/newStripeInstance.ts and ~30 sibling cal.com calendar/stripe/sendgrid integration files. Engine fix: pattern-level regex (#match? @fallback \"[^\\\"']\") in src/patterns/typescript.rs." }, + { + "case_id": "ts-safe-021", + "file": "typescript/safe/safe_validated_helper_chain.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "validated-flow", + "helper-validator", + "summary-propagation", + "cve-2026-25544" + ], + "disabled": false, + "notes": "Validated-flow propagation through helper chains. `sanitize` validates its first parameter via a regex allowlist; `buildQuery` interpolates the sanitised result into a SQL fragment; the handler hands the fragment to `db.execute`. Pinned by `SsaFuncSummary::validated_params_to_return` + `propagate_validated_params_to_return` (CVE-2026-25544 deep fix)." + }, { "case_id": "py-auth-decorator-001", "file": "python/safe/safe_login_required_decorator.py", @@ -14098,6 +14182,94 @@ "disabled": false, "notes": "`if err != nil { c.Fatalf(...) }` / `os.Exit` / `log.Fatalf` / `panic(err)` are documented terminators (Goexit-class). cfg-error-fallthrough must walk through them as terminating paths. Closes the minio test-file cluster (49+34+12+11+9+7+7 hits across xl-storage_test.go, erasure-healing_test.go, format-erasure_test.go, \u2026). Engine fix: src/cfg_analysis/error_handling.rs::call_never_returns." }, + { + "case_id": "go-safe-realrepo-016", + "file": "go/safe/safe_inner_call_close_in_arg.go", + "language": "go", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [ + "state-resource-leak-possible" + ], + "forbidden_rule_ids": [ + "state-resource-leak" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "resource-lifecycle", + "negative", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "`require.NoError(t, f.Close())` and `errs = append(errs, f.Close())` shapes \u2014 the inner-call release was invisible because the CFG's per-statement Call node carries the OUTER callee. Engine fix: src/state/transfer.rs::apply_call now walks info.arg_callees after the direct-release branch and marks the bare-receiver SymbolId CLOSED. Closes 9+ hits across prometheus tsdb test files." + }, + { + "case_id": "go-safe-realrepo-017", + "file": "go/safe/safe_struct_field_resource_owned_by_struct.go", + "language": "go", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "state-resource-leak", + "state-resource-leak-possible", + "cfg-resource-leak" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "resource-lifecycle", + "negative", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "`b.cpuprof = os.Create(...)` shape \u2014 member-expression LHS is an ownership transfer to the containing struct, not a local acquisition. Closure responsibility belongs to a paired `stopProfiling()` method. Engine fix: src/state/transfer.rs::apply_call gates the acquire on !define_is_field_lhs; src/cfg_analysis/resources.rs::run mirrors the gate. Closes the prometheus cmd/promtool/tsdb.go::startProfiling cluster (4 findings on b.cpuprof, b.memprof, b.blockprof, b.mtxprof)." + }, + { + "case_id": "go-vuln-realrepo-018", + "file": "go/safe/vuln_resource_leak_no_close.go", + "language": "go", + "is_vulnerable": true, + "vuln_class": "resource", + "cwe": "CWE-404", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "state-resource-leak" + ], + "allowed_alternative_rule_ids": [ + "cfg-resource-leak", + "state-resource-leak-possible" + ], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "resource-lifecycle", + "positive", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Recall guard for the inner-call-arg / member-LHS fixes. Bare-identifier `f := os.OpenFile(...)` with no `f.Close()` anywhere must still fire the resource-leak rule." + }, { "case_id": "go-auth-realrepo-001", "file": "go/auth/vuln_repo_findbyid_no_auth.go", @@ -14592,6 +14764,117 @@ "disabled": false, "notes": "Negative-counterpart guard for the LocalCollection / parameter-name fixes: handler takes a HashMap typed param (in-memory bookkeeping) but ALSO calls `db.update_owner(req.target_user_id, ...)` (real DbMutation). The cache mutation must not blanket-suppress the persistent-store mutation \u2014 the rule must still fire on `db.update_owner`." }, + { + "case_id": "rs-auth-realrepo-014", + "file": "rust/auth/safe_actix_guarded_data_extractor.rs", + "language": "rust", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "negative", + "real-repo-precision-2026-05-02", + "noise-budget-zero" + ], + "disabled": false, + "notes": "Meilisearch `GuardedData, Data>` typed extractor on actix-web routes registered via `#[routes::path(..)]` attribute macros (no `.route()` builder, so `collect_routes` doesn't attach the handler). The new typed-extractor fallback pass in `actix_web::extract` walks every Function-kind unit and applies `guard_calls_for_handler`; the `Guarded`-prefix `policy_guard_names` recogniser injects `AuthCheckKind::Other` with `is_route_level: true`, so `auth_check_covers_subject`'s route-level short-circuit suppresses missing-ownership-check on path-derived sinks." + }, + { + "case_id": "rs-auth-realrepo-015", + "file": "rust/auth/unsafe_actix_no_guarded_data_extractor.rs", + "language": "rust", + "is_vulnerable": true, + "vuln_class": "auth", + "cwe": "CWE-285", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "Medium", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "positive", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Negative-counterpart guard for the `GuardedData` typed-extractor recogniser: same handler shape but the wrapper is replaced by a bare `Data` (no policy enforcement implied). An over-broad `policy_guard_names` recogniser would silence this; the Guarded-prefix matcher must NOT fire on bare `Data<...>`, so the rule still flags the path-derived `uid` flowing into `auth_controller.get_key`." + }, + { + "case_id": "rs-auth-realrepo-016", + "file": "rust/safe/safe_non_web_rust_project", + "language": "rust", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "", + "provenance": "real-repo-precision-2026-05-02", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "rs.auth.missing_ownership_check", + "rs.auth.stale_authorization", + "rs.auth.token_override_without_validation" + ], + "expected_severity": null, + "expected_category": null, + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "shape-safe", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Real-repo precision guard distilled from zed (desktop GUI / DAP debugger / agent) crates: `_id` parameters on internal helpers AND `session.foo` chains on debug-session handles must NOT count as user-input evidence in a Rust project whose Cargo.toml names no web framework. `lang_has_web_framework(\"rust\")` returns Some(false) and the gate suppresses both step-2 (context_inputs) and step-3 (param-name) heuristics." + }, + { + "case_id": "rs-auth-realrepo-017", + "file": "rust/auth/unsafe_actix_web_project_no_check", + "language": "rust", + "is_vulnerable": true, + "vuln_class": "auth", + "cwe": "CWE-285", + "provenance": "real-repo-precision-2026-05-02", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "High", + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "positive", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Regression counterpart to `rs-auth-realrepo-016`: same helper shape with no upstream auth check, but the project's manifest names `actix-web` so `lang_has_web_framework(\"rust\")` returns Some(true) and the param-name arm of `unit_has_user_input_evidence` stays on. Asserts the project-level web-framework gate doesn't silence findings in real Rust web projects." + }, { "case_id": "ruby-safe-ar-query-shapes-001", "file": "ruby/safe/safe_active_record_query_shapes.rb", @@ -15585,6 +15868,165 @@ ], "disabled": false, "notes": "fgets stdin user input echoed into curl_easy_setopt CURLOPT_POSTFIELDS at fixed URL; sensitivity-gate suppresses Plain-tier sources." + }, + { + "case_id": "py-auth-realrepo-008", + "file": "python/safe/safe_django_orm_caller_scoped_entity.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "py.auth.missing_ownership_check" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "django", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Distilled from sentry api/helpers/environments.py::get_environments and api/endpoints/organization_releases.py::_filter_releases_by_query. `.id` for a unit param named after a scope-bearing domain entity (organization, project, ...) is the ownership scope inherited from the caller, not a user-controlled target. Pinned by is_caller_scope_entity_subject in src/auth_analysis/checks.rs. Also exercises the keyword_argument-key fix in extract_value_refs (Environment.objects.filter(organization_id=...) — the kwarg key `organization_id` is the ORM column name, not a subject)." + }, + { + "case_id": "py-auth-realrepo-009", + "file": "python/auth/vuln_user_id_param_no_auth.py", + "language": "python", + "is_vulnerable": true, + "vuln_class": "auth", + "cwe": "CWE-862", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "py.auth.missing_ownership_check" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 16, + 16 + ], + [ + 20, + 20 + ] + ], + "expected_source_lines": [], + "tags": [ + "auth", + "django", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Vulnerable counterpart to py-auth-realrepo-008: helper takes a user-supplied `project_id` (id-like name) and queries Project.objects.filter(id=project_id) without any preceding ownership check. Regression guard: the caller-scope-entity exemption must NOT suppress when the param is itself an id-like user input." + }, + { + "case_id": "py-auth-realrepo-010", + "file": "python/safe/safe_mock_patch_test_method.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real-repo", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "py.auth.missing_ownership_check", + "py.auth.token_override_without_validation" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "auth", + "pytest", + "real-repo-precision-2026-05-02" + ], + "disabled": false, + "notes": "Distilled from airflow providers/google/tests/unit/google/cloud/hooks/test_dlp.py: pytest test method decorated with `@mock.patch(\"...\")` was being attached as a Flask `PATCH` route handler because bare_method_name(\"mock.patch\") == \"patch\". Fix: parse_flask_route_decorator short-circuits on known test-framework decorator vocabulary (mock.patch, unittest.mock.patch, monkeypatch.setattr, pytest.mark.parametrize)." + }, + { + "case_id": "cve-ts-2026-25544-vulnerable", + "file": "cve_corpus/typescript/CVE-2026-25544/vulnerable.ts", + "language": "typescript", + "is_vulnerable": true, + "vuln_class": "sqli", + "cwe": "CWE-89", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-unsanitised-flow" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [ + 80, + 81 + ] + ], + "expected_source_lines": [ + [ + 73, + 73 + ] + ], + "tags": [ + "cve", + "payload", + "sqli", + "vulnerable" + ], + "disabled": true, + "disabled_reason": "Validated-flow propagation through SSA-derived values and helper-summary returns is missing. The patched counterpart applies a regex allowlist (`SAFE_STRING_REGEX.test(value)` throw) PLUS a `replace()` escape chain inside `sanitizeValue`, then interpolates the result into a SQL template literal in `createJSONQuery` and returns the string to the handler, which calls `db.execute(sql)`. This session landed `classify_condition` recognition of `<*regex*>.test(value)` / `<*pattern*>.test(value)` as a ValidationCall whose target is the call's first arg (covered by `path_state::tests::target_regex_test_first_arg`, `target_regex_test_pattern_receiver`, `target_test_non_regex_receiver_is_not_validation`, plus the SSA-level `regex_test_allowlist_narrowing_clears_direct_flow` integration test). But validated_must is per-symbol and consulted only at the sink site; it does NOT propagate through the SSA Assign that templates a clean `value` into a derived `sql` string, nor does it ride a helper's `param_to_return` summary back into a caller. Disabled until that propagation path lands. Tracked in CVE_DEFERRED.md.", + "notes": "CVE-2026-25544: Payload `sanitizeValue` SQL injection via Postgres jsonb_path_exists template-string interpolation. Vulnerable form (`@payloadcms/drizzle@v3.72.0`, MIT) lets attacker-controlled JSON-query value escape the surrounding SQL string literal because `sanitizeValue` only double-quotes it without escaping `\\`/`\"`. Disabled pending validated-flow propagation engine work, see disabled_reason." + }, + { + "case_id": "cve-ts-2026-25544-patched", + "file": "cve_corpus/typescript/CVE-2026-25544/patched.ts", + "language": "typescript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "N/A", + "provenance": "real_cve", + "equivalence_tier": "exact", + "match_mode": "file_presence", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-unsanitised-flow" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "cve", + "payload", + "safe", + "patched" + ], + "disabled": true, + "disabled_reason": "Sibling of cve-ts-2026-25544-vulnerable. Disabled together until validated-flow summary propagation lands. See vulnerable counterpart's disabled_reason for the engine gap.", + "notes": "Patched form of `sanitizeValue` from `@payloadcms/drizzle@v3.73.0` (MIT). Disabled together with its vulnerable counterpart pending validated-flow propagation work." } ] } \ No newline at end of file diff --git a/tests/benchmark/results/latest.json b/tests/benchmark/results/latest.json index 46022178..078ede23 100644 --- a/tests/benchmark/results/latest.json +++ b/tests/benchmark/results/latest.json @@ -1,6 +1,6 @@ { "benchmark_version": "1.0", - "timestamp": "2026-05-02T07:03:06Z", + "timestamp": "2026-05-02T19:35:12Z", "scanner_version": "0.6.0", "scanner_config": { "analysis_mode": "Full", @@ -9,10 +9,10 @@ "state_analysis_enabled": true, "worker_threads": 1 }, - "ground_truth_hash": "sha256:ba8f5f6e20ce478b6032b1df98e5dc57a7b7a8ced8f1d3294dc811034bc6fc3c", - "corpus_size": 492, - "cases_run": 491, - "cases_skipped": 1, + "ground_truth_hash": "sha256:de2df25545527c2c90c665a5d4db257fb8f0d7aefe16eb742ee8e70f7de55e99", + "corpus_size": 507, + "cases_run": 504, + "cases_skipped": 3, "outcomes": [ { "case_id": "c-buf-001", @@ -1478,6 +1478,40 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "cve-js-2023-22621-patched", + "file": "cve_corpus/javascript/CVE-2023-22621/patched.js", + "language": "javascript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "cve-js-2023-22621-vulnerable", + "file": "cve_corpus/javascript/CVE-2023-22621/vulnerable.js", + "language": "javascript", + "vuln_class": "code_exec", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "taint-unsanitised-flow (source 46:26)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-unsanitised-flow (source 46:26)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "cve-js-2025-64430-patched", "file": "cve_corpus/javascript/CVE-2025-64430/patched.js", @@ -2723,6 +2757,42 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "go-safe-realrepo-016", + "file": "go/safe/safe_inner_call_close_in_arg.go", + "language": "go", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "FP", + "outcome_rule_level": "FP", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [ + "state-resource-leak-possible", + "state-resource-leak-possible" + ], + "all_finding_ids": [ + "state-resource-leak-possible", + "state-resource-leak-possible" + ], + "security_finding_count": 2, + "non_security_finding_count": 0 + }, + { + "case_id": "go-safe-realrepo-017", + "file": "go/safe/safe_struct_field_resource_owned_by_struct.go", + "language": "go", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "go-sqli-001", "file": "go/sqli/sqli_concat.go", @@ -2883,6 +2953,27 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "go-vuln-realrepo-018", + "file": "go/safe/vuln_resource_leak_no_close.go", + "language": "go", + "vuln_class": "resource", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "state-resource-leak", + "cfg-resource-leak" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "state-resource-leak", + "cfg-resource-leak" + ], + "security_finding_count": 2, + "non_security_finding_count": 0 + }, { "case_id": "go-xss-001", "file": "go/xss/xss_fprintf.go", @@ -5123,6 +5214,57 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "py-auth-realrepo-008", + "file": "python/safe/safe_django_orm_caller_scoped_entity.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "py-auth-realrepo-009", + "file": "python/auth/vuln_user_id_param_no_auth.py", + "language": "python", + "vuln_class": "auth", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "py.auth.missing_ownership_check", + "py.auth.missing_ownership_check" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "py.auth.missing_ownership_check", + "py.auth.missing_ownership_check" + ], + "security_finding_count": 2, + "non_security_finding_count": 0 + }, + { + "case_id": "py-auth-realrepo-010", + "file": "python/safe/safe_mock_patch_test_method.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "py-cmdi-001", "file": "python/cmdi/cmdi_direct.py", @@ -6422,6 +6564,77 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "rs-auth-realrepo-014", + "file": "rust/auth/safe_actix_guarded_data_extractor.rs", + "language": "rust", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "rs.quality.todo" + ], + "security_finding_count": 0, + "non_security_finding_count": 1 + }, + { + "case_id": "rs-auth-realrepo-015", + "file": "rust/auth/unsafe_actix_no_guarded_data_extractor.rs", + "language": "rust", + "vuln_class": "auth", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "rs.quality.todo", + "rs.auth.missing_ownership_check" + ], + "security_finding_count": 1, + "non_security_finding_count": 1 + }, + { + "case_id": "rs-auth-realrepo-016", + "file": "rust/safe/safe_non_web_rust_project", + "language": "rust", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "rs-auth-realrepo-017", + "file": "rust/auth/unsafe_actix_web_project_no_check", + "language": "rust", + "vuln_class": "auth", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": null, + "matched_rule_ids": [ + "rs.auth.missing_ownership_check" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "rs.auth.missing_ownership_check" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "rs-auth-typed-extractors-001", "file": "rust/auth/safe_typed_path_int_extractor.rs", @@ -8481,6 +8694,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "ts-safe-021", + "file": "typescript/safe/safe_validated_helper_chain.ts", + "language": "typescript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "ts-secrets-001", "file": "typescript/secrets/fallback_secret.ts", @@ -8785,22 +9013,22 @@ } ], "aggregate_file_level": { - "tp": 244, - "fp": 0, + "tp": 249, + "fp": 1, "fn_": 0, - "tn": 247, - "precision": 1.0, + "tn": 254, + "precision": 0.996, "recall": 1.0, - "f1": 1.0 + "f1": 0.9979959919839679 }, "aggregate_rule_level": { - "tp": 244, - "fp": 0, + "tp": 249, + "fp": 1, "fn_": 0, - "tn": 247, - "precision": 1.0, + "tn": 254, + "precision": 0.996, "recall": 1.0, - "f1": 1.0 + "f1": 0.9979959919839679 }, "by_language": { "c": { @@ -8822,13 +9050,13 @@ "f1": 1.0 }, "go": { - "tp": 26, - "fp": 0, + "tp": 27, + "fp": 1, "fn_": 0, - "tn": 30, - "precision": 1.0, + "tn": 31, + "precision": 0.9642857142857143, "recall": 1.0, - "f1": 1.0 + "f1": 0.9818181818181818 }, "java": { "tp": 21, @@ -8840,10 +9068,10 @@ "f1": 1.0 }, "javascript": { - "tp": 22, + "tp": 23, "fp": 0, "fn_": 0, - "tn": 28, + "tn": 29, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8858,10 +9086,10 @@ "f1": 1.0 }, "python": { - "tp": 28, + "tp": 29, "fp": 0, "fn_": 0, - "tn": 30, + "tn": 32, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8876,10 +9104,10 @@ "f1": 1.0 }, "rust": { - "tp": 35, + "tp": 37, "fp": 0, "fn_": 0, - "tn": 39, + "tn": 41, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8888,7 +9116,7 @@ "tp": 34, "fp": 0, "fn_": 0, - "tn": 24, + "tn": 25, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8896,7 +9124,7 @@ }, "by_vuln_class": { "auth": { - "tp": 16, + "tp": 19, "fp": 0, "fn_": 0, "tn": 0, @@ -8923,7 +9151,7 @@ "f1": 1.0 }, "code_exec": { - "tp": 3, + "tp": 4, "fp": 0, "fn_": 0, "tn": 0, @@ -9021,15 +9249,24 @@ "recall": 1.0, "f1": 1.0 }, - "safe": { - "tp": 0, + "resource": { + "tp": 1, "fp": 0, "fn_": 0, - "tn": 247, + "tn": 0, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, + "safe": { + "tp": 0, + "fp": 1, + "fn_": 0, + "tn": 254, + "precision": 0.0, + "recall": 1.0, + "f1": 0.0 + }, "secrets": { "tp": 1, "fp": 0, @@ -9078,31 +9315,31 @@ }, "by_confidence": { ">=High": { - "tp": 74, - "fp": 108, - "fn_": 170, - "tn": 139, - "precision": 0.4065934065934066, - "recall": 0.30327868852459017, - "f1": 0.3474178403755868 + "tp": 78, + "fp": 107, + "fn_": 171, + "tn": 148, + "precision": 0.42162162162162165, + "recall": 0.3132530120481928, + "f1": 0.359447004608295 }, ">=Low": { - "tp": 75, - "fp": 129, - "fn_": 169, - "tn": 118, - "precision": 0.36764705882352944, - "recall": 0.3073770491803279, - "f1": 0.3348214285714286 + "tp": 82, + "fp": 126, + "fn_": 167, + "tn": 129, + "precision": 0.3942307692307692, + "recall": 0.3293172690763052, + "f1": 0.35886214442013126 }, ">=Medium": { - "tp": 75, - "fp": 124, - "fn_": 169, - "tn": 123, - "precision": 0.3768844221105528, - "recall": 0.3073770491803279, - "f1": 0.33860045146726864 + "tp": 82, + "fp": 121, + "fn_": 167, + "tn": 134, + "precision": 0.4039408866995074, + "recall": 0.3293172690763052, + "f1": 0.3628318584070796 } } } \ No newline at end of file