Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers * feat: Implement cross-file data exfiltration detection with parameter-specific gate filters * feat: Add calibration tests and refine DATA_EXFIL severity scoring logic * feat: Introduce per-detector configuration for data exfiltration suppression * feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output * feat: Add tainted body and URL handling for data exfiltration detection * feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go * feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients * feat: Add synthetic externals handling for closure-captured variables in SSA * feat: Implement closure-based suppression for resource leak findings * feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns * feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders * feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt * feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests * feat: Add data exfiltration sinks for various languages and enhance documentation * refactor: Simplify formatting and improve readability in various files * refactor: Improve readability by simplifying conditional statements and adding clippy linting * docs: Update CHANGELOG and comments for data exfiltration features and configuration * docs: Clarify configuration instructions for data exfiltration trusted destinations * docs: Enhance comments for evidence routing logic in data exfiltration
2026-06-21 20:18:06 +02:00 · 2026-05-01 10:59:52 -04:00 · 2026-05-01 10:59:52 -04:00 · 58f1794a4e
commit 58f1794a4e
parent a438886217
189 changed files with 8421 additions and 383 deletions
--- a/tests/fetch_data_exfil_integration_tests.rs
+++ b/tests/fetch_data_exfil_integration_tests.rs
@ -5,6 +5,12 @@
 //! headers / json flow), and a tainted body must not surface as SSRF and
 //! vice versa.  Also sanity-checks the SARIF output so the new finding
 //! class produces a distinct rule id.
+//!
+//! `DATA_EXFIL` is gated on source sensitivity: only `Sensitive`-tier
+//! sources (cookies, headers, env, db rows, file reads) trigger the cap.
+//! Plain user input echoed back into a body is *not* data exfiltration —
+//! the user already controls the value.  See
+//! `fetch_body_user_input_silenced.js` for the negative regression.

 mod common;

@ -79,6 +85,87 @@ fn fetch_ssrf_url_tainted_emits_ssrf_not_data_exfil() {
    );
 }

+#[test]
+fn fetch_body_plain_user_input_does_not_emit_data_exfil() {
+    // Plain attacker-controlled input (`req.body.message`) flowing into a
+    // fixed-URL `fetch` body must NOT fire `Cap::DATA_EXFIL` after the
+    // source-sensitivity gate.  The user already controls the value;
+    // surfacing it back to the user via the outbound payload is not a
+    // cross-boundary disclosure.
+    let diags = diags_for("fetch_body_user_input_silenced.js");
+    let exfil = diags
+        .iter()
+        .filter(|d| d.id.starts_with("taint-data-exfiltration"))
+        .count();
+    assert_eq!(
+        exfil,
+        0,
+        "plain user input echoed into a fetch body must NOT emit \
+         taint-data-exfiltration, got {exfil}.\n\
+         Diags: {:#?}",
+        diags.iter().map(|d| &d.id).collect::<Vec<_>>(),
+    );
+}
+
+#[test]
+fn fetch_body_data_exfil_witness_mentions_session_token() {
+    // Symex-witness regression guard: a DATA_EXFIL `Confirmed` (or
+    // Inconclusive but witness-bearing) verdict on the cookie → fetch
+    // body fixture must surface the session-token payload in its
+    // witness string.  The cap-specific payload selector in
+    // `src/symex/witness.rs::witness_payload` returns
+    // `<SESSION_TOKEN>` for `Cap::DATA_EXFIL`, the rendered witness
+    // (via `get_sink_witness`) substitutes that into the
+    // string-renderable expression so the analyst sees that the *leak*
+    // is a credential-bearing payload, not an injection.
+    //
+    // When symex emits no witness for this flow (e.g. the expression
+    // tree was opaque) the test silently accepts that, the assertion
+    // is one-sided so the witness shape is locked but witness absence
+    // is not promoted to a hard failure (the calibration suite
+    // already covers the no-witness path).
+    let diags = diags_for("fetch_body_data_exfil.js");
+    let exfil_witnesses: Vec<&String> = diags
+        .iter()
+        .filter(|d| d.id.starts_with("taint-data-exfiltration"))
+        .filter_map(|d| {
+            d.evidence
+                .as_ref()
+                .and_then(|e| e.symbolic.as_ref())
+                .and_then(|sv| sv.witness.as_ref())
+        })
+        .collect();
+    for w in &exfil_witnesses {
+        assert!(
+            w.contains("<SESSION_TOKEN>") || w.contains("body") || w.contains("payload"),
+            "DATA_EXFIL witness must mention the leaked payload \
+             (<SESSION_TOKEN>) or body/payload context.  Got: {w:?}",
+        );
+    }
+}
+
+#[test]
+fn fetch_body_int_value_does_not_emit_data_exfil() {
+    // Numeric-typed bodies (e.g. `parseInt(req.cookies.session_count)`)
+    // are payload-incompatible: ints cannot carry session tokens, header
+    // secrets, or any credential material that constitutes a
+    // cross-boundary disclosure.  `is_type_safe_for_sink` lists
+    // `DATA_EXFIL` in its type-suppressible cap mask so a proven-Int SSA
+    // value at the gate silences the finding.
+    let diags = diags_for("fetch_body_int_suppressed.js");
+    let exfil = diags
+        .iter()
+        .filter(|d| d.id.starts_with("taint-data-exfiltration"))
+        .count();
+    assert_eq!(
+        exfil,
+        0,
+        "int-typed body must NOT emit taint-data-exfiltration, got {exfil}.\n\
+         Diags: {:#?}",
+        diags.iter().map(|d| &d.id).collect::<Vec<_>>(),
+    );
+}
+
 #[test]
 fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() {
    use nyx_scanner::output::build_sarif;
@ -106,20 +193,35 @@ fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() {
    let results = sarif["runs"][0]["results"]
        .as_array()
        .expect("SARIF results array");
-    let exfil_results = results
+    let exfil_results: Vec<&serde_json::Value> = results
        .iter()
        .filter(|r| r["ruleId"].as_str() == Some("taint-data-exfiltration"))
-        .count();
+        .collect();
    let ssrf_results = results
        .iter()
        .filter(|r| r["ruleId"].as_str() == Some("taint-unsanitised-flow"))
        .count();
    assert!(
-        exfil_results >= 1,
-        "expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {exfil_results}",
+        !exfil_results.is_empty(),
+        "expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {}",
+        exfil_results.len(),
    );
    assert!(
        ssrf_results >= 1,
        "expected >= 1 SARIF result with ruleId taint-unsanitised-flow, got {ssrf_results}",
    );
+
+    // Every DATA_EXFIL finding from the fixture set targets the request body
+    // (`fetch('/endpoint', { body: payload })`), so SARIF must surface the
+    // destination field via `properties.data_exfil_field`.  At least one
+    // result has to advertise `body`, fixtures that reach `headers` /
+    // `json` are out of scope for this assertion but must not be silenced.
+    let body_field_seen = exfil_results
+        .iter()
+        .any(|r| r["properties"]["data_exfil_field"].as_str() == Some("body"));
+    assert!(
+        body_field_seen,
+        "expected at least one taint-data-exfiltration SARIF result with \
+         properties.data_exfil_field == \"body\". Results: {exfil_results:#?}",
+    );
 }