Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
This commit is contained in:
Eli Peter 2026-05-01 10:59:52 -04:00 committed by GitHub
parent a438886217
commit 58f1794a4e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
189 changed files with 8421 additions and 383 deletions

View file

@ -5,6 +5,12 @@
//! headers / json flow), and a tainted body must not surface as SSRF and
//! vice versa. Also sanity-checks the SARIF output so the new finding
//! class produces a distinct rule id.
//!
//! `DATA_EXFIL` is gated on source sensitivity: only `Sensitive`-tier
//! sources (cookies, headers, env, db rows, file reads) trigger the cap.
//! Plain user input echoed back into a body is *not* data exfiltration —
//! the user already controls the value. See
//! `fetch_body_user_input_silenced.js` for the negative regression.
mod common;
@ -79,6 +85,87 @@ fn fetch_ssrf_url_tainted_emits_ssrf_not_data_exfil() {
);
}
#[test]
fn fetch_body_plain_user_input_does_not_emit_data_exfil() {
// Plain attacker-controlled input (`req.body.message`) flowing into a
// fixed-URL `fetch` body must NOT fire `Cap::DATA_EXFIL` after the
// source-sensitivity gate. The user already controls the value;
// surfacing it back to the user via the outbound payload is not a
// cross-boundary disclosure.
let diags = diags_for("fetch_body_user_input_silenced.js");
let exfil = diags
.iter()
.filter(|d| d.id.starts_with("taint-data-exfiltration"))
.count();
assert_eq!(
exfil,
0,
"plain user input echoed into a fetch body must NOT emit \
taint-data-exfiltration, got {exfil}.\n\
Diags: {:#?}",
diags.iter().map(|d| &d.id).collect::<Vec<_>>(),
);
}
#[test]
fn fetch_body_data_exfil_witness_mentions_session_token() {
// Symex-witness regression guard: a DATA_EXFIL `Confirmed` (or
// Inconclusive but witness-bearing) verdict on the cookie → fetch
// body fixture must surface the session-token payload in its
// witness string. The cap-specific payload selector in
// `src/symex/witness.rs::witness_payload` returns
// `<SESSION_TOKEN>` for `Cap::DATA_EXFIL`, the rendered witness
// (via `get_sink_witness`) substitutes that into the
// string-renderable expression so the analyst sees that the *leak*
// is a credential-bearing payload, not an injection.
//
// When symex emits no witness for this flow (e.g. the expression
// tree was opaque) the test silently accepts that, the assertion
// is one-sided so the witness shape is locked but witness absence
// is not promoted to a hard failure (the calibration suite
// already covers the no-witness path).
let diags = diags_for("fetch_body_data_exfil.js");
let exfil_witnesses: Vec<&String> = diags
.iter()
.filter(|d| d.id.starts_with("taint-data-exfiltration"))
.filter_map(|d| {
d.evidence
.as_ref()
.and_then(|e| e.symbolic.as_ref())
.and_then(|sv| sv.witness.as_ref())
})
.collect();
for w in &exfil_witnesses {
assert!(
w.contains("<SESSION_TOKEN>") || w.contains("body") || w.contains("payload"),
"DATA_EXFIL witness must mention the leaked payload \
(<SESSION_TOKEN>) or body/payload context. Got: {w:?}",
);
}
}
#[test]
fn fetch_body_int_value_does_not_emit_data_exfil() {
// Numeric-typed bodies (e.g. `parseInt(req.cookies.session_count)`)
// are payload-incompatible: ints cannot carry session tokens, header
// secrets, or any credential material that constitutes a
// cross-boundary disclosure. `is_type_safe_for_sink` lists
// `DATA_EXFIL` in its type-suppressible cap mask so a proven-Int SSA
// value at the gate silences the finding.
let diags = diags_for("fetch_body_int_suppressed.js");
let exfil = diags
.iter()
.filter(|d| d.id.starts_with("taint-data-exfiltration"))
.count();
assert_eq!(
exfil,
0,
"int-typed body must NOT emit taint-data-exfiltration, got {exfil}.\n\
Diags: {:#?}",
diags.iter().map(|d| &d.id).collect::<Vec<_>>(),
);
}
#[test]
fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() {
use nyx_scanner::output::build_sarif;
@ -106,20 +193,35 @@ fn sarif_distinguishes_data_exfil_rule_id_from_ssrf() {
let results = sarif["runs"][0]["results"]
.as_array()
.expect("SARIF results array");
let exfil_results = results
let exfil_results: Vec<&serde_json::Value> = results
.iter()
.filter(|r| r["ruleId"].as_str() == Some("taint-data-exfiltration"))
.count();
.collect();
let ssrf_results = results
.iter()
.filter(|r| r["ruleId"].as_str() == Some("taint-unsanitised-flow"))
.count();
assert!(
exfil_results >= 1,
"expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {exfil_results}",
!exfil_results.is_empty(),
"expected >= 1 SARIF result with ruleId taint-data-exfiltration, got {}",
exfil_results.len(),
);
assert!(
ssrf_results >= 1,
"expected >= 1 SARIF result with ruleId taint-unsanitised-flow, got {ssrf_results}",
);
// Every DATA_EXFIL finding from the fixture set targets the request body
// (`fetch('/endpoint', { body: payload })`), so SARIF must surface the
// destination field via `properties.data_exfil_field`. At least one
// result has to advertise `body`, fixtures that reach `headers` /
// `json` are out of scope for this assertion but must not be silenced.
let body_field_seen = exfil_results
.iter()
.any(|r| r["properties"]["data_exfil_field"].as_str() == Some("body"));
assert!(
body_field_seen,
"expected at least one taint-data-exfiltration SARIF result with \
properties.data_exfil_field == \"body\". Results: {exfil_results:#?}",
);
}