Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
This commit is contained in:
Eli Peter 2026-05-01 10:59:52 -04:00 committed by GitHub
parent a438886217
commit 58f1794a4e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
189 changed files with 8421 additions and 383 deletions

View file

@ -145,6 +145,11 @@ fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -
/// Build a [`Diag`] from a taint [`Finding`], the CFG that produced it,
/// the parsed tree (for byte→line/col conversion) and the file path.
///
/// Returns `None` when source-sensitivity gating fully suppresses the
/// finding (the canonical case is a multi-gate `DATA_EXFIL` event whose
/// contributing source is plain user input — see the
/// `effective_caps` strip below).
fn build_taint_diag(
finding: &crate::taint::Finding,
cfg_graph: &crate::cfg::Cfg,
@ -152,7 +157,7 @@ fn build_taint_diag(
path: &Path,
src: &[u8],
scan_root: Option<&Path>,
) -> Diag {
) -> Option<Diag> {
let call_site_byte = cfg_graph[finding.sink].classification_span().0;
let call_site_point = byte_offset_to_point(tree, call_site_byte);
// `finding.source` should be a NodeIndex valid in this body's CFG, but
@ -373,16 +378,63 @@ fn build_taint_diag(
// SSA dispatch) when populated; fall back to the union of all sink-label
// caps on the CFG node so legacy paths that build findings without
// setting `effective_sink_caps` still pick the right rule id.
let effective_caps = if finding.effective_sink_caps.is_empty() {
let mut effective_caps = if finding.effective_sink_caps.is_empty() {
crate::labels::Cap::from_bits_truncate(sink_caps_bits)
} else {
finding.effective_sink_caps
};
// Source-sensitivity gate for `DATA_EXFIL`. Plain attacker input echoed
// back into an outbound request body / headers / json is not data
// exfiltration, the user already controls the value, surfacing it as a
// leak is noise (the canonical false-positive class for API gateways
// and telemetry forwarders that proxy `req.body`). A `DATA_EXFIL`
// finding requires the contributing source to be at least `Sensitive`
// (cookies, headers, env, db rows, file reads). Plain user-input
// sources have the cap stripped so the finding either drops entirely
// or downgrades to whatever non-`DATA_EXFIL` cap also applies (e.g.
// SSRF on the URL position of the same `fetch` call).
if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& finding.source_kind.sensitivity() < crate::labels::Sensitivity::Sensitive
{
effective_caps.remove(crate::labels::Cap::DATA_EXFIL);
// The multi-gate dispatch produces one finding per (source, sink-cap)
// pair, a body-flow finding's `effective_sink_caps` is exactly the
// cap that fired (e.g. `DATA_EXFIL`). When that single cap is the
// sensitivity-stripped one, the finding has no surviving rationale
// and we drop it entirely rather than reroute it to the generic
// `taint-unsanitised-flow` bucket (which would just re-emit the same
// false positive under a different rule id). Findings with a
// multi-cap `effective_sink_caps` keep their non-DATA_EXFIL caps and
// are routed normally below.
if finding.effective_sink_caps == crate::labels::Cap::DATA_EXFIL {
return None;
}
}
// DATA_EXFIL routing.
//
// Multi-gate dispatch (JS / Go) emits one event per cap, so by this
// point each finding's `effective_sink_caps` carries exactly one bit
// and the simple `DATA_EXFIL && !SSRF` test routes correctly. Flat-
// rule paths (Java HTTP clients where type-qualified resolution
// attaches both `SSRF` and `DATA_EXFIL` Sink labels to the same call,
// e.g. `client.send(req)` covering both URL and body channels of the
// request value) produce a single dual-cap event. In that case the
// source's sensitivity tier disambiguates: a Sensitive source
// (cookie, header, env, db, session) leaking into an outbound
// request is canonically DATA_EXFIL even if the sink also carries
// an SSRF label, because operator-bound state is not URL-shaped
// attacker input. Plain user input keeps SSRF routing (the typical
// user-controlled-URL pattern).
let is_data_exfil_rule = effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& !effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID)
&& (!effective_caps.contains(crate::labels::Cap::SSRF)
|| finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive);
let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
"rs.auth.missing_ownership_check.taint".to_string()
} else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& !effective_caps.contains(crate::labels::Cap::SSRF)
{
} else if is_data_exfil_rule {
format!(
"taint-data-exfiltration (source {}:{})",
source_point.row + 1,
@ -396,18 +448,86 @@ fn build_taint_diag(
)
};
// For `DATA_EXFIL` rules, look up which destination object-literal field
// (`body` / `headers` / `json`) the tainted value reached. Each
// [`crate::cfg::GateFilter`] carries `destination_uses` (var names) in
// parallel with `destination_fields` (the field each var was bound to),
// so we walk the gate filter whose `label_caps` includes `DATA_EXFIL`
// and match the tainted var name from the last flow step. Falls back
// to the first non-empty destination field on the matching filter when
// the var-name match fails (e.g. the SSA sink event is reported on a
// copy-propagated value whose name no longer matches the original
// destination ident). `None` when the sink wasn't a destination-aware
// gate (no object literal, or non-fetch sink).
let data_exfil_field: Option<String> = if is_data_exfil_rule {
let last_var = finding
.flow_steps
.last()
.and_then(|s| s.var_name.as_deref());
let filters = &cfg_graph[finding.sink].call.gate_filters;
filters
.iter()
.find(|f| f.label_caps.contains(crate::labels::Cap::DATA_EXFIL))
.and_then(|f| {
if let (Some(uses), Some(var)) = (f.destination_uses.as_ref(), last_var)
&& let Some(idx) = uses.iter().position(|u| u == var)
{
return f.destination_fields.get(idx).cloned();
}
f.destination_fields.first().cloned()
})
} else {
None
};
// DATA_EXFIL severity calibration (Phase: detector ranking).
//
// Generic taint severity comes from `severity_for_source_kind`, which
// maps Cookie/Header/Env to High because those sources are spicy
// *as taint roots*. For `DATA_EXFIL` we are scoring the leak class,
// not the source itself: not every Sensitive-tier source is a Secret.
// Cookies and env carry credential / session material whose leakage
// is an immediate disclosure (Secret-tier); request headers, file
// reads, db rows, and caught exceptions are Sensitive but not
// automatically secret, so they downgrade to Medium. Plain user
// input is already stripped above by the source-sensitivity gate, so
// the `_` arm here is reached only by Sensitive sources that are not
// explicit secrets.
let severity = if is_data_exfil_rule {
match finding.source_kind {
crate::labels::SourceKind::Cookie | crate::labels::SourceKind::EnvironmentConfig => {
crate::patterns::Severity::High
}
_ => crate::patterns::Severity::Medium,
}
} else {
severity_for_source_kind(finding.source_kind)
};
// DATA_EXFIL: surface the destination field in the message so analysts
// see at a glance whether the leak reached the request body, headers,
// or json payload. Generic taint findings stay on the existing
// "unsanitised … flows from … → …" template.
let message = if is_data_exfil_rule {
let suffix = data_exfil_field
.as_deref()
.map(|f| format!(" ({f} field)"))
.unwrap_or_default();
format!("sensitive data flows from {short_source} \u{2192} {sink_display}{suffix}")
} else {
format!("unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}")
};
let mut diag = Diag {
path: primary_path.clone(),
line: primary_line,
col: primary_col,
severity: severity_for_source_kind(finding.source_kind),
severity,
id: diag_id,
category: FindingCategory::Security,
path_validated: finding.path_validated,
guard_kind: finding.guard_kind.map(|k| format!("{k:?}")),
message: Some(format!(
"unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}"
)),
message: Some(message),
labels,
confidence: None,
evidence: Some(Evidence {
@ -448,6 +568,7 @@ fn build_taint_diag(
symbolic: finding.symbolic.clone(),
sink_caps: sink_caps_bits,
engine_notes: finding.engine_notes.clone(),
data_exfil_field,
..Default::default()
}),
rank_score: None,
@ -467,7 +588,7 @@ fn build_taint_diag(
ev.confidence_limiters = limiters;
}
diag
Some(diag)
}
/// Resolve a file extension to a language slug (e.g. `"rust"`,
@ -622,6 +743,8 @@ fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str {
use crate::labels::SourceKind;
match sk {
SourceKind::UserInput => "user input",
SourceKind::Cookie => "cookie value",
SourceKind::Header => "request header",
SourceKind::EnvironmentConfig => "environment config",
SourceKind::FileSystem => "file system data",
SourceKind::Database => "database result",
@ -1198,18 +1321,31 @@ impl<'a> ParsedFile<'a> {
continue;
}
out.push(build_taint_diag(
if let Some(diag) = build_taint_diag(
finding,
body_cfg,
&self.source.tree,
self.source.path,
self.source.bytes,
scan_root,
));
) {
out.push(diag);
}
}
// ── CFG structural analyses (per body) ─────────────────────────
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
// Pre-compute, per body, the set of variable names whose
// release / close calls live in a NESTED closure body inside
// that body (e.g. `socket.on("close", () => ws.close())`).
// Both the structural ResourceMisuse pass and the state-model
// leak pass consult it to suppress findings whose cleanup is
// registered as a callback the per-body CFG can't follow.
// Only descendants count — sibling methods on the same class
// don't share resource ownership.
let closure_released_per_body =
state::collect_closure_released_var_names(&self.file_cfg.bodies, caller_lang);
let empty_set: std::collections::HashSet<String> = std::collections::HashSet::new();
for body in &self.file_cfg.bodies {
let body_taint: Vec<_> = taint_results
.iter()
@ -1231,6 +1367,11 @@ impl<'a> ParsedFile<'a> {
body_const_facts: body_const_facts.as_ref(),
type_facts: body_const_facts.as_ref().map(|f| &f.type_facts),
auth_decorators: &body.meta.auth_decorators,
closure_released_var_names: Some(
closure_released_per_body
.get(&body.meta.id)
.unwrap_or(&empty_set),
),
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&self.source.tree, cf.span.0);
@ -1307,6 +1448,11 @@ impl<'a> ParsedFile<'a> {
&body.meta.auth_decorators,
&path_safe_suppressed_spans,
body_pointer_hints.as_ref(),
Some(
closure_released_per_body
.get(&body.meta.id)
.unwrap_or(&empty_set),
),
);
for sf in &state_findings {