Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers * feat: Implement cross-file data exfiltration detection with parameter-specific gate filters * feat: Add calibration tests and refine DATA_EXFIL severity scoring logic * feat: Introduce per-detector configuration for data exfiltration suppression * feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output * feat: Add tainted body and URL handling for data exfiltration detection * feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go * feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients * feat: Add synthetic externals handling for closure-captured variables in SSA * feat: Implement closure-based suppression for resource leak findings * feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns * feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders * feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt * feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests * feat: Add data exfiltration sinks for various languages and enhance documentation * refactor: Simplify formatting and improve readability in various files * refactor: Improve readability by simplifying conditional statements and adding clippy linting * docs: Update CHANGELOG and comments for data exfiltration features and configuration * docs: Clarify configuration instructions for data exfiltration trusted destinations * docs: Enhance comments for evidence routing logic in data exfiltration
2026-06-09 19:45:13 +02:00 · 2026-05-01 10:59:52 -04:00 · 2026-05-01 10:59:52 -04:00 · 58f1794a4e
commit 58f1794a4e
parent a438886217
189 changed files with 8421 additions and 383 deletions
--- a/src/ast.rs
+++ b/src/ast.rs
@ -145,6 +145,11 @@ fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -

 /// Build a [`Diag`] from a taint [`Finding`], the CFG that produced it,
 /// the parsed tree (for byte→line/col conversion) and the file path.
+///
+/// Returns `None` when source-sensitivity gating fully suppresses the
+/// finding (the canonical case is a multi-gate `DATA_EXFIL` event whose
+/// contributing source is plain user input — see the
+/// `effective_caps` strip below).
 fn build_taint_diag(
    finding: &crate::taint::Finding,
    cfg_graph: &crate::cfg::Cfg,
@ -152,7 +157,7 @@ fn build_taint_diag(
    path: &Path,
    src: &[u8],
    scan_root: Option<&Path>,
-) -> Diag {
+) -> Option<Diag> {
    let call_site_byte = cfg_graph[finding.sink].classification_span().0;
    let call_site_point = byte_offset_to_point(tree, call_site_byte);
    // `finding.source` should be a NodeIndex valid in this body's CFG, but
@ -373,16 +378,63 @@ fn build_taint_diag(
    // SSA dispatch) when populated; fall back to the union of all sink-label
    // caps on the CFG node so legacy paths that build findings without
    // setting `effective_sink_caps` still pick the right rule id.
-    let effective_caps = if finding.effective_sink_caps.is_empty() {
+    let mut effective_caps = if finding.effective_sink_caps.is_empty() {
        crate::labels::Cap::from_bits_truncate(sink_caps_bits)
    } else {
        finding.effective_sink_caps
    };
+
+    // Source-sensitivity gate for `DATA_EXFIL`.  Plain attacker input echoed
+    // back into an outbound request body / headers / json is not data
+    // exfiltration, the user already controls the value, surfacing it as a
+    // leak is noise (the canonical false-positive class for API gateways
+    // and telemetry forwarders that proxy `req.body`).  A `DATA_EXFIL`
+    // finding requires the contributing source to be at least `Sensitive`
+    // (cookies, headers, env, db rows, file reads).  Plain user-input
+    // sources have the cap stripped so the finding either drops entirely
+    // or downgrades to whatever non-`DATA_EXFIL` cap also applies (e.g.
+    // SSRF on the URL position of the same `fetch` call).
+    if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
+        && finding.source_kind.sensitivity() < crate::labels::Sensitivity::Sensitive
+    {
+        effective_caps.remove(crate::labels::Cap::DATA_EXFIL);
+        // The multi-gate dispatch produces one finding per (source, sink-cap)
+        // pair, a body-flow finding's `effective_sink_caps` is exactly the
+        // cap that fired (e.g. `DATA_EXFIL`).  When that single cap is the
+        // sensitivity-stripped one, the finding has no surviving rationale
+        // and we drop it entirely rather than reroute it to the generic
+        // `taint-unsanitised-flow` bucket (which would just re-emit the same
+        // false positive under a different rule id).  Findings with a
+        // multi-cap `effective_sink_caps` keep their non-DATA_EXFIL caps and
+        // are routed normally below.
+        if finding.effective_sink_caps == crate::labels::Cap::DATA_EXFIL {
+            return None;
+        }
+    }
+
+    // DATA_EXFIL routing.
+    //
+    // Multi-gate dispatch (JS / Go) emits one event per cap, so by this
+    // point each finding's `effective_sink_caps` carries exactly one bit
+    // and the simple `DATA_EXFIL && !SSRF` test routes correctly.  Flat-
+    // rule paths (Java HTTP clients where type-qualified resolution
+    // attaches both `SSRF` and `DATA_EXFIL` Sink labels to the same call,
+    // e.g. `client.send(req)` covering both URL and body channels of the
+    // request value) produce a single dual-cap event.  In that case the
+    // source's sensitivity tier disambiguates: a Sensitive source
+    // (cookie, header, env, db, session) leaking into an outbound
+    // request is canonically DATA_EXFIL even if the sink also carries
+    // an SSRF label, because operator-bound state is not URL-shaped
+    // attacker input.  Plain user input keeps SSRF routing (the typical
+    // user-controlled-URL pattern).
+    let is_data_exfil_rule = effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
+        && !effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID)
+        && (!effective_caps.contains(crate::labels::Cap::SSRF)
+            || finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive);
+
    let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
        "rs.auth.missing_ownership_check.taint".to_string()
-    } else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
-        && !effective_caps.contains(crate::labels::Cap::SSRF)
-    {
+    } else if is_data_exfil_rule {
        format!(
            "taint-data-exfiltration (source {}:{})",
            source_point.row + 1,
@ -396,18 +448,86 @@ fn build_taint_diag(
        )
    };

+    // For `DATA_EXFIL` rules, look up which destination object-literal field
+    // (`body` / `headers` / `json`) the tainted value reached.  Each
+    // [`crate::cfg::GateFilter`] carries `destination_uses` (var names) in
+    // parallel with `destination_fields` (the field each var was bound to),
+    // so we walk the gate filter whose `label_caps` includes `DATA_EXFIL`
+    // and match the tainted var name from the last flow step.  Falls back
+    // to the first non-empty destination field on the matching filter when
+    // the var-name match fails (e.g. the SSA sink event is reported on a
+    // copy-propagated value whose name no longer matches the original
+    // destination ident).  `None` when the sink wasn't a destination-aware
+    // gate (no object literal, or non-fetch sink).
+    let data_exfil_field: Option<String> = if is_data_exfil_rule {
+        let last_var = finding
+            .flow_steps
+            .last()
+            .and_then(|s| s.var_name.as_deref());
+        let filters = &cfg_graph[finding.sink].call.gate_filters;
+        filters
+            .iter()
+            .find(|f| f.label_caps.contains(crate::labels::Cap::DATA_EXFIL))
+            .and_then(|f| {
+                if let (Some(uses), Some(var)) = (f.destination_uses.as_ref(), last_var)
+                    && let Some(idx) = uses.iter().position(|u| u == var)
+                {
+                    return f.destination_fields.get(idx).cloned();
+                }
+                f.destination_fields.first().cloned()
+            })
+    } else {
+        None
+    };
+
+    // DATA_EXFIL severity calibration (Phase: detector ranking).
+    //
+    // Generic taint severity comes from `severity_for_source_kind`, which
+    // maps Cookie/Header/Env to High because those sources are spicy
+    // *as taint roots*.  For `DATA_EXFIL` we are scoring the leak class,
+    // not the source itself: not every Sensitive-tier source is a Secret.
+    // Cookies and env carry credential / session material whose leakage
+    // is an immediate disclosure (Secret-tier); request headers, file
+    // reads, db rows, and caught exceptions are Sensitive but not
+    // automatically secret, so they downgrade to Medium.  Plain user
+    // input is already stripped above by the source-sensitivity gate, so
+    // the `_` arm here is reached only by Sensitive sources that are not
+    // explicit secrets.
+    let severity = if is_data_exfil_rule {
+        match finding.source_kind {
+            crate::labels::SourceKind::Cookie | crate::labels::SourceKind::EnvironmentConfig => {
+                crate::patterns::Severity::High
+            }
+            _ => crate::patterns::Severity::Medium,
+        }
+    } else {
+        severity_for_source_kind(finding.source_kind)
+    };
+
+    // DATA_EXFIL: surface the destination field in the message so analysts
+    // see at a glance whether the leak reached the request body, headers,
+    // or json payload.  Generic taint findings stay on the existing
+    // "unsanitised … flows from … → …" template.
+    let message = if is_data_exfil_rule {
+        let suffix = data_exfil_field
+            .as_deref()
+            .map(|f| format!(" ({f} field)"))
+            .unwrap_or_default();
+        format!("sensitive data flows from {short_source} \u{2192} {sink_display}{suffix}")
+    } else {
+        format!("unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}")
+    };
+
    let mut diag = Diag {
        path: primary_path.clone(),
        line: primary_line,
        col: primary_col,
-        severity: severity_for_source_kind(finding.source_kind),
+        severity,
        id: diag_id,
        category: FindingCategory::Security,
        path_validated: finding.path_validated,
        guard_kind: finding.guard_kind.map(|k| format!("{k:?}")),
-        message: Some(format!(
-            "unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}"
-        )),
+        message: Some(message),
        labels,
        confidence: None,
        evidence: Some(Evidence {
@ -448,6 +568,7 @@ fn build_taint_diag(
            symbolic: finding.symbolic.clone(),
            sink_caps: sink_caps_bits,
            engine_notes: finding.engine_notes.clone(),
+            data_exfil_field,
            ..Default::default()
        }),
        rank_score: None,
@ -467,7 +588,7 @@ fn build_taint_diag(
        ev.confidence_limiters = limiters;
    }

-    diag
+    Some(diag)
 }

 /// Resolve a file extension to a language slug (e.g. `"rust"`,
@ -622,6 +743,8 @@ fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str {
    use crate::labels::SourceKind;
    match sk {
        SourceKind::UserInput => "user input",
+        SourceKind::Cookie => "cookie value",
+        SourceKind::Header => "request header",
        SourceKind::EnvironmentConfig => "environment config",
        SourceKind::FileSystem => "file system data",
        SourceKind::Database => "database result",
@ -1198,18 +1321,31 @@ impl<'a> ParsedFile<'a> {
                continue;
            }

-            out.push(build_taint_diag(
+            if let Some(diag) = build_taint_diag(
                finding,
                body_cfg,
                &self.source.tree,
                self.source.path,
                self.source.bytes,
                scan_root,
-            ));
+            ) {
+                out.push(diag);
+            }
        }

        // ── CFG structural analyses (per body) ─────────────────────────
        let taint_active = global_summaries.is_some() || !taint_results.is_empty();
+        // Pre-compute, per body, the set of variable names whose
+        // release / close calls live in a NESTED closure body inside
+        // that body (e.g. `socket.on("close", () => ws.close())`).
+        // Both the structural ResourceMisuse pass and the state-model
+        // leak pass consult it to suppress findings whose cleanup is
+        // registered as a callback the per-body CFG can't follow.
+        // Only descendants count — sibling methods on the same class
+        // don't share resource ownership.
+        let closure_released_per_body =
+            state::collect_closure_released_var_names(&self.file_cfg.bodies, caller_lang);
+        let empty_set: std::collections::HashSet<String> = std::collections::HashSet::new();
        for body in &self.file_cfg.bodies {
            let body_taint: Vec<_> = taint_results
                .iter()
@ -1231,6 +1367,11 @@ impl<'a> ParsedFile<'a> {
                body_const_facts: body_const_facts.as_ref(),
                type_facts: body_const_facts.as_ref().map(|f| &f.type_facts),
                auth_decorators: &body.meta.auth_decorators,
+                closure_released_var_names: Some(
+                    closure_released_per_body
+                        .get(&body.meta.id)
+                        .unwrap_or(&empty_set),
+                ),
            };
            for cf in cfg_analysis::run_all(&cfg_ctx) {
                let point = byte_offset_to_point(&self.source.tree, cf.span.0);
@ -1307,6 +1448,11 @@ impl<'a> ParsedFile<'a> {
                    &body.meta.auth_decorators,
                    &path_safe_suppressed_spans,
                    body_pointer_hints.as_ref(),
+                    Some(
+                        closure_released_per_body
+                            .get(&body.meta.id)
+                            .unwrap_or(&empty_set),
+                    ),
                );

                for sf in &state_findings {