Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers * feat: Implement cross-file data exfiltration detection with parameter-specific gate filters * feat: Add calibration tests and refine DATA_EXFIL severity scoring logic * feat: Introduce per-detector configuration for data exfiltration suppression * feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output * feat: Add tainted body and URL handling for data exfiltration detection * feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go * feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients * feat: Add synthetic externals handling for closure-captured variables in SSA * feat: Implement closure-based suppression for resource leak findings * feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns * feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders * feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt * feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests * feat: Add data exfiltration sinks for various languages and enhance documentation * refactor: Simplify formatting and improve readability in various files * refactor: Improve readability by simplifying conditional statements and adding clippy linting * docs: Update CHANGELOG and comments for data exfiltration features and configuration * docs: Clarify configuration instructions for data exfiltration trusted destinations * docs: Enhance comments for evidence routing logic in data exfiltration
2026-06-12 19:55:14 +02:00 · 2026-05-01 10:59:52 -04:00 · 2026-05-01 10:59:52 -04:00 · 58f1794a4e
commit 58f1794a4e
parent a438886217
189 changed files with 8421 additions and 383 deletions
--- a/src/state/facts.rs
+++ b/src/state/facts.rs
@ -19,19 +19,29 @@ fn sanitize_desc(s: &str) -> String {
 /// convergence node where all execution paths join before leaving the function.
 ///
 /// **Invariant:** Only terminal exits carry the complete merged lifecycle state
-/// needed for leak analysis.  Return nodes are intermediate (they flow into the
-/// terminal exit) and must NOT be analyzed for terminal resource state.
-///
-/// Detection is purely topological: a node inside a function is terminal when
-/// it has no successor within the same function scope.  This works for both
-/// per-body graphs (Exit node is a sink) and legacy supergraphs (the
-/// synthesized Return's successor is the file-level Exit with
+/// needed for leak analysis.  Return nodes are intermediate in per-body graphs
+/// (they flow into the synthetic Exit node) but become terminal in legacy
+/// supergraphs (their successor is the file-level Exit with
 /// `enclosing_func = None`).
+///
+/// Detection combines a kind filter with a topological check.  Only nodes
+/// whose `StmtKind` actually terminates execution (`Exit`, `Return`, `Throw`)
+/// are considered, then we require that they have no successor in the same
+/// function scope.  Without the kind filter, dangling Seq nodes left behind
+/// when nested function literals (e.g. `obj.fn = () => {...}`) get a
+/// placeholder in the parent graph would be misclassified as terminal exits
+/// and produce spurious resource-leak findings at the function-literal span.
 fn is_terminal_function_exit(
    idx: petgraph::graph::NodeIndex,
    info: &crate::cfg::NodeInfo,
    cfg: &Cfg,
 ) -> bool {
+    if !matches!(
+        info.kind,
+        StmtKind::Exit | StmtKind::Return | StmtKind::Throw
+    ) {
+        return false;
+    }
    info.ast.enclosing_func.is_some()
        && !cfg
            .neighbors_directed(idx, petgraph::Direction::Outgoing)
@ -62,6 +72,7 @@ pub struct StateFinding {
 /// `state-unauthed-access` finding is suppressed on those spans because
 /// the user-controlled input has already been proved unable to escape
 /// into a privileged location.
+#[allow(clippy::too_many_arguments)]
 pub fn extract_findings(
    result: &DataflowResult<ProductState, TransferEvent>,
    cfg: &Cfg,
@ -70,6 +81,7 @@ pub fn extract_findings(
    func_summaries: &crate::cfg::FuncSummaries,
    enable_auth: bool,
    path_safe_suppressed_sink_spans: &std::collections::HashSet<(usize, usize)>,
+    closure_released_var_names: Option<&std::collections::HashSet<String>>,
 ) -> Vec<StateFinding> {
    let mut findings = Vec::new();

@ -195,6 +207,23 @@ pub fn extract_findings(
                continue;
            }

+            // Suppress leaks for variables whose release call lives in a
+            // nested closure (callback / event handler) outside this
+            // body's CFG.  Common JS/TS shape:
+            //   const ws = new WebSocket(url);
+            //   socket.on("close", () => ws.close());
+            // The per-body resource analysis cannot observe the close
+            // inside the registered handler body; without this gate the
+            // handle reads as a definite leak.  Match by variable name —
+            // closure-captured handles share the binding name with the
+            // handle in the outer scope.
+            if closure_released_var_names
+                .map(|s| s.contains(var_name))
+                .unwrap_or(false)
+            {
+                continue;
+            }
+
            // Prefer direct acquire node span; fall back to proxy span
            // from ResourceMethodSummary (cross-body resource tracking).
            let acquire_span = acquire_node
@ -557,6 +586,7 @@ mod tests {
            &HashMap::new(),
            false,
            &std::collections::HashSet::new(),
+            None,
        );

        assert_eq!(findings.len(), 1);
@ -617,6 +647,7 @@ mod tests {
            &HashMap::new(),
            false,
            &std::collections::HashSet::new(),
+            None,
        );

        assert!(findings.is_empty());
@ -751,6 +782,7 @@ mod tests {
            &HashMap::new(),
            false,
            &std::collections::HashSet::new(),
+            None,
        );

        assert!(
@ -816,6 +848,7 @@ mod tests {
            &HashMap::new(),
            false,
            &std::collections::HashSet::new(),
+            None,
        );

        assert_eq!(
--- a/src/state/mod.rs
+++ b/src/state/mod.rs
@ -77,6 +77,13 @@ pub fn run_state_analysis(
    // m.Lock()`) and routes them through `chain_proxies` instead.  Pass
    // `None` to disable, strict-additive.
    ptr_proxy_hints: Option<&std::collections::HashMap<String, crate::pointer::PtrProxyHint>>,
+    // Names of variables whose `.close()`/release calls live in a nested
+    // closure (event handler, deferred callback) that the per-body CFG
+    // can't observe directly.  Used to suppress resource-leak findings
+    // for handles whose cleanup is registered as a callback (`ws.on(
+    // "close", () => ws2.close())`).  Pass `None` for languages or
+    // shapes that don't need this.
+    closure_released_var_names: Option<&std::collections::HashSet<String>>,
 ) -> Vec<StateFinding> {
    let _span = tracing::debug_span!("run_state_analysis").entered();

@ -116,9 +123,99 @@ pub fn run_state_analysis(
        func_summaries,
        enable_auth,
        path_safe_suppressed_sink_spans,
+        closure_released_var_names,
    )
 }

+/// Build a per-body map of variable names whose release calls
+/// (`.close`, `.destroy`, `.end`, `.release`, …) appear inside a
+/// **descendant** body (a closure / event handler nested inside the
+/// body that opens the handle).
+///
+/// Returned: `body_id → set of var names released somewhere inside
+/// that body's nested-closure subtree`.  Used by the structural
+/// ResourceMisuse pass and the state-model leak pass to suppress
+/// findings whose cleanup lives in a callback the per-body CFG can't
+/// follow (`socket.on("close", () => ws.close())`).
+///
+/// Restricted to descendants — sibling methods on the same class
+/// don't share resource ownership, so a release in `queryAndClose`
+/// must NOT silence a leak in sibling `queryAndLeak`.  Only true
+/// nested-closure parent / child relationships participate.
+pub fn collect_closure_released_var_names(
+    bodies: &[crate::cfg::BodyCfg],
+    lang: Lang,
+) -> std::collections::HashMap<crate::cfg::BodyId, std::collections::HashSet<String>> {
+    use crate::cfg::{BodyId, StmtKind};
+    use petgraph::visit::IntoNodeReferences;
+
+    // Step 1: collect releases per body.  Only nested (non-toplevel)
+    // closures are eligible — top-level bodies' own releases are
+    // already tracked by the dataflow.
+    let pairs = rules::resource_pairs(lang);
+    let mut per_body: std::collections::HashMap<BodyId, std::collections::HashSet<String>> =
+        std::collections::HashMap::new();
+    for body in bodies {
+        if body.meta.parent_body_id.is_none() {
+            continue;
+        }
+        let mut local = std::collections::HashSet::new();
+        for (_idx, info) in body.graph.node_references() {
+            if info.kind != StmtKind::Call {
+                continue;
+            }
+            let Some(callee) = info.call.callee.as_deref() else {
+                continue;
+            };
+            let cl = callee.to_ascii_lowercase();
+            let is_release = pairs.iter().any(|p| {
+                p.release.iter().any(|r| {
+                    let rl = r.to_ascii_lowercase();
+                    if let Some(method) = rl.strip_prefix('.') {
+                        cl.ends_with(&format!(".{method}"))
+                    } else {
+                        cl == rl || cl.ends_with(&format!(".{rl}"))
+                    }
+                })
+            });
+            if !is_release {
+                continue;
+            }
+            if let Some(rcv) = info.call.receiver.as_deref() {
+                local.insert(rcv.to_string());
+            } else if let Some((rcv, _)) = callee.rsplit_once('.')
+                && !rcv.is_empty()
+            {
+                local.insert(rcv.to_string());
+            }
+        }
+        if !local.is_empty() {
+            per_body.insert(body.meta.id, local);
+        }
+    }
+
+    // Step 2: roll up into ancestor bodies.  Walk each non-top body's
+    // parent chain and union its release set into every ancestor's
+    // entry.  Class methods at the same nesting level (siblings under a
+    // class body) do not roll up into each other — they have distinct
+    // BodyId entries and the chain only flows through `parent_body_id`.
+    let mut rollup: std::collections::HashMap<BodyId, std::collections::HashSet<String>> =
+        std::collections::HashMap::new();
+    let by_id: std::collections::HashMap<BodyId, &crate::cfg::BodyCfg> =
+        bodies.iter().map(|b| (b.meta.id, b)).collect();
+    for body in bodies {
+        let Some(local) = per_body.get(&body.meta.id) else {
+            continue;
+        };
+        let mut cur = body.meta.parent_body_id;
+        while let Some(pid) = cur {
+            rollup.entry(pid).or_default().extend(local.iter().cloned());
+            cur = by_id.get(&pid).and_then(|b| b.meta.parent_body_id);
+        }
+    }
+    rollup
+}
+
 /// Build resource method summaries by pre-scanning all method bodies for known
 /// resource acquire/release operations. Only creates summaries for methods whose
 /// bodies actually contain matching operations, never infers from names alone.
--- a/src/state/transfer.rs
+++ b/src/state/transfer.rs
@ -635,6 +635,19 @@ impl DefaultTransfer<'_> {
    fn apply_assignment(&self, _node_idx: NodeIndex, info: &NodeInfo, state: &mut ProductState) {
        // Ownership transfer: if `defines` reassigns a tracked resource
        // variable from a `uses` variable, transfer the lifecycle.
+        //
+        // Skip when the RHS is a function or lambda literal: storing a
+        // closure into a property (`ws.onclose = () => { ... }`,
+        // `obj.handler = function(){...}`) does not move ownership of the
+        // resources the closure body references — those identifiers appear
+        // in `info.taint.uses` only because `def_use` walks the literal's
+        // body, not because the assignment itself reads them.  Without this
+        // gate, the first OPEN-tracked capture inside the closure body gets
+        // marked MOVED and the property's symbol becomes the new OPEN
+        // owner, which then surfaces as a spurious leak on the property.
+        if info.rhs_is_function_literal {
+            return;
+        }
        if let Some(ref def) = info.taint.defines
            && let Some(def_sym) = self.get_sym(info, def)
        {