Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
This commit is contained in:
Eli Peter 2026-05-01 10:59:52 -04:00 committed by GitHub
parent a438886217
commit 58f1794a4e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
189 changed files with 8421 additions and 383 deletions

View file

@ -145,6 +145,11 @@ fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -
/// Build a [`Diag`] from a taint [`Finding`], the CFG that produced it,
/// the parsed tree (for byte→line/col conversion) and the file path.
///
/// Returns `None` when source-sensitivity gating fully suppresses the
/// finding (the canonical case is a multi-gate `DATA_EXFIL` event whose
/// contributing source is plain user input — see the
/// `effective_caps` strip below).
fn build_taint_diag(
finding: &crate::taint::Finding,
cfg_graph: &crate::cfg::Cfg,
@ -152,7 +157,7 @@ fn build_taint_diag(
path: &Path,
src: &[u8],
scan_root: Option<&Path>,
) -> Diag {
) -> Option<Diag> {
let call_site_byte = cfg_graph[finding.sink].classification_span().0;
let call_site_point = byte_offset_to_point(tree, call_site_byte);
// `finding.source` should be a NodeIndex valid in this body's CFG, but
@ -373,16 +378,63 @@ fn build_taint_diag(
// SSA dispatch) when populated; fall back to the union of all sink-label
// caps on the CFG node so legacy paths that build findings without
// setting `effective_sink_caps` still pick the right rule id.
let effective_caps = if finding.effective_sink_caps.is_empty() {
let mut effective_caps = if finding.effective_sink_caps.is_empty() {
crate::labels::Cap::from_bits_truncate(sink_caps_bits)
} else {
finding.effective_sink_caps
};
// Source-sensitivity gate for `DATA_EXFIL`. Plain attacker input echoed
// back into an outbound request body / headers / json is not data
// exfiltration, the user already controls the value, surfacing it as a
// leak is noise (the canonical false-positive class for API gateways
// and telemetry forwarders that proxy `req.body`). A `DATA_EXFIL`
// finding requires the contributing source to be at least `Sensitive`
// (cookies, headers, env, db rows, file reads). Plain user-input
// sources have the cap stripped so the finding either drops entirely
// or downgrades to whatever non-`DATA_EXFIL` cap also applies (e.g.
// SSRF on the URL position of the same `fetch` call).
if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& finding.source_kind.sensitivity() < crate::labels::Sensitivity::Sensitive
{
effective_caps.remove(crate::labels::Cap::DATA_EXFIL);
// The multi-gate dispatch produces one finding per (source, sink-cap)
// pair, a body-flow finding's `effective_sink_caps` is exactly the
// cap that fired (e.g. `DATA_EXFIL`). When that single cap is the
// sensitivity-stripped one, the finding has no surviving rationale
// and we drop it entirely rather than reroute it to the generic
// `taint-unsanitised-flow` bucket (which would just re-emit the same
// false positive under a different rule id). Findings with a
// multi-cap `effective_sink_caps` keep their non-DATA_EXFIL caps and
// are routed normally below.
if finding.effective_sink_caps == crate::labels::Cap::DATA_EXFIL {
return None;
}
}
// DATA_EXFIL routing.
//
// Multi-gate dispatch (JS / Go) emits one event per cap, so by this
// point each finding's `effective_sink_caps` carries exactly one bit
// and the simple `DATA_EXFIL && !SSRF` test routes correctly. Flat-
// rule paths (Java HTTP clients where type-qualified resolution
// attaches both `SSRF` and `DATA_EXFIL` Sink labels to the same call,
// e.g. `client.send(req)` covering both URL and body channels of the
// request value) produce a single dual-cap event. In that case the
// source's sensitivity tier disambiguates: a Sensitive source
// (cookie, header, env, db, session) leaking into an outbound
// request is canonically DATA_EXFIL even if the sink also carries
// an SSRF label, because operator-bound state is not URL-shaped
// attacker input. Plain user input keeps SSRF routing (the typical
// user-controlled-URL pattern).
let is_data_exfil_rule = effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& !effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID)
&& (!effective_caps.contains(crate::labels::Cap::SSRF)
|| finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive);
let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
"rs.auth.missing_ownership_check.taint".to_string()
} else if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
&& !effective_caps.contains(crate::labels::Cap::SSRF)
{
} else if is_data_exfil_rule {
format!(
"taint-data-exfiltration (source {}:{})",
source_point.row + 1,
@ -396,18 +448,86 @@ fn build_taint_diag(
)
};
// For `DATA_EXFIL` rules, look up which destination object-literal field
// (`body` / `headers` / `json`) the tainted value reached. Each
// [`crate::cfg::GateFilter`] carries `destination_uses` (var names) in
// parallel with `destination_fields` (the field each var was bound to),
// so we walk the gate filter whose `label_caps` includes `DATA_EXFIL`
// and match the tainted var name from the last flow step. Falls back
// to the first non-empty destination field on the matching filter when
// the var-name match fails (e.g. the SSA sink event is reported on a
// copy-propagated value whose name no longer matches the original
// destination ident). `None` when the sink wasn't a destination-aware
// gate (no object literal, or non-fetch sink).
let data_exfil_field: Option<String> = if is_data_exfil_rule {
let last_var = finding
.flow_steps
.last()
.and_then(|s| s.var_name.as_deref());
let filters = &cfg_graph[finding.sink].call.gate_filters;
filters
.iter()
.find(|f| f.label_caps.contains(crate::labels::Cap::DATA_EXFIL))
.and_then(|f| {
if let (Some(uses), Some(var)) = (f.destination_uses.as_ref(), last_var)
&& let Some(idx) = uses.iter().position(|u| u == var)
{
return f.destination_fields.get(idx).cloned();
}
f.destination_fields.first().cloned()
})
} else {
None
};
// DATA_EXFIL severity calibration (Phase: detector ranking).
//
// Generic taint severity comes from `severity_for_source_kind`, which
// maps Cookie/Header/Env to High because those sources are spicy
// *as taint roots*. For `DATA_EXFIL` we are scoring the leak class,
// not the source itself: not every Sensitive-tier source is a Secret.
// Cookies and env carry credential / session material whose leakage
// is an immediate disclosure (Secret-tier); request headers, file
// reads, db rows, and caught exceptions are Sensitive but not
// automatically secret, so they downgrade to Medium. Plain user
// input is already stripped above by the source-sensitivity gate, so
// the `_` arm here is reached only by Sensitive sources that are not
// explicit secrets.
let severity = if is_data_exfil_rule {
match finding.source_kind {
crate::labels::SourceKind::Cookie | crate::labels::SourceKind::EnvironmentConfig => {
crate::patterns::Severity::High
}
_ => crate::patterns::Severity::Medium,
}
} else {
severity_for_source_kind(finding.source_kind)
};
// DATA_EXFIL: surface the destination field in the message so analysts
// see at a glance whether the leak reached the request body, headers,
// or json payload. Generic taint findings stay on the existing
// "unsanitised … flows from … → …" template.
let message = if is_data_exfil_rule {
let suffix = data_exfil_field
.as_deref()
.map(|f| format!(" ({f} field)"))
.unwrap_or_default();
format!("sensitive data flows from {short_source} \u{2192} {sink_display}{suffix}")
} else {
format!("unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}")
};
let mut diag = Diag {
path: primary_path.clone(),
line: primary_line,
col: primary_col,
severity: severity_for_source_kind(finding.source_kind),
severity,
id: diag_id,
category: FindingCategory::Security,
path_validated: finding.path_validated,
guard_kind: finding.guard_kind.map(|k| format!("{k:?}")),
message: Some(format!(
"unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}"
)),
message: Some(message),
labels,
confidence: None,
evidence: Some(Evidence {
@ -448,6 +568,7 @@ fn build_taint_diag(
symbolic: finding.symbolic.clone(),
sink_caps: sink_caps_bits,
engine_notes: finding.engine_notes.clone(),
data_exfil_field,
..Default::default()
}),
rank_score: None,
@ -467,7 +588,7 @@ fn build_taint_diag(
ev.confidence_limiters = limiters;
}
diag
Some(diag)
}
/// Resolve a file extension to a language slug (e.g. `"rust"`,
@ -622,6 +743,8 @@ fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str {
use crate::labels::SourceKind;
match sk {
SourceKind::UserInput => "user input",
SourceKind::Cookie => "cookie value",
SourceKind::Header => "request header",
SourceKind::EnvironmentConfig => "environment config",
SourceKind::FileSystem => "file system data",
SourceKind::Database => "database result",
@ -1198,18 +1321,31 @@ impl<'a> ParsedFile<'a> {
continue;
}
out.push(build_taint_diag(
if let Some(diag) = build_taint_diag(
finding,
body_cfg,
&self.source.tree,
self.source.path,
self.source.bytes,
scan_root,
));
) {
out.push(diag);
}
}
// ── CFG structural analyses (per body) ─────────────────────────
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
// Pre-compute, per body, the set of variable names whose
// release / close calls live in a NESTED closure body inside
// that body (e.g. `socket.on("close", () => ws.close())`).
// Both the structural ResourceMisuse pass and the state-model
// leak pass consult it to suppress findings whose cleanup is
// registered as a callback the per-body CFG can't follow.
// Only descendants count — sibling methods on the same class
// don't share resource ownership.
let closure_released_per_body =
state::collect_closure_released_var_names(&self.file_cfg.bodies, caller_lang);
let empty_set: std::collections::HashSet<String> = std::collections::HashSet::new();
for body in &self.file_cfg.bodies {
let body_taint: Vec<_> = taint_results
.iter()
@ -1231,6 +1367,11 @@ impl<'a> ParsedFile<'a> {
body_const_facts: body_const_facts.as_ref(),
type_facts: body_const_facts.as_ref().map(|f| &f.type_facts),
auth_decorators: &body.meta.auth_decorators,
closure_released_var_names: Some(
closure_released_per_body
.get(&body.meta.id)
.unwrap_or(&empty_set),
),
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&self.source.tree, cf.span.0);
@ -1307,6 +1448,11 @@ impl<'a> ParsedFile<'a> {
&body.meta.auth_decorators,
&path_safe_suppressed_spans,
body_pointer_hints.as_ref(),
Some(
closure_released_per_body
.get(&body.meta.id)
.unwrap_or(&empty_set),
),
);
for sf in &state_findings {

View file

@ -1118,6 +1118,7 @@ fn clone_preserves_all_sub_structs() {
arg_string_literals: vec![Some("lit".into())],
destination_uses: None,
gate_filters: Vec::new(),
is_constructor: false,
},
taint: TaintMeta {
labels: {

View file

@ -373,11 +373,26 @@ pub(crate) fn first_member_label(
if let Some(full) = member_expr_text(n, code) {
// Try the full text first, then progressively strip the last segment
// to match rules like "process.env" from "process.env.CMD".
//
// The strip-and-retry only ever yields a sound label for Sources:
// `process.env.CMD` → strip → `process.env` makes sense because
// the receiver itself IS the source. Sinks and Sanitizers, by
// contrast, name the *operation* — `connection.query`, `eval`,
// `exec` — and stripping a trailing segment to match them is
// not semantically valid (e.g. `exec.start` should never be
// treated as a SHELL_ESCAPE sink because of bare `exec`). We
// accept any label on a full-text match (the behaviour callers
// already depend on for Source/Sink labels alike), but only
// accept Source labels after segment stripping.
let mut candidate = full.as_str();
let mut first = true;
loop {
if let Some(lbl) = classify(lang, candidate, extra_labels) {
return Some(lbl);
if first || matches!(lbl, DataLabel::Source(_)) {
return Some(lbl);
}
}
first = false;
match candidate.rsplit_once('.') {
Some((prefix, _)) => candidate = prefix,
None => break,

View file

@ -38,25 +38,27 @@ pub(super) fn find_call_node<'a>(n: Node<'a>, lang: &str) -> Option<Node<'a>> {
}
}
/// Extract identifiers from specified fields of an object-literal argument.
/// Extract `(field_name, ident_name)` pairs from specified fields of an
/// object-literal argument.
///
/// Returns:
/// * `Some(names)` if the positional argument at `index` IS an object literal
/// (JS `object`, TS `object`, Python `dictionary`). `names` contains
/// identifiers lifted from pair values whose key matches any entry in
/// `fields` (case-sensitive; JS/TS identifiers). When no destination-field
/// pairs are present, returns `Some(vec![])`, the sink is effectively
/// silenced because no destination identifier exists.
/// * `Some(pairs)` if the positional argument at `index` IS an object literal
/// (JS `object`, TS `object`, Python `dictionary`). Each pair is
/// `(field_name, ident_name)` where `field_name` is the matched key from
/// `fields` and `ident_name` is an identifier lifted from that pair's
/// value expression. When no destination-field pairs are present, returns
/// `Some(vec![])`, the sink is effectively silenced because no destination
/// identifier exists.
/// * `None` if the arg is absent, is not an object literal (plain string
/// / ident / expression), or has splat/spread children that break static
/// per-field reasoning. Callers fall back to the whole-arg positional
/// filter in this case.
pub(super) fn extract_destination_field_idents(
pub(super) fn extract_destination_field_pairs(
call_node: Node,
arg_index: usize,
fields: &[&str],
code: &[u8],
) -> Option<Vec<String>> {
) -> Option<Vec<(String, String)>> {
if fields.is_empty() {
return None;
}
@ -71,7 +73,7 @@ pub(super) fn extract_destination_field_idents(
return None;
}
let mut out: Vec<String> = Vec::new();
let mut out: Vec<(String, String)> = Vec::new();
let mut c = arg.walk();
for child in arg.named_children(&mut c) {
match child.kind() {
@ -88,8 +90,8 @@ pub(super) fn extract_destination_field_idents(
let Some(name) = text_of(child, code) else {
continue;
};
if fields.iter().any(|&f| f == name) && !out.contains(&name) {
out.push(name);
if fields.iter().any(|&f| f == name) && !out.iter().any(|(_, v)| v == &name) {
out.push((name.clone(), name));
}
}
"pair" => {
@ -124,8 +126,8 @@ pub(super) fn extract_destination_field_idents(
let mut paths: Vec<String> = Vec::new();
collect_idents_with_paths(val_node, code, &mut idents, &mut paths);
for name in paths.into_iter().chain(idents) {
if !out.contains(&name) {
out.push(name);
if !out.iter().any(|(_, v)| v == &name) {
out.push((key.clone(), name));
}
}
}
@ -135,6 +137,62 @@ pub(super) fn extract_destination_field_idents(
Some(out)
}
/// Extract `(field_name, ident_name)` pairs from `keyword_argument` /
/// `named_argument` children of a call whose keyword name matches one of
/// `fields`. Used for languages where destination-bearing fields are passed
/// as direct kwargs rather than wrapped in a dict literal, e.g. Python
/// `requests.post(url, data=tainted, json=safe)` where `data` and `json` are
/// `keyword_argument` siblings of the positional URL.
///
/// Returns the union of matching kwargs, preserving the kwarg name in the
/// `field` slot so callers can still attribute findings per-field. Empty
/// when no matching kwargs exist or the call has no `arguments` field.
pub(super) fn extract_destination_kwarg_pairs(
call_node: Node,
fields: &[&str],
code: &[u8],
) -> Vec<(String, String)> {
if fields.is_empty() {
return Vec::new();
}
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return Vec::new();
};
let mut out: Vec<(String, String)> = Vec::new();
let mut cursor = args_node.walk();
for child in args_node.named_children(&mut cursor) {
let kind = child.kind();
if kind != "keyword_argument" && kind != "named_argument" {
continue;
}
let named_count = child.named_child_count();
let name_node = child
.child_by_field_name("name")
.or_else(|| child.named_child(0));
let value_node = child
.child_by_field_name("value")
.or_else(|| child.named_child(named_count.saturating_sub(1) as u32));
let (Some(nn), Some(vn)) = (name_node, value_node) else {
continue;
};
let Some(name) = text_of(nn, code) else {
continue;
};
if !fields.iter().any(|&f| f == name) {
continue;
}
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(vn, code, &mut idents, &mut paths);
for ident in paths.into_iter().chain(idents) {
if !out.iter().any(|(_, v)| v == &ident) {
out.push((name.clone(), ident));
}
}
}
out
}
/// Extract the string-literal content at argument position `index` (0-based).
/// Returns `None` if the argument is not a string literal or the index is out of range.
pub(super) fn extract_const_string_arg(
@ -144,7 +202,14 @@ pub(super) fn extract_const_string_arg(
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let arg = args.named_children(&mut cursor).nth(index)?;
let mut arg = args.named_children(&mut cursor).nth(index)?;
// PHP / Go wrap each positional argument in an `argument` node; unwrap so
// the kind-match below sees the inner literal.
if arg.kind() == "argument" && arg.named_child_count() == 1 {
if let Some(inner) = arg.named_child(0) {
arg = inner;
}
}
match arg.kind() {
// `string` / `string_literal` cover JS/TS, Python, Java, PHP, C/C++, Ruby, Rust;
// `interpreted_string_literal` / `raw_string_literal` cover Go's
@ -177,6 +242,39 @@ pub(super) fn extract_const_string_arg(
}
}
/// Extract a macro-constant or `define`d identifier name at argument position
/// `index` (0-based). Used for languages where activation values are
/// preprocessor symbols rather than string literals — currently C, C++, and
/// PHP define-constants like `CURLOPT_POSTFIELDS` whose syntactic form is an
/// `identifier` / `name` node, not a `string`.
///
/// Returns `None` for any non-identifier shape so dynamic-activation
/// semantics still apply when the activation arg is a runtime value
/// (variable, expression, function call).
pub(super) fn extract_const_macro_arg(
call_node: Node,
index: usize,
code: &[u8],
) -> Option<String> {
let args = call_node.child_by_field_name("arguments")?;
let mut cursor = args.walk();
let mut arg = args.named_children(&mut cursor).nth(index)?;
if arg.kind() == "argument" && arg.named_child_count() == 1 {
if let Some(inner) = arg.named_child(0) {
arg = inner;
}
}
match arg.kind() {
// C/C++ identifier / PHP `name` node for define-style constants.
// Scoped C++ identifiers (`Curl::OPT_POSTFIELDS`) and PHP namespaced
// names also surface here so the dangerous_values match catches them.
"identifier" | "name" | "qualified_name" | "scoped_identifier" => {
text_of(arg, code).map(|s| s.to_string())
}
_ => None,
}
}
/// Extract the value of a keyword argument from a call node (e.g. Python `shell=True`).
/// Walks argument children looking for `keyword_argument` nodes, matches the keyword
/// name, and extracts the value node text for literals.
@ -1546,6 +1644,59 @@ pub(super) fn def_use(
(None, uses, vec![])
}
// for-in / for-of / Python `for x in iter:` ─────────────────────────
//
// Tree-sitter classifies these as `Kind::For` with a `left`/`right`
// field pair (binding pattern + iterable). Without an explicit
// arm here, the default branch collects every ident as a `use` and
// never registers the iteration binding as a `define`, so taint
// entering the iterable does not propagate into the body's
// references to the binding (`for (const [a, b] of obj) { sink(a) }`
// would lose the flow at `a`).
//
// C-style `for_statement` has no `left`/`right` fields (it uses
// `initializer`/`condition`/`increment`), so this path falls through
// to the default-collecting behaviour for those, preserving today's
// semantics.
Kind::For => {
let left = ast.child_by_field_name("left");
let right = ast.child_by_field_name("right");
if left.is_none() && right.is_none() {
// C-style for, defer to default ident collection.
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(ast, code, &mut idents, &mut paths);
let mut uses = paths;
uses.extend(idents);
return (None, uses, vec![]);
}
let mut defs: Option<String> = None;
let mut extra_defs: Vec<String> = Vec::new();
let mut uses: Vec<String> = Vec::new();
if let Some(pat) = left {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(pat, code, &mut idents, &mut paths);
let first = paths.pop().or_else(|| idents.first().cloned());
for ident in &idents {
if first.as_ref() != Some(ident) {
extra_defs.push(ident.clone());
}
}
defs = first;
}
if let Some(val) = right {
let mut idents = Vec::new();
let mut paths = Vec::new();
collect_idents_with_paths(val, code, &mut idents, &mut paths);
uses.extend(paths);
uses.extend(idents);
}
(defs, uses, extra_defs)
}
// everything else no definition, but may read vars
_ => {
let mut idents = Vec::new();
@ -1557,3 +1708,225 @@ pub(super) fn def_use(
}
}
}
/// One match from [`extract_shell_array_payload_idents`].
///
/// `arg_position` is the positional argument index of the call where the
/// shell-array literal was found. `payload_idents` is the union of
/// identifiers (and dotted paths) lifted from the array's payload elements
/// (positions 2+ for POSIX `sh -c <cmd>` form; positions 2+ for `cmd /c <cmd>`
/// likewise). Empty `payload_idents` means the payload is a constant string,
/// which the caller should treat as benign (no SHELL_ESCAPE finding possible).
#[derive(Debug, Clone)]
pub(super) struct ShellArrayMatch {
pub arg_position: usize,
pub payload_idents: Vec<String>,
}
/// Detect inline shell-execution array literals at a call site.
///
/// Recognises the pattern `[<shell>, "-c", <payload>]` (POSIX shells) and
/// `[<cmd-shell>, "/c"|"/C", <payload>]` (Windows `cmd.exe`) appearing as
/// either:
/// * a direct positional argument of `call_node`, or
/// * the value of any field within an object-literal positional argument
/// (covers `container.exec({Cmd: ["bash", "-c", x]})` form).
///
/// Returns one [`ShellArrayMatch`] per detected shell-array. Empty when the
/// call has no shell-array literals.
///
/// The shell-name list is intentionally narrow (POSIX shells + Windows
/// `cmd.exe`/`powershell`) to avoid false positives on benign array literals
/// like `["ls", "-la"]` or `["git", "rev-parse", "HEAD"]`, where element 0 is
/// not a shell. Element 1 must be a literal `-c` (POSIX) or `/c`/`/C` (cmd);
/// otherwise the array is not in shell-exec form regardless of element 0.
///
/// Identifiers from elements at positions 2+ are lifted via
/// [`collect_idents_with_paths`] so template-literal interpolations
/// (`` `echo ${x}` ``), member-expressions (`obj.field`), and bare idents are
/// all captured. Dedup is preserved across array elements so a single ident
/// referenced in multiple payload positions appears once.
pub(super) fn extract_shell_array_payload_idents(
call_node: Node,
code: &[u8],
) -> Vec<ShellArrayMatch> {
let mut out = Vec::new();
let Some(args_node) = call_node.child_by_field_name("arguments") else {
return out;
};
let mut cursor = args_node.walk();
for (idx, child) in args_node.named_children(&mut cursor).enumerate() {
let kind = child.kind();
// Splats break positional indexing; bail conservatively on the whole call.
if kind == "spread_element"
|| kind == "dictionary_splat"
|| kind == "list_splat"
|| kind == "splat_argument"
|| kind == "hash_splat_argument"
{
return Vec::new();
}
if kind == "keyword_argument" || kind == "named_argument" {
continue;
}
// Direct array-literal arg.
if let Some(idents) = shell_array_payload_idents_of(child, code) {
out.push(ShellArrayMatch {
arg_position: idx,
payload_idents: idents,
});
continue;
}
// Object-literal arg whose field value is a shell-array literal.
// Covers `container.exec({Cmd: [...]})` form. Field name is not
// restricted to `Cmd` / `cmd`: the shell-shape itself is the gate,
// and the payload extraction is per-array.
if matches!(kind, "object" | "dictionary") {
let mut cc = child.walk();
for pair in child.named_children(&mut cc) {
if pair.kind() != "pair" {
continue;
}
let Some(val_node) = pair.child_by_field_name("value") else {
continue;
};
let val_node = unwrap_parens(val_node);
if let Some(idents) = shell_array_payload_idents_of(val_node, code) {
out.push(ShellArrayMatch {
arg_position: idx,
payload_idents: idents,
});
}
}
}
}
out
}
/// If `node` is an array literal of shape `[<shell>, "-c", *]` (POSIX shells)
/// or `[<cmd-shell>, "/c", *]` (Windows cmd.exe), return the identifiers
/// referenced in the payload elements (positions 2+). Otherwise return
/// `None`. Returning `Some(vec![])` means the payload is a constant string
/// — caller should still skip emitting a sink (no taint can reach a literal).
fn shell_array_payload_idents_of(node: Node, code: &[u8]) -> Option<Vec<String>> {
let node = unwrap_parens(node);
if node.kind() != "array" {
return None;
}
// Walk named children to skip commas and other trivia.
let mut cursor = node.walk();
let elems: Vec<Node> = node.named_children(&mut cursor).collect();
if elems.len() < 3 {
return None;
}
let shell = const_string_value(elems[0], code)?;
if !is_known_shell(&shell) {
return None;
}
let flag = const_string_value(elems[1], code)?;
if !is_shell_command_flag(&shell, &flag) {
return None;
}
// Lift identifiers from the payload elements (positions 2+). Constants
// contribute nothing. An empty result means the entire payload is
// statically benign.
let mut idents: Vec<String> = Vec::new();
let mut paths: Vec<String> = Vec::new();
for elem in &elems[2..] {
collect_idents_with_paths(*elem, code, &mut idents, &mut paths);
}
let mut combined = paths;
combined.extend(idents);
// Dedup (preserve first-seen order).
let mut seen = std::collections::HashSet::new();
combined.retain(|s| seen.insert(s.clone()));
if combined.is_empty() {
// Static payload — no taint can reach it. Return None so the caller
// does not emit a useless sink filter.
return None;
}
Some(combined)
}
/// Extract a constant string value from `node`, handling JS/TS `string` /
/// `template_string` (no interpolation) forms. Returns `None` for dynamic
/// values, identifiers, or expressions.
fn const_string_value(node: Node, code: &[u8]) -> Option<String> {
let node = unwrap_parens(node);
match node.kind() {
"string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" => {
let raw = text_of(node, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
"template_string" => {
let mut c = node.walk();
if node
.named_children(&mut c)
.any(|ch| ch.kind() == "template_substitution")
{
return None;
}
let raw = text_of(node, code)?;
if raw.len() >= 2 {
Some(raw[1..raw.len() - 1].to_string())
} else {
None
}
}
_ => None,
}
}
/// Known shell executable names that activate the shell-array detector.
/// Scoped narrowly to POSIX shells + Windows command interpreters, listing
/// only canonical names so benign arrays like `["ls", ...]`, `["git", ...]`,
/// or `["python", ...]` do not match.
fn is_known_shell(name: &str) -> bool {
// Strip directory prefix for matching: `/bin/bash` → `bash`.
let leaf = name.rsplit('/').next().unwrap_or(name);
matches!(
leaf,
"bash"
| "sh"
| "zsh"
| "dash"
| "ksh"
| "fish"
| "ash"
| "tcsh"
| "csh"
| "cmd"
| "cmd.exe"
| "powershell"
| "powershell.exe"
| "pwsh"
| "pwsh.exe"
)
}
/// True when `flag` is the "execute the following string as a shell command"
/// switch for the given `shell`. POSIX shells use `-c`; cmd.exe accepts
/// `/c` / `/C`; PowerShell uses `-Command` (also `-c` as alias) and
/// `-EncodedCommand`.
fn is_shell_command_flag(shell: &str, flag: &str) -> bool {
let leaf = shell.rsplit('/').next().unwrap_or(shell);
let is_cmd = matches!(leaf, "cmd" | "cmd.exe");
let is_powershell = matches!(leaf, "powershell" | "powershell.exe" | "pwsh" | "pwsh.exe");
if is_cmd {
return matches!(flag, "/c" | "/C" | "/k" | "/K");
}
if is_powershell {
return matches!(
flag,
"-c" | "-Command" | "-command" | "-EncodedCommand" | "-encodedcommand"
);
}
// POSIX shells.
flag == "-c"
}

View file

@ -52,10 +52,11 @@ use literals::has_sql_placeholders;
use literals::{
arg0_kind_and_interpolation, call_ident_of, def_use, detect_go_replace_call_sanitizer,
detect_rust_replace_chain_sanitizer, extract_arg_callees, extract_arg_string_literals,
extract_arg_uses, extract_const_keyword_arg, extract_const_string_arg,
extract_destination_field_idents, extract_kwargs, extract_literal_rhs, find_call_node,
find_call_node_deep, find_chained_inner_call, has_keyword_arg, has_only_literal_args,
is_parameterized_query_call, java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
extract_arg_uses, extract_const_keyword_arg, extract_const_macro_arg, extract_const_string_arg,
extract_destination_field_pairs, extract_destination_kwarg_pairs, extract_kwargs,
extract_literal_rhs, extract_shell_array_payload_idents, find_call_node, find_call_node_deep,
find_chained_inner_call, has_keyword_arg, has_only_literal_args, is_parameterized_query_call,
java_chain_arg0_kind_for_method, js_chain_arg0_kind_for_method,
js_chain_outer_method_for_inner, ruby_chain_arg0_for_method, walk_chain_inner_call_args,
};
use params::{
@ -312,6 +313,15 @@ pub struct CallMeta {
/// [`Self::destination_uses`]).
#[serde(default)]
pub gate_filters: Vec<GateFilter>,
/// True when this call expression is a constructor invocation
/// (e.g. JS/TS `new Stripe(key)`, PHP `new PDO(...)`). The SSA Call
/// transfer uses this to narrow the constructed value's caps: a wrapper
/// object instance is structurally not a path string, format string,
/// URL component, or JSON input, so out-of-process side-effect bits
/// (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on the arguments
/// must not survive into the constructed object.
#[serde(default)]
pub is_constructor: bool,
}
/// One gate's contribution at a call site whose callee matches multiple
@ -329,6 +339,15 @@ pub struct GateFilter {
/// considers SSA values whose `var_name` matches one of `names` (object-
/// literal destination fields lifted at CFG time). `None` ⇒ whole arg.
pub destination_uses: Option<Vec<String>>,
/// Parallel to [`Self::destination_uses`]: for each entry, the
/// destination object-literal field name (e.g. `"body"`, `"headers"`,
/// `"json"`) where the corresponding ident was bound. Empty when
/// `destination_uses` is `None` or the gate had no
/// `object_destination_fields` configured. Consumed by diag rendering
/// to embed the destination field in `DATA_EXFIL` messages and SARIF
/// `properties.data_exfil_field`.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub destination_fields: Vec<String>,
}
/// Taint-classification and variable-flow metadata.
@ -450,6 +469,13 @@ pub struct NodeInfo {
/// up the field's declared `TypeKind`. Strictly additive, when
/// `None`, the legacy copy-prop semantics apply.
pub member_field: Option<String>,
/// True when this assignment / declaration's RHS is a function or
/// lambda literal (`obj.handler = (e) => {...}`, `let f = function(){}`).
/// State analysis uses this to suppress resource-ownership transfer:
/// storing a function reference into a property does not move the
/// resources captured by the closure body, so the lifecycle of those
/// captures must remain unchanged on the assignment node.
pub rhs_is_function_literal: bool,
}
impl NodeInfo {
@ -1564,6 +1590,92 @@ pub(super) fn push_node<'a>(
let extra = analysis_rules.map(|r| r.extra_labels.as_slice());
let mut labels = classify_all(lang, &text, extra);
// Rust chain-text classification. The default `text` for a Rust
// CallMethod is `{root_receiver}.{method}`, where `root_receiver`
// is the leftmost identifier after walking through every nested
// call/method receiver. That convention loses the intermediate
// chain methods, so a body-binding chain like
// `Client::post(url).body(payload).send()` reduces to
// `Client::post.send` and rules keyed on `body.send` /
// `RequestBuilder.body` cannot fire.
//
// Reclassify against the call-AST's source text (with paren groups
// stripped) so suffix matchers covering chain shapes
// (`body.send`, `body_string`, `Request::builder.body`, ...) attach.
// Strictly additive: we union new labels with the existing ones,
// never override. Limited to Rust to avoid disturbing the other
// languages' chain conventions.
if lang == "rust" {
if let Some(cn) = find_call_node(ast, lang) {
if let Some(chain_raw) = text_of(cn, code) {
// Multi-line Rust chains (`Client::new()\n .post(url)\n
// .body(p)\n .send()`) preserve interior whitespace in
// the source slice, which would prevent suffix matchers
// like `body.send` from firing. Strip whitespace before
// normalizing paren groups, mirroring the same trick
// used by `find_chained_inner_call` for JS/TS chains.
let chain_compact: String =
chain_raw.chars().filter(|c| !c.is_whitespace()).collect();
let chain_text = crate::labels::normalize_chained_call_for_classify(&chain_compact);
if chain_text != text {
let chain_labels = classify_all(lang, &chain_text, extra);
for l in chain_labels {
if !labels.contains(&l) {
labels.push(l);
}
}
}
// Also try classification against the chain with
// trailing identity methods peeled. Rust chains often
// end in `.unwrap()` / `.expect("...")` / `.await` /
// `.clone()` etc., which obscure the body-bind verb
// for suffix matchers. E.g. hyper's
// `Request::builder().method(..).uri(..).body(p).unwrap()`
// peels to `...body`, allowing a simpler `body` /
// `Request::builder.body` matcher to fire.
let peeled = crate::ssa::type_facts::peel_identity_suffix(&chain_text);
if peeled != chain_text && peeled != text {
let peeled_labels = classify_all(lang, &peeled, extra);
for l in peeled_labels {
if !labels.contains(&l) {
labels.push(l);
}
}
}
// Pattern synthesis: the hyper request-builder chain
// (`hyper::Request::builder().method(..).uri(..).body(p)`)
// can interleave `.method`, `.uri`, `.header`, `.version`
// etc. between `Request::builder` and the body-bind step.
// Suffix matchers can't span those, so synthesise a
// DATA_EXFIL sink whenever the chain begins with
// `Request::builder` and ends in a body-binding verb.
// Strictly additive: no labels are removed, only added,
// and the synthesis only fires when an explicit Sink
// hasn't already attached.
let chain_for_synth = if peeled != chain_text {
&peeled
} else {
&chain_text
};
if !labels
.iter()
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(crate::labels::Cap::DATA_EXFIL)))
&& (chain_for_synth.contains("Request::builder.")
|| chain_for_synth.contains("hyper::Request::builder."))
{
let last_seg =
chain_for_synth.rsplit('.').next().unwrap_or(chain_for_synth);
if matches!(
last_seg,
"body" | "body_mut" | "body_string" | "body_json" | "body_bytes"
) {
labels.push(DataLabel::Sink(crate::labels::Cap::DATA_EXFIL));
}
}
}
}
}
// If the outermost call didn't classify, try inner/nested calls.
// E.g. `str(eval(expr))`, `str` is not a sink, but `eval` is.
// When the callee is overridden, save the original for container ops
@ -1727,7 +1839,23 @@ pub(super) fn push_node<'a>(
let mut sink_payload_args: Option<Vec<usize>> = None;
let mut destination_uses: Option<Vec<String>> = None;
let mut gate_filters: Vec<GateFilter> = Vec::new();
if labels.is_empty() {
// Gates run when no flat `Sink` label is already present, OR when a
// matching gate restricts the payload-arg set on top of an existing flat
// sink. Source / Sanitizer labels are orthogonal — a callee like
// Python's `requests.post` is a `Source` for its response object AND a
// gated `Sink` for its URL/body argument positions; both should attach.
//
// Payload-arg refinement: when a flat sink matches a callee that ALSO
// has a gate entry restricting `payload_args`, the gate's `payload_args`
// are propagated to `sink_payload_args` so only those positions are
// taint-checked. Example: `execSync(cmd, { env: process.env })` matches
// the bare `execSync` flat `Sink(SHELL_ESCAPE)` AND the gate `=execSync`
// with `payload_args: &[0]`; without the refinement, the flat rule's
// implicit "all args" would flag `process.env` flowing into the options
// object's `env` field. The gate's labels themselves are deduped so a
// single capability never double-attributes.
let has_sink_label = labels.iter().any(|l| matches!(l, DataLabel::Sink(_)));
{
let gate_call = call_ast.or_else(|| find_call_node_deep(ast, lang, 4));
if let Some(cn) = gate_call {
let gate_callee_text = if call_ast.is_some() {
@ -1746,7 +1874,22 @@ pub(super) fn push_node<'a>(
let matches = classify_gated_sink(
lang,
&gate_callee_text,
|idx| extract_const_string_arg(cn, idx, code),
|idx| {
extract_const_string_arg(cn, idx, code).or_else(|| {
// C/C++ preprocessor macros and PHP `define`d constants
// surface as identifier nodes, not string literals.
// Falling back to the macro-arg extractor for those
// languages lets gates like `curl_easy_setopt` /
// `curl_setopt` activate on a `CURLOPT_POSTFIELDS`
// ident match instead of firing conservatively on
// every positional arg.
if matches!(lang, "c" | "cpp" | "c++" | "php") {
extract_const_macro_arg(cn, idx, code)
} else {
None
}
})
},
|kw| extract_const_keyword_arg(cn, kw, code),
|kw| has_keyword_arg(cn, kw, code),
);
@ -1758,11 +1901,23 @@ pub(super) fn push_node<'a>(
// * a `GateFilter` carrying that gate's specific
// `(label_caps, payload_args, destination_uses)` so
// the SSA sink scan can attribute taint per-cap.
//
// When a flat sink already matches, gate labels are deduped
// so the same capability isn't attributed twice (once flat,
// once gated). Their `payload_args` still flow into
// `sink_payload_args` so the gate's arg-position restriction
// applies on top of the flat sink.
let mut union_payload: Vec<usize> = Vec::new();
for gm in &matches {
labels.push(gm.label);
if has_sink_label {
if !labels.contains(&gm.label) {
labels.push(gm.label);
}
} else {
labels.push(gm.label);
}
let payload_vec: Vec<usize> =
let mut payload_vec: Vec<usize> =
if gm.payload_args == crate::labels::ALL_ARGS_PAYLOAD {
// Dynamic-activation sentinel: every positional arg is
// conservatively a payload. Expand using the actual
@ -1780,19 +1935,57 @@ pub(super) fn push_node<'a>(
// checks to identifiers under those fields. Non-object
// arg forms return `None` from the extractor and the gate
// falls back to whole-arg positional filtering.
//
// The pair form preserves which object-literal field each
// ident was bound to (e.g. `body` vs `headers` vs `json`)
// so diag rendering can attribute `DATA_EXFIL` findings to
// a specific destination field.
let mut dest_uses: Option<Vec<String>> = None;
let mut dest_fields: Vec<String> = Vec::new();
if !gm.object_destination_fields.is_empty() {
let mut all_pairs: Vec<(String, String)> = Vec::new();
let mut had_object_match = false;
for &pos in gm.payload_args {
if let Some(names) = extract_destination_field_idents(
if let Some(pairs) = extract_destination_field_pairs(
cn,
pos,
gm.object_destination_fields,
code,
) {
dest_uses = Some(names);
all_pairs.extend(pairs);
had_object_match = true;
break;
}
}
// Direct kwargs: languages where destination-bearing
// fields are passed as `keyword_argument` siblings of
// the positional args (Python `data=`, Ruby kwargs).
// SSA lowering folds kwarg idents into the implicit
// args group at index `arity`, so we expand
// `payload_vec` to include that position; the
// `destination_filter` then narrows to the kwarg
// ident's `var_name`.
let kwarg_pairs =
extract_destination_kwarg_pairs(cn, gm.object_destination_fields, code);
if !kwarg_pairs.is_empty() {
let arity = extract_arg_uses(cn, code).len();
if !payload_vec.contains(&arity) {
payload_vec.push(arity);
}
for pair in kwarg_pairs {
if !all_pairs.iter().any(|(_, v)| v == &pair.1) {
all_pairs.push(pair);
}
}
}
if had_object_match || !all_pairs.is_empty() {
let (fields, vars): (Vec<String>, Vec<String>) =
all_pairs.into_iter().unzip();
dest_uses = Some(vars);
dest_fields = fields;
}
}
let label_caps = match gm.label {
@ -1809,6 +2002,7 @@ pub(super) fn push_node<'a>(
label_caps,
payload_args: payload_vec,
destination_uses: dest_uses,
destination_fields: dest_fields,
});
}
if !union_payload.is_empty() {
@ -1826,6 +2020,65 @@ pub(super) fn push_node<'a>(
}
}
// ── Inline shell-array sink synthesis ────────────────────────────────
//
// Recognise `[<shell>, "-c", <payload>]` (and `cmd /c <payload>`)
// appearing as an argument to *any* call. The shell-array shape itself
// is the gate, regardless of callee, so this fires through user-defined
// wrappers like `execInContainer(id, ["bash", "-c", `echo ${tainted}`])`
// without needing per-wrapper summary annotations. Only fires for JS/TS
// because the array-literal grammar (`array` node) and shell-form usage
// are JS/TS conventions; other languages use different shapes for
// shell-exec wrappers.
//
// The inner array also covers Dockerode's
// `container.exec({Cmd: [shell, "-c", payload]})`: the helper looks
// inside object-literal args for shell-array values under any field.
//
// Existing FP carve-outs are preserved. `["ls", "-la"]` doesn't match
// (element 0 is not a known shell). `untaintedArrayVariable` doesn't
// match (variable, not literal). `execSync(cmd, { env: process.env })`
// doesn't match (string + object args, no shell-array literal). When
// the payload elements are constant strings the helper returns no
// match, so a literal `["bash", "-c", "ls -la"]` doesn't fire either.
if matches!(lang, "javascript" | "js" | "typescript" | "ts") {
if let Some(cn) = call_ast.or_else(|| find_call_node_deep(ast, lang, 4)) {
let shell_matches = extract_shell_array_payload_idents(cn, code);
if !shell_matches.is_empty() {
let shell_label = DataLabel::Sink(Cap::SHELL_ESCAPE);
let already_has_shell_sink = labels.iter().any(|l| match l {
DataLabel::Sink(c) => c.contains(Cap::SHELL_ESCAPE),
_ => false,
});
if !already_has_shell_sink {
labels.push(shell_label);
}
let mut union_payload: Vec<usize> = sink_payload_args.clone().unwrap_or_default();
for sm in shell_matches {
if !union_payload.contains(&sm.arg_position) {
union_payload.push(sm.arg_position);
}
gate_filters.push(GateFilter {
label_caps: Cap::SHELL_ESCAPE,
payload_args: vec![sm.arg_position],
destination_uses: Some(sm.payload_idents),
destination_fields: Vec::new(),
});
}
if !union_payload.is_empty() {
sink_payload_args = Some(union_payload);
}
// Legacy single-gate path: when this is the only gate filter,
// populate the top-level destination_uses too so the SSA
// fast-path stays consistent with the multi-gate behaviour.
if gate_filters.len() == 1 {
destination_uses = gate_filters[0].destination_uses.clone();
}
}
}
}
// Pattern-based sanitizer synthesis: recognise a Rust
// `param.replace(LIT, LIT)[.replace(LIT, LIT)]*` chain that provably strips
// path-traversal or HTML metacharacters. The CFG collapses the whole
@ -2296,6 +2549,20 @@ pub(super) fn push_node<'a>(
// just bloat every labeled Call node.
let callee_span = inner_callee_span.or(inner_text_span).filter(|s| *s != span);
// Constructor detection: a `new X(...)` call carries different cap
// semantics than a plain function call. The SSA Call transfer uses
// this flag to narrow the constructed value's caps so out-of-process
// side-effect bits (FILE_IO, FMT_STRING, URL_ENCODE, JSON_PARSE) on
// the arguments don't survive into a wrapper-object instance.
// Recognised forms:
// * JS/TS `new_expression`
// * Java/C++ `object_creation_expression`
// * PHP `object_creation_expression`
let is_constructor = ast.kind() == "new_expression"
|| ast.kind() == "object_creation_expression"
|| call_ast
.is_some_and(|cn| matches!(cn.kind(), "new_expression" | "object_creation_expression"));
let idx = g.add_node(NodeInfo {
kind,
call: CallMeta {
@ -2311,6 +2578,7 @@ pub(super) fn push_node<'a>(
arg_string_literals,
destination_uses,
gate_filters,
is_constructor,
},
taint: TaintMeta {
labels,
@ -2339,6 +2607,7 @@ pub(super) fn push_node<'a>(
is_eq_with_const: detect_eq_with_const(ast, lang),
is_numeric_length_access: detect_numeric_length_access(ast, lang, code),
member_field: detect_member_field_assignment(ast, code),
rhs_is_function_literal: rhs_is_function_literal(ast, lang),
});
debug!(
@ -2404,7 +2673,10 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
if candidate.is_none() {
// Walk one level into declarations whose direct child is the
// declarator (variable_declaration → variable_declarator →
// value).
// value), or expression-statement wrappers whose direct child is
// an assignment_expression / assignment with a `right` field
// (JS `expression_statement > assignment_expression`, Python
// `expression_statement > assignment`).
let mut cursor = ast.walk();
for c in ast.children(&mut cursor) {
if matches!(
@ -2417,6 +2689,11 @@ fn rhs_is_function_literal(ast: Node, lang: &str) -> bool {
if candidate.is_some() {
break;
}
} else if matches!(lookup(lang, c.kind()), Kind::Assignment) {
candidate = c.child_by_field_name("right");
if candidate.is_some() {
break;
}
}
}
}
@ -4417,7 +4694,23 @@ fn apply_promisify_labels(
let Some(alias) = aliases.get(&callee) else {
continue;
};
let wrapped_labels = classify_all(lang, &alias.wrapped, extra);
// Inherit both flat and gated labels from the wrapped callee.
// Gated sinks (e.g. `child_process.exec`) carry the same
// capability semantics as flat sinks, just with arg-position
// filtering at the call site; the promisify alias should
// surface the wrapped function's sink class regardless of
// which arm originally classified it.
let mut wrapped_labels: Vec<crate::labels::DataLabel> =
classify_all(lang, &alias.wrapped, extra)
.into_iter()
.collect();
for gm in
classify_gated_sink(lang, &alias.wrapped, |_| None, |_| None, |_| false).iter()
{
if !wrapped_labels.contains(&gm.label) {
wrapped_labels.push(gm.label);
}
}
if wrapped_labels.is_empty() {
continue;
}

View file

@ -678,12 +678,30 @@ fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
if info.kind == StmtKind::If {
if let Some(cond_text) = &info.condition_text {
let kind = classify_condition(cond_text);
// For `AllowlistCheck`, also confirm a target identifier was
// extractable. When the receiver-method form carries a
// string-literal arg (`filePath.includes("/")`,
// `path.contains("..")`), `extract_allowlist_target` returns
// `None` because the argument isn't an identifier. Those
// shapes are presence-checks, not real allowlist tests against
// a collection variable, and shouldn't dominate every
// downstream sink as a structural guard with `Cap::all()`.
// `classify_condition` itself stays unchanged (an existing
// test locks in its broad return for the receiver-method form,
// and the SSA branch-narrowing layer reads the kind for its
// own purposes).
let allowlist_has_target = if kind == PredicateKind::AllowlistCheck {
crate::taint::path_state::classify_condition_with_target(cond_text)
.1
.is_some()
} else {
true
};
if matches!(
kind,
PredicateKind::AllowlistCheck
| PredicateKind::TypeCheck
| PredicateKind::ValidationCall
) {
PredicateKind::TypeCheck | PredicateKind::ValidationCall,
) || (kind == PredicateKind::AllowlistCheck && allowlist_has_target)
{
result.push((idx, Cap::all()));
} else if cond_indirect_validator_callee(info, ctx).is_some() {
// Indirect-validator pattern:
@ -995,7 +1013,25 @@ impl CfgAnalysis for UnguardedSink {
// is the only other operand. The simpler `is_all_args_constant`
// check above rejects that mixed shape because it forbids real
// parameters in operand position.
if !has_taint && ssa_all_sink_operands_const_or_param(ctx, *sink) {
//
// Exemption: shell-array gate filters. The
// `extract_shell_array_payload_idents` detector recognises
// `[<shell>, "-c", <payload>]` arrays at any call site and emits a
// `Sink(SHELL_ESCAPE)` label with `destination_uses` narrowed to
// the payload-element idents. When the array shape itself is the
// gate, an unrelated reassign-to-const elsewhere in the body
// (`const flag = true; if (flag) {}`) does not erase the
// shell-exec intent — the construction of `[bash, -c, x]` is by
// itself the dangerous operation. Skip this suppression so the
// structural finding survives in closed-world contexts where no
// taint source has been resolved yet.
let has_shell_array_gate = sink_info.call.gate_filters.iter().any(|gf| {
gf.label_caps.contains(Cap::SHELL_ESCAPE) && gf.destination_uses.is_some()
});
if !has_taint
&& !has_shell_array_gate
&& ssa_all_sink_operands_const_or_param(ctx, *sink)
{
continue;
}

View file

@ -125,6 +125,13 @@ pub struct AnalysisContext<'a> {
/// the function-declaration level, the gap only matters when the
/// auth call has to live inside the body.
pub auth_decorators: &'a [String],
/// Names of variables whose `.close()` / release calls live in a
/// nested closure body somewhere else in the file (e.g.
/// `socket.on("close", () => ws.close())`). ResourceMisuse uses this
/// to suppress `cfg-resource-leak` for handles whose cleanup happens
/// in a callback the per-body CFG can't observe. When `None`, no
/// closure-based suppression is applied.
pub closure_released_var_names: Option<&'a std::collections::HashSet<String>>,
}
pub trait CfgAnalysis {

View file

@ -442,6 +442,23 @@ impl CfgAnalysis for ResourceMisuse {
if pair.resource_name == "mutex" && !has_explicit_lock_acquire(ctx, acquire) {
continue;
}
// Suppress when a sibling closure / event handler in
// this file releases the same variable. Common JS/TS
// shape: `const ws = new WebSocket(url);
// socket.on("close", () => ws.close())`. The release
// node lives in a nested body the per-body CFG can't
// see, so the structural "no release on this exit
// path" check fires erroneously. Match by acquired
// variable name; closure captures share the binding
// name with the outer handle.
if let Some(acq_var) = ctx.cfg[acquire].taint.defines.as_deref()
&& ctx
.closure_released_var_names
.map(|s| s.contains(acq_var))
.unwrap_or(false)
{
continue;
}
let info = &ctx.cfg[acquire];
let callee_desc = info.call.callee.as_deref().unwrap_or("(acquire)");

View file

@ -33,6 +33,7 @@ fn parse_and_analyse<A: CfgAnalysis>(
body_const_facts: None,
type_facts: None,
auth_decorators: &[],
closure_released_var_names: None,
};
analysis.run(&ctx)
}
@ -61,6 +62,7 @@ fn parse_and_run_all(src: &[u8], lang_str: &str, ts_lang: Language) -> Vec<CfgFi
body_const_facts: None,
type_facts: None,
auth_decorators: &[],
closure_released_var_names: None,
};
run_all(&ctx)
}
@ -94,6 +96,7 @@ fn parse_and_run_all_with_taint(
body_const_facts: None,
type_facts: None,
auth_decorators: &[],
closure_released_var_names: None,
};
run_all(&ctx)
}
@ -211,6 +214,7 @@ fn parse_and_analyse_with_ssa<A: CfgAnalysis>(
body_const_facts: facts.as_ref(),
type_facts: facts.as_ref().map(|f| &f.type_facts),
auth_decorators: &[],
closure_released_var_names: None,
};
analysis.run(&ctx)
}
@ -1225,6 +1229,7 @@ fn config_sanitizer_suppresses_unguarded_sink() {
body_const_facts: None,
type_facts: None,
auth_decorators: &[],
closure_released_var_names: None,
};
let findings = run_all(&ctx);
@ -1703,6 +1708,7 @@ fn cfg_only_no_taint_produces_low_severity() {
body_const_facts: None,
type_facts: None,
auth_decorators: &[],
closure_released_var_names: None,
};
let findings = guards::UnguardedSink.run(&ctx);

View file

@ -32,6 +32,7 @@ pub fn handle_command(
);
}
let _ = crate::utils::analysis_options::install(config.analysis.engine);
let _ = crate::utils::detector_options::install(config.detectors.clone());
};
match command {
@ -293,6 +294,9 @@ pub fn handle_command(
"analysis-engine runtime already installed; CLI engine flags ignored"
);
}
// Detector knobs (currently `[detectors.data_exfil]`) are
// resolved straight from config; no CLI overrides yet.
let _ = crate::utils::detector_options::install(config.detectors.clone());
// ── --explain-engine: print resolved config and exit ────────
if explain_engine {

View file

@ -184,6 +184,7 @@ fn type_kind_index(kind: &TypeKind) -> u32 {
TypeKind::Url => 10,
TypeKind::HttpClient => 11,
TypeKind::LocalCollection => 12,
TypeKind::RequestBuilder => 13,
// the analysis DTO types carry per-field structural info that the
// bitset domain can't represent. Collapse to Unknown so callers
// still see "any type possible" rather than crashing on an
@ -208,6 +209,7 @@ fn type_kind_from_index(idx: u32) -> Option<TypeKind> {
10 => Some(TypeKind::Url),
11 => Some(TypeKind::HttpClient),
12 => Some(TypeKind::LocalCollection),
13 => Some(TypeKind::RequestBuilder),
_ => None,
}
}

View file

@ -610,6 +610,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}

View file

@ -2516,6 +2516,7 @@ fn ssa_summaries_round_trip() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
),
(
@ -2550,6 +2551,7 @@ fn ssa_summaries_round_trip() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
),
];
@ -2722,6 +2724,7 @@ fn ssa_summaries_hash_rescan_replaces_stale() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
)];
idx.replace_ssa_summaries_for_file(&f, &hash_v1, &sums_v1)
@ -2758,6 +2761,7 @@ fn ssa_summaries_hash_rescan_replaces_stale() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
)];
idx.replace_ssa_summaries_for_file(&f, &hash_v2, &sums_v2)
@ -2815,6 +2819,7 @@ fn clear_drops_ssa_summaries_table() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
)];
idx.replace_ssa_summaries_for_file(&f, &hash, &sums)
@ -2871,6 +2876,7 @@ fn make_test_callee_body(
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::new(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
},
opt: crate::ssa::OptimizeResult {
const_values: std::collections::HashMap::new(),
@ -3086,6 +3092,7 @@ fn make_test_ssa_summary() -> crate::summary::ssa_summary::SsaFuncSummary {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
}
}
@ -3847,3 +3854,59 @@ fn ssa_summaries_pre_phase5_blob_decodes_with_empty_field_points_to() {
"missing field_points_to must default to empty",
);
}
/// Pre-`param_to_gate_filters` blob compatibility: a summary serialised
/// before this field existed deserialises with the empty default.
/// `#[serde(default)]` on the field means old SQLite blobs round-trip
/// without a schema migration, the new field is stored inside the JSON
/// `summary` column so SQL-level columns are unchanged.
#[test]
fn ssa_summaries_pre_gate_filters_blob_decodes_with_empty_param_to_gate_filters() {
use crate::summary::ssa_summary::SsaFuncSummary;
// Hand-craft JSON without the `param_to_gate_filters` key.
let pre_gate_filters_json = r#"{
"param_to_return": [],
"param_to_sink": [],
"source_caps": 0,
"param_to_sink_param": [],
"param_container_to_return": [],
"param_to_container_store": [],
"return_type": null,
"return_abstract": null,
"source_to_callback": [],
"receiver_to_return": null,
"receiver_to_sink": 0,
"abstract_transfer": [],
"param_return_paths": [],
"return_path_facts": [],
"typed_call_receivers": []
}"#;
let sum: SsaFuncSummary = serde_json::from_str(pre_gate_filters_json).unwrap();
assert!(
sum.param_to_gate_filters.is_empty(),
"missing param_to_gate_filters must default to empty",
);
}
/// Round-trip: a summary with a populated `param_to_gate_filters`
/// survives JSON serialise + deserialise, including the per-position
/// cap-mask values needed to preserve SSRF-vs-DATA_EXFIL splits across
/// the function-summary boundary.
#[test]
fn ssa_summaries_param_to_gate_filters_round_trip() {
use crate::labels::Cap;
use crate::summary::ssa_summary::SsaFuncSummary;
let mut sum = SsaFuncSummary::default();
sum.param_to_gate_filters.push((0, Cap::SSRF));
sum.param_to_gate_filters.push((1, Cap::DATA_EXFIL));
let json = serde_json::to_string(&sum).expect("serialize");
let restored: SsaFuncSummary = serde_json::from_str(&json).expect("deserialize");
assert_eq!(
restored.param_to_gate_filters,
vec![(0, Cap::SSRF), (1, Cap::DATA_EXFIL)],
"per-position cap masks must round-trip exactly",
);
}

View file

@ -218,6 +218,14 @@ pub struct Evidence {
/// under-budget findings and skipped during serialization in that case.
#[serde(default, skip_serializing_if = "smallvec::SmallVec::is_empty")]
pub engine_notes: smallvec::SmallVec<[crate::engine_notes::EngineNote; 2]>,
/// For `Cap::DATA_EXFIL` findings, the destination object-literal field
/// the tainted value reached (e.g. `"body"`, `"headers"`, `"json"`).
/// `None` for non-exfil findings, for exfil findings whose payload arg
/// was not an object literal, or when the sink was resolved through a
/// summary path that did not preserve destination metadata.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub data_exfil_field: Option<String>,
}
fn is_zero_u16(v: &u16) -> bool {
@ -301,7 +309,15 @@ pub fn compute_confidence(diag: &Diag) -> Confidence {
let id = &diag.id;
let base = if id.starts_with("taint-") {
let base = if id.starts_with("taint-data-exfiltration") {
// DATA_EXFIL is calibrated independently from the generic taint path:
// the value at risk is the leak of an *already-sensitive* source, not
// the construction of an attacker payload, so the points-based scoring
// tuned for code-exec / SSRF / SQLi over-credits these findings. Route
// to a narrower decision tree that asks "did we corroborate a real
// string body leaving the process?" instead.
compute_data_exfil_confidence(diag)
} else if id.starts_with("taint-") {
compute_taint_confidence(diag)
} else if id.starts_with("state-") {
match id.as_str() {
@ -458,13 +474,71 @@ fn compute_taint_confidence(diag: &Diag) -> Confidence {
}
}
/// Confidence routing for `taint-data-exfiltration` findings.
///
/// The generic taint scorer ranks DATA_EXFIL too aggressively: a Sensitive
/// source plus a sink call is enough to push it into the Medium/High band,
/// but the leak class needs corroboration that a real string body actually
/// leaves the process (otherwise we surface every `fetch(..., {body: x})`
/// where `x` happens to be Sensitive-tagged). This routing is deliberately
/// capped at Medium and only fires Medium when the symbolic execution
/// verdict confirms the path (abstract interpretation participates only as
/// a sink-suppression filter inside SSA taint and does not surface a
/// separate verdict here).
///
/// Routing:
/// * Source < Sensitive → Low (caller already strips DATA_EXFIL for
/// Plain sources, but defensively floor here).
/// * Symbolic verdict `Confirmed` → Medium (symex produced a witness
/// that a tainted string reaches the body argument).
/// * Symbolic verdict `Inconclusive` / `NotAttempted` / no symbolic
/// analysis → Low (instruction's "Inconclusive" tier; the `Confidence`
/// enum has no separate Inconclusive variant so it floors to Low).
/// * Symbolic verdict `Infeasible` → Low (path proven dead).
///
/// After routing, a `path_validated` guard on the diag drops the result
/// one tier (Medium → Low; Low stays Low) and `apply_engine_notes_cap`
/// applies the standard engine-notes cap.
fn compute_data_exfil_confidence(diag: &Diag) -> Confidence {
let ev = match &diag.evidence {
Some(e) => e,
None => return Confidence::Low,
};
let is_sensitive = ev
.source_kind
.map(|k| k.sensitivity() >= crate::labels::Sensitivity::Sensitive)
.unwrap_or(false);
if !is_sensitive {
return Confidence::Low;
}
let mut base = match ev.symbolic.as_ref().map(|s| s.verdict) {
Some(Verdict::Confirmed) => Confidence::Medium,
Some(Verdict::Infeasible) => Confidence::Low,
Some(Verdict::Inconclusive) | Some(Verdict::NotAttempted) | None => Confidence::Low,
};
// Guarded flow: drop a tier. A validation predicate on the path means
// the leak may be unreachable in practice, so the corroborated witness
// is downgraded one step (Medium → Low; Low stays Low).
if diag.path_validated && base > Confidence::Low {
base = Confidence::Low;
}
apply_engine_notes_cap(diag, base)
}
/// Score a structured `SourceKind` value.
///
/// UserInput=+3, EnvironmentConfig=+2, Unknown/FileSystem=+1, Database/CaughtException=0.
fn structured_source_kind_score(kind: crate::labels::SourceKind) -> i32 {
use crate::labels::SourceKind;
match kind {
SourceKind::UserInput => 3,
// Cookie / Header carry auth material, score them at the same
// ranking weight as direct user input rather than the lower
// FileSystem/Database tiers.
SourceKind::UserInput | SourceKind::Cookie | SourceKind::Header => 3,
SourceKind::EnvironmentConfig => 2,
SourceKind::Unknown | SourceKind::FileSystem => 1,
SourceKind::Database | SourceKind::CaughtException => 0,
@ -538,6 +612,8 @@ pub fn generate_explanation(diag: &Diag) -> Option<String> {
use crate::labels::SourceKind;
match kind {
SourceKind::UserInput => "user input",
SourceKind::Cookie => "cookie",
SourceKind::Header => "request header",
SourceKind::EnvironmentConfig => "environment/config",
SourceKind::Database => "database",
SourceKind::FileSystem => "file system",

View file

@ -1,4 +1,4 @@
use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig};
use crate::labels::{Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, SinkGate};
use phf::{Map, phf_map};
pub static RULES: &[LabelRule] = &[
@ -69,6 +69,33 @@ pub static RULES: &[LabelRule] = &[
},
];
/// Gated sinks for C.
///
/// `curl_easy_setopt(handle, option, payload)` is libcurl's option-binding
/// interface; the option identifier at arg 1 selects which slot the payload
/// fills. `CURLOPT_POSTFIELDS` and `CURLOPT_COPYPOSTFIELDS` carry the
/// request body, while other CURLOPT_* constants designate URL / auth / TLS
/// behaviour and are not DATA_EXFIL-relevant. Gating on the macro identifier
/// keeps the rule from over-firing on `curl_easy_setopt(h, CURLOPT_URL, url)`
/// (covered separately by the `curl_easy_perform` SSRF flat sink).
///
/// Identifier-based activation is enabled via the macro-arg fallback in
/// `cfg::mod::classify_gated_sink` for `lang == "c"`. Header-parsing
/// libraries (e.g. libmicrohttpd, mongoose) lack a stable surface and are
/// left to project-specific config.
pub static GATED_SINKS: &[SinkGate] = &[SinkGate {
callee_matcher: "curl_easy_setopt",
arg_index: 1,
dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: true,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::ValueMatch,
}];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
// control-flow
"if_statement" => Kind::If,

View file

@ -1,4 +1,4 @@
use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig};
use crate::labels::{Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, SinkGate};
use phf::{Map, phf_map};
pub static RULES: &[LabelRule] = &[
@ -91,6 +91,28 @@ pub static RULES: &[LabelRule] = &[
},
];
/// Gated sinks for C++.
///
/// Mirror of the C gate set: `curl_easy_setopt` with `CURLOPT_POSTFIELDS` /
/// `CURLOPT_COPYPOSTFIELDS` at arg 1 binds the request body at arg 2.
/// Identifier-based activation is enabled via the macro-arg fallback in
/// `cfg::mod::classify_gated_sink` for `lang == "cpp" / "c++"`. Modern C++
/// HTTP wrappers (cpr, Boost.Beast) layer over libcurl or directly over the
/// socket; their ergonomic surfaces differ enough that adding gates per-
/// library is left for a follow-up driven by the corpus.
pub static GATED_SINKS: &[SinkGate] = &[SinkGate {
callee_matcher: "curl_easy_setopt",
arg_index: 1,
dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: true,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::ValueMatch,
}];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
// control-flow
"if_statement" => Kind::If,

View file

@ -1,11 +1,13 @@
use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig, RuntimeLabelRule};
use crate::labels::{
Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, RuntimeLabelRule, SinkGate,
};
use crate::utils::project::{DetectedFramework, FrameworkContext};
use phf::{Map, phf_map};
pub static RULES: &[LabelRule] = &[
// ─────────── Sources ───────────
LabelRule {
matchers: &["os.Getenv"],
matchers: &["os.Getenv", "os.LookupEnv", "os.Environ"],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
@ -16,8 +18,12 @@ pub static RULES: &[LabelRule] = &[
"r.URL",
"r.Body",
"r.Header",
"r.Header.Get",
"r.Header.Values",
"r.URL.Query",
"r.URL.Query.Get",
"r.Cookie",
"r.Cookies",
"Request.FormValue",
"Request.URL",
],
@ -97,27 +103,20 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::HTML_ESCAPE),
case_sensitive: false,
},
// ── Outbound HTTP clients (SSRF) ───────────────────────────────────
//
// These are modeled as destination-aware gated sinks in `GATED_SINKS`
// below. Flat Sink rules would over-flag every positional argument as
// SSRF (so a tainted body in `http.Post(url, contentType, body)` would
// fire SSRF on the body), and the gate machinery short-circuits when a
// flat Sink label is already attached to the callee, blocking DATA_EXFIL
// body-flow gates from running.
//
// `net.Dial` / `net.DialTimeout` keep their flat-sink modeling: the
// first positional arg is the network address with no body / payload
// companion, so the over-flag concern does not apply.
LabelRule {
matchers: &[
"http.Get",
"http.Post",
"http.Head",
"http.NewRequest",
"http.NewRequestWithContext",
"net.Dial",
"net.DialTimeout",
// `http.DefaultClient` is the package-level default `*http.Client`.
// Idiomatic Go SSRF sinks (Owncast CVE-2023-3188) use the
// `http.DefaultClient.Get(url)` form rather than the bare
// `http.Get(url)` helper, so the suffix-matched callee text needs
// an explicit entry here, bare `Get/Post/Do/Head` would
// over-match unrelated method names.
"http.DefaultClient.Get",
"http.DefaultClient.Post",
"http.DefaultClient.Head",
"http.DefaultClient.Do",
"http.DefaultClient.PostForm",
],
matchers: &["net.Dial", "net.DialTimeout"],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
@ -135,6 +134,343 @@ pub static RULES: &[LabelRule] = &[
},
];
/// Argument-role-aware Go sinks. Two classes coexist on the outbound HTTP
/// surface, mirroring the JS/TS modeling:
///
/// * SSRF on the URL-bearing position of a one-shot request (`http.Get`,
/// `http.Post`, `http.NewRequest`, `http.DefaultClient.*`).
/// * `Cap::DATA_EXFIL` on the body / payload position when the source is
/// Sensitive (cookies, headers, env, db reads). Gates fire only when
/// taint reaches the body argument, so a tainted URL alone never
/// activates DATA_EXFIL and a tainted body alone never activates SSRF.
///
/// `http.NewRequest` / `http.NewRequestWithContext` carry an SSRF gate on
/// their URL position only. In Go's two-step idiom the actual network
/// call happens at `client.Do(req)`; body taint flows from the body
/// argument through the returned `*http.Request` via default arg → return
/// propagation, and then activates the `http.DefaultClient.Do` DATA_EXFIL
/// gate below. Modeling NewRequest as a body propagator (rather than a
/// body sink) avoids duplicate findings on the idiomatic
/// `req, _ := http.NewRequest(...); client.Do(req)` shape.
pub static GATED_SINKS: &[SinkGate] = &[
// ── SSRF gates (URL-bearing position) ────────────────────────────────
// `http.Get(url)` — url is arg 0.
SinkGate {
callee_matcher: "http.Get",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.Head(url)` — url is arg 0.
SinkGate {
callee_matcher: "http.Head",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.Post(url, contentType, body)` — url is arg 0.
SinkGate {
callee_matcher: "http.Post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.PostForm(url, data)` — url is arg 0.
SinkGate {
callee_matcher: "http.PostForm",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.NewRequest(method, url, body)` — url is arg 1.
SinkGate {
callee_matcher: "http.NewRequest",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.NewRequestWithContext(ctx, method, url, body)` — url is arg 2.
SinkGate {
callee_matcher: "http.NewRequestWithContext",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.DefaultClient.Get(url)` / `.Head(url)` — url is arg 0.
SinkGate {
callee_matcher: "http.DefaultClient.Get",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "http.DefaultClient.Head",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.DefaultClient.Post(url, contentType, body)` — url is arg 0.
SinkGate {
callee_matcher: "http.DefaultClient.Post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.DefaultClient.PostForm(url, data)` — url is arg 0.
SinkGate {
callee_matcher: "http.DefaultClient.PostForm",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// ── DATA_EXFIL gates (body-bearing position) ─────────────────────────
// `http.Post(url, contentType, body)` — body is arg 2.
SinkGate {
callee_matcher: "http.Post",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.PostForm(url, data)` — `data` (arg 1) is `url.Values`. Form
// bodies serialize the same operator state cookies / headers do, so a
// tainted Sensitive value reaching the form payload is DATA_EXFIL.
SinkGate {
callee_matcher: "http.PostForm",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.DefaultClient.Do(req)` — `req` (arg 0) is the `*http.Request`
// value. Body taint introduced via either `http.NewRequest(_, _, body)`
// (default arg → return propagation) or a later `req.Body = body` field
// write reaches this sink through the request value.
SinkGate {
callee_matcher: "http.DefaultClient.Do",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.DefaultClient.PostForm(url, data)` — same as `http.PostForm`
// but invoked through the package-level default `*http.Client`.
SinkGate {
callee_matcher: "http.DefaultClient.PostForm",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `http.DefaultClient.Post(url, contentType, body)` — body is arg 2.
SinkGate {
callee_matcher: "http.DefaultClient.Post",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// ── Common third-party HTTP clients ─────────────────────────────────
//
// `go-resty/resty`: `client.R().SetBody(body).Post(url)` style.
// `SetBody(body)` carries the body into the chained request; the
// network call happens at the verb method. We model the verb
// methods (Get / Post / Put / Patch / Delete / Send / Execute) as
// DATA_EXFIL gates with `payload_args: &[]` (empty), which engages
// the receiver-tainted fallback in `collect_tainted_sink_vars`. A
// builder receiver carrying body taint from `SetBody` activates the
// sink without us needing a positional body arg.
SinkGate {
callee_matcher: "resty.Request.Post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "resty.Request.Put",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "resty.Request.Patch",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// `imroc/req`: `req.Post(url, req.BodyJSON(payload))`, the `BodyJSON`
// / `BodyXML` helpers wrap a tainted payload and pass it as arg 1+ of
// the verb call. Since the helper return value carries the body
// taint, gating the verb on every payload arg is sufficient.
SinkGate {
callee_matcher: "req.Post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1, 2, 3],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "req.Put",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1, 2, 3],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
// control-flow
"if_statement" => Kind::If,

View file

@ -31,6 +31,15 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// Sensitive operator state: HTTP session attributes commonly carry
// auth tokens / CSRF tokens / signed user ids. Routed through the
// `Cookie` source-kind heuristic so DATA_EXFIL fires when these
// values leave the process via an outbound request body.
LabelRule {
matchers: &["HttpSession.getAttribute", "session.getAttribute"],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// ───────── Sanitizers ──────────
LabelRule {
matchers: &["HtmlUtils.htmlEscape", "StringEscapeUtils.escapeHtml4"],
@ -121,6 +130,79 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// ── Cross-boundary data exfiltration ──────────────────────────────────
//
// Outbound HTTP egress points where a Sensitive source (cookie, header,
// env, session attribute, db read) reaching the request body / payload
// is a cross-boundary disclosure distinct from SSRF. The flat-rule
// model relies on default arg → return propagation through builder
// chains: `HttpRequest.newBuilder().uri(u).POST(BodyPublishers.ofString(p)).build()`
// smears `p`-taint into the returned request, which then activates the
// sink at `client.send(req)`.
//
// Type-qualified resolution maps `restTemplate.postForObject(...)` →
// `HttpClient.postForObject` via the JAVA_HIERARCHY (RestTemplate,
// OkHttpClient, WebClient, CloseableHttpClient all subtype HttpClient),
// so a single set of `HttpClient.<method>` rules covers every framework
// in scope. Plain user input is silenced by the source-sensitivity
// gate in `effective_sink_caps`, so this fires only on cookies / headers
// / env / session / db.
LabelRule {
matchers: &[
// java.net.http: client.send(req) consumes a request that
// carries body-taint via BodyPublishers.ofString/ofByteArray/
// ofInputStream through the builder chain.
"HttpClient.send",
"HttpClient.sendAsync",
// Spring RestTemplate verbs that take a body / entity.
"postForObject",
"postForEntity",
"RestTemplate.exchange",
"RestTemplate.put",
"RestTemplate.patchForObject",
// Apache HttpClient: httpClient.execute(req) where req is an
// HttpPost / HttpPut / HttpPatch with .setEntity(StringEntity(p)).
// CloseableHttpClient subtypes HttpClient so type-qualified
// resolution rewrites client.execute → HttpClient.execute.
"HttpClient.execute",
// Spring WebClient body-binding step:
// webClient.post().uri(u).bodyValue(payload).retrieve().
// bodyValue is the explicit body-bind verb; default propagation
// carries the tainted body into the chain return so the sink
// attaches at the body-bind site itself (no cross-call needed).
"bodyValue",
// Apache HttpClient body-binding: the `setEntity` step on
// HttpPost / HttpPut / HttpPatch mutates the request rather
// than returning the builder, so the receiver's SSA value at
// the later `httpClient.execute(req)` does not carry body
// taint via the default smear (which threads through return
// values, not field mutations). Firing DATA_EXFIL at the
// setEntity call itself catches the body-binding directly.
// The matcher is specific enough to avoid collisions —
// `setEntity` is Apache-HttpClient-specific.
"setEntity",
// OkHttp builder body-binding shortcut: when the chain
// doesn't roll through `.post(body).build()` (e.g. a helper
// function returns the Builder mid-chain), `RequestBody`
// is bound via `.post(body)` / `.put(body)` / `.patch(body)`
// / `.delete(body)` directly on the Builder. These methods
// also exist on unrelated classes (NIO, Streams) but in the
// OkHttp idiom the receiver type is `Request.Builder`; the
// receiver-type widening from `Request.Builder` → HttpClient
// isn't currently modeled, so we fall back to suffix-name
// matchers and accept some receiver-agnostic firing risk.
// Conservative: omit these for v1 to avoid over-fire on
// non-OkHttp `post`/`put`/`patch` calls.
// OkHttp two-step: client.newCall(req).execute() / .enqueue().
// Chain normalization strips `()` between dots so the tree-
// sitter callee text `client.newCall(req).execute` matches the
// suffix `newCall.execute` after normalization.
"newCall.execute",
"newCall.enqueue",
],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
},
LabelRule {
matchers: &[
"readObject",

View file

@ -98,6 +98,26 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
case_sensitive: false,
},
// Conventional forwarding wrappers, telemetry / analytics / metrics dispatch.
// Treating these as Sanitizer(DATA_EXFIL) encodes the project convention
// that a payload routed through a named forwarding boundary is an
// explicit, expected egress (the developer named the function), not the
// accidental cross-boundary leak DATA_EXFIL is meant to catch. Users who
// do not follow this convention can override per-project via
// [analysis.languages.javascript] custom rules; the convention is
// documented in docs/detectors/taint.md so projects can extend it.
LabelRule {
matchers: &[
"serializeForUpstream",
"forwardPayload",
"tracker.send",
"analytics.track",
"metrics.report",
"logEvent",
],
label: DataLabel::Sanitizer(Cap::DATA_EXFIL),
case_sensitive: false,
},
// Conventional project-local HTML escapers. Suffix word-boundary match
// fires on bare calls to locally defined helpers (`function escapeHtml(x)`
// invoked as `escapeHtml(x)`) across codebases that follow the common
@ -128,6 +148,23 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::URL_ENCODE),
case_sensitive: false,
},
// Shell-exec sinks. Qualified `child_process.*` and bare destructured-
// import forms (`exec`, `execSync`, `execFile`, ...) are both modeled as
// flat sinks here so module-aliased call sites like `cp.exec(...)`
// (where `cp = require('child_process')`) still fire via suffix match.
// The bare-form FPs that motivated tightening are addressed elsewhere:
//
// * `container.exec(...)` (Dockerode) and `exec.start(...)` (the
// resulting `exec` handle) — `container.exec` is excluded via the
// EXCLUDES list below; `exec.start` is suppressed by restricting
// `first_member_label`'s suffix-strip-and-retry to `Source` labels
// only (see `cfg/helpers.rs`).
// * `execSync(cmd, { env: process.env })` flagging `process.env`
// flowing into the options arg — addressed by the
// `=exec`/`=execSync`/`=execFile`/... gates in `GATED_SINKS` below
// which set `payload_args: &[0]`. The cfg pass propagates a gate's
// payload_args restriction onto the matching flat sink so only arg
// 0 (the command string) is taint-checked at the call site.
LabelRule {
matchers: &[
"child_process.exec",
@ -136,8 +173,9 @@ pub static RULES: &[LabelRule] = &[
"child_process.execFile",
// Bare forms from destructured imports:
// const { exec, execSync } = require('child_process')
// Note: bare `exec` suffix-matches RegExp.prototype.exec() too,
// but in practice tainted data rarely flows to regexp.exec().
// and module-aliased calls like `cp.exec(...)`. Receiver-name
// collisions (`container.exec`, etc.) are suppressed via
// EXCLUDES; arg-position restriction comes from the `=*` gates.
"exec",
"execSync",
"execFile",
@ -250,16 +288,22 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: false,
},
// ORM / query builder raw-SQL entry points
// ORM / query builder raw-SQL entry points.
//
// `$queryRaw` / `$executeRaw` are tagged-template forms; the SQL is
// assembled from a template literal so taint reaching arg 0 is the
// injection vector and modeling them as flat sinks is correct.
//
// `$queryRawUnsafe` / `$executeRawUnsafe` accept positional bind
// parameters: `tx.$queryRawUnsafe(sqlTemplate, p1, p2, ...)` binds
// p1..pN as `$1..$N` (PostgreSQL prepared-statement params) and the SQL
// template at arg 0 is the only injection point. These are modeled as
// gated sinks below (`payload_args: &[0]`) so taint flowing only into
// the bind params no longer fires. `sequelize.query` and `knex.raw`
// also accept a separate bind-params object/array but the bind-params
// interface is non-positional in those APIs, so they stay flat for now.
LabelRule {
matchers: &[
"sequelize.query",
"knex.raw",
"$queryRaw",
"$queryRawUnsafe",
"$executeRaw",
"$executeRawUnsafe",
],
matchers: &["sequelize.query", "knex.raw", "$queryRaw", "$executeRaw"],
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
},
@ -295,6 +339,17 @@ pub static EXCLUDES: &[&str] = &[
"req.session.regenerate",
"req.session.save",
"req.session.reload",
// Dockerode container API: `container.exec({ Cmd: [...] })` is the
// canonical non-shell exec path (the Cmd array is passed directly to
// the kernel via `execve`, no shell parsing). `exec.start(...)` is
// the follow-on stream attach. Suffix-matching the bare `exec` rule
// would otherwise classify every `<receiver>.exec(...)` method call
// — including these — as a SHELL_ESCAPE sink. These patterns name
// the Dockerode SDK methods specifically; if a project happens to
// also expose its own `container.exec` shell wrapper, override via
// [analysis.languages.javascript] custom rules.
"container.exec",
"exec.start",
];
pub static GATED_SINKS: &[SinkGate] = &[
@ -577,6 +632,128 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["body", "headers", "json"],
},
},
// ── Shell-exec sinks (SHELL_ESCAPE) ──────────────────────────────────
//
// Only arg 0 (the command string) is a shell-injection payload.
// `options.env` / `options.cwd` / etc. at arg 1+ are not. Bare forms
// (`exec`, `execSync`, `execFile`, `execAsync`, `execPromise`) use the
// `=` exact-only sigil so they match the destructured-import shape
// (`const { exec } = require('child_process'); exec(cmd)`) without
// colliding with any `<receiver>.exec` method (Dockerode's
// `container.exec`, `RegExp.prototype.exec`, etc.).
// Qualified `child_process.*` forms stay as flat sinks (see RULES above);
// gates run only when no flat sink already classifies the call, so adding
// them here would never fire. The bare destructured-import forms below
// are the only place where shell-exec needs gating, since `classify_all`
// can't safely register a bare `exec` rule without colliding with every
// `<receiver>.exec` method (Dockerode `container.exec`,
// `RegExp.prototype.exec`, etc.).
SinkGate {
callee_matcher: "=exec",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execSync",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execFile",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execAsync",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execPromise",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// ── Prisma raw-SQL with positional bind params (SQL_QUERY) ───────────
//
// `tx.$queryRawUnsafe(sqlTemplate, p1, p2, ...)` binds `p1..pN` as
// PostgreSQL `$1..$N` prepared-statement parameters; only arg 0 (the
// SQL template) is the injection vector. Flat sinks here flagged taint
// flowing only into bind params, which is equivalent to a parameterised
// query and not exploitable. Suffix-match (no `=` sigil) so
// `tx.$queryRawUnsafe`, `prisma.$queryRawUnsafe`, etc. all qualify.
SinkGate {
callee_matcher: "$queryRawUnsafe",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "$executeRawUnsafe",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -320,6 +320,11 @@ static GATED_REGISTRY: Lazy<HashMap<&'static str, &'static [SinkGate]>> = Lazy::
m.insert("ts", typescript::GATED_SINKS);
m.insert("python", python::GATED_SINKS);
m.insert("py", python::GATED_SINKS);
m.insert("go", go::GATED_SINKS);
m.insert("php", php::GATED_SINKS);
m.insert("c", c::GATED_SINKS);
m.insert("cpp", cpp::GATED_SINKS);
m.insert("c++", cpp::GATED_SINKS);
m
});
@ -473,6 +478,10 @@ pub fn lookup(lang: &str, raw: &str) -> Kind {
pub enum SourceKind {
/// Direct user input (request params, argv, stdin, form data)
UserInput,
/// HTTP cookie value (carries session / auth material)
Cookie,
/// HTTP request header (may carry auth tokens, user-agent fingerprints)
Header,
/// Environment variables and configuration
EnvironmentConfig,
/// File system reads
@ -485,10 +494,81 @@ pub enum SourceKind {
Unknown,
}
/// Sensitivity classification of a taint source. Drives detector classes
/// like `DATA_EXFIL` that only fire when the source carries information
/// the operator did not intend to leak. Plain user input echoed back into
/// an outbound request is not data exfiltration, the user already controls
/// it, surfacing it as a leak is noise.
///
/// The threshold for `DATA_EXFIL` is `>= Sensitive`, plain user input is
/// suppressed. Projects that legitimately classify a request body as
/// sensitive (e.g. an API gateway forwarding pre-authenticated user tokens
/// out of a request body) can override via custom rules in `nyx.conf`,
/// either by re-classifying the source or by adding a Sanitizer rule for
/// `Cap::DATA_EXFIL` on the legitimate forwarding path.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum Sensitivity {
/// Attacker-controlled but not secret in itself, request bodies, query
/// strings, form fields, argv. Echoing this to an outbound request is
/// not data exfiltration.
Plain,
/// Carries operator state the user should not see leak out, cookies,
/// auth headers, env, file system reads, database rows.
Sensitive,
/// Reserved for future explicit secret classifications (API keys,
/// credential stores, key material). No source currently produces
/// this, but the threshold check in `effective_sink_caps` already
/// handles it monotonically.
Secret,
}
impl SourceKind {
/// Return the sensitivity tier this source kind belongs to. Drives the
/// `Cap::DATA_EXFIL` cap-suppression decision in `ast.rs`.
pub fn sensitivity(self) -> Sensitivity {
match self {
// Plain user-controlled input, the user already has the data,
// surfacing it back to them via an outbound request is not a
// disclosure.
SourceKind::UserInput => Sensitivity::Plain,
// Operator-bound state, leaking these via an outbound request
// is a real cross-boundary disclosure.
SourceKind::Cookie
| SourceKind::Header
| SourceKind::EnvironmentConfig
| SourceKind::FileSystem
| SourceKind::Database => Sensitivity::Sensitive,
// Caught exceptions can carry stack traces, db errors, internal
// paths, treat them as sensitive by default.
SourceKind::CaughtException => Sensitivity::Sensitive,
// Conservative default for unclassified sources, surface
// findings rather than silently drop them.
SourceKind::Unknown => Sensitivity::Sensitive,
}
}
}
/// Infer the source kind from capabilities and callee name.
pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
let cl = callee.to_ascii_lowercase();
// Cookie / Header are checked *before* the generic user-input bucket
// because they imply higher sensitivity (auth material, session ids).
// The generic UserInput substrings (`request`, `header`, `cookie`)
// would otherwise swallow these.
//
// Session stores carry auth material (CSRF tokens, signed user ids) of
// the same sensitivity tier as raw cookies, so route them through the
// `Cookie` arm. The substring is checked AFTER excluding the
// capitalised `Session` constructor (covered by the `request` /
// `requests` checks below not firing for `Session` builders).
if cl.contains("cookie") || cl.contains("session") {
return SourceKind::Cookie;
}
if cl.contains("header") {
return SourceKind::Header;
}
// User input patterns
if cl.contains("argv")
|| cl.contains("stdin")
@ -498,11 +578,23 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
|| cl.contains("params")
|| cl.contains("input")
|| cl.contains("body")
|| cl.contains("header")
|| cl.contains("cookie")
|| cl.contains("location")
|| cl.contains("document.url")
|| cl.contains("document.referrer")
// PHP superglobals: the AST text preserves the `$` (member-text
// extraction reads the `variable_name` node verbatim) so we match
// both `$_POST` and the `_POST` form some collectors emit.
// `$_REQUEST` already matches via the `request` substring above;
// `$_COOKIE` / `$_SESSION` route through the Cookie tier earlier in
// the function. `$_SERVER` is operator-state-bearing (auth headers
// etc.) so it stays Sensitive by falling through to the Unknown
// bucket.
|| cl == "$_get"
|| cl == "$_post"
|| cl == "$_files"
|| cl == "_get"
|| cl == "_post"
|| cl == "_files"
{
return SourceKind::UserInput;
}
@ -542,6 +634,8 @@ pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
pub fn severity_for_source_kind(kind: SourceKind) -> crate::patterns::Severity {
match kind {
SourceKind::UserInput => crate::patterns::Severity::High,
SourceKind::Cookie => crate::patterns::Severity::High,
SourceKind::Header => crate::patterns::Severity::High,
SourceKind::EnvironmentConfig => crate::patterns::Severity::High,
SourceKind::FileSystem => crate::patterns::Severity::Medium,
SourceKind::Database => crate::patterns::Severity::Medium,
@ -986,11 +1080,20 @@ pub fn classify_gated_sink(
None => return out,
};
// Match against the original callee text AND a chain-normalised form
// that strips `()` between dots so a chained construction like
// `httpx.AsyncClient().post` matches a gate matcher of
// `httpx.AsyncClient.post`. Mirrors the normalisation applied by
// `classify` for flat label rules.
let callee_bytes = callee_text.as_bytes();
let normalized = normalize_chained_call(callee_text);
let normalized_bytes = normalized.as_bytes();
for gate in *gates {
let matcher = gate.callee_matcher.as_bytes();
if !match_suffix_cs(callee_bytes, matcher, gate.case_sensitive) {
if !match_suffix_cs(callee_bytes, matcher, gate.case_sensitive)
&& !match_suffix_cs(normalized_bytes, matcher, gate.case_sensitive)
{
continue;
}
@ -1473,26 +1576,69 @@ mod tests {
// CVE Hunt Session 2 (Go CVE-2023-3188 Owncast SSRF):
// `http.DefaultClient.Get/Post/Head/Do/PostForm` is the idiomatic Go
// SSRF sink shape (`http.DefaultClient` is the package-level shared
// `*http.Client`). Bare `Get`/`Post` matchers would over-match
// unrelated method names; the explicit `http.DefaultClient.*` matcher
// restricts the suffix-match to the stdlib helper while leaving
// user-defined `myClient.Get` alone (no false positives).
// `*http.Client`). These callees migrated from a flat `Sink(SSRF)`
// rule to destination-aware gated sinks so that DATA_EXFIL gates can
// coexist on the same callee (e.g. `http.DefaultClient.Post(url, _,
// body)` carries SSRF on arg 0 and DATA_EXFIL on arg 2). The
// assertions below check the gate registration rather than the flat
// classifier output.
#[test]
fn classify_go_http_default_client_get_is_ssrf_sink() {
let result = classify("go", "http.DefaultClient.Get", None);
assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
fn classify_go_http_default_client_get_is_ssrf_gate() {
let no_kw = |_: &str| None;
let no_kw_present = |_: &str| false;
let result = classify_gated_sink(
"go",
"http.DefaultClient.Get",
|_| None,
no_kw,
no_kw_present,
);
assert!(
result.iter().any(|m| m.label == DataLabel::Sink(Cap::SSRF)),
"expected SSRF gate match, got {result:?}"
);
}
#[test]
fn classify_go_http_default_client_post_is_ssrf_sink() {
let result = classify("go", "http.DefaultClient.Post", None);
assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
fn classify_go_http_default_client_post_is_ssrf_and_data_exfil_gate() {
let no_kw = |_: &str| None;
let no_kw_present = |_: &str| false;
let result = classify_gated_sink(
"go",
"http.DefaultClient.Post",
|_| None,
no_kw,
no_kw_present,
);
assert!(
result.iter().any(|m| m.label == DataLabel::Sink(Cap::SSRF)),
"expected SSRF gate match, got {result:?}"
);
assert!(
result
.iter()
.any(|m| m.label == DataLabel::Sink(Cap::DATA_EXFIL)),
"expected DATA_EXFIL gate match, got {result:?}"
);
}
#[test]
fn classify_go_http_default_client_do_is_ssrf_sink() {
let result = classify("go", "http.DefaultClient.Do", None);
assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
fn classify_go_http_default_client_do_is_data_exfil_gate() {
let no_kw = |_: &str| None;
let no_kw_present = |_: &str| false;
let result = classify_gated_sink(
"go",
"http.DefaultClient.Do",
|_| None,
no_kw,
no_kw_present,
);
assert!(
result
.iter()
.any(|m| m.label == DataLabel::Sink(Cap::DATA_EXFIL)),
"expected DATA_EXFIL gate match, got {result:?}"
);
}
#[test]

View file

@ -1,4 +1,6 @@
use crate::labels::{Cap, DataLabel, Kind, LabelRule, ParamConfig, RuntimeLabelRule};
use crate::labels::{
Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, RuntimeLabelRule, SinkGate,
};
use crate::utils::project::{DetectedFramework, FrameworkContext};
use phf::{Map, phf_map};
@ -138,8 +140,67 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// ── Cross-boundary data exfiltration ──────────────────────────────────
//
// Body-bearing outbound HTTP verb methods on the major PHP HTTP clients.
// Flat sinks here compose with the SSRF rule on `curl_exec` /
// `file_get_contents` via multi-label classification. The
// source-sensitivity gate in `effective_sink_caps` strips DATA_EXFIL
// when the contributing source is `Plain` (`$_GET`, `$_POST`, `$_REQUEST`),
// so this only fires for sensitive sources (cookies / sessions /
// server-side state / env / file / db reads).
//
// Covered clients:
// * `Guzzle\Client::post/put/patch` — guzzlehttp/guzzle
// matched by suffix on the verb method (chained `$client->post(...)`).
// * `Symfony\HttpClient::request` — symfony/http-client
// request($method, $url, ['body' => $payload, 'json' => $data, ...])
// * `Http::post` — Laravel HTTP facade (over Guzzle)
LabelRule {
matchers: &[
"Client.post",
"Client.put",
"Client.patch",
"Client.request",
"HttpClient.post",
"HttpClient.put",
"HttpClient.patch",
"HttpClient.request",
"Http.post",
"Http.put",
"Http.patch",
],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: true,
},
];
/// Gated sinks for PHP.
///
/// `curl_setopt($ch, CURLOPT_POSTFIELDS, $payload)` is the canonical
/// non-OO PHP HTTP-egress payload binding. The activation arg (index 1) is
/// a `define`d constant: `CURLOPT_POSTFIELDS` (and the byref-copying variant
/// `CURLOPT_COPYPOSTFIELDS`) carry the request body, while other CURLOPT_*
/// constants designate URL / auth / TLS / behaviour, none of which is
/// DATA_EXFIL-relevant. Gating on the constant identifier keeps the rule
/// from over-firing on `curl_setopt($ch, CURLOPT_URL, $url)` (covered
/// elsewhere by the `curl_exec` SSRF flat sink).
///
/// Identifier-based activation is enabled via the macro-arg fallback in
/// `cfg::mod::classify_gated_sink` for `lang == "php"`.
pub static GATED_SINKS: &[SinkGate] = &[SinkGate {
callee_matcher: "curl_setopt",
arg_index: 1,
dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: true,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::ValueMatch,
}];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
// control-flow
"if_statement" => Kind::If,

View file

@ -44,6 +44,34 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// Session stores: session cookies / DRF / Django auth carry auth material
// the operator did not intend to leak. `infer_source_kind` maps `session`
// callees to `SourceKind::Cookie` (Sensitive) so flowing into an outbound
// request payload fires `DATA_EXFIL`. Case-sensitive: lowercase `session`
// here is the Flask global / Django request attribute; the capitalised
// `requests.Session` constructor is a client object, not a source, and
// must not be tagged.
//
// The matchers cover both attribute access (`request.session.user_id`,
// resolved as the attribute text) and the bare `session.<method>`
// pattern that follows `from flask import session`. The `=session`
// exact-match form fires only when the call is the bare top-level
// `session(...)` so accidental field projections like
// `obj.client.session` (Phase 2 chained-receiver lowering) don't get
// mis-labelled as sources.
LabelRule {
matchers: &[
"request.session",
"flask_request.session",
"flask.session",
"django.contrib.sessions",
"=session",
"session.get",
"session.pop",
],
label: DataLabel::Source(Cap::all()),
case_sensitive: true,
},
// Django-specific sources (case-sensitive to avoid request.get() dict method FP)
LabelRule {
matchers: &[
@ -208,58 +236,25 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::FILE_IO),
case_sensitive: false,
},
// Outbound HTTP — flat SSRF sinks for read-shaped methods (GET / HEAD)
// that don't carry a body. Body-bearing methods (POST / PUT / PATCH /
// DELETE / request) are modelled via destination-aware gates in
// GATED_SINKS so SSRF activation can be narrowed to the URL position
// and the cross-boundary `DATA_EXFIL` cap can attach to body kwargs as
// a separate gate. `urllib.request.urlopen` stays flat: its argument
// is a Request object whose payload-vs-URL split happens at
// `urllib.request.Request` construction (gated below).
LabelRule {
matchers: &[
"urllib.request.urlopen",
"requests.get",
"requests.post",
"requests.put",
"requests.delete",
"requests.patch",
"requests.head",
"requests.request",
"httpx.get",
"httpx.post",
"httpx.put",
"httpx.delete",
"httpx.patch",
"httpx.head",
"httpx.request",
],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// aiohttp HTTP client, SSRF sinks
LabelRule {
matchers: &[
"aiohttp.get",
"aiohttp.post",
"aiohttp.put",
"aiohttp.delete",
"aiohttp.request",
],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// Type-qualified SSRF sinks: when the receiver is tracked as
// TypeKind::HttpClient (e.g. `client = requests.Session()`,
// `client = httpx.Client()`, or `s = aiohttp.ClientSession()`),
// resolve_type_qualified_labels() constructs `"HttpClient.<method>"`
// call texts so the receiver-name is no longer load-bearing. Matches
// the existing Rust HttpClient.<method> sink set so both languages
// stay in step on the type-aware SSRF model. Motivated by the
// upstream LMDeploy CVE-2026-33626 shape:
// client = requests.Session()
// response = client.get(url, ...)
LabelRule {
matchers: &[
"aiohttp.head",
"HttpClient.get",
"HttpClient.post",
"HttpClient.put",
"HttpClient.delete",
"HttpClient.patch",
"HttpClient.head",
"HttpClient.request",
"HttpClient.send",
],
label: DataLabel::Sink(Cap::SSRF),
@ -332,6 +327,687 @@ pub static GATED_SINKS: &[SinkGate] = &[
dangerous_kwargs: &[("shell", &["True", "true"])],
activation: GateActivation::ValueMatch,
},
// ── Outbound HTTP clients (SSRF + cross-boundary data exfiltration) ───
//
// Body-bearing methods (POST / PUT / PATCH / DELETE / request) are
// gated by destination so that:
// * SSRF fires only when taint reaches the URL position (arg 0).
// * `DATA_EXFIL` fires only when taint reaches a body kwarg (`data` /
// `json` / `files` for requests / aiohttp; `content` / `data` /
// `json` / `files` for httpx).
// The pair lets a single `requests.post(taintedUrl, data=secret)` call
// report SSRF on the URL flow and DATA_EXFIL on the body flow as
// independent findings rather than a conflated combined cap.
//
// CFG-level kwarg-aware extraction (see `extract_destination_kwarg_pairs`)
// walks `keyword_argument` siblings and routes matching idents into the
// gate's `destination_uses` so the SSA sink scan only fires when the
// body kwarg itself is tainted.
//
// The source-sensitivity gate in `ast.rs` strips DATA_EXFIL when the
// contributing source is `Sensitivity::Plain` (raw `request.args`,
// `request.form`), so plain user input forwarded to a POST body does
// not surface — only sensitive sources (cookies, sessions, env, headers)
// produce a DATA_EXFIL finding.
SinkGate {
callee_matcher: "requests.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "requests.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json", "files"],
},
},
SinkGate {
callee_matcher: "requests.put",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "requests.put",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json", "files"],
},
},
SinkGate {
callee_matcher: "requests.patch",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "requests.patch",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json", "files"],
},
},
SinkGate {
callee_matcher: "requests.delete",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "requests.delete",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json", "files"],
},
},
// requests.request(method, url, ...) — note the URL is at arg 1, not
// arg 0; method is at arg 0. Body kwargs at arg 2+ via kwarg expansion.
SinkGate {
callee_matcher: "requests.request",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "requests.request",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json", "files"],
},
},
// httpx — `content` is httpx's raw-bytes body kwarg; `data` covers
// form-encoded; `json` covers JSON-encoded; `files` covers multipart.
SinkGate {
callee_matcher: "httpx.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "httpx.put",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.put",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "httpx.patch",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.patch",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "httpx.delete",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.delete",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
// httpx.request(method, url, ...) — same shape as requests.request.
SinkGate {
callee_matcher: "httpx.request",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.request",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
// Type-qualified variants: `requests.Session()`, `httpx.Client()`,
// `httpx.AsyncClient()`, `aiohttp.ClientSession()` instances all resolve
// to the synthetic `HttpClient.<method>` callee text via
// `resolve_type_qualified_labels`. Covering both module-level and
// type-qualified forms ensures `s = requests.Session(); s.post(url, data=x)`
// and `client = httpx.AsyncClient(); await client.post(url, json=x)` both
// fire SSRF on the URL and DATA_EXFIL on the body kwarg.
SinkGate {
callee_matcher: "HttpClient.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "HttpClient.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "HttpClient.put",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "HttpClient.put",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "HttpClient.patch",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "HttpClient.patch",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "HttpClient.delete",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "HttpClient.delete",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "HttpClient.request",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "HttpClient.request",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
// aiohttp module-level (`aiohttp.post`, `aiohttp.put`, etc.) — uncommon
// in real code (idiomatic usage is `async with aiohttp.ClientSession()`),
// covered for completeness. ClientSession.<method> dispatches via the
// type-qualified `HttpClient.<method>` gates above.
SinkGate {
callee_matcher: "aiohttp.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "aiohttp.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json"],
},
},
SinkGate {
callee_matcher: "aiohttp.put",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "aiohttp.put",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json"],
},
},
SinkGate {
callee_matcher: "aiohttp.request",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "aiohttp.request",
arg_index: 2,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json"],
},
},
// Chained-construction variants: `httpx.AsyncClient().post(url, json=x)`
// / `httpx.Client().post(url, ...)` / `aiohttp.ClientSession().post(...)`.
// Chain-normalisation strips `()` between dots so the callee text
// becomes `httpx.AsyncClient.post`; gate matching applies to that
// normalised form so the chained shape is covered without binding to
// an intermediate variable.
SinkGate {
callee_matcher: "httpx.AsyncClient.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.AsyncClient.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "httpx.Client.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "httpx.Client.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["content", "data", "json", "files"],
},
},
SinkGate {
callee_matcher: "aiohttp.ClientSession.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "aiohttp.ClientSession.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json"],
},
},
SinkGate {
callee_matcher: "requests.Session.post",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "requests.Session.post",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data", "json", "files"],
},
},
// urllib.request.urlopen(req) — when req is a `urllib.request.Request`
// built with the `data` kwarg, that kwarg becomes the POST body. The
// gate fires on `Request(url, data=tainted)` directly: the constructor
// does not egress, but the convention is that wrapping data in a Request
// means egress is imminent (the urllib.request.Request → urlopen path).
// This is a heuristic — the real egress happens at urlopen, but tracking
// the data flow through the constructor is a fair static approximation.
SinkGate {
callee_matcher: "urllib.request.Request",
arg_index: 1,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
payload_args: &[1],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &["data"],
},
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -28,6 +28,16 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// Sensitive request state: cookies and session stores carry auth material
// / CSRF tokens / signed user ids the operator did not intend to leak.
// `infer_source_kind` routes substrings containing "cookie" or "session"
// through `SourceKind::Cookie` (Sensitive), so flow into outbound request
// payloads activates the `DATA_EXFIL` cap added below.
LabelRule {
matchers: &["request.cookies", "request.session", "cookies", "session"],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// ───────── Sanitizers ──────────
LabelRule {
matchers: &["CGI.escapeHTML", "ERB::Util.html_escape"],
@ -135,6 +145,55 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// ── Cross-boundary data exfiltration ──────────────────────────────────
//
// Body-bearing outbound HTTP verb methods. A flat Sink(DATA_EXFIL) here
// composes with the SSRF rule above via multi-label classification:
// `Net::HTTP.post(uri, payload)` reports SSRF on the URL flow (arg 0)
// and DATA_EXFIL on the body flow (arg 1+) as separate findings. The
// source-sensitivity gate in `effective_sink_caps` strips DATA_EXFIL
// when the contributing source is `Plain` (raw `params`), so this only
// fires for sensitive sources (cookies / session / env / headers /
// file / db reads).
//
// Covered clients:
// * `Net::HTTP.post(uri, data, headers)` — stdlib
// * `Net::HTTP::Post.new(path)` body= setter — emitted as
// `Net::HTTP::Post.body=` after Ruby setter normalisation; flat rule
// ensures any tainted assignment to `.body` smears into the request
// * `RestClient.post(url, payload, headers)` — rest-client gem
// * `Faraday.post(url, body, headers)` — faraday
// * `HTTParty.post(url, body: ..., headers: ...)` — already a Sink(SSRF)
// above, DATA_EXFIL adds independently
// * `Typhoeus.post(url, body: ...)` — typhoeus
LabelRule {
matchers: &[
"Net::HTTP.post",
"RestClient.post",
"RestClient.put",
"RestClient.patch",
"Faraday.post",
"Faraday.put",
"Faraday.patch",
"HTTParty.post",
"HTTParty.put",
"HTTParty.patch",
"Typhoeus.post",
"Typhoeus.put",
"Typhoeus.patch",
],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
},
// Generic outbound-method suffix matchers for chained / typed receivers
// (e.g. `client.post(payload)` where `client` is a configured Faraday or
// RestClient instance). Suffix-match keeps the rule compact; source
// sensitivity gates noise from plain user input.
LabelRule {
matchers: &["HttpClient.post", "HttpClient.put", "HttpClient.patch"],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
},
LabelRule {
matchers: &["Marshal.load", "Marshal.restore", "YAML.load"],
label: DataLabel::Sink(Cap::DESERIALIZE),

View file

@ -19,6 +19,34 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// Inbound HTTP request metadata: headers, cookies, query strings,
// and body extractors. These only carry caller-supplied bytes when
// the framework binds them (the framework-conditional rules attach
// the same labels for axum / actix / rocket extractors). Including
// the bare suffix matchers here means a `req.headers().get("h")`
// chain in non-framework code (e.g. internal helpers that take an
// `&HeaderMap`) still surfaces as a Source. `infer_source_kind`
// routes these to `Header` / `Cookie` (Sensitive), enabling
// DATA_EXFIL gating downstream.
LabelRule {
matchers: &[
// Type-qualified (receiver typed as HttpRequest, HeaderMap, ...)
"HttpRequest.headers",
"HttpRequest.cookie",
"HttpRequest.cookies",
"Request.headers",
"Request.cookies",
"Request.uri",
// Bare HeaderMap / cookie-jar accessors.
"headers.get",
"headers.get_all",
"CookieJar.get",
"CookieJar.get_private",
"CookieJar.get_signed",
],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// ───────── Sanitizers ──────────
LabelRule {
matchers: &["html_escape::encode_safe", "sanitize_", "sanitize_html"],
@ -75,6 +103,34 @@ pub static RULES: &[LabelRule] = &[
"reqwest::Client.head",
"reqwest::Client.patch",
"reqwest::Client.request",
// Chained constructor + verb form: `reqwest::Client::new()
// .post(url)` reduces (via root-receiver collapse) to chain
// text `Client::new.post`, so existing `Client.post` matchers
// miss it. Cover the chained shape directly.
"Client::new.get",
"Client::new.post",
"Client::new.put",
"Client::new.delete",
"Client::new.head",
"Client::new.patch",
"Client::new.request",
// surf free verbs are themselves SSRF gates , the URL is
// their first positional argument.
"surf::get",
"surf::post",
"surf::put",
"surf::delete",
"surf::head",
"surf::patch",
"surf::connect",
"surf::trace",
// ureq free verbs are HTTP request initiators.
"ureq::get",
"ureq::post",
"ureq::put",
"ureq::delete",
"ureq::patch",
"ureq::head",
// Type-qualified (receiver typed as HttpClient)
"HttpClient.get",
"HttpClient.post",
@ -89,6 +145,68 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
// Cross-boundary data exfiltration sinks. Outbound HTTP egress where
// a Sensitive source (env, header, cookie, file, db) reaching the
// request body / payload is a leak distinct from SSRF. Plain user
// input is silenced by the source-sensitivity gate, so these only
// fire when the source carries operator-bound state.
//
// Body-binding methods on the request builder: `body`, `json`, `form`,
// `multipart` (reqwest); `body_string`, `body_json`, `body_bytes`
// (surf); `send_string`, `send_json`, `send_form` (ureq, which
// combines body-bind and dispatch). Plus `.send()` on an HttpClient
// / RequestBuilder, where the chain receiver is typed. Chain text
// matchers like `body.send` cover the all-in-one form
// `Client::post(url).body(payload).send()`.
LabelRule {
matchers: &[
// Type-qualified terminal verbs (split form, typed receiver).
"HttpClient.send",
"HttpClient.execute",
"RequestBuilder.send",
// Type-qualified body-bind methods on a typed RequestBuilder.
"RequestBuilder.body",
"RequestBuilder.json",
"RequestBuilder.form",
"RequestBuilder.multipart",
"RequestBuilder.body_string",
"RequestBuilder.body_json",
"RequestBuilder.body_bytes",
"RequestBuilder.send_string",
"RequestBuilder.send_json",
"RequestBuilder.send_form",
// surf / ureq method names that are unambiguous in Rust ,
// they only appear on HTTP request builders, so a bare-name
// suffix matcher is safe.
"body_string",
"body_json",
"body_bytes",
"send_string",
"send_json",
"send_form",
// Reqwest chain shapes. After paren-group strip the chain
// text becomes `Client::post.body.send`, so the body-bind
// verb sits before `.send` and a `body.send` suffix matcher
// pins exfil-only firing to chains that actually bind a body.
"body.send",
"json.send",
"form.send",
"multipart.send",
// hyper Request::builder().method(...).body(payload) , the
// body-bind step is the leak point. `.unwrap` is a common
// trailing identity method; we cover both shapes.
"Request::builder.body",
"Request::builder.method.body",
"Request::builder.method.body.unwrap",
"Request::builder.body.unwrap",
// Two-step reqwest where the user has a dedicated `Client`
// variable and uses `.execute(req)` on it.
"Client::new.send",
"Client::new.execute",
],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: false,
},
LabelRule {
matchers: &[
"rusqlite::Connection.execute",

View file

@ -92,6 +92,22 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
case_sensitive: false,
},
// Conventional forwarding wrappers, telemetry / analytics / metrics dispatch.
// See javascript.rs for rationale; mirrored here so TypeScript projects pick
// up the same convention. Override per-project via
// [analysis.languages.typescript] custom rules.
LabelRule {
matchers: &[
"serializeForUpstream",
"forwardPayload",
"tracker.send",
"analytics.track",
"metrics.report",
"logEvent",
],
label: DataLabel::Sanitizer(Cap::DATA_EXFIL),
case_sensitive: false,
},
// Conventional project-local HTML escapers. Suffix word-boundary match
// fires on bare calls to locally defined helpers (`function escapeHtml(x)`
// invoked as `escapeHtml(x)`) across codebases that follow the common
@ -113,18 +129,21 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::HTML_ESCAPE),
case_sensitive: false,
},
// Shell-exec sinks. Qualified `child_process.*` and bare forms are both
// flat sinks; receiver-name collisions are handled via EXCLUDES; the
// `=*` gates in `GATED_SINKS` below restrict checked args to arg 0
// (command string) so `execSync(cmd, { env: process.env })` no longer
// flags `process.env` flowing into the options object. See
// javascript.rs for full rationale.
LabelRule {
matchers: &[
"child_process.exec",
"child_process.execSync",
"child_process.spawn",
"child_process.execFile",
// Bare forms from destructured imports:
// const { exec, execSync } = require('child_process')
"exec",
"execSync",
"execFile",
// Common promisified wrappers around child_process.exec
"execAsync",
"execPromise",
],
@ -227,16 +246,12 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: false,
},
// ORM / query builder raw-SQL entry points
// ORM / query builder raw-SQL entry points. `$queryRawUnsafe` /
// `$executeRawUnsafe` are gated below — only arg 0 (the SQL template) is
// the injection vector; positional bind params are bound as `$1..$N`.
// See javascript.rs for the full rationale.
LabelRule {
matchers: &[
"sequelize.query",
"knex.raw",
"$queryRaw",
"$queryRawUnsafe",
"$executeRaw",
"$executeRawUnsafe",
],
matchers: &["sequelize.query", "knex.raw", "$queryRaw", "$executeRaw"],
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
},
@ -264,6 +279,9 @@ pub static EXCLUDES: &[&str] = &[
"req.app",
"req.route",
"req.next",
// Dockerode container API — see javascript.rs EXCLUDES for rationale.
"container.exec",
"exec.start",
];
pub static GATED_SINKS: &[SinkGate] = &[
@ -478,6 +496,113 @@ pub static GATED_SINKS: &[SinkGate] = &[
object_destination_fields: &["body", "headers", "json"],
},
},
// ── Shell-exec sinks (SHELL_ESCAPE) ──────────────────────────────────
// See javascript.rs for the rationale. Only arg 0 (command string)
// carries the shell-injection payload; bare forms use `=` exact-only
// matching so they don't collide with any `<receiver>.exec` method.
// Qualified `child_process.*` forms stay as flat sinks; gates only fire
// when no flat sink classifies the call, so the bare destructured-import
// forms below are the only place where shell-exec needs gating.
SinkGate {
callee_matcher: "=exec",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execSync",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execFile",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execAsync",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "=execPromise",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
// ── Prisma raw-SQL with positional bind params (SQL_QUERY) ───────────
// See javascript.rs for rationale.
SinkGate {
callee_matcher: "$queryRawUnsafe",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
SinkGate {
callee_matcher: "$executeRawUnsafe",
arg_index: 0,
dangerous_values: &[],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::SQL_QUERY),
case_sensitive: true,
payload_args: &[0],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::Destination {
object_destination_fields: &[],
},
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -207,6 +207,18 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
props.insert("confidence".into(), json!(conf.to_string()));
}
// `DATA_EXFIL` findings carry the destination object-literal
// field the leak reached (`body` / `headers` / `json`); surface
// it so SARIF consumers can pivot per-destination without
// reparsing the message.
if let Some(field) = d
.evidence
.as_ref()
.and_then(|ev| ev.data_exfil_field.as_deref())
{
props.insert("data_exfil_field".into(), json!(field));
}
// Alternative-path cross-references. When the dedup pass
// at `taint::analyse_file` preserves both a validated and
// an unvalidated flow for the same `(body, sink, source)`,

View file

@ -666,6 +666,8 @@ mod tests {
exception_edges: vec![],
field_interner: self.field_interner,
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
}
@ -880,6 +882,8 @@ mod tests {
exception_edges: vec![],
field_interner: FieldInterner::new(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let facts = analyse_body(&body, body_id());
assert!(facts.is_trivial());

View file

@ -206,7 +206,16 @@ pub fn rank_diags(diags: &mut [Diag]) {
/// Bonus based on analysis kind inferred from rule ID + evidence.
fn analysis_kind_bonus(rule_id: &str, evidence: Option<&Evidence>) -> f64 {
if rule_id.starts_with("taint-") {
if rule_id.starts_with("taint-data-exfiltration") {
// DATA_EXFIL ranks below SSRF / SQLi / CMDi: the leak class is
// a softer signal than direct payload-driven exploitation, so
// the taint-class bonus is trimmed (-3) to seat data-exfil
// findings between general taint flows and AST/CFG patterns.
// The source-kind bonus (`evidence_strength`) already separates
// cookie / env / header from less attacker-relevant origins,
// so this bonus is the only ranking discount applied.
7.0
} else if rule_id.starts_with("taint-") {
// Taint-confirmed flow is the strongest signal
10.0
} else if rule_id.starts_with("state-") {

View file

@ -1179,6 +1179,7 @@ fn type_kind_tag(k: &TypeKind) -> String {
TypeKind::Url => "Url".into(),
TypeKind::HttpClient => "HttpClient".into(),
TypeKind::LocalCollection => "LocalCollection".into(),
TypeKind::RequestBuilder => "RequestBuilder".into(),
TypeKind::Dto(_) => "Dto".into(),
}
}
@ -1872,6 +1873,7 @@ function consume() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
@ -2026,6 +2028,8 @@ async function recentAuditLogs() {
exception_edges: vec![],
field_interner,
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let facts = analyse_body(&body, BodyId(0));

View file

@ -104,6 +104,14 @@ pub fn issue_categories(
}
fn issue_category_label(rule_id: &str) -> &'static str {
// `taint-data-exfiltration` and the legacy `taint-unsanitised-flow`
// share the `taint` family token, but the exfil class targets a
// different threat (sensitive data leaving the trust boundary, not
// attacker payload entering it). Surface it as its own bucket so the
// dashboard category badge matches the rule semantics.
if rule_id.starts_with("taint-data-exfiltration") {
return "Data Exfiltration";
}
match extract_family(rule_id) {
"sqli" => "SQL Injection",
"xss" => "Cross-Site Scripting",
@ -221,6 +229,26 @@ mod tests {
assert_eq!(out[2].count, 2);
}
#[test]
fn issue_category_label_routes_data_exfil_to_dedicated_bucket() {
// `taint-data-exfiltration` shares the `taint` family token with
// `taint-unsanitised-flow`, but exfil findings need their own
// dashboard badge so analysts can pivot on the leak class.
assert_eq!(
issue_category_label("taint-data-exfiltration"),
"Data Exfiltration"
);
assert_eq!(
issue_category_label("taint-data-exfiltration (source 1:1)"),
"Data Exfiltration"
);
// Generic taint findings stay in the broader bucket.
assert_eq!(
issue_category_label("taint-unsanitised-flow"),
"Tainted Flow"
);
}
#[test]
fn issue_category_label_recognises_simple_families() {
assert_eq!(

View file

@ -445,6 +445,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
)],
)
@ -516,6 +517,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
},
false,
false,
@ -538,6 +541,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
},
true,
true,
@ -560,6 +565,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
},
true,
false,
@ -656,6 +663,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
)],
)

View file

@ -217,6 +217,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}

View file

@ -638,6 +638,8 @@ mod tests {
exception_edges: Vec::new(),
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}

View file

@ -215,6 +215,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let (eliminated, copy_map) = copy_propagate(&mut body, &cfg);
@ -296,6 +298,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let (eliminated, copy_map) = copy_propagate(&mut body, &cfg);
@ -366,6 +370,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
(cfg, body)
}
@ -488,6 +494,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let (eliminated, _map) = copy_propagate(&mut body, &cfg);
assert_eq!(eliminated, 0, "two-operand Assign is not a copy");
@ -567,6 +575,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let (eliminated, _) = copy_propagate(&mut body, &cfg);
assert_eq!(eliminated, 1, "v1 should be eliminated");
@ -664,6 +674,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let (eliminated, _map) = copy_propagate(&mut body, &cfg);
assert_eq!(eliminated, 1);
@ -712,6 +724,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let (eliminated, map) = copy_propagate(&mut body, &cfg);
assert_eq!(eliminated, 0);

View file

@ -217,6 +217,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -265,6 +267,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -314,6 +318,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -359,6 +365,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -396,6 +404,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -460,6 +470,8 @@ mod tests {
exception_edges: vec![],
field_interner: interner,
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -527,6 +539,8 @@ mod tests {
exception_edges: vec![],
field_interner: interner,
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -587,6 +601,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -637,6 +653,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -724,6 +742,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);
@ -801,6 +821,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let removed = eliminate_dead_defs(&mut body, &cfg);

View file

@ -788,6 +788,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let errs = check_structural_invariants(&body);
assert!(
@ -835,6 +837,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let errs = check_structural_invariants(&body);
assert!(
@ -885,6 +889,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let errs = check_structural_invariants(&body);
assert!(
@ -913,6 +919,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let errs = check_structural_invariants(&body);
assert!(

View file

@ -4,7 +4,7 @@ use crate::ssa::type_facts::TypeKind;
use petgraph::graph::NodeIndex;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
/// Unique identifier for an SSA value (one per definition point).
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
@ -353,6 +353,26 @@ pub struct SsaBody {
/// cleanly with an empty map (no migration needed).
#[serde(default)]
pub field_writes: HashMap<SsaValue, (SsaValue, FieldId)>,
/// SSA values that lowering injected for **free / closure-captured**
/// variables (variables referenced by the body but not declared as
/// formal parameters and not assigned within the body).
///
/// Lowering models every external use as an [`SsaOp::Param`] in block
/// 0 so the rename pass can reference it. Real formal parameters and
/// closure captures end up using the same op variant; this side-table
/// distinguishes the two so downstream analyses (in particular the
/// JS/TS handler-name auto-seed in
/// [`crate::taint::ssa_transfer`]) can avoid treating closure
/// captures as if they were the function's own parameters. Without
/// this distinction, a lambda body that references an out-of-scope
/// `userId` / `cmd` / `payload` would have the synthetic Param
/// auto-seeded as `UserInput`, producing a phantom source on the
/// enclosing function's declaration line.
///
/// `#[serde(default)]` for backward compatibility with summary blobs
/// produced before this field existed.
#[serde(default)]
pub synthetic_externals: HashSet<SsaValue>,
}
impl SsaBody {
@ -560,6 +580,7 @@ mod tests {
exception_edges: vec![],
field_interner: FieldInterner::new(),
field_writes: HashMap::new(),
synthetic_externals: HashSet::new(),
};
let fid = body.intern_field("mu");
body.blocks[0].body.push(SsaInst {

View file

@ -239,18 +239,25 @@ fn lower_to_ssa_inner(
// 6. Rename variables (dominator tree preorder walk)
let dom_tree_children = build_dom_tree_children(num_blocks, &doms, &block_graph);
let (mut ssa_blocks, mut value_defs, cfg_node_map, field_interner, field_writes) =
rename_variables(
cfg,
&blocks_nodes,
&block_succs,
&block_preds,
&phi_placements,
&dom_tree_children,
&filtered_edges,
&external_vars,
&nop_nodes,
);
let (
mut ssa_blocks,
mut value_defs,
cfg_node_map,
field_interner,
field_writes,
synthetic_externals,
) = rename_variables(
cfg,
&blocks_nodes,
&block_succs,
&block_preds,
&phi_placements,
&dom_tree_children,
&filtered_edges,
&external_vars,
formal_params,
&nop_nodes,
);
// 6b. Fill any missing phi operands with a shared Undef sentinel so
// every phi has exactly one operand per predecessor. See
@ -306,6 +313,7 @@ fn lower_to_ssa_inner(
exception_edges,
field_interner,
field_writes,
synthetic_externals,
};
// 9. Catch-block reachability invariant.
@ -927,6 +935,7 @@ fn rename_variables(
dom_tree_children: &[Vec<usize>],
filtered_edges: &[(NodeIndex, NodeIndex, EdgeKind)],
external_vars: &[String],
formal_params: &[String],
nop_nodes: &HashSet<NodeIndex>,
) -> (
Vec<SsaBlock>,
@ -934,6 +943,7 @@ fn rename_variables(
HashMap<NodeIndex, SsaValue>,
crate::ssa::ir::FieldInterner,
HashMap<SsaValue, (SsaValue, crate::ssa::ir::FieldId)>,
HashSet<SsaValue>,
) {
let num_blocks = blocks_nodes.len();
let mut next_value: u32 = 0;
@ -1679,6 +1689,27 @@ fn rename_variables(
// Inject synthetic Param instructions at START of block 0 for external variables.
// These create SSA definitions so the rename pass can reference them.
// Pre-seed var_stacks so process_block sees them.
//
// `external_vars` contains both real formal parameters and free / closure-
// captured variables (variables read by the body but not declared as a
// formal and not assigned anywhere). Both end up emitted as
// [`SsaOp::Param`] in block 0; we record the SSA values that correspond
// to free vars in `synthetic_externals` so downstream analyses (the JS/TS
// handler-name auto-seed in particular) can avoid treating closure
// captures as if they were parameters of the function under analysis.
//
// **Conservative behaviour when `formal_params` is empty.** Several
// call sites (`lower_to_ssa`, `lower_to_ssa_scoped_nop`) don't supply
// formal parameter names; in that case we cannot distinguish formals
// from free vars structurally, so we leave `synthetic_externals` empty
// and the auto-seed pass keeps its pre-fix behaviour of treating every
// `Param` op as a candidate. Only callers that pass a non-empty
// `formal_params` slice (`lower_to_ssa_with_params`, used by the
// findings pipeline's per-function lowering) opt into the
// closure-capture distinction.
let mut synthetic_externals: HashSet<SsaValue> = HashSet::new();
let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect();
let track_synthetic = !formal_params.is_empty();
if !external_vars.is_empty() {
let entry_cfg_node = blocks_nodes[0][0];
let mut synthetic_body = Vec::with_capacity(external_vars.len());
@ -1691,7 +1722,8 @@ fn rename_variables(
cfg_node: entry_cfg_node,
block: BlockId(0),
});
let op = if is_receiver_name(var) {
let is_receiver = is_receiver_name(var);
let op = if is_receiver {
SsaOp::SelfParam
} else {
let op = SsaOp::Param {
@ -1700,6 +1732,28 @@ fn rename_variables(
positional_idx += 1;
op
};
// A non-receiver var is "synthetic" (a free / closure capture)
// when it is *not* one of the function's declared formals AND
// not a dotted access on a formal (`input.cmd` where `input` is
// a formal — it represents a structural projection of the
// formal, not a free variable; the auto-seed should still treat
// it as part of the formal's own taint surface). Receivers are
// intentionally excluded: `this` / `self` represent the implicit
// receiver, which always belongs to the function.
//
// Only fire when the caller supplied formal-parameter names; see
// the `track_synthetic` rationale above.
let root_is_formal = var
.split_once('.')
.map(|(root, _)| formal_set.contains(root))
.unwrap_or(false);
if track_synthetic
&& !is_receiver
&& !formal_set.contains(var.as_str())
&& !root_is_formal
{
synthetic_externals.insert(v);
}
synthetic_body.push(SsaInst {
value: v,
op,
@ -1784,6 +1838,7 @@ fn rename_variables(
cfg_node_map,
field_interner,
field_writes,
synthetic_externals,
)
}

View file

@ -417,6 +417,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}

View file

@ -440,6 +440,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let cfg: Cfg = Graph::new();
let const_values = HashMap::new();

View file

@ -25,6 +25,15 @@ pub enum TypeKind {
FileHandle,
Url,
HttpClient,
/// A pre-network HTTP request builder produced by `Client::post(url)`,
/// `surf::post(url)`, `Request::builder()`, `ureq::post(url)`, etc.
/// The body-bind methods (`body`, `json`, `form`, `multipart`,
/// `body_string`, `body_json`, `body_bytes`) and terminal verbs
/// (`send`, `send_string`, `send_json`, `send_form`) are sinks for
/// `DATA_EXFIL` when receiver-typed. Distinct from `HttpClient` so
/// type-qualified resolution can attach builder-only rules without
/// over-firing on plain client objects.
RequestBuilder,
/// A local, in-memory collection (HashMap, HashSet, Vec, etc.).
/// The auth sink gate uses this so calls like `map.insert(...)`
/// are treated as bookkeeping rather than cross-tenant sinks. No
@ -76,6 +85,7 @@ impl TypeKind {
Self::DatabaseConnection => Some("DatabaseConnection"),
Self::FileHandle => Some("FileHandle"),
Self::Url => Some("URL"),
Self::RequestBuilder => Some("RequestBuilder"),
_ => None,
}
}
@ -180,9 +190,10 @@ impl TypeFactResult {
///
/// Suppression policy:
/// * [`TypeKind::Int`] (and float, treated as numeric): suppresses
/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` ,
/// numeric values cannot carry the metacharacters required to drive
/// any of these injection classes.
/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF`,
/// `DATA_EXFIL`, numeric values cannot carry the metacharacters
/// required to drive any of these injection classes, nor can they
/// encode credentials/tokens that meaningfully constitute leakage.
/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit ,
/// `true`/`false` cannot carry a payload of any kind.
pub fn is_type_safe_for_sink(
@ -191,8 +202,12 @@ pub fn is_type_safe_for_sink(
type_facts: &TypeFactResult,
) -> bool {
use crate::labels::Cap;
let type_suppressible =
Cap::SQL_QUERY | Cap::FILE_IO | Cap::SHELL_ESCAPE | Cap::HTML_ESCAPE | Cap::SSRF;
let type_suppressible = Cap::SQL_QUERY
| Cap::FILE_IO
| Cap::SHELL_ESCAPE
| Cap::HTML_ESCAPE
| Cap::SSRF
| Cap::DATA_EXFIL;
if !sink_caps.intersects(type_suppressible) {
return false;
}
@ -224,6 +239,13 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
"newHttpClient" | "newBuilder" if callee.contains("HttpClient") => {
Some(TypeKind::HttpClient)
}
// Apache HttpClient idiomatic factory:
// `CloseableHttpClient client = HttpClients.createDefault();`
// `HttpClients` contains the substring `HttpClient` so this
// doesn't widen to unrelated `createDefault` calls.
"createDefault" | "custom" if callee.contains("HttpClient") => {
Some(TypeKind::HttpClient)
}
"OkHttpClient" | "WebClient" | "RestTemplate" => Some(TypeKind::HttpClient),
"getConnection" => Some(TypeKind::DatabaseConnection),
"MongoClient" => Some(TypeKind::DatabaseConnection),
@ -340,6 +362,10 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
// so the auth sink gate recognises
// `let x = factory_fn(); x.insert(..)`.
Some(TypeKind::LocalCollection)
} else if is_rust_request_builder_constructor(base) {
// HTTP request-builder constructors across reqwest, surf,
// ureq, hyper. See [`is_rust_request_builder_constructor`].
Some(TypeKind::RequestBuilder)
} else {
None
}
@ -449,6 +475,54 @@ fn is_rust_local_collection_constructor(base: &str) -> bool {
})
}
/// Does the peeled Rust callee correspond to a known HTTP request-builder
/// constructor / factory? Covers:
/// * surf free verbs (`surf::post`, `surf::get`, ...) ,
/// * ureq free verbs (`ureq::post`, ...) ,
/// * hyper `Request::builder` ,
/// * reqwest `Client::post(url)` / `Client::get(url)` etc. (the `Client`
/// instance is itself an `HttpClient` but the verb call on it returns a
/// `RequestBuilder` whose chained methods bind body/json/form/etc.).
///
/// reqwest's `Client::new` keeps its existing `HttpClient` mapping ,
/// it produces the client, not a builder.
fn is_rust_request_builder_constructor(base: &str) -> bool {
// surf free verbs that return Request (acts as a builder).
const SURF_VERBS: &[&str] = &[
"post", "get", "put", "delete", "patch", "head", "connect", "trace",
];
if SURF_VERBS
.iter()
.any(|v| base.ends_with(&format!("surf::{v}")))
{
return true;
}
// ureq free verbs that return Request.
const UREQ_VERBS: &[&str] = &["post", "get", "put", "delete", "patch", "head"];
if UREQ_VERBS
.iter()
.any(|v| base.ends_with(&format!("ureq::{v}")))
{
return true;
}
// hyper request builder.
if base.ends_with("Request::builder") || base.ends_with("hyper::Request::builder") {
return true;
}
// reqwest Client verb-on-instance. `Client::post(url)` /
// `Client::get(url)` chained-form returns a RequestBuilder. We match
// the constructor-style segment used by chain text after CFG receiver
// collapse (`reqwest::Client::new.post`, `Client::post`, etc.).
const REQWEST_CLIENT_VERBS: &[&str] =
&["post", "get", "put", "delete", "patch", "head", "request"];
if REQWEST_CLIENT_VERBS.iter().any(|v| {
base.ends_with(&format!("Client::new.{v}")) || base.ends_with(&format!("Client::{v}"))
}) {
return true;
}
false
}
pub fn is_identity_method(callee: &str) -> bool {
let suffix = callee.rsplit(['.', ':']).next().unwrap_or(callee);
matches!(
@ -1076,6 +1150,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let consts = HashMap::from([
@ -1189,6 +1265,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let consts = HashMap::new();
@ -1220,9 +1298,10 @@ mod tests {
}
/// Int-typed values must suppress every type-suppressible
/// cap, including the freshly-added `SSRF` bit. Numeric IDs
/// cannot rewrite a URL host, cannot form path traversal sequences,
/// cannot carry SQL/HTML/shell metacharacters.
/// cap, including the freshly-added `SSRF` and `DATA_EXFIL` bits.
/// Numeric IDs cannot rewrite a URL host, cannot form path
/// traversal sequences, cannot carry SQL/HTML/shell metacharacters,
/// and do not encode credentials worth exfiltrating.
#[test]
fn int_suppresses_every_type_suppressible_cap() {
use crate::labels::Cap;
@ -1236,6 +1315,7 @@ mod tests {
Cap::SHELL_ESCAPE,
Cap::HTML_ESCAPE,
Cap::SSRF,
Cap::DATA_EXFIL,
] {
assert!(
is_type_safe_for_sink(&[SsaValue(0)], cap, &result),
@ -1271,6 +1351,7 @@ mod tests {
Cap::SHELL_ESCAPE,
Cap::HTML_ESCAPE,
Cap::SSRF,
Cap::DATA_EXFIL,
] {
assert!(
is_type_safe_for_sink(&[SsaValue(0)], cap, &result),
@ -1307,14 +1388,14 @@ mod tests {
/// `is_type_safe_for_sink` requires an intentional matrix edit + a
/// test update. Truth values:
///
/// | TypeKind | SQL | FILE | SHELL | HTML | SSRF | CODE_EXEC | DESERIALIZE |
/// |-----------|-----|------|-------|------|------|-----------|-------------|
/// | Int | Y | Y | Y | Y | Y | N | N |
/// | Bool | Y | Y | Y | Y | Y | N | N |
/// | String | N | N | N | N | N | N | N |
/// | Url | N | N | N | N | N | N | N |
/// | Object | N | N | N | N | N | N | N |
/// | Unknown | N | N | N | N | N | N | N |
/// | TypeKind | SQL | FILE | SHELL | HTML | SSRF | DATA_EXFIL | CODE_EXEC | DESERIALIZE |
/// |-----------|-----|------|-------|------|------|------------|-----------|-------------|
/// | Int | Y | Y | Y | Y | Y | Y | N | N |
/// | Bool | Y | Y | Y | Y | Y | Y | N | N |
/// | String | N | N | N | N | N | N | N | N |
/// | Url | N | N | N | N | N | N | N | N |
/// | Object | N | N | N | N | N | N | N | N |
/// | Unknown | N | N | N | N | N | N | N | N |
#[test]
fn type_kind_cap_suppression_matrix() {
use crate::labels::Cap;
@ -1324,40 +1405,41 @@ mod tests {
("SHELL_ESCAPE", Cap::SHELL_ESCAPE),
("HTML_ESCAPE", Cap::HTML_ESCAPE),
("SSRF", Cap::SSRF),
("DATA_EXFIL", Cap::DATA_EXFIL),
("CODE_EXEC", Cap::CODE_EXEC),
("DESERIALIZE", Cap::DESERIALIZE),
];
// (kind_name, kind, [suppress for each cap in `caps` order])
let rows: &[(&str, TypeKind, [bool; 7])] = &[
let rows: &[(&str, TypeKind, [bool; 8])] = &[
(
"Int",
TypeKind::Int,
[true, true, true, true, true, false, false],
[true, true, true, true, true, true, false, false],
),
(
"Bool",
TypeKind::Bool,
[true, true, true, true, true, false, false],
[true, true, true, true, true, true, false, false],
),
(
"String",
TypeKind::String,
[false, false, false, false, false, false, false],
[false, false, false, false, false, false, false, false],
),
(
"Url",
TypeKind::Url,
[false, false, false, false, false, false, false],
[false, false, false, false, false, false, false, false],
),
(
"Object",
TypeKind::Object,
[false, false, false, false, false, false, false],
[false, false, false, false, false, false, false, false],
),
(
"Unknown",
TypeKind::Unknown,
[false, false, false, false, false, false, false],
[false, false, false, false, false, false, false, false],
),
];
for (kind_name, kind, expected) in rows {
@ -1389,6 +1471,7 @@ mod tests {
Cap::SHELL_ESCAPE,
Cap::HTML_ESCAPE,
Cap::SSRF,
Cap::DATA_EXFIL,
Cap::CODE_EXEC,
Cap::DESERIALIZE,
] {
@ -1487,6 +1570,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let consts = HashMap::new();

View file

@ -19,19 +19,29 @@ fn sanitize_desc(s: &str) -> String {
/// convergence node where all execution paths join before leaving the function.
///
/// **Invariant:** Only terminal exits carry the complete merged lifecycle state
/// needed for leak analysis. Return nodes are intermediate (they flow into the
/// terminal exit) and must NOT be analyzed for terminal resource state.
///
/// Detection is purely topological: a node inside a function is terminal when
/// it has no successor within the same function scope. This works for both
/// per-body graphs (Exit node is a sink) and legacy supergraphs (the
/// synthesized Return's successor is the file-level Exit with
/// needed for leak analysis. Return nodes are intermediate in per-body graphs
/// (they flow into the synthetic Exit node) but become terminal in legacy
/// supergraphs (their successor is the file-level Exit with
/// `enclosing_func = None`).
///
/// Detection combines a kind filter with a topological check. Only nodes
/// whose `StmtKind` actually terminates execution (`Exit`, `Return`, `Throw`)
/// are considered, then we require that they have no successor in the same
/// function scope. Without the kind filter, dangling Seq nodes left behind
/// when nested function literals (e.g. `obj.fn = () => {...}`) get a
/// placeholder in the parent graph would be misclassified as terminal exits
/// and produce spurious resource-leak findings at the function-literal span.
fn is_terminal_function_exit(
idx: petgraph::graph::NodeIndex,
info: &crate::cfg::NodeInfo,
cfg: &Cfg,
) -> bool {
if !matches!(
info.kind,
StmtKind::Exit | StmtKind::Return | StmtKind::Throw
) {
return false;
}
info.ast.enclosing_func.is_some()
&& !cfg
.neighbors_directed(idx, petgraph::Direction::Outgoing)
@ -62,6 +72,7 @@ pub struct StateFinding {
/// `state-unauthed-access` finding is suppressed on those spans because
/// the user-controlled input has already been proved unable to escape
/// into a privileged location.
#[allow(clippy::too_many_arguments)]
pub fn extract_findings(
result: &DataflowResult<ProductState, TransferEvent>,
cfg: &Cfg,
@ -70,6 +81,7 @@ pub fn extract_findings(
func_summaries: &crate::cfg::FuncSummaries,
enable_auth: bool,
path_safe_suppressed_sink_spans: &std::collections::HashSet<(usize, usize)>,
closure_released_var_names: Option<&std::collections::HashSet<String>>,
) -> Vec<StateFinding> {
let mut findings = Vec::new();
@ -195,6 +207,23 @@ pub fn extract_findings(
continue;
}
// Suppress leaks for variables whose release call lives in a
// nested closure (callback / event handler) outside this
// body's CFG. Common JS/TS shape:
// const ws = new WebSocket(url);
// socket.on("close", () => ws.close());
// The per-body resource analysis cannot observe the close
// inside the registered handler body; without this gate the
// handle reads as a definite leak. Match by variable name —
// closure-captured handles share the binding name with the
// handle in the outer scope.
if closure_released_var_names
.map(|s| s.contains(var_name))
.unwrap_or(false)
{
continue;
}
// Prefer direct acquire node span; fall back to proxy span
// from ResourceMethodSummary (cross-body resource tracking).
let acquire_span = acquire_node
@ -557,6 +586,7 @@ mod tests {
&HashMap::new(),
false,
&std::collections::HashSet::new(),
None,
);
assert_eq!(findings.len(), 1);
@ -617,6 +647,7 @@ mod tests {
&HashMap::new(),
false,
&std::collections::HashSet::new(),
None,
);
assert!(findings.is_empty());
@ -751,6 +782,7 @@ mod tests {
&HashMap::new(),
false,
&std::collections::HashSet::new(),
None,
);
assert!(
@ -816,6 +848,7 @@ mod tests {
&HashMap::new(),
false,
&std::collections::HashSet::new(),
None,
);
assert_eq!(

View file

@ -77,6 +77,13 @@ pub fn run_state_analysis(
// m.Lock()`) and routes them through `chain_proxies` instead. Pass
// `None` to disable, strict-additive.
ptr_proxy_hints: Option<&std::collections::HashMap<String, crate::pointer::PtrProxyHint>>,
// Names of variables whose `.close()`/release calls live in a nested
// closure (event handler, deferred callback) that the per-body CFG
// can't observe directly. Used to suppress resource-leak findings
// for handles whose cleanup is registered as a callback (`ws.on(
// "close", () => ws2.close())`). Pass `None` for languages or
// shapes that don't need this.
closure_released_var_names: Option<&std::collections::HashSet<String>>,
) -> Vec<StateFinding> {
let _span = tracing::debug_span!("run_state_analysis").entered();
@ -116,9 +123,99 @@ pub fn run_state_analysis(
func_summaries,
enable_auth,
path_safe_suppressed_sink_spans,
closure_released_var_names,
)
}
/// Build a per-body map of variable names whose release calls
/// (`.close`, `.destroy`, `.end`, `.release`, …) appear inside a
/// **descendant** body (a closure / event handler nested inside the
/// body that opens the handle).
///
/// Returned: `body_id → set of var names released somewhere inside
/// that body's nested-closure subtree`. Used by the structural
/// ResourceMisuse pass and the state-model leak pass to suppress
/// findings whose cleanup lives in a callback the per-body CFG can't
/// follow (`socket.on("close", () => ws.close())`).
///
/// Restricted to descendants — sibling methods on the same class
/// don't share resource ownership, so a release in `queryAndClose`
/// must NOT silence a leak in sibling `queryAndLeak`. Only true
/// nested-closure parent / child relationships participate.
pub fn collect_closure_released_var_names(
bodies: &[crate::cfg::BodyCfg],
lang: Lang,
) -> std::collections::HashMap<crate::cfg::BodyId, std::collections::HashSet<String>> {
use crate::cfg::{BodyId, StmtKind};
use petgraph::visit::IntoNodeReferences;
// Step 1: collect releases per body. Only nested (non-toplevel)
// closures are eligible — top-level bodies' own releases are
// already tracked by the dataflow.
let pairs = rules::resource_pairs(lang);
let mut per_body: std::collections::HashMap<BodyId, std::collections::HashSet<String>> =
std::collections::HashMap::new();
for body in bodies {
if body.meta.parent_body_id.is_none() {
continue;
}
let mut local = std::collections::HashSet::new();
for (_idx, info) in body.graph.node_references() {
if info.kind != StmtKind::Call {
continue;
}
let Some(callee) = info.call.callee.as_deref() else {
continue;
};
let cl = callee.to_ascii_lowercase();
let is_release = pairs.iter().any(|p| {
p.release.iter().any(|r| {
let rl = r.to_ascii_lowercase();
if let Some(method) = rl.strip_prefix('.') {
cl.ends_with(&format!(".{method}"))
} else {
cl == rl || cl.ends_with(&format!(".{rl}"))
}
})
});
if !is_release {
continue;
}
if let Some(rcv) = info.call.receiver.as_deref() {
local.insert(rcv.to_string());
} else if let Some((rcv, _)) = callee.rsplit_once('.')
&& !rcv.is_empty()
{
local.insert(rcv.to_string());
}
}
if !local.is_empty() {
per_body.insert(body.meta.id, local);
}
}
// Step 2: roll up into ancestor bodies. Walk each non-top body's
// parent chain and union its release set into every ancestor's
// entry. Class methods at the same nesting level (siblings under a
// class body) do not roll up into each other — they have distinct
// BodyId entries and the chain only flows through `parent_body_id`.
let mut rollup: std::collections::HashMap<BodyId, std::collections::HashSet<String>> =
std::collections::HashMap::new();
let by_id: std::collections::HashMap<BodyId, &crate::cfg::BodyCfg> =
bodies.iter().map(|b| (b.meta.id, b)).collect();
for body in bodies {
let Some(local) = per_body.get(&body.meta.id) else {
continue;
};
let mut cur = body.meta.parent_body_id;
while let Some(pid) = cur {
rollup.entry(pid).or_default().extend(local.iter().cloned());
cur = by_id.get(&pid).and_then(|b| b.meta.parent_body_id);
}
}
rollup
}
/// Build resource method summaries by pre-scanning all method bodies for known
/// resource acquire/release operations. Only creates summaries for methods whose
/// bodies actually contain matching operations, never infers from names alone.

View file

@ -635,6 +635,19 @@ impl DefaultTransfer<'_> {
fn apply_assignment(&self, _node_idx: NodeIndex, info: &NodeInfo, state: &mut ProductState) {
// Ownership transfer: if `defines` reassigns a tracked resource
// variable from a `uses` variable, transfer the lifecycle.
//
// Skip when the RHS is a function or lambda literal: storing a
// closure into a property (`ws.onclose = () => { ... }`,
// `obj.handler = function(){...}`) does not move ownership of the
// resources the closure body references — those identifiers appear
// in `info.taint.uses` only because `def_use` walks the literal's
// body, not because the assignment itself reads them. Without this
// gate, the first OPEN-tracked capture inside the closure body gets
// marked MOVED and the property's symbol becomes the new OPEN
// owner, which then surfaces as a spurious leak on the property.
if info.rhs_is_function_literal {
return;
}
if let Some(ref def) = info.taint.defines
&& let Some(def_sym) = self.get_sym(info, def)
{

View file

@ -158,6 +158,39 @@ pub struct SsaFuncSummary {
/// (caller_param_index, sink_arg_position, sink_caps).
#[serde(default)]
pub param_to_sink_param: Vec<(usize, usize, Cap)>,
/// Per-parameter gate-filter cap masks lifted from inner multi-gate
/// sink call sites.
///
/// When a function body contains a callee whose
/// [`crate::cfg::CallMeta::gate_filters`] carries more than one entry
/// (e.g. `fetch` is both an `SSRF` gate on the URL arg and a
/// `DATA_EXFIL` gate on the body arg), the multi-gate dispatch in
/// [`super::super::collect_block_events`] cap-narrows the event's
/// `sink_caps` to the specific gate's `label_caps`. Each
/// `(param_idx, label_caps)` entry records that this function's
/// parameter `param_idx` flowed into a gated sink whose narrowed
/// caps were `label_caps`.
///
/// Cross-file callers consume this list to preserve per-position cap
/// attribution through wrapper functions: a wrapper
/// `fn forward(url, body) { fetch(url, {body}) }` records
/// `[(0, SSRF), (1, DATA_EXFIL)]` so a caller of `forward` splits
/// URL-tainted SSRF findings from body-tainted DATA_EXFIL findings
/// instead of conflating both caps onto every parameter.
///
/// `Vec<(param_idx, label_caps)>` is sufficient at cross-file
/// granularity, the corresponding `payload_args` and
/// `destination_uses` are intra-file context that does not survive
/// the function-summary boundary (field idents reference SSA
/// values from the callee body).
///
/// Empty (the default) for callees whose internal sinks carry zero
/// or one gate filter, the existing
/// [`Self::param_to_sink`] /
/// [`Self::param_to_sink_param`] machinery already records those
/// cases without per-position cap conflict.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub param_to_gate_filters: Vec<(usize, Cap)>,
/// Parameter indices whose container identity flows to the return value
/// (e.g., function returns the same container it received as input).
///

View file

@ -441,6 +441,7 @@ fn ssa_summary_serde_round_trip_identity() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -473,6 +474,7 @@ fn ssa_summary_serde_round_trip_strip_bits() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -502,6 +504,7 @@ fn ssa_summary_serde_round_trip_add_bits() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -538,6 +541,7 @@ fn ssa_summary_serde_round_trip_all_variants() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -576,6 +580,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
gs.insert_ssa(key.clone(), v1.clone());
assert_eq!(gs.get_ssa(&key), Some(&v1));
@ -602,6 +607,7 @@ fn global_summaries_insert_ssa_exact_key_replacement() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
gs.insert_ssa(key.clone(), v2.clone());
assert_eq!(gs.get_ssa(&key), Some(&v2));
@ -648,6 +654,7 @@ fn global_summaries_merge_with_ssa_entries() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let sum_b = SsaFuncSummary {
param_to_return: vec![],
@ -670,6 +677,7 @@ fn global_summaries_merge_with_ssa_entries() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
gs1.insert_ssa(key_a.clone(), sum_a.clone());
@ -716,6 +724,7 @@ fn global_summaries_is_empty_considers_ssa() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
@ -745,6 +754,7 @@ fn ssa_summary_serde_round_trip_param_to_sink_param() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -789,6 +799,7 @@ fn ssa_summary_serde_round_trip_container_fields() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -843,6 +854,7 @@ fn ssa_summary_serde_round_trip_return_abstract() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();
@ -916,6 +928,8 @@ fn make_callee_body(
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
},
opt: crate::ssa::OptimizeResult {
const_values: std::collections::HashMap::new(),
@ -1361,6 +1375,7 @@ fn global_summaries_resolve_body_requires_body_present() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
// Don't insert body
@ -3504,6 +3519,7 @@ fn cf4_return_path_transform_serde_round_trip() {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
};
let json = serde_json::to_string(&summary).unwrap();
let back: SsaFuncSummary = serde_json::from_str(&json).unwrap();

View file

@ -1382,6 +1382,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let empty_succs = HashMap::new();
@ -1441,6 +1443,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let empty_succs = HashMap::new();
@ -1573,6 +1577,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = make_finding(n0, n1);
@ -1680,6 +1686,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
// Finding path goes through B0 → B1 → B3
@ -1826,6 +1834,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {
@ -1938,6 +1948,8 @@ mod tests {
exception_edges: vec![(b0, b2)],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let mut exc_succs: HashMap<BlockId, SmallVec<[BlockId; 2]>> = HashMap::new();
@ -2004,6 +2016,8 @@ mod tests {
exception_edges: vec![(b0, b2)],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let mut exc_succs: HashMap<BlockId, SmallVec<[BlockId; 2]>> = HashMap::new();
@ -2111,6 +2125,8 @@ mod tests {
exception_edges: vec![(b1, b2)],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {

View file

@ -389,6 +389,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -434,6 +436,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -515,6 +519,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -577,6 +583,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -657,6 +665,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -728,6 +738,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -762,6 +774,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -818,6 +832,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -898,6 +914,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -976,6 +994,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);
@ -1011,6 +1031,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let info = analyse_loops(&ssa);

View file

@ -379,6 +379,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {
@ -452,6 +454,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {
@ -554,6 +558,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let ctx = SymexContext {
@ -614,6 +620,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let ctx = SymexContext {

View file

@ -353,6 +353,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let witness = state.get_sink_witness(&finding, &ssa);
@ -393,6 +395,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
assert_eq!(state.get_sink_witness(&finding, &ssa), None);
@ -430,6 +434,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
assert_eq!(state.get_sink_witness(&finding, &ssa), None);
@ -470,6 +476,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
state.widen_at_loop_head(BlockId(0), &ssa);
@ -513,6 +521,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
state.widen_at_loop_head(BlockId(0), &ssa);
@ -556,6 +566,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
state.widen_at_loop_head(BlockId(0), &ssa);

View file

@ -1012,6 +1012,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
@ -1591,6 +1593,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
let ctx = make_summary_ctx(&gs);
@ -1659,6 +1662,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
let ctx = make_summary_ctx(&gs);
@ -1727,6 +1731,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
let ctx = make_summary_ctx(&gs);
@ -1790,6 +1795,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
let ctx = make_summary_ctx(&gs);
@ -1853,6 +1859,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
let ctx = make_summary_ctx(&gs);
@ -2050,6 +2057,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
@ -2128,6 +2136,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
@ -2207,6 +2216,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
// Second "send", in ns B, also with same arity → ambiguous bare-name
@ -2236,6 +2246,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
// Also register the type-qualified name so Attempt 1 can find it
@ -2265,6 +2276,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
@ -2343,6 +2355,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
@ -2423,6 +2436,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
insert_java_summary(
@ -2451,6 +2465,7 @@ mod tests {
field_points_to: Default::default(),
return_path_facts: smallvec::SmallVec::new(),
typed_call_receivers: vec![],
param_to_gate_filters: vec![],
},
);
// No "HttpClient.send" summary registered, disambiguation has 0 exact matches

View file

@ -204,8 +204,15 @@ fn sink_cap(finding: &Finding, cfg: &Cfg) -> Cap {
/// Select a witness payload string based on the vulnerability class.
fn witness_payload(cap: Cap) -> &'static str {
// Check bits in priority order (most specific first)
if cap.intersects(Cap::CODE_EXEC) {
// Check bits in priority order (most specific first).
//
// `DATA_EXFIL` is checked before the action-class caps (CODE_EXEC, SQL,
// etc.) because a data-exfil sink reflects what the *attacker reads*,
// not what they *do*: the witness needs to look like a leaked secret
// ("<SESSION_TOKEN>") rather than an injected payload ("' OR 1=1 --").
if cap.intersects(Cap::DATA_EXFIL) {
"<SESSION_TOKEN>"
} else if cap.intersects(Cap::CODE_EXEC) {
"require('child_process').execSync('id')"
} else if cap.intersects(Cap::HTML_ESCAPE) {
"<script>alert('xss')</script>"
@ -639,9 +646,21 @@ mod tests {
witness_payload(Cap::DESERIALIZE),
"malicious_serialized_object"
);
assert_eq!(witness_payload(Cap::DATA_EXFIL), "<SESSION_TOKEN>");
assert_eq!(witness_payload(Cap::CRYPTO), "TAINTED"); // fallback
}
#[test]
fn test_witness_payload_data_exfil_wins_over_action_caps() {
// A `fetch` call's body slot can carry both DATA_EXFIL (the leak
// class) and the underlying action cap (e.g. SSRF) when the same
// sink is multi-gated. The witness should reflect the *leaked*
// value (a session token) rather than an injection payload, the
// attacker is reading data, not writing it.
let combined = Cap::DATA_EXFIL | Cap::SSRF;
assert_eq!(witness_payload(combined), "<SESSION_TOKEN>");
}
#[test]
fn test_witness_payload_code_exec_separate_from_xss() {
// CODE_EXEC must return a code-execution payload, not an XSS one.
@ -776,6 +795,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {
@ -831,6 +852,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let cfg = Cfg::new();
let finding = Finding {
@ -892,6 +915,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {
@ -954,6 +979,8 @@ mod tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let finding = Finding {

View file

@ -752,6 +752,7 @@ mod tests {
exception_edges: Vec::new(),
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
(ssa, cfg)
@ -766,6 +767,47 @@ mod tests {
assert_eq!(d.validated_false, 0);
}
/// Regression guard: the cap-routing logic must round-trip
/// `Cap::DATA_EXFIL` exactly like every other cap. The backwards
/// engine treats the demand as opaque bits, so if a future change
/// accidentally narrows the type of `caps` (e.g. a hardcoded mask)
/// the data-exfiltration cap stops surviving the walk.
#[test]
fn demand_state_roundtrips_data_exfil_cap() {
let d = DemandState::new(Cap::DATA_EXFIL);
assert_eq!(d.caps, Cap::DATA_EXFIL);
assert!(d.caps.contains(Cap::DATA_EXFIL));
// Sanity: combined demand keeps the bit alongside SSRF (the two
// most-frequently-co-occurring caps on outbound HTTP gates).
let combined = DemandState::new(Cap::DATA_EXFIL | Cap::SSRF);
assert!(combined.caps.contains(Cap::DATA_EXFIL));
assert!(combined.caps.contains(Cap::SSRF));
}
/// The backwards driver must classify a `DATA_EXFIL`-capable source
/// even when the sink demand is *exactly* `DATA_EXFIL` (no other
/// caps). Mirrors `driver_walks_source_to_sink` but pins the cap so
/// a future change that intersects with a wider mask (and thus
/// silently widens the demand) is caught.
#[test]
fn driver_walks_data_exfil_source_to_sink() {
let (ssa, mut cfg) = build_trivial_source_body();
// Tag the source CFG node with a Source(DATA_EXFIL) label so
// the cap-match path (the one that actually rules end-to-end
// routing) exercises the bit.
let src_node = NodeIndex::new(0);
cfg[src_node]
.taint
.labels
.push(DataLabel::Source(Cap::DATA_EXFIL));
let ctx = BackwardsCtx::new(&ssa, &cfg, Lang::JavaScript);
let flows = analyse_sink_backwards(&ctx, SsaValue(1), NodeIndex::new(1), Cap::DATA_EXFIL);
assert_eq!(flows.len(), 1, "exactly one DATA_EXFIL flow expected");
assert!(flows[0].is_confirmation(), "must confirm at the source");
assert_eq!(flows[0].sink_caps, Cap::DATA_EXFIL);
}
#[test]
fn backward_transfer_source_terminates() {
let (ssa, _cfg) = build_trivial_source_body();
@ -800,6 +842,7 @@ mod tests {
exception_edges: Vec::new(),
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let demand = DemandState::new(Cap::all());
let (step, next) = backward_transfer(&ssa, SsaValue(0), &demand);
@ -832,6 +875,7 @@ mod tests {
exception_edges: Vec::new(),
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let demand = DemandState::new(Cap::all());
let (step, _next) = backward_transfer(&ssa, SsaValue(0), &demand);
@ -919,6 +963,7 @@ mod tests {
exception_edges: Vec::new(),
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let demand = DemandState::new(Cap::all());
@ -1007,6 +1052,7 @@ mod tests {
exception_edges: Vec::new(),
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let ctx = BackwardsCtx::new(&ssa, &cfg, Lang::JavaScript);

View file

@ -4026,6 +4026,45 @@ pub(super) fn transfer_inst(
}
}
// Constructor cap narrowing: a `new X(...)` call returns an object
// instance, not a string. Caps that name a string-shaped sink
// pattern (path argument, format string, URL component, JSON
// input) cannot fire on a wrapper object, so they must not
// survive the construction. Without this narrowing, a tainted
// argument to `new SdkClient(secret)` propagates `Cap::all()`
// into the wrapper, every method call on the wrapper inherits
// those bits via receiver propagation, and any downstream
// `fs.write*` / `printf` / `JSON.parse` on a string property
// returned by an SDK method (e.g. `client.create().id`) flags
// a phantom flow that has no real path-traversal etc. payload.
//
// Caps preserved (legitimately travel through wrappers):
// - SHELL_ESCAPE / SQL_QUERY / CODE_EXEC / DESERIALIZE: a
// wrapper that captures a tainted command/query string can
// replay it via methods, the bit must survive the wrap.
// - SSRF / DATA_EXFIL: URL/payload concerns persist on URL or
// content-bearing objects.
// - UNAUTHORIZED_ID: ownership obligation persists on a
// wrapper that carries a request-bound identifier.
// - ENV_VAR: provenance marker, never a sink trigger by
// itself.
// - HTML_ESCAPE: kept for safety, conservative dual concern
// (a wrapper used as a string in template rendering).
// - CRYPTO: kept conservatively.
//
// Caps stripped on construction:
// - FILE_IO: path strings only.
// - FMT_STRING: printf-style format args only.
// - URL_ENCODE: URL components only.
// - JSON_PARSE: parser inputs only.
if info.call.is_constructor && !return_bits.is_empty() {
let strip = Cap::FILE_IO | Cap::FMT_STRING | Cap::URL_ENCODE | Cap::JSON_PARSE;
return_bits &= !strip;
if return_bits.is_empty() {
return_origins.clear();
}
}
// Write result
if return_bits.is_empty() {
state.remove(inst.value);
@ -4314,16 +4353,41 @@ pub(super) fn transfer_inst(
// summary-extraction mode so baseline probes keep their
// intrinsic-source contract. Gate is set by the caller, e.g.
// always-on for JS/TS, only AnonymousFunction bodies for Java.
//
// The `Param` branch fires for both real formal parameters and
// synthetic externals injected by lowering for free / closure-
// captured variables (`SsaBody.synthetic_externals`). Only real
// formals should receive the heuristic seed: a closure capturing
// an out-of-scope `userId` / `cmd` / `payload` is NOT a handler
// entry point — the variable is supplied by the enclosing scope
// and seeding it here produces phantom sources anchored to the
// function's declaration line.
if transfer.auto_seed_handler_params
&& !seeded_from_scope
&& matches!(&inst.op, SsaOp::Param { .. })
&& !ssa.synthetic_externals.contains(&inst.value)
{
if let Some(var_name) = ssa
.value_defs
.get(inst.value.0 as usize)
.and_then(|vd| vd.var_name.as_deref())
{
if crate::labels::is_js_ts_handler_param_name(var_name) {
// Direct match: the Param's name itself is a handler
// identifier (e.g. `input`, `cmd`, `userId`).
//
// Root-prefix match: dotted-path Params produced by
// lowering for member-expression uses inside the body
// (`input.cmd` — an unbacked phantom Param) inherit the
// seed when their *root* is a handler-param formal.
// Without this, the field-aware suppression downstream
// sees `input.cmd` as a "clean field" and strips
// `input`'s taint, even though `input.cmd` is just a
// structural projection of the auto-seeded formal.
let root_is_handler = var_name
.split_once('.')
.map(|(root, _)| crate::labels::is_js_ts_handler_param_name(root))
.unwrap_or(false);
if crate::labels::is_js_ts_handler_param_name(var_name) || root_is_handler {
let origin = TaintOrigin {
node: inst.cfg_node,
source_kind: SourceKind::UserInput,
@ -5245,6 +5309,15 @@ fn collect_block_events(
let sink_info = resolve_sink_info(info, transfer);
let mut sink_caps = sink_info.caps;
// [detectors.data_exfil] enabled toggle. When the detector class is
// disabled per-project, strip Cap::DATA_EXFIL from sink_caps so no
// taint-data-exfiltration event is emitted regardless of which gate
// would have fired. Strict-additive: defaults to enabled, no effect
// for projects that don't opt in.
if !crate::utils::detector_options::current().data_exfil.enabled {
sink_caps &= !Cap::DATA_EXFIL;
}
// Type-qualified sink resolution: when normal sink resolution found nothing,
// try using the receiver's inferred type to construct a qualified callee name.
if sink_caps.is_empty() {
@ -5324,50 +5397,83 @@ fn collect_block_events(
for &(cb_idx, src_caps) in &resolved.source_to_callback {
let cb_name = info.arg_callees.get(cb_idx).and_then(|ac| ac.as_ref());
if let Some(cb_callee) = cb_name {
if let Some(cb_resolved) =
resolve_callee(transfer, cb_callee, caller_func, 0)
{
let matching_sink_caps = cb_resolved
.param_to_sink
.iter()
.filter(|(_, caps)| !(src_caps & *caps).is_empty())
.fold(Cap::empty(), |acc, (_, c)| acc | *c);
if !matching_sink_caps.is_empty() {
let source_kind =
crate::labels::infer_source_kind(src_caps, callee);
let origin = TaintOrigin {
node: inst.cfg_node,
source_kind,
source_span: None,
};
// Pick callback-path sink sites.
// The callback callee's `param_to_sink_sites`
// drives attribution when available; cap-only
// fallback yields `primary_sink_site = None`.
let cb_tainted: Vec<(
SsaValue,
Cap,
SmallVec<[TaintOrigin; 2]>,
)> = vec![(
// First try the standard summary-based resolution
// path (covers user-defined functions and built-ins
// that landed in label-derived summaries upstream).
// If that yields no matching sink caps, fall back
// to gated-sink classification on the callback
// callee's name — gated sinks (e.g.
// `child_process.exec` post-fix) carry their
// payload positions in the gate, not in any
// summary, and the callback pipeline still needs
// those positions to pair source caps against
// param_to_sink.
let cb_resolved = resolve_callee(transfer, cb_callee, caller_func, 0);
let mut matching_sink_caps = Cap::empty();
let cb_param_to_sink_sites: Vec<(usize, SmallVec<[SinkSite; 1]>)> =
if let Some(ref r) = cb_resolved {
matching_sink_caps = r
.param_to_sink
.iter()
.filter(|(_, caps)| !(src_caps & *caps).is_empty())
.fold(Cap::empty(), |acc, (_, c)| acc | *c);
r.param_to_sink_sites.clone()
} else {
vec![]
};
if matching_sink_caps.is_empty() {
// Gate-fallback: classify_gated_sink yields the
// callback callee's payload positions + sink
// caps directly when the name matches a gated
// sink rule.
let lang_str = transfer.lang.as_str();
let gates = crate::labels::classify_gated_sink(
lang_str,
cb_callee,
|_| None,
|_| None,
|_| false,
);
for gm in gates.iter() {
if let DataLabel::Sink(bits) = gm.label {
if !(src_caps & bits).is_empty() {
matching_sink_caps |= bits;
}
}
}
}
if !matching_sink_caps.is_empty() {
let source_kind =
crate::labels::infer_source_kind(src_caps, callee);
let origin = TaintOrigin {
node: inst.cfg_node,
source_kind,
source_span: None,
};
// Pick callback-path sink sites.
// The callback callee's `param_to_sink_sites`
// drives attribution when available; cap-only
// fallback yields `primary_sink_site = None`.
let cb_tainted: Vec<(SsaValue, Cap, SmallVec<[TaintOrigin; 2]>)> =
vec![(
inst.value,
src_caps & matching_sink_caps,
SmallVec::from_elem(origin, 1),
)];
let cb_sites = pick_primary_sink_sites_from_resolved(
matching_sink_caps,
&cb_resolved.param_to_sink_sites,
);
emit_ssa_taint_events(
events,
inst.cfg_node,
cb_tainted,
matching_sink_caps,
false,
None,
true,
cb_sites,
);
}
let cb_sites = pick_primary_sink_sites_from_resolved(
matching_sink_caps,
&cb_param_to_sink_sites,
);
emit_ssa_taint_events(
events,
inst.cfg_node,
cb_tainted,
matching_sink_caps,
false,
None,
true,
cb_sites,
);
}
}
}
@ -5563,8 +5669,62 @@ fn collect_block_events(
// loop with the legacy `(sink_caps, info.call.sink_payload_args,
// info.call.destination_uses)` triple, preserving prior behavior
// for every non-multi-gate site.
//
// Cross-file wrapper case: when the resolved callee summary carries
// [`SinkInfo::param_to_gate_filters`] (the wrapper's body contains
// an inner multi-gate sink whose per-position cap split was lifted
// at extraction time), expand one filter pass per `(param_idx,
// label_caps)` entry restricted to that single arg position. This
// preserves SSRF-vs-DATA_EXFIL attribution across a
// `fn forward(url, body) { fetch(url, {body}) }` wrapper that is
// NOT itself a known gated sink.
//
// Params NOT covered by `param_to_gate_filters` retain coverage
// via their `param_to_sink` entry, expanded per-position so the
// emitted event's `sink_caps` reflects the param-specific cap
// mask rather than the aggregate union. This matters for
// wrappers that mix gated sinks with label-based sinks
// (e.g. `fn dispatch(cmd, url) { execSync(cmd); fetch(url) }`),
// where param 0 reaches a non-gated SHELL_ESCAPE sink and the
// gate-filter list only carries the SSRF gate for param 1.
let multi_gate = info.call.gate_filters.len() > 1;
let summary_per_position = !multi_gate && !sink_info.param_to_gate_filters.is_empty();
type FilterEntry<'a> = (Cap, Option<&'a [usize]>, Option<&'a [String]>);
// Per-position dispatch source for the summary-per-position branch.
// First, every entry from `param_to_gate_filters` (cap-narrowed by
// the inner gate); then, for any param_to_sink index NOT mentioned
// in `param_to_gate_filters`, an entry using that param's
// `param_to_sink` cap mask.
struct PerPosEntry {
idx: [usize; 1],
caps: Cap,
}
let per_position_entries: Vec<PerPosEntry> = if summary_per_position {
let mut out: Vec<PerPosEntry> =
Vec::with_capacity(sink_info.param_to_gate_filters.len());
for (idx, caps) in &sink_info.param_to_gate_filters {
out.push(PerPosEntry {
idx: [*idx],
caps: *caps,
});
}
for (idx, caps) in &sink_info.param_to_sink {
if sink_info
.param_to_gate_filters
.iter()
.any(|(i, _)| *i == *idx)
{
continue;
}
out.push(PerPosEntry {
idx: [*idx],
caps: *caps,
});
}
out
} else {
Vec::new()
};
let filter_iter: smallvec::SmallVec<[FilterEntry<'_>; 2]> = if multi_gate {
info.call
.gate_filters
@ -5577,11 +5737,37 @@ fn collect_block_events(
)
})
.collect()
} else if summary_per_position {
per_position_entries
.iter()
.map(|e| (sink_caps & e.caps, Some(e.idx.as_slice()), None))
.collect()
} else {
smallvec::smallvec![(sink_caps, None, None)]
};
for (filter_caps, positions_override, destination_override) in filter_iter {
let mut filter_caps = filter_caps;
// Per-filter destination allowlist for DATA_EXFIL. When this
// filter would emit Cap::DATA_EXFIL and the call's destination
// arg has a trusted static prefix (configured via
// detectors.data_exfil.trusted_destinations), drop the bit
// for this filter only. Other gates on the same call site
// (notably SSRF) are unaffected. Mirrors the semantics of
// is_call_data_exfil_destination_trusted but operates per-gate
// so a multi-gate fetch site keeps SSRF attribution while
// dropping DATA_EXFIL when the destination is trusted.
if filter_caps.intersects(Cap::DATA_EXFIL) {
if let SsaOp::Call { ref args, .. } = inst.op {
if let Some(ref abs) = state.abstract_state {
if is_call_data_exfil_destination_trusted(inst, args, abs, cfg) {
filter_caps &= !Cap::DATA_EXFIL;
}
}
}
}
if filter_caps.is_empty() {
continue;
}
@ -6464,6 +6650,15 @@ struct SinkInfo {
/// coordinates. Used to attribute findings to the dangerous
/// callee-internal instruction.
param_to_sink_sites: Vec<(usize, SmallVec<[SinkSite; 1]>)>,
/// Per-parameter gate-filter cap masks lifted from the callee's
/// inner multi-gate sink call sites. Mirrors
/// [`crate::summary::ssa_summary::SsaFuncSummary::param_to_gate_filters`].
/// When non-empty, the dispatcher in [`collect_block_events`]
/// expands one filter pass per `(param_idx, label_caps)` entry so
/// a wrapper carrying multiple gate classes (e.g. SSRF on the URL
/// arg + DATA_EXFIL on the body arg) attributes findings per cap
/// instead of joining them.
param_to_gate_filters: Vec<(usize, Cap)>,
}
fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo {
@ -6479,6 +6674,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo {
caps: label_sink_caps,
param_to_sink: vec![],
param_to_sink_sites: vec![],
param_to_gate_filters: vec![],
};
}
@ -6500,6 +6696,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo {
caps: r.sink_caps,
param_to_sink: r.param_to_sink,
param_to_sink_sites: r.param_to_sink_sites,
param_to_gate_filters: r.param_to_gate_filters,
};
}
@ -6525,6 +6722,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo {
caps: r.sink_caps,
param_to_sink: r.param_to_sink,
param_to_sink_sites: r.param_to_sink_sites,
param_to_gate_filters: r.param_to_gate_filters,
};
}
}
@ -6533,6 +6731,7 @@ fn resolve_sink_info(info: &NodeInfo, transfer: &SsaTaintTransfer) -> SinkInfo {
caps: Cap::empty(),
param_to_sink: vec![],
param_to_sink_sites: vec![],
param_to_gate_filters: vec![],
}
}
@ -7383,6 +7582,16 @@ fn is_abstract_safe_for_sink(
}
}
// DATA_EXFIL, destination allowlist via configured trusted prefixes.
// Mirrors the SSRF prefix-lock above but consults the user-configured
// [detectors.data_exfil] table's trusted_destinations key. Strict-
// additive: when no destinations are configured this is a no-op.
if sink_caps.intersects(Cap::DATA_EXFIL)
&& is_inst_data_exfil_destination_trusted(inst, abs, cfg)
{
return true;
}
// SHELL_ESCAPE, static-map finite-domain safety. When every tainted
// payload value is proved by the static-HashMap-lookup analysis to come
// from a bounded set of metacharacter-free literals, the call cannot
@ -7509,6 +7718,15 @@ fn is_call_abstract_safe(
}
}
// DATA_EXFIL, destination-allowlist match. Mirrors the SSRF arm above
// for the Call path. Strict-additive: a no-op when
// detectors.data_exfil.trusted_destinations is empty.
if sink_caps.intersects(Cap::DATA_EXFIL)
&& is_call_data_exfil_destination_trusted(inst, args, abs, cfg)
{
return true;
}
// SHELL_ESCAPE, static-map finite-domain safety on every non-empty arg
// group. Mirrors the non-Call path so suppression fires regardless of
// which branch the sink detector took.
@ -7785,6 +8003,118 @@ fn is_static_map_shell_safe(
})
}
/// `DATA_EXFIL` destination-allowlist match.
///
/// Returns `true` when `prefix` (the proven static prefix of an outbound
/// destination URL, sourced from either the abstract string domain or an
/// inline literal seen by CFG) starts with one of the user-configured
/// trusted destinations. Used by the abstract sink-suppression code to
/// drop the [`Cap::DATA_EXFIL`] bit on legitimate forwarding pipelines
/// (telemetry, internal APIs, analytics) without affecting other caps on
/// the same call.
///
/// Match semantics: a trusted destination entry is treated as a string
/// prefix. An empty entry never matches (empty prefix would match
/// every URL, which is never a useful allowlist). Entries should be
/// origin-pinned (e.g. `https://api.internal/`) so partial-host
/// collisions cannot occur.
fn is_string_prefix_trusted_destination(prefix: &str, trusted: &[String]) -> bool {
if prefix.is_empty() {
return false;
}
trusted
.iter()
.any(|t| !t.is_empty() && prefix.starts_with(t.as_str()))
}
/// Check whether the call site's destination argument (positional arg 0) is
/// a known trusted destination per
/// [`crate::utils::detector_options::DataExfilDetectorOptions::trusted_destinations`].
///
/// Returns `true` when the URL argument has a static prefix matching one
/// of the configured trusted entries. Three sources are consulted in
/// order:
///
/// 1. The CFG node's syntactic literal (`info.call.arg_string_literals[0]`),
/// populated for any positional argument that is a syntactic string
/// literal at the call site. Catches the common case
/// `fetch('https://api.internal/...', {...})` whose URL never enters
/// the abstract domain because it is not bound to an identifier.
/// 2. The inline template-literal prefix attached to the call node
/// directly (matches the SSRF prefix-lock fallback).
/// 3. The abstract string-domain prefix of arg 0's SSA value group.
/// Catches identifier-bound URLs like
/// `let url = \`https://api.internal/${id}\`; fetch(url, {...})`.
///
/// Returns `false` when no trusted destinations are configured.
fn is_call_data_exfil_destination_trusted(
inst: &SsaInst,
args: &[SmallVec<[SsaValue; 2]>],
abs: &AbstractState,
cfg: &Cfg,
) -> bool {
let opts = crate::utils::detector_options::current();
let trusted = &opts.data_exfil.trusted_destinations;
if trusted.is_empty() {
return false;
}
let node_info = &cfg[inst.cfg_node];
if let Some(Some(lit)) = node_info.call.arg_string_literals.first() {
if is_string_prefix_trusted_destination(lit, trusted) {
return true;
}
}
if let Some(prefix) = node_info.string_prefix.as_deref() {
if is_string_prefix_trusted_destination(prefix, trusted) {
return true;
}
}
if let Some(first_arg) = args.first() {
if !first_arg.is_empty()
&& first_arg.iter().all(|v| {
abs.get(*v)
.string
.prefix
.as_deref()
.is_some_and(|p| is_string_prefix_trusted_destination(p, trusted))
})
{
return true;
}
}
false
}
/// Non-Call variant of [`is_call_data_exfil_destination_trusted`]: used by
/// [`is_abstract_safe_for_sink`] where the destination is read off the
/// instruction's own used SSA values rather than a positional Call arg
/// list. Falls back to the node-attached `string_prefix` when no abstract
/// fact is available.
fn is_inst_data_exfil_destination_trusted(inst: &SsaInst, abs: &AbstractState, cfg: &Cfg) -> bool {
let opts = crate::utils::detector_options::current();
let trusted = &opts.data_exfil.trusted_destinations;
if trusted.is_empty() {
return false;
}
let node_info = &cfg[inst.cfg_node];
if let Some(prefix) = node_info.string_prefix.as_deref() {
if is_string_prefix_trusted_destination(prefix, trusted) {
return true;
}
}
let used = inst_use_values(inst);
if used.is_empty() {
return false;
}
used.iter().all(|v| {
abs.get(*v)
.string
.prefix
.as_deref()
.is_some_and(|p| is_string_prefix_trusted_destination(p, trusted))
})
}
/// SSRF safety: prefix includes scheme + full host + path separator.
///
/// Soundness: if the prefix contains `scheme://host/`, the attacker cannot
@ -8026,6 +8356,21 @@ struct ResolvedSummary {
/// retained; in that case `param_to_sink` alone still drives sink
/// detection.
param_to_sink_sites: Vec<(usize, SmallVec<[SinkSite; 1]>)>,
/// Per-parameter gate-filter cap masks lifted from the callee's
/// inner multi-gate sink call sites. Mirrors
/// [`crate::summary::ssa_summary::SsaFuncSummary::param_to_gate_filters`].
///
/// Each `(param_idx, label_caps)` entry says "this caller-side
/// parameter flows to a callee-internal gated sink whose narrowed
/// caps are `label_caps`". When non-empty, the multi-gate dispatch
/// in [`collect_block_events`] expands one filter pass per entry so
/// the emitted event's `sink_caps` reflect the gate-specific cap
/// rather than the aggregate union, preserving SSRF-vs-DATA_EXFIL
/// (and similar) attribution through wrapper functions.
///
/// Empty for label, local-summary, FuncSummary, and interop paths,
/// these forms do not retain per-gate cap detail.
param_to_gate_filters: Vec<(usize, Cap)>,
propagates_taint: bool,
propagating_params: Vec<usize>,
/// Parameter indices whose container identity flows to return value.
@ -8229,18 +8574,34 @@ fn resolve_callee_full(
param_return_paths: vec![],
points_to: Default::default(),
field_points_to: Default::default(),
param_to_gate_filters: vec![],
});
}
// Try label classification for the bound function (by leaf name)
// Try label classification for the bound function (by leaf name).
// Consult both flat rules (`classify_all`) and gated sinks: a
// callback bound to a gated sink (e.g. passing
// `child_process.exec` directly as the callback) still needs to
// surface its `Sink` capability so the source/callback pairing
// logic can match `param_to_sink` against the caller's source.
// The gate's `payload_args` translate directly into
// `param_to_sink` index entries.
let labels = crate::labels::classify_all(
transfer.lang.as_str(),
&real_key.name,
transfer.extra_labels,
);
if !labels.is_empty() {
let gate_matches = crate::labels::classify_gated_sink(
transfer.lang.as_str(),
&real_key.name,
|_| None,
|_| None,
|_| false,
);
if !labels.is_empty() || !gate_matches.is_empty() {
let mut source_caps = Cap::empty();
let mut sanitizer_caps = Cap::empty();
let mut sink_caps = Cap::empty();
let mut param_to_sink: Vec<(usize, Cap)> = vec![];
for lbl in &labels {
match lbl {
DataLabel::Source(bits) => source_caps |= *bits,
@ -8248,11 +8609,25 @@ fn resolve_callee_full(
DataLabel::Sink(bits) => sink_caps |= *bits,
}
}
for gm in gate_matches.iter() {
if let DataLabel::Sink(bits) = gm.label {
sink_caps |= bits;
// Map the gate's payload_args to per-param sink entries
// so source-to-callback pairing can match by index.
// Skip the dynamic-activation sentinel — without a
// concrete arity we can't enumerate positions here.
if gm.payload_args != crate::labels::ALL_ARGS_PAYLOAD {
for &idx in gm.payload_args {
param_to_sink.push((idx, bits));
}
}
}
}
return Some(ResolvedSummary {
source_caps,
sanitizer_caps,
sink_caps,
param_to_sink: vec![],
param_to_sink,
param_to_sink_sites: vec![],
propagates_taint: false,
propagating_params: vec![],
@ -8270,6 +8645,7 @@ fn resolve_callee_full(
param_return_paths: vec![],
points_to: Default::default(),
field_points_to: Default::default(),
param_to_gate_filters: vec![],
});
}
}
@ -8414,6 +8790,7 @@ fn resolve_callee_full(
param_return_paths: vec![],
points_to: Default::default(),
field_points_to: Default::default(),
param_to_gate_filters: vec![],
});
}
} else {
@ -8463,6 +8840,7 @@ fn resolve_callee_full(
param_return_paths: vec![],
points_to: Default::default(),
field_points_to: Default::default(),
param_to_gate_filters: vec![],
};
match widened.len() {
0 => {}
@ -8533,6 +8911,7 @@ fn resolve_callee_full(
param_return_paths: vec![],
points_to: Default::default(),
field_points_to: Default::default(),
param_to_gate_filters: vec![],
});
}
}
@ -8714,6 +9093,7 @@ fn convert_ssa_to_resolved_for_caller(
param_return_paths: ssa_sum.param_return_paths.clone(),
points_to: ssa_sum.points_to.clone(),
field_points_to: ssa_sum.field_points_to.clone(),
param_to_gate_filters: ssa_sum.param_to_gate_filters.clone(),
}
}
@ -8810,6 +9190,20 @@ fn merge_resolved_summaries_fanout(
}
}
// param_to_gate_filters: dedup-union (idx, caps) pairs. Each
// implementer may carry its own per-position cap split; the union
// preserves cap attribution from any implementer reachable via
// virtual dispatch.
for (idx, caps) in r.param_to_gate_filters {
if !acc
.param_to_gate_filters
.iter()
.any(|&(i, c)| i == idx && c == caps)
{
acc.param_to_gate_filters.push((idx, caps));
}
}
// SSA-precision fields: drop on any disagreement.
if acc.return_type != r.return_type {
acc.return_type = None;

View file

@ -753,6 +753,8 @@ fn origin_sort_key(o: &TaintOrigin) -> (usize, usize, u8, usize) {
crate::labels::SourceKind::Database => 3,
crate::labels::SourceKind::CaughtException => 4,
crate::labels::SourceKind::Unknown => 5,
crate::labels::SourceKind::Cookie => 6,
crate::labels::SourceKind::Header => 7,
};
(span_start, span_end, kind_tag, o.node.index())
}

View file

@ -387,6 +387,15 @@ pub fn extract_ssa_func_summary_full(
let mut param_to_return = Vec::new();
let mut param_to_sink: Vec<(usize, SmallVec<[SinkSite; 1]>)> = Vec::new();
let mut param_to_sink_param = Vec::new();
// Per-param gate-filter cap masks lifted from inner multi-gate sink calls.
// Populated when the per-param probe reaches a sink whose CFG node carries
// [`crate::cfg::CallMeta::gate_filters`] with more than one entry, the
// multi-gate dispatch in `collect_block_events` has already cap-narrowed
// `event.sink_caps` to the matching gate's `label_caps`, so we record the
// pair as-is. Cross-file callers consume this list to preserve per-position
// cap attribution through wrapper functions like
// `fn forward(url, body) { fetch(url, {body}) }`.
let mut param_to_gate_filters: Vec<(usize, Cap)> = Vec::new();
// Per-param return-path decomposition. Populated only when the param
// has ≥2 distinct return-block predicate hashes, a single-return-path
// callee is already precise via `param_to_return`.
@ -541,6 +550,28 @@ pub fn extract_ssa_func_summary_full(
for pos in extract_sink_arg_positions(event, ssa) {
param_to_sink_param.push((idx, pos, event.sink_caps));
}
// Per-position gate-filter cap lifting.
//
// When the sink callee carries multiple gate filters (e.g. `fetch`
// is both an SSRF gate on the URL arg and a `DATA_EXFIL` gate on
// the body arg), the multi-gate dispatch has already filtered
// `event.sink_caps` down to the specific gate's `label_caps` for
// this probe. Recording `(idx, event.sink_caps)` preserves that
// narrowing across the function-summary boundary so a caller of
// the wrapper splits SSRF from DATA_EXFIL findings instead of
// joining them under a single union.
//
// Single-gate / no-gate sinks are skipped, the existing
// `param_to_sink` machinery already records those without
// per-position cap conflict.
if !event.sink_caps.is_empty()
&& cfg[event.sink_node].call.gate_filters.len() > 1
&& !param_to_gate_filters
.iter()
.any(|&(i, c)| i == idx && c == event.sink_caps)
{
param_to_gate_filters.push((idx, event.sink_caps));
}
if event.sink_caps.is_empty() {
continue;
}
@ -641,6 +672,7 @@ pub fn extract_ssa_func_summary_full(
param_to_sink,
source_caps,
param_to_sink_param,
param_to_gate_filters,
param_container_to_return,
param_to_container_store,
return_type,

View file

@ -85,6 +85,8 @@ mod cross_file_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
},
opt: crate::ssa::OptimizeResult {
const_values: std::collections::HashMap::new(),
@ -832,6 +834,8 @@ mod primary_sink_location_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
@ -963,6 +967,8 @@ mod goto_succ_propagation_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let cfg: Cfg = Graph::new();
@ -1053,6 +1059,8 @@ mod goto_succ_propagation_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let cfg: Cfg = Graph::new();
let interner = SymbolInterner::new();
@ -1112,6 +1120,8 @@ mod goto_succ_propagation_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
@ -1298,6 +1308,8 @@ mod goto_succ_propagation_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
@ -1423,6 +1435,8 @@ mod receiver_candidates_field_proj_tests {
exception_edges: vec![],
field_interner: interner,
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
}
}
@ -1508,6 +1522,8 @@ mod receiver_candidates_field_proj_tests {
exception_edges: vec![],
field_interner: interner,
field_writes: std::collections::HashMap::new(),
synthetic_externals: std::collections::HashSet::new(),
};
let cands =
super::super::receiver_candidates_for_type_lookup(SsaValue(0), Some(&body), Lang::Go);
@ -1550,6 +1566,7 @@ mod fanout_merge_tests {
param_return_paths: vec![],
points_to: Default::default(),
field_points_to: Default::default(),
param_to_gate_filters: vec![],
}
}
@ -1909,6 +1926,7 @@ mod field_write_tests {
exception_edges: vec![],
field_interner,
field_writes,
synthetic_externals: HashSet::new(),
};
(body, cache_id)
}
@ -2206,6 +2224,7 @@ mod field_write_tests {
m.insert(SsaValue(2), (SsaValue(0), cache_id));
m
},
synthetic_externals: HashSet::new(),
};
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0));
// v0 is Const → empty pt, the hook should not insert anything.
@ -2437,6 +2456,8 @@ mod container_elem_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: HashMap::new(),
synthetic_externals: HashSet::new(),
};
// Run pointer analysis first to confirm the result of `shift()`
@ -2575,6 +2596,8 @@ mod container_elem_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: HashMap::new(),
synthetic_externals: HashSet::new(),
};
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(7));
@ -2715,6 +2738,8 @@ mod container_elem_tests {
exception_edges: vec![],
field_interner: crate::ssa::ir::FieldInterner::default(),
field_writes: HashMap::new(),
synthetic_externals: HashSet::new(),
};
let interner = SymbolInterner::new();
@ -2838,6 +2863,8 @@ mod cross_call_field_tests {
exception_edges: vec![],
field_interner,
field_writes: HashMap::new(),
synthetic_externals: HashSet::new(),
};
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(7));
(body, cache_id, pf)
@ -3210,6 +3237,8 @@ mod field_taint_origin_cap_tests {
exception_edges: vec![],
field_interner,
field_writes: HashMap::new(),
synthetic_externals: HashSet::new(),
};
(body, cache_id, cfg, n_proj)
}
@ -3533,6 +3562,7 @@ mod pointer_lattice_worklist_tests {
exception_edges: vec![],
field_interner,
field_writes,
synthetic_externals: HashSet::new(),
};
let mut interner = SymbolInterner::new();

View file

@ -712,6 +712,10 @@ pub struct Config {
pub output: OutputConfig,
pub performance: PerformanceConfig,
pub analysis: AnalysisRulesConfig,
/// Per-detector knobs ([detectors.*] in nyx.conf). Currently exposes
/// `[detectors.data_exfil]` for cross-boundary leak suppression.
#[serde(default)]
pub detectors: crate::utils::detector_options::DetectorOptions,
pub server: ServerConfig,
pub runs: RunsConfig,
pub profiles: HashMap<String, ScanProfile>,
@ -1018,6 +1022,17 @@ pub(crate) fn merge_configs(mut default: Config, user: Config) -> Config {
default.profiles.insert(name, profile);
}
// --- DetectorOptions ---
// Wholesale replace: each `[detectors.*]` field uses #[serde(default)],
// so any omitted field already inherits the documented defaults during
// user-config deserialization. trusted_destinations is union-merged so
// the user adds to (rather than replaces) any future built-in defaults.
default.detectors.data_exfil.enabled = user.detectors.data_exfil.enabled;
extend_dedup(
&mut default.detectors.data_exfil.trusted_destinations,
user.detectors.data_exfil.trusted_destinations,
);
// --- AnalysisRulesConfig ---
// Engine options: wholesale replace. User's engine block is already
// serde-merged with defaults (via #[serde(default)] per field), so any

View file

@ -0,0 +1,129 @@
//! Per-detector runtime options.
//!
//! Mirrors the install/current pattern in [`crate::utils::analysis_options`]
//! but for detector-class knobs that live under `[detectors.*]` in
//! `nyx.conf`. Engine code that wants to consult a detector option calls
//! [`current`]; the CLI installs a resolved value before the scan starts.
//!
//! The first knobs covered here are the [`Cap::DATA_EXFIL`][crate::labels::Cap::DATA_EXFIL]
//! suppression layers:
//!
//! * `enabled` — turn the cap off entirely per-project so legitimate
//! forwarding pipelines don't surface findings.
//! * `trusted_destinations` — destination URL prefixes that suppress the
//! cap when a sink's URL argument has a static prefix matching one of
//! them. Uses the same prefix-lock plumbing the SSRF suppression has.
//!
//! Defaults are conservative: detector enabled, no trusted destinations.
use serde::{Deserialize, Serialize};
use std::sync::RwLock;
/// Options for the `Cap::DATA_EXFIL` suppression layers.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct DataExfilDetectorOptions {
/// When `false`, the entire data-exfiltration detector class is
/// suppressed for the project. Sink-time filters drop
/// [`crate::labels::Cap::DATA_EXFIL`] from sink caps before event
/// emission, so no `taint-data-exfiltration` findings reach output.
pub enabled: bool,
/// URL prefixes treated as trusted destinations for outbound
/// requests. When a sink's destination argument has a proven static
/// prefix (from the abstract string domain or an inline literal)
/// that begins with one of these entries, the
/// [`crate::labels::Cap::DATA_EXFIL`] bit is dropped before event
/// emission. Mirrors the SSRF prefix-lock semantics.
pub trusted_destinations: Vec<String>,
}
impl Default for DataExfilDetectorOptions {
fn default() -> Self {
Self {
enabled: true,
trusted_destinations: Vec::new(),
}
}
}
/// Top-level `[detectors]` block.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct DetectorOptions {
pub data_exfil: DataExfilDetectorOptions,
}
static RUNTIME: RwLock<Option<DetectorOptions>> = RwLock::new(None);
/// Install the process-wide detector options. First-wins: subsequent calls
/// are a no-op and return `false`. The CLI calls this once per process at
/// scan start; library consumers that never install pick up
/// [`DetectorOptions::default`] via [`current`].
pub fn install(opts: DetectorOptions) -> bool {
let mut guard = RUNTIME.write().expect("detector options RwLock poisoned");
if guard.is_some() {
return false;
}
*guard = Some(opts);
true
}
/// Replace the installed options unconditionally. Mirrors
/// [`crate::utils::analysis_options::reinstall`] for the server's
/// per-request resolution path.
pub fn reinstall(opts: DetectorOptions) {
*RUNTIME.write().expect("detector options RwLock poisoned") = Some(opts);
}
/// Read the active options. Returns the installed runtime when present,
/// otherwise [`DetectorOptions::default`].
pub fn current() -> DetectorOptions {
RUNTIME
.read()
.expect("detector options RwLock poisoned")
.clone()
.unwrap_or_default()
}
/// Test helper: clear the installed runtime so a subsequent [`install`]
/// takes effect. Used only in tests that exercise different detector
/// configurations within the same process.
#[doc(hidden)]
pub fn _reset_for_tests() {
*RUNTIME.write().expect("detector options RwLock poisoned") = None;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn defaults_match_documented() {
let o = DetectorOptions::default();
assert!(o.data_exfil.enabled);
assert!(o.data_exfil.trusted_destinations.is_empty());
}
#[test]
fn toml_roundtrip() {
let opts = DetectorOptions {
data_exfil: DataExfilDetectorOptions {
enabled: false,
trusted_destinations: vec![
"https://api.internal/".into(),
"https://telemetry.".into(),
],
},
};
let s = toml::to_string(&opts).unwrap();
let back: DetectorOptions = toml::from_str(&s).unwrap();
assert_eq!(opts, back);
}
#[test]
fn missing_section_uses_defaults() {
let toml_str = r#"# empty"#;
let cfg: DetectorOptions = toml::from_str(toml_str).unwrap();
assert!(cfg.data_exfil.enabled);
}
}

View file

@ -1,5 +1,6 @@
pub mod analysis_options;
pub mod config;
pub mod detector_options;
pub(crate) mod ext;
pub mod path;
pub mod project;
@ -8,4 +9,5 @@ pub(crate) mod snippet;
pub use analysis_options::{AnalysisOptions, SymexOptions};
pub use config::Config;
pub use detector_options::{DataExfilDetectorOptions, DetectorOptions};
pub use project::{detect_frameworks, get_project_info};