mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-06 19:35:13 +02:00
8109 lines
329 KiB
Rust
8109 lines
329 KiB
Rust
//! Tree-sitter parsing and two-pass analysis for all supported languages.
|
||
//!
|
||
//! The core type is `ParsedSource`, a thin wrapper around a parsed tree-sitter
|
||
//! tree that carries the source bytes and language. Parsing reuses a thread-local
|
||
//! [`tree_sitter::Parser`] so each worker thread keeps one live parser instance.
|
||
//!
|
||
//! ## Two-pass pipeline
|
||
//!
|
||
//! **Pass 1** (`extract_summaries_from_file`): builds the CFG, lowers to SSA,
|
||
//! and extracts a [`crate::summary::FuncSummary`] per function. Summaries
|
||
//! describe boundary behaviour: which arguments flow to sinks, which sources
|
||
//! the function reads, what taint it strips, and what it returns.
|
||
//!
|
||
//! **Pass 2** (`run_rules_on_file`): reanalyses each file with the merged
|
||
//! [`crate::summary::GlobalSummaries`] from pass 1. The taint engine runs a
|
||
//! forward dataflow worklist over SSA, resolving cross-file calls via summaries.
|
||
//!
|
||
//! Parse timeouts are tracked per-thread via [`take_last_parse_timeout_ms`]
|
||
//! so callers can surface the event as an informational diagnostic instead
|
||
//! of silently skipping the file.
|
||
|
||
#![allow(clippy::only_used_in_recursion, clippy::type_complexity)]
|
||
|
||
use crate::auth_analysis;
|
||
use crate::cfg::{Cfg, FileCfg, FuncSummaries, build_cfg, export_summaries};
|
||
use crate::cfg_analysis;
|
||
use crate::commands::scan::Diag;
|
||
use crate::errors::{NyxError, NyxResult};
|
||
use crate::evidence::{Evidence, FlowStep, SpanEvidence, StateEvidence};
|
||
use crate::labels::{
|
||
Cap, DataLabel, LangAnalysisRules, build_lang_rules, severity_for_source_kind,
|
||
};
|
||
use crate::patterns::{FindingCategory, PatternCategory, Severity};
|
||
use crate::state;
|
||
use crate::summary::ssa_summary::SsaFuncSummary;
|
||
use crate::summary::{FuncSummary, GlobalSummaries};
|
||
use crate::symbol::Lang;
|
||
use crate::utils::config::AnalysisMode;
|
||
use crate::utils::ext::lowercase_ext;
|
||
use crate::utils::{Config, query_cache};
|
||
use petgraph::graph::NodeIndex;
|
||
use std::borrow::Cow;
|
||
use std::cell::{OnceCell, RefCell};
|
||
use std::collections::{HashMap, HashSet};
|
||
use std::ops::ControlFlow;
|
||
use std::path::Path;
|
||
use std::time::Instant;
|
||
use tree_sitter::{Language, QueryCursor, StreamingIterator};
|
||
|
||
thread_local! {
|
||
static PARSER: RefCell<tree_sitter::Parser> = RefCell::new(tree_sitter::Parser::new());
|
||
/// Records the timeout budget (in ms) when a tree-sitter parse is
|
||
/// aborted due to [`parse_timeout_ms`]. Callers that want to surface
|
||
/// the event as a synthetic informational [`Diag`] read this slot
|
||
/// immediately after [`ParsedSource::try_new`] returns `Ok(None)`
|
||
/// and clear it with `take_last_parse_timeout_ms`.
|
||
static LAST_PARSE_TIMEOUT_MS: std::cell::Cell<Option<u64>> = const {
|
||
std::cell::Cell::new(None)
|
||
};
|
||
}
|
||
|
||
/// Consume and return the most recent parse-timeout event on this thread
|
||
/// (set by `ParsedSource::try_new`). Used to lift the event into a
|
||
/// synthetic [`Diag`] carrying an [`crate::engine_notes::EngineNote::ParseTimeout`].
|
||
pub fn take_last_parse_timeout_ms() -> Option<u64> {
|
||
LAST_PARSE_TIMEOUT_MS.with(|c| c.take())
|
||
}
|
||
|
||
/// Synthesize an informational diagnostic surfacing a parse-timeout event
|
||
/// for `path`. The diag carries an [`crate::engine_notes::EngineNote::ParseTimeout`]
|
||
/// in its evidence so downstream tooling can distinguish "found nothing"
|
||
/// from "parse was aborted before we could look".
|
||
fn parse_timeout_diag(path: &Path, timeout_ms: u64) -> Diag {
|
||
let mut evidence = Evidence::default();
|
||
evidence.notes.push(format!(
|
||
"tree-sitter parse exceeded timeout budget ({timeout_ms} ms); file skipped"
|
||
));
|
||
evidence
|
||
.engine_notes
|
||
.push(crate::engine_notes::EngineNote::ParseTimeout {
|
||
timeout_ms: timeout_ms.min(u32::MAX as u64) as u32,
|
||
});
|
||
Diag {
|
||
path: path.to_string_lossy().into_owned(),
|
||
line: 0,
|
||
col: 0,
|
||
severity: Severity::Low,
|
||
id: "engine.parse_timeout".into(),
|
||
category: FindingCategory::Quality,
|
||
path_validated: false,
|
||
guard_kind: None,
|
||
message: Some(format!(
|
||
"tree-sitter parse exceeded timeout budget ({timeout_ms} ms); file skipped"
|
||
)),
|
||
labels: vec![],
|
||
confidence: None,
|
||
evidence: Some(evidence),
|
||
rank_score: None,
|
||
rank_reason: None,
|
||
suppressed: false,
|
||
suppression: None,
|
||
rollup: None,
|
||
finding_id: String::new(),
|
||
alternative_finding_ids: Vec::new(),
|
||
stable_hash: 0,
|
||
}
|
||
}
|
||
|
||
/// Resolve the effective parse-timeout budget in milliseconds. Tree-sitter
|
||
/// is generally fast, but adversarially-crafted inputs (deeply ambiguous
|
||
/// grammar constructs, pathological backtracking) can drive it into slow
|
||
/// parses; the default 10 s ceiling lets a 10 000-file scan survive even if
|
||
/// every file is hostile. Configured via `analysis.engine.parse_timeout_ms`
|
||
/// in `nyx.conf` (or `--parse-timeout-ms` on the CLI); `0` disables the cap.
|
||
fn parse_timeout_ms() -> u64 {
|
||
crate::utils::analysis_options::current().parse_timeout_ms
|
||
}
|
||
|
||
/// Test-only: when the `NYX_TEST_FORCE_PANIC_PATH` env var is set, any file
|
||
/// path containing that substring triggers a deterministic panic here. Used
|
||
/// by `tests/panic_recovery_tests.rs` to exercise per-file panic behaviour in
|
||
/// the scan pipeline. The env var is re-read each call so successive tests
|
||
/// in the same process can toggle injection; `std::env::var` is an in-memory
|
||
/// lookup on supported platforms so the overhead is negligible.
|
||
fn maybe_inject_test_panic(path: &Path) {
|
||
if let Ok(marker) = std::env::var("NYX_TEST_FORCE_PANIC_PATH")
|
||
&& !marker.is_empty()
|
||
&& path.to_string_lossy().contains(marker.as_str())
|
||
{
|
||
panic!(
|
||
"NYX_TEST_FORCE_PANIC_PATH injection: {} matches {:?}",
|
||
path.display(),
|
||
marker
|
||
);
|
||
}
|
||
}
|
||
|
||
/// Convenience alias for node indices.
|
||
fn byte_offset_to_point(tree: &tree_sitter::Tree, byte: usize) -> tree_sitter::Point {
|
||
tree.root_node()
|
||
.descendant_for_byte_range(byte, byte)
|
||
.map(|n| n.start_position())
|
||
.unwrap_or_else(|| tree_sitter::Point { row: 0, column: 0 })
|
||
}
|
||
|
||
use crate::utils::snippet::line_snippet as extract_line_snippet;
|
||
|
||
/// Resolve a `file_rel` (relative to `scan_root` per
|
||
/// [`normalize_namespace`] convention) back to the absolute path the
|
||
/// diagnostic pipeline expects.
|
||
///
|
||
/// * Empty `file_rel`, single-file scans normalize every namespace to
|
||
/// `""`; treat that as "the file under analysis" and return
|
||
/// `fallback.to_string_lossy()`.
|
||
/// * `scan_root` absent, we have no workspace root to resolve against;
|
||
/// return `file_rel` verbatim (it may already be absolute).
|
||
/// * Otherwise, join `scan_root` with `file_rel`.
|
||
fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -> String {
|
||
if file_rel.is_empty() {
|
||
return fallback.to_string_lossy().into_owned();
|
||
}
|
||
match scan_root {
|
||
Some(root) => root.join(file_rel).to_string_lossy().into_owned(),
|
||
None => file_rel.to_string(),
|
||
}
|
||
}
|
||
|
||
/// Build a [`Diag`] from a taint [`Finding`], the CFG that produced it,
|
||
/// the parsed tree (for byte→line/col conversion) and the file path.
|
||
///
|
||
/// Returns `None` when source-sensitivity gating fully suppresses the
|
||
/// finding (the canonical case is a multi-gate `DATA_EXFIL` event whose
|
||
/// contributing source is plain user input — see the
|
||
/// `effective_caps` strip below).
|
||
fn build_taint_diag(
|
||
finding: &crate::taint::Finding,
|
||
cfg_graph: &crate::cfg::Cfg,
|
||
tree: &tree_sitter::Tree,
|
||
path: &Path,
|
||
src: &[u8],
|
||
scan_root: Option<&Path>,
|
||
) -> Option<Diag> {
|
||
let call_site_byte = cfg_graph[finding.sink].classification_span().0;
|
||
let call_site_point = byte_offset_to_point(tree, call_site_byte);
|
||
// `finding.source` should be a NodeIndex valid in this body's CFG, but
|
||
// cross-body / cross-file inline analysis has historically leaked
|
||
// callee-NodeIndex origins (see `extract_inline_return_taint`). Guard
|
||
// the lookup so a stray out-of-bounds index degrades the diagnostic
|
||
// rather than panicking the worker thread.
|
||
let source_info = cfg_graph.node_weight(finding.source);
|
||
// The reconstructed flow path is the authoritative view of where the
|
||
// taint started *in this body*. When present, prefer its first step's
|
||
// CFG span over `finding.source_span`, which can be stale across
|
||
// multi-hop cross-body remaps (e.g. JS two-level solve where a
|
||
// callee-interior source gets its span rewritten to the enclosing
|
||
// body's entry node). Fall back to `source_span`, then to the source
|
||
// NodeIndex, then finally to the sink byte.
|
||
let source_byte = finding
|
||
.flow_steps
|
||
.first()
|
||
.and_then(|s| {
|
||
cfg_graph
|
||
.node_weight(s.cfg_node)
|
||
.map(|i| i.classification_span().0)
|
||
})
|
||
.or(finding.source_span)
|
||
.or_else(|| source_info.map(|i| i.classification_span().0))
|
||
.unwrap_or(call_site_byte);
|
||
let source_point = byte_offset_to_point(tree, source_byte);
|
||
|
||
// Prefer the source CFG node's callee string when it's a call expression
|
||
// (e.g. `os.getenv("X")`). For property-access sources like
|
||
// `navigator.userAgent` there is no callee, fall back to the first flow
|
||
// step's `variable` (the SSA var name, e.g. "userAgent"), then to the
|
||
// source node's `taint.defines` / first `taint.uses` entry, before
|
||
// finally giving up and rendering "(unknown)".
|
||
let source_callee = source_info
|
||
.and_then(|i| i.call.callee.as_deref())
|
||
.map(sanitize_desc)
|
||
.or_else(|| {
|
||
finding
|
||
.flow_steps
|
||
.first()
|
||
.and_then(|s| s.var_name.as_deref())
|
||
.map(sanitize_desc)
|
||
})
|
||
.or_else(|| {
|
||
source_info
|
||
.and_then(|i| i.taint.defines.as_deref())
|
||
.map(sanitize_desc)
|
||
})
|
||
.or_else(|| {
|
||
source_info
|
||
.and_then(|i| i.taint.uses.first().map(String::as_str))
|
||
.map(sanitize_desc)
|
||
})
|
||
.unwrap_or_else(|| "(unknown)".into());
|
||
// Sink-callee attribution: when the sink node is an *argument* of a call
|
||
// (e.g. PHP `header("location: " . $_GET['x'])` — the `$_GET[...]` subscript
|
||
// carries `callee = "$_GET"` but `outer_callee = "header"`), the enclosing
|
||
// call is the real sink and should be displayed, not the source token.
|
||
// `outer_callee` is only populated for nested/argument positions, so for a
|
||
// plain call node it is None and we fall back to the node's own callee.
|
||
let call_site_callee = cfg_graph[finding.sink]
|
||
.call
|
||
.outer_callee
|
||
.as_deref()
|
||
.or(cfg_graph[finding.sink].call.callee.as_deref())
|
||
.map(sanitize_desc)
|
||
.unwrap_or_else(|| "(unknown)".into());
|
||
let kind_label = source_kind_label(finding.source_kind);
|
||
|
||
let file_path_owned = path.to_string_lossy().into_owned();
|
||
|
||
// Primary-location attribution: when the sink was resolved via a
|
||
// callee summary that carried a [`SinkSite`], `finding.primary_location`
|
||
// names the dangerous instruction inside the callee body. Use those
|
||
// coordinates as the diag's primary (file, line, col); otherwise fall
|
||
// back to the caller's call-site position.
|
||
let (primary_path, primary_line, primary_col, primary_snippet_hint) =
|
||
if let Some(loc) = finding.primary_location.as_ref() {
|
||
let abs = resolve_file_rel(&loc.file_rel, scan_root, path);
|
||
if abs != file_path_owned {
|
||
tracing::debug!(
|
||
caller_file = %file_path_owned,
|
||
primary_file = %abs,
|
||
primary_line = loc.line,
|
||
"taint finding attributed to a cross-file primary sink location",
|
||
);
|
||
}
|
||
let snippet = if loc.snippet.is_empty() {
|
||
None
|
||
} else {
|
||
Some(loc.snippet.clone())
|
||
};
|
||
(abs, loc.line as usize, loc.col as usize, snippet)
|
||
} else {
|
||
(
|
||
file_path_owned.clone(),
|
||
call_site_point.row + 1,
|
||
call_site_point.column + 1,
|
||
None,
|
||
)
|
||
};
|
||
|
||
let short_source = crate::fmt::shorten_callee(&source_callee);
|
||
let short_call_site = crate::fmt::shorten_callee(&call_site_callee);
|
||
let sink_display = primary_snippet_hint
|
||
.as_deref()
|
||
.map(crate::fmt::shorten_callee)
|
||
.unwrap_or_else(|| short_call_site.clone());
|
||
let sink_label_display = if finding.primary_location.is_some() {
|
||
format!("{call_site_callee} \u{2192} {sink_display}")
|
||
} else {
|
||
call_site_callee.clone()
|
||
};
|
||
|
||
let mut labels = vec![
|
||
(
|
||
"Source".into(),
|
||
format!(
|
||
"{source_callee} ({}:{})",
|
||
source_point.row + 1,
|
||
source_point.column + 1
|
||
),
|
||
),
|
||
("Sink".into(), sink_label_display),
|
||
];
|
||
if let Some(guard) = finding.guard_kind {
|
||
labels.push(("Path guard".into(), format!("{guard:?}")));
|
||
}
|
||
|
||
let mut evidence_notes = Vec::new();
|
||
if finding.path_validated {
|
||
evidence_notes.push("path_validated".into());
|
||
}
|
||
evidence_notes.push(format!("source_kind:{:?}", finding.source_kind));
|
||
evidence_notes.push(format!("hop_count:{}", finding.hop_count));
|
||
evidence_notes.push(format!("cap_specificity:{}", finding.cap_specificity));
|
||
if finding.uses_summary {
|
||
evidence_notes.push("uses_summary".into());
|
||
}
|
||
|
||
// Convert raw flow steps to display FlowSteps. When the finding has a
|
||
// primary_location distinct from the call site, the last raw step is
|
||
// really the Call, reclassify it and append a synthetic Sink step
|
||
// pointing at the callee-internal dangerous instruction so analysts
|
||
// see both the call site and the final sink in the trace.
|
||
let mut flow_steps: Vec<FlowStep> = finding
|
||
.flow_steps
|
||
.iter()
|
||
.enumerate()
|
||
.map(|(i, raw)| {
|
||
let step_byte = cfg_graph[raw.cfg_node].classification_span().0;
|
||
let point = byte_offset_to_point(tree, step_byte);
|
||
let snippet = extract_line_snippet(src, step_byte);
|
||
let callee = cfg_graph[raw.cfg_node].call.callee.clone();
|
||
let function = cfg_graph[raw.cfg_node].ast.enclosing_func.clone();
|
||
FlowStep {
|
||
step: (i + 1) as u32,
|
||
kind: raw.op_kind.clone(),
|
||
file: file_path_owned.clone(),
|
||
line: (point.row + 1) as u32,
|
||
col: (point.column + 1) as u32,
|
||
snippet,
|
||
variable: raw.var_name.clone(),
|
||
callee,
|
||
function,
|
||
is_cross_file: false,
|
||
}
|
||
})
|
||
.collect();
|
||
|
||
if let Some(loc) = finding.primary_location.as_ref() {
|
||
if let Some(last) = flow_steps.last_mut()
|
||
&& matches!(last.kind, crate::evidence::FlowStepKind::Sink)
|
||
{
|
||
last.kind = crate::evidence::FlowStepKind::Call;
|
||
}
|
||
let is_cross_file = primary_path != file_path_owned;
|
||
let synthetic_snippet = if loc.snippet.is_empty() {
|
||
None
|
||
} else {
|
||
Some(loc.snippet.clone())
|
||
};
|
||
let next_step = (flow_steps.len() + 1) as u32;
|
||
flow_steps.push(FlowStep {
|
||
step: next_step,
|
||
kind: crate::evidence::FlowStepKind::Sink,
|
||
file: primary_path.clone(),
|
||
line: loc.line,
|
||
col: loc.col,
|
||
snippet: synthetic_snippet,
|
||
variable: None,
|
||
callee: None,
|
||
function: None,
|
||
is_cross_file,
|
||
});
|
||
}
|
||
|
||
let sink_evidence_snippet = primary_snippet_hint.or(Some(short_call_site));
|
||
|
||
// Resolved sink capability bits, used by deduplication to distinguish
|
||
// sinks with different cap types on the same source line (e.g.
|
||
// `sink_sql(x); sink_shell(x);`).
|
||
//
|
||
// Prefer the per-finding `effective_sink_caps` (set by the SSA dispatch
|
||
// when receiver-type qualification, gated rules, or other late-binding
|
||
// resolvers contribute caps that the CFG node's static labels do not
|
||
// carry). Fall back to the union of `Sink(cap)` labels on the CFG
|
||
// node when the SSA dispatch did not narrow.
|
||
let sink_caps_bits: u32 = if !finding.effective_sink_caps.is_empty() {
|
||
finding.effective_sink_caps.bits()
|
||
} else {
|
||
cfg_graph[finding.sink]
|
||
.taint
|
||
.labels
|
||
.iter()
|
||
.filter_map(|l| match l {
|
||
crate::labels::DataLabel::Sink(c) => Some(c.bits()),
|
||
_ => None,
|
||
})
|
||
.fold(0u32, |acc, b| acc | b)
|
||
};
|
||
|
||
// Cap-specific rule-id routing.
|
||
//
|
||
// 1. `UNAUTHORIZED_ID`: namespace alongside the standalone `auth_analysis`
|
||
// subsystem's output so cross-tool aggregation lines up.
|
||
// 2. `DATA_EXFIL`: route to `taint-data-exfiltration` so SARIF surfaces a
|
||
// distinct rule id from SSRF, the two share callees (e.g. `fetch`)
|
||
// but represent different vulnerability classes.
|
||
//
|
||
// Prefer the per-finding `effective_sink_caps` (set by the multi-gate
|
||
// SSA dispatch) when populated; fall back to the union of all sink-label
|
||
// caps on the CFG node so legacy paths that build findings without
|
||
// setting `effective_sink_caps` still pick the right rule id.
|
||
let mut effective_caps = if finding.effective_sink_caps.is_empty() {
|
||
crate::labels::Cap::from_bits_truncate(sink_caps_bits)
|
||
} else {
|
||
finding.effective_sink_caps
|
||
};
|
||
|
||
// Source-sensitivity gate for `DATA_EXFIL`. Plain attacker input echoed
|
||
// back into an outbound request body / headers / json is not data
|
||
// exfiltration, the user already controls the value, surfacing it as a
|
||
// leak is noise (the canonical false-positive class for API gateways
|
||
// and telemetry forwarders that proxy `req.body`). A `DATA_EXFIL`
|
||
// finding requires the contributing source to be at least `Sensitive`
|
||
// (cookies, headers, env, db rows, file reads). Plain user-input
|
||
// sources have the cap stripped so the finding either drops entirely
|
||
// or downgrades to whatever non-`DATA_EXFIL` cap also applies (e.g.
|
||
// SSRF on the URL position of the same `fetch` call).
|
||
if effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
|
||
&& finding.source_kind.sensitivity() < crate::labels::Sensitivity::Sensitive
|
||
{
|
||
effective_caps.remove(crate::labels::Cap::DATA_EXFIL);
|
||
// The multi-gate dispatch produces one finding per (source, sink-cap)
|
||
// pair, a body-flow finding's `effective_sink_caps` is exactly the
|
||
// cap that fired (e.g. `DATA_EXFIL`). When that single cap is the
|
||
// sensitivity-stripped one, the finding has no surviving rationale
|
||
// and we drop it entirely rather than reroute it to the generic
|
||
// `taint-unsanitised-flow` bucket (which would just re-emit the same
|
||
// false positive under a different rule id). Findings with a
|
||
// multi-cap `effective_sink_caps` keep their non-DATA_EXFIL caps and
|
||
// are routed normally below.
|
||
if finding.effective_sink_caps == crate::labels::Cap::DATA_EXFIL {
|
||
return None;
|
||
}
|
||
}
|
||
|
||
// DATA_EXFIL routing.
|
||
//
|
||
// Multi-gate dispatch (JS / Go) emits one event per cap, so by this
|
||
// point each finding's `effective_sink_caps` carries exactly one bit
|
||
// and the simple `DATA_EXFIL && !SSRF` test routes correctly. Flat-
|
||
// rule paths (Java HTTP clients where type-qualified resolution
|
||
// attaches both `SSRF` and `DATA_EXFIL` Sink labels to the same call,
|
||
// e.g. `client.send(req)` covering both URL and body channels of the
|
||
// request value) produce a single dual-cap event. Disambiguate using
|
||
// the flow path: when a body-bind verb (`.body(`, `.json(`, `.form(`,
|
||
// `.multipart(`, `BodyPublishers`, `setEntity`, `bodyValue`, etc.)
|
||
// appears anywhere in the SSA flow steps or the sink chain text, the
|
||
// taint reached an outbound payload field, route to DATA_EXFIL. When
|
||
// no body-bind verb is on the path (Sensitive-tier source flowing
|
||
// straight into the URL position via `.get`/`.post`/`.send`), this is
|
||
// a real SSRF and routes to taint-unsanitised-flow regardless of
|
||
// source sensitivity. Source sensitivity is still required for the
|
||
// DATA_EXFIL route, plain user input echoed into a request body is
|
||
// not exfiltration.
|
||
let flow_has_body_bind = {
|
||
let body_bind_substrings = [
|
||
".body(",
|
||
".json(",
|
||
".form(",
|
||
".multipart(",
|
||
".bodyvalue(",
|
||
".setentity(",
|
||
"bodypublishers",
|
||
"body_string",
|
||
"body_json",
|
||
"body_bytes",
|
||
"send_string",
|
||
"send_json",
|
||
"send_form",
|
||
// Spring RestTemplate one-shot verbs that take a body argument
|
||
// inline (no separate `BodyPublishers` / `setEntity` step in the
|
||
// chain). Method-name suffixes are unique enough that bare
|
||
// substring matching is safe.
|
||
"postforobject",
|
||
"postforentity",
|
||
"patchforobject",
|
||
];
|
||
let chain_lower = call_site_callee.to_ascii_lowercase();
|
||
let in_sink = body_bind_substrings.iter().any(|m| chain_lower.contains(m));
|
||
let in_steps = finding.flow_steps.iter().any(|step| {
|
||
cfg_graph[step.cfg_node]
|
||
.call
|
||
.callee
|
||
.as_deref()
|
||
.map(|c| {
|
||
let lc = c.to_ascii_lowercase();
|
||
body_bind_substrings.iter().any(|m| lc.contains(m))
|
||
})
|
||
.unwrap_or(false)
|
||
});
|
||
in_sink || in_steps
|
||
};
|
||
// Java HTTP-client builder pattern hides the body-bind step inside a
|
||
// builder chain whose intermediate calls collapse to `HttpRequest.build`
|
||
// in the flow. When the source is unambiguously credential-bearing
|
||
// (cookies, session attributes, caught exceptions carrying stack
|
||
// frames) and the sink fires DATA_EXFIL, treat that as exfil even
|
||
// when no body-bind verb is visible in the flow. Env vars stay
|
||
// ambiguous (they often carry URL config) so they still require an
|
||
// explicit body-bind hit on the path.
|
||
let source_is_credential_bearing = matches!(
|
||
finding.source_kind,
|
||
crate::labels::SourceKind::Cookie | crate::labels::SourceKind::CaughtException
|
||
);
|
||
let is_data_exfil_rule = effective_caps.contains(crate::labels::Cap::DATA_EXFIL)
|
||
&& !effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID)
|
||
&& (!effective_caps.contains(crate::labels::Cap::SSRF)
|
||
|| (finding.source_kind.sensitivity() >= crate::labels::Sensitivity::Sensitive
|
||
&& (flow_has_body_bind || source_is_credential_bearing)));
|
||
|
||
// Cap-specific rule routing. Auth-as-taint and data-exfil keep their
|
||
// pre-existing branches so the routing rules they encode (auth-finding
|
||
// namespace alignment; body-bind / source-sensitivity gate) stay
|
||
// exactly as before. New cap classes (LDAP / XPath / Header / Open
|
||
// redirect / SSTI / XXE / Prototype pollution) route through
|
||
// `cap_rule_meta()` so the canonical rule ids in the registry are the
|
||
// single source of truth. Legacy generic taint findings continue to
|
||
// emit `taint-unsanitised-flow`.
|
||
let diag_id = if effective_caps.contains(crate::labels::Cap::UNAUTHORIZED_ID) {
|
||
"rs.auth.missing_ownership_check.taint".to_string()
|
||
} else if is_data_exfil_rule {
|
||
format!(
|
||
"taint-data-exfiltration (source {}:{})",
|
||
source_point.row + 1,
|
||
source_point.column + 1
|
||
)
|
||
} else if let Some(meta) = [
|
||
crate::labels::Cap::LDAP_INJECTION,
|
||
crate::labels::Cap::XPATH_INJECTION,
|
||
crate::labels::Cap::HEADER_INJECTION,
|
||
crate::labels::Cap::OPEN_REDIRECT,
|
||
crate::labels::Cap::SSTI,
|
||
crate::labels::Cap::XXE,
|
||
crate::labels::Cap::PROTOTYPE_POLLUTION,
|
||
]
|
||
.iter()
|
||
.find(|c| effective_caps.contains(**c))
|
||
.and_then(|c| crate::labels::cap_rule_meta(*c))
|
||
{
|
||
format!(
|
||
"{} (source {}:{})",
|
||
meta.rule_id,
|
||
source_point.row + 1,
|
||
source_point.column + 1
|
||
)
|
||
} else {
|
||
format!(
|
||
"taint-unsanitised-flow (source {}:{})",
|
||
source_point.row + 1,
|
||
source_point.column + 1
|
||
)
|
||
};
|
||
|
||
// For `DATA_EXFIL` rules, look up which destination object-literal field
|
||
// (`body` / `headers` / `json`) the tainted value reached. Each
|
||
// [`crate::cfg::GateFilter`] carries `destination_uses` (var names) in
|
||
// parallel with `destination_fields` (the field each var was bound to),
|
||
// so we walk the gate filter whose `label_caps` includes `DATA_EXFIL`
|
||
// and match the tainted var name from the last flow step. Falls back
|
||
// to the first non-empty destination field on the matching filter when
|
||
// the var-name match fails (e.g. the SSA sink event is reported on a
|
||
// copy-propagated value whose name no longer matches the original
|
||
// destination ident). `None` when the sink wasn't a destination-aware
|
||
// gate (no object literal, or non-fetch sink).
|
||
let data_exfil_field: Option<String> = if is_data_exfil_rule {
|
||
let last_var = finding
|
||
.flow_steps
|
||
.last()
|
||
.and_then(|s| s.var_name.as_deref());
|
||
let filters = &cfg_graph[finding.sink].call.gate_filters;
|
||
filters
|
||
.iter()
|
||
.find(|f| f.label_caps.contains(crate::labels::Cap::DATA_EXFIL))
|
||
.and_then(|f| {
|
||
if let (Some(uses), Some(var)) = (f.destination_uses.as_ref(), last_var)
|
||
&& let Some(idx) = uses.iter().position(|u| u == var)
|
||
{
|
||
return f.destination_fields.get(idx).cloned();
|
||
}
|
||
f.destination_fields.first().cloned()
|
||
})
|
||
} else {
|
||
None
|
||
};
|
||
|
||
// DATA_EXFIL severity calibration (Phase: detector ranking).
|
||
//
|
||
// Generic taint severity comes from `severity_for_source_kind`, which
|
||
// maps Cookie/Header/Env to High because those sources are spicy
|
||
// *as taint roots*. For `DATA_EXFIL` we are scoring the leak class,
|
||
// not the source itself: not every Sensitive-tier source is a Secret.
|
||
// Cookies and env carry credential / session material whose leakage
|
||
// is an immediate disclosure (Secret-tier); request headers, file
|
||
// reads, db rows, and caught exceptions are Sensitive but not
|
||
// automatically secret, so they downgrade to Medium. Plain user
|
||
// input is already stripped above by the source-sensitivity gate, so
|
||
// the `_` arm here is reached only by Sensitive sources that are not
|
||
// explicit secrets.
|
||
let severity = if is_data_exfil_rule {
|
||
match finding.source_kind {
|
||
crate::labels::SourceKind::Cookie | crate::labels::SourceKind::EnvironmentConfig => {
|
||
crate::patterns::Severity::High
|
||
}
|
||
_ => crate::patterns::Severity::Medium,
|
||
}
|
||
} else if let Some(meta) = [
|
||
crate::labels::Cap::LDAP_INJECTION,
|
||
crate::labels::Cap::XPATH_INJECTION,
|
||
crate::labels::Cap::HEADER_INJECTION,
|
||
crate::labels::Cap::OPEN_REDIRECT,
|
||
crate::labels::Cap::SSTI,
|
||
crate::labels::Cap::XXE,
|
||
crate::labels::Cap::PROTOTYPE_POLLUTION,
|
||
]
|
||
.iter()
|
||
.find(|c| effective_caps.contains(**c))
|
||
.and_then(|c| crate::labels::cap_rule_meta(*c))
|
||
{
|
||
// New cap classes draw severity from the rule registry so a single
|
||
// edit to `CAP_RULE_REGISTRY` cascades through SARIF, the dashboard,
|
||
// and the integration suite without per-language source-kind nudges.
|
||
meta.severity
|
||
} else {
|
||
severity_for_source_kind(finding.source_kind)
|
||
};
|
||
|
||
// DATA_EXFIL: surface the destination field in the message so analysts
|
||
// see at a glance whether the leak reached the request body, headers,
|
||
// or json payload. Generic taint findings stay on the existing
|
||
// "unsanitised … flows from … → …" template.
|
||
let message = if is_data_exfil_rule {
|
||
let suffix = data_exfil_field
|
||
.as_deref()
|
||
.map(|f| format!(" ({f} field)"))
|
||
.unwrap_or_default();
|
||
format!("sensitive data flows from {short_source} \u{2192} {sink_display}{suffix}")
|
||
} else {
|
||
format!("unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}")
|
||
};
|
||
|
||
let mut diag = Diag {
|
||
path: primary_path.clone(),
|
||
line: primary_line,
|
||
col: primary_col,
|
||
severity,
|
||
id: diag_id,
|
||
category: FindingCategory::Security,
|
||
path_validated: finding.path_validated,
|
||
guard_kind: finding.guard_kind.map(|k| format!("{k:?}")),
|
||
message: Some(message),
|
||
labels,
|
||
confidence: None,
|
||
evidence: Some(Evidence {
|
||
source: Some(SpanEvidence {
|
||
path: file_path_owned,
|
||
line: (source_point.row + 1) as u32,
|
||
col: (source_point.column + 1) as u32,
|
||
kind: "source".into(),
|
||
snippet: Some(short_source),
|
||
}),
|
||
sink: Some(SpanEvidence {
|
||
path: primary_path.clone(),
|
||
line: primary_line as u32,
|
||
col: primary_col as u32,
|
||
kind: "sink".into(),
|
||
snippet: sink_evidence_snippet,
|
||
}),
|
||
guards: finding
|
||
.guard_kind
|
||
.map(|g| {
|
||
vec![SpanEvidence {
|
||
path: primary_path.clone(),
|
||
line: primary_line as u32,
|
||
col: 0,
|
||
kind: "guard".into(),
|
||
snippet: Some(format!("{g:?}")),
|
||
}]
|
||
})
|
||
.unwrap_or_default(),
|
||
sanitizers: vec![],
|
||
state: None,
|
||
notes: evidence_notes,
|
||
source_kind: Some(finding.source_kind),
|
||
hop_count: Some(finding.hop_count),
|
||
uses_summary: finding.uses_summary,
|
||
cap_specificity: Some(finding.cap_specificity),
|
||
flow_steps,
|
||
symbolic: finding.symbolic.clone(),
|
||
sink_caps: sink_caps_bits,
|
||
engine_notes: finding.engine_notes.clone(),
|
||
data_exfil_field,
|
||
..Default::default()
|
||
}),
|
||
rank_score: None,
|
||
rank_reason: None,
|
||
suppressed: false,
|
||
suppression: None,
|
||
rollup: None,
|
||
finding_id: finding.finding_id.clone(),
|
||
alternative_finding_ids: finding.alternative_finding_ids.to_vec(),
|
||
stable_hash: 0,
|
||
};
|
||
|
||
// Post-fill explanation and confidence limiters
|
||
let explanation = crate::evidence::generate_explanation(&diag);
|
||
let limiters = crate::evidence::compute_confidence_limiters(&diag);
|
||
if let Some(ref mut ev) = diag.evidence {
|
||
ev.explanation = explanation;
|
||
ev.confidence_limiters = limiters;
|
||
}
|
||
|
||
Some(diag)
|
||
}
|
||
|
||
/// Resolve a file extension to a language slug (e.g. `"rust"`,
|
||
/// `"javascript"`). Public façade over `lang_for_path` for callers
|
||
/// that only need the slug, used by the debug API to look up
|
||
/// per-language rule enablement without re-parsing the file.
|
||
pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> {
|
||
lang_for_path(path).map(|(_, slug)| slug)
|
||
}
|
||
|
||
/// Resolve a file extension to a (tree‑sitter Language, slug) pair.
|
||
fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
|
||
// Distinguish `.tsx` from `.ts` before normalising via `lowercase_ext` —
|
||
// the latter merges both into the `"ts"` slug, which would lose the
|
||
// information needed to pick the JSX-aware TSX grammar. The slug returned
|
||
// here stays `"typescript"` for both so all downstream KINDS / RULES /
|
||
// PARAM_CONFIG entries apply uniformly.
|
||
let raw_ext = path
|
||
.extension()
|
||
.and_then(|s| s.to_str())
|
||
.map(|s| s.to_ascii_lowercase());
|
||
if matches!(raw_ext.as_deref(), Some("tsx")) {
|
||
return Some((
|
||
Language::from(tree_sitter_typescript::LANGUAGE_TSX),
|
||
"typescript",
|
||
));
|
||
}
|
||
if matches!(raw_ext.as_deref(), Some("jsx")) {
|
||
return Some((
|
||
Language::from(tree_sitter_javascript::LANGUAGE),
|
||
"javascript",
|
||
));
|
||
}
|
||
match lowercase_ext(path) {
|
||
Some("rs") => Some((Language::from(tree_sitter_rust::LANGUAGE), "rust")),
|
||
Some("c") => Some((Language::from(tree_sitter_c::LANGUAGE), "c")),
|
||
// Real-world C++ codebases (gRPC, rocksdb, LLVM, …) overwhelmingly
|
||
// use `.cc` / `.cxx` / `.hpp` / `.hh` / `.h++` rather than the
|
||
// `.cpp` synthetic-fixture extension. Without these mappings,
|
||
// the scanner silently skipped them. Headers (`.h` is omitted
|
||
// intentionally, it's also valid C and disambiguating without a
|
||
// build system is brittle).
|
||
Some("cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++") => {
|
||
Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp"))
|
||
}
|
||
Some("java") => Some((Language::from(tree_sitter_java::LANGUAGE), "java")),
|
||
Some("go") => Some((Language::from(tree_sitter_go::LANGUAGE), "go")),
|
||
Some("php") => Some((Language::from(tree_sitter_php::LANGUAGE_PHP), "php")),
|
||
Some("py") => Some((Language::from(tree_sitter_python::LANGUAGE), "python")),
|
||
Some("ts") => Some((
|
||
Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT),
|
||
"typescript",
|
||
)),
|
||
Some("js") => Some((
|
||
Language::from(tree_sitter_javascript::LANGUAGE),
|
||
"javascript",
|
||
)),
|
||
Some("rb") => Some((Language::from(tree_sitter_ruby::LANGUAGE), "ruby")),
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
/// All language slugs the scanner can parse, paired with the file extensions
|
||
/// that map to them. Single source of truth shared with [`lang_for_path`]; the
|
||
/// `supported_extensions_resolve_to_their_slug` test asserts they stay in sync.
|
||
pub(crate) const SUPPORTED_LANGUAGE_EXTENSIONS: &[(&str, &[&str])] = &[
|
||
("rust", &["rs"]),
|
||
("c", &["c"]),
|
||
(
|
||
"cpp",
|
||
&["cpp", "cc", "cxx", "c++", "hpp", "hxx", "hh", "h++"],
|
||
),
|
||
("java", &["java"]),
|
||
("go", &["go"]),
|
||
("php", &["php"]),
|
||
("python", &["py"]),
|
||
("typescript", &["ts", "tsx"]),
|
||
("javascript", &["js", "jsx"]),
|
||
("ruby", &["rb"]),
|
||
];
|
||
|
||
/// File extensions associated with a language slug (case-insensitive). Returns
|
||
/// an empty slice if `slug` is not a supported language.
|
||
pub fn extensions_for_lang(slug: &str) -> &'static [&'static str] {
|
||
SUPPORTED_LANGUAGE_EXTENSIONS
|
||
.iter()
|
||
.find(|(s, _)| s.eq_ignore_ascii_case(slug))
|
||
.map(|(_, exts)| *exts)
|
||
.unwrap_or(&[])
|
||
}
|
||
|
||
/// Fast binary-file guard: skip if >1% NUL bytes.
|
||
fn is_binary(bytes: &[u8]) -> bool {
|
||
bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1
|
||
}
|
||
|
||
/// Check if a file path indicates a test file. Matches filename-based
|
||
/// conventions across the languages the engine supports, plus the
|
||
/// `__tests__` directory convention used by JS/TS tooling.
|
||
///
|
||
/// Directory-only checks (`test/`, `tests/`, `fixtures/`) are
|
||
/// intentionally excluded because they are too broad when scanning
|
||
/// absolute paths. Severity-downgrade for those directories lives in
|
||
/// [`is_nonprod_path`].
|
||
pub(crate) fn is_test_file(path: &Path) -> bool {
|
||
// Filename-suffix conventions that are unambiguous markers of a test
|
||
// module. Each entry must end with a `.<ext>` suffix so PHP
|
||
// `*Test.php` does not match a class file named `MyContestTest.php`
|
||
// — the engine's recogniser matches on the filename, not class
|
||
// declarations.
|
||
static TEST_SUFFIXES: &[&str] = &[
|
||
// JS / TS
|
||
".test.js",
|
||
".test.ts",
|
||
".test.jsx",
|
||
".test.tsx",
|
||
".test.mjs",
|
||
".test.cjs",
|
||
".spec.js",
|
||
".spec.ts",
|
||
".spec.jsx",
|
||
".spec.tsx",
|
||
".spec.mjs",
|
||
".spec.cjs",
|
||
// Python (`pytest` and `unittest` conventions)
|
||
"_test.py",
|
||
"_tests.py",
|
||
// Java (JUnit / TestNG)
|
||
"Test.java",
|
||
"Tests.java",
|
||
"IT.java",
|
||
// PHP (PHPUnit)
|
||
"Test.php",
|
||
// Ruby (RSpec / Minitest)
|
||
"_spec.rb",
|
||
"_test.rb",
|
||
// Go
|
||
"_test.go",
|
||
// Rust (uncommon but used by some crates)
|
||
"_test.rs",
|
||
"_tests.rs",
|
||
// C / C++ (varies; cover the common shapes)
|
||
"_test.c",
|
||
"_test.cc",
|
||
"_test.cpp",
|
||
"_test.cxx",
|
||
"_test.h",
|
||
"_test.hpp",
|
||
];
|
||
|
||
// Filename-prefix conventions for languages whose convention puts
|
||
// the `test_` marker at the start instead of the end.
|
||
static TEST_PREFIXES: &[&str] = &[
|
||
// Python (`pytest`)
|
||
"test_",
|
||
// C / C++ test runners
|
||
];
|
||
|
||
// Exact filenames that are always test infrastructure.
|
||
static TEST_EXACT: &[&str] = &[
|
||
// Pytest fixture entry point (always a test helper, never prod)
|
||
"conftest.py",
|
||
];
|
||
|
||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||
for suffix in TEST_SUFFIXES {
|
||
if name.ends_with(suffix) {
|
||
return true;
|
||
}
|
||
}
|
||
for prefix in TEST_PREFIXES {
|
||
if name.starts_with(prefix)
|
||
&& (name.ends_with(".py")
|
||
|| name.ends_with(".c")
|
||
|| name.ends_with(".cc")
|
||
|| name.ends_with(".cpp")
|
||
|| name.ends_with(".cxx"))
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
if TEST_EXACT.contains(&name) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// `__tests__` is specific enough (React/Jest convention) to match on
|
||
// directory. Other test directories (`tests/`, `test/`, `spec/`)
|
||
// overlap with production paths in some real codebases (e.g.
|
||
// django apps that ship a `tests` submodule alongside production
|
||
// code under the same package), so the broad directory check stays
|
||
// in [`is_nonprod_path`] for severity downgrade only.
|
||
for component in path.components() {
|
||
if let std::path::Component::Normal(c) = component
|
||
&& c == "__tests__"
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
|
||
false
|
||
}
|
||
|
||
/// Detect bundled or minified third-party assets that the engine should not
|
||
/// analyse. These files are produced by build tooling, ship verbatim from
|
||
/// upstream packages, and can never be remediated by the codebase author, so
|
||
/// any finding raised against them is signal-less noise.
|
||
///
|
||
/// Triggers (any one is sufficient):
|
||
/// * Filename ends in `.min.js`, `.min.css`, `.bundle.js`, `.umd.js`,
|
||
/// `.umd.min.js`, `.iife.js`, `.iife.min.js`, or `.bundled.js`.
|
||
/// * Path component `bower_components` (legacy front-end package dir).
|
||
/// * Path component `vendor` AND filename has a front-end asset extension
|
||
/// (`.js`, `.mjs`, `.cjs`, `.jsx`, `.ts`, `.tsx`, `.css`). Restricted to
|
||
/// web assets so Go module vendoring (`vendor/<pkg>/*.go`) is not
|
||
/// suppressed.
|
||
///
|
||
/// The check is conservative: it skips files only when the evidence is
|
||
/// unambiguous. Hand-authored vendored plugins that lack a `.min` suffix and
|
||
/// live outside `vendor/` (e.g. `webapp/.../scripts/jquery-ui-plugin.js`) are
|
||
/// still parsed; their findings flow through `is_nonprod_path` for severity
|
||
/// downgrade instead.
|
||
pub(crate) fn is_vendored_asset_path(path: &Path) -> bool {
|
||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||
let lower: String = name.to_ascii_lowercase();
|
||
const SUFFIXES: &[&str] = &[
|
||
".min.js",
|
||
".min.css",
|
||
".bundle.js",
|
||
".bundled.js",
|
||
".umd.js",
|
||
".umd.min.js",
|
||
".iife.js",
|
||
".iife.min.js",
|
||
];
|
||
if SUFFIXES.iter().any(|s| lower.ends_with(s)) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
let mut has_vendor_component = false;
|
||
for component in path.components() {
|
||
if let std::path::Component::Normal(c) = component
|
||
&& let Some(s) = c.to_str()
|
||
{
|
||
if s.eq_ignore_ascii_case("bower_components") {
|
||
return true;
|
||
}
|
||
if s.eq_ignore_ascii_case("vendor") || s.eq_ignore_ascii_case("vendors") {
|
||
has_vendor_component = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
if has_vendor_component && let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||
let ext_lower: String = ext.to_ascii_lowercase();
|
||
const FRONT_END_EXTS: &[&str] = &[
|
||
"js", "mjs", "cjs", "jsx", "ts", "tsx", "css", "scss", "less",
|
||
];
|
||
if FRONT_END_EXTS.iter().any(|e| *e == ext_lower) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
false
|
||
}
|
||
|
||
/// Pattern IDs that are noise-prone in test files (fixture credentials,
|
||
/// non-crypto randomness, plain HTTP in test harnesses).
|
||
fn is_test_suppressible_pattern(id: &str) -> bool {
|
||
// Suffix-match so a single rule covers the per-language prefixes
|
||
// (`js.`, `ts.`, `go.`, `php.`, `py.`, `rb.`, `java.`). Each entry
|
||
// is a class of finding that is informational at best in a test
|
||
// module: hardcoded test API tokens, weak hashes used for fast
|
||
// deterministic test data, insecure RNG used for fixture seeding.
|
||
id.ends_with(".secrets.hardcoded_secret")
|
||
|| id.ends_with(".secrets.hardcoded_key")
|
||
|| id.ends_with(".crypto.hardcoded_key")
|
||
|| id.ends_with(".crypto.math_random")
|
||
|| id.ends_with(".crypto.insecure_random")
|
||
|| id.ends_with(".crypto.weak_digest")
|
||
|| id.ends_with(".crypto.weak_algorithm")
|
||
|| id.ends_with(".crypto.md5")
|
||
|| id.ends_with(".crypto.sha1")
|
||
|| id.ends_with(".crypto.rand")
|
||
|| id.ends_with(".transport.fetch_http")
|
||
}
|
||
|
||
/// Check if a file path belongs to a non-production context (tests, vendor,
|
||
/// benchmarks, etc.). Used to downgrade severity for findings in paths that
|
||
/// are unlikely to represent attack surface.
|
||
fn is_nonprod_path(path: &Path) -> bool {
|
||
static NONPROD_DIRS: &[&str] = &[
|
||
"tests",
|
||
"test",
|
||
"__tests__",
|
||
"benches",
|
||
"benchmarks",
|
||
"examples",
|
||
"build",
|
||
"scripts",
|
||
"docs",
|
||
"js_tests",
|
||
"fixtures",
|
||
"vendor",
|
||
];
|
||
static NONPROD_FILES: &[&str] = &["build.rs"];
|
||
|
||
if let Some(name) = path.file_name().and_then(|n| n.to_str())
|
||
&& (NONPROD_FILES.contains(&name) || name.ends_with(".min.js"))
|
||
{
|
||
return true;
|
||
}
|
||
|
||
for component in path.components() {
|
||
if let std::path::Component::Normal(c) = component
|
||
&& let Some(s) = c.to_str()
|
||
&& NONPROD_DIRS.contains(&s)
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
|
||
false
|
||
}
|
||
|
||
/// Normalize a callee description for display.
|
||
fn sanitize_desc(s: &str) -> String {
|
||
crate::fmt::normalize_snippet(s)
|
||
}
|
||
|
||
/// Human-readable label for a `SourceKind`.
|
||
fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str {
|
||
use crate::labels::SourceKind;
|
||
match sk {
|
||
SourceKind::UserInput => "user input",
|
||
SourceKind::Cookie => "cookie value",
|
||
SourceKind::Header => "request header",
|
||
SourceKind::EnvironmentConfig => "environment config",
|
||
SourceKind::FileSystem => "file system data",
|
||
SourceKind::Database => "database result",
|
||
SourceKind::CaughtException => "caught exception",
|
||
SourceKind::Unknown => "tainted data",
|
||
}
|
||
}
|
||
|
||
/// Downgrade severity by one tier: High→Medium, Medium→Low, Low→Low.
|
||
fn downgrade_severity(s: Severity) -> Severity {
|
||
match s {
|
||
Severity::High => Severity::Medium,
|
||
Severity::Medium => Severity::Low,
|
||
Severity::Low => Severity::Low,
|
||
}
|
||
}
|
||
|
||
// ParsedSource + ParsedFile: shared parse/CFG pipeline
|
||
|
||
/// Level 1: parsed tree + lang info. No CFG construction.
|
||
struct ParsedSource<'a> {
|
||
tree: tree_sitter::Tree,
|
||
ts_lang: Language,
|
||
lang_slug: &'static str,
|
||
bytes: &'a [u8],
|
||
path: &'a Path,
|
||
file_path_str: Cow<'a, str>,
|
||
}
|
||
|
||
impl<'a> ParsedSource<'a> {
|
||
/// Parse bytes into a tree-sitter AST. Returns `None` for binary files,
|
||
/// parse timeouts, or unsupported languages. File-size filtering is
|
||
/// handled at the walker boundary via
|
||
/// [`ScannerConfig::max_file_size_mb`]; the timeout check here defends
|
||
/// against hostile inputs (pathological grammar ambiguities) that could
|
||
/// tie up a worker indefinitely even for files within the size cap.
|
||
fn try_new(bytes: &'a [u8], path: &'a Path) -> NyxResult<Option<Self>> {
|
||
// Clear any stale parse-timeout signal from a prior `try_new` on
|
||
// this thread that the caller did not consume. Ensures the slot
|
||
// always reflects "this parse" by the time we return.
|
||
LAST_PARSE_TIMEOUT_MS.with(|c| c.set(None));
|
||
if is_vendored_asset_path(path) {
|
||
return Ok(None);
|
||
}
|
||
if is_binary(bytes) {
|
||
return Ok(None);
|
||
}
|
||
let Some((ts_lang, lang_slug)) = lang_for_path(path) else {
|
||
return Ok(None);
|
||
};
|
||
let timeout_ms = parse_timeout_ms();
|
||
let start = Instant::now();
|
||
let mut timed_out = false;
|
||
let parsed = PARSER.with(|cell| -> NyxResult<Option<tree_sitter::Tree>> {
|
||
let mut parser = cell.borrow_mut();
|
||
parser.set_language(&ts_lang)?;
|
||
if timeout_ms == 0 {
|
||
return Ok(parser.parse(bytes, None));
|
||
}
|
||
let len = bytes.len();
|
||
let mut input = |i: usize, _pt: tree_sitter::Point| -> &[u8] {
|
||
if i < len { &bytes[i..] } else { &[] }
|
||
};
|
||
let mut progress = |_state: &tree_sitter::ParseState| -> ControlFlow<()> {
|
||
if start.elapsed().as_millis() as u64 >= timeout_ms {
|
||
timed_out = true;
|
||
ControlFlow::Break(())
|
||
} else {
|
||
ControlFlow::Continue(())
|
||
}
|
||
};
|
||
let options = tree_sitter::ParseOptions::new().progress_callback(&mut progress);
|
||
Ok(parser.parse_with_options(&mut input, None, Some(options)))
|
||
})?;
|
||
let Some(tree) = parsed else {
|
||
if timed_out {
|
||
tracing::warn!(
|
||
file = %path.display(),
|
||
timeout_ms,
|
||
"tree-sitter parse timed out; skipping file",
|
||
);
|
||
LAST_PARSE_TIMEOUT_MS.with(|c| c.set(Some(timeout_ms)));
|
||
return Ok(None);
|
||
}
|
||
return Err(NyxError::Other("tree-sitter failed".into()));
|
||
};
|
||
let file_path_str = path.to_string_lossy();
|
||
Ok(Some(Self {
|
||
tree,
|
||
ts_lang,
|
||
lang_slug,
|
||
bytes,
|
||
path,
|
||
file_path_str,
|
||
}))
|
||
}
|
||
|
||
/// Run AST pattern queries and return diagnostics.
|
||
fn run_ast_queries(&self, cfg: &Config) -> Vec<Diag> {
|
||
let root = self.tree.root_node();
|
||
let compiled = query_cache::for_lang(self.lang_slug, self.ts_lang.clone());
|
||
let mut cursor = QueryCursor::new();
|
||
let mut out = Vec::new();
|
||
let in_test_file = is_test_file(self.path);
|
||
|
||
for cq in compiled.iter() {
|
||
if cq.meta.severity > cfg.scanner.min_severity {
|
||
continue;
|
||
}
|
||
// Suppress noise-prone patterns in test files
|
||
if in_test_file && is_test_suppressible_pattern(cq.meta.id) {
|
||
continue;
|
||
}
|
||
let mut matches = cursor.matches(&cq.query, root, self.bytes);
|
||
while let Some(m) = matches.next() {
|
||
if let Some(cap) = m.captures.iter().find(|c| c.index == 0) {
|
||
// Layer A: suppress Security findings on calls with all-literal args.
|
||
//
|
||
// Carve-outs for categories where the literal argument IS
|
||
// the bug (algorithm choice, hardcoded secret, insecure
|
||
// protocol scheme, unsafe config flag): suppression would
|
||
// silence the actual signal. Hash algorithms picked from
|
||
// string literals (`MessageDigest.getInstance("MD5")`,
|
||
// `hashlib.md5(b"…")`) are weak regardless of caller-side
|
||
// data flow.
|
||
if cq.meta.category.finding_category() == FindingCategory::Security
|
||
&& !matches!(
|
||
cq.meta.category,
|
||
PatternCategory::Crypto
|
||
| PatternCategory::Secrets
|
||
| PatternCategory::InsecureConfig
|
||
| PatternCategory::InsecureTransport
|
||
)
|
||
&& is_call_all_args_literal(cap.node, self.bytes, self.lang_slug)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer B: PHP `include $var` where $var is a formal parameter
|
||
// of the immediately enclosing function/method/closure and is
|
||
// not reassigned before the include. This is the canonical
|
||
// PHP autoloader / scope-isolated-include shape (composer's
|
||
// ClassLoader, PSR-4 loaders, route-file loaders); the
|
||
// pattern rule is heuristic without taint and over-fires
|
||
// here. A taint-aware sink check (the engine's
|
||
// taint-unsanitised-flow rule) still catches the case where
|
||
// a tainted value reaches the parameter at the call site.
|
||
if cq.meta.id == "php.path.include_variable"
|
||
&& self.lang_slug == "php"
|
||
&& is_php_include_param_passthrough(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer C: PHP `unserialize($x, ['allowed_classes' => [...]])`
|
||
// or `unserialize($x, ['allowed_classes' => false])` ,
|
||
// PHP 7+ structural mitigation against object injection.
|
||
// When the call passes an `allowed_classes` option set to
|
||
// either `false` (no class instantiation) or an array
|
||
// literal of explicit class names, the deserialised data
|
||
// cannot construct arbitrary user classes. Skip
|
||
// `allowed_classes => true` (the unsafe default) and
|
||
// dynamic / variable values (let those fire).
|
||
if cq.meta.id == "php.deser.unserialize"
|
||
&& self.lang_slug == "php"
|
||
&& is_php_unserialize_allowed_classes_restricted(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer C2: PHP `Serializable::unserialize($input)` magic
|
||
// method body — `public function unserialize($x) { ...
|
||
// unserialize($x) ... }`. This is the legacy
|
||
// `Serializable` interface contract (deprecated since PHP
|
||
// 8.1). PHP itself invokes the method when restoring an
|
||
// instance, so the body's `\unserialize($x)` call cannot
|
||
// be removed without breaking the interface. The
|
||
// actionable signal is at the class level (the class
|
||
// implements Serializable — fix is to migrate to
|
||
// `__serialize` / `__unserialize`), not at this call
|
||
// site. Genuine deserialization sinks (free-function
|
||
// `unserialize($_GET[..])`, helpers reading from session
|
||
// / cache, etc.) keep firing because they are not inside
|
||
// a method declaration named `unserialize` with a single
|
||
// formal parameter passed straight to the call.
|
||
if cq.meta.id == "php.deser.unserialize"
|
||
&& self.lang_slug == "php"
|
||
&& is_php_unserialize_magic_method_passthrough(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer C3: PHP `unserialize($x)` inside a PHPUnit
|
||
// assertion of the form
|
||
// `$this->assertSame(LITERAL, unserialize($x))`
|
||
// (or `assertEquals` / `assertNull` / static / self
|
||
// / parent dispatch variants). The literal expected
|
||
// value bounds the unserialize result so the
|
||
// call-site cannot release attacker-controlled
|
||
// object graphs into the test process — failed
|
||
// assertions abort the test rather than leak side
|
||
// effects. Drupal / Joomla / Nextcloud each carry
|
||
// tens of these `Serializable` round-trip
|
||
// assertions in their test trees and every firing
|
||
// is noise.
|
||
if cq.meta.id == "php.deser.unserialize"
|
||
&& self.lang_slug == "php"
|
||
&& is_php_unserialize_inside_phpunit_assertion(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer C4: Python `pickle.loads` / `yaml.load` /
|
||
// `shelve.open` / kindred deserialization sinks
|
||
// wrapped in a `unittest.TestCase` assertion whose
|
||
// other argument is a literal expected value (or
|
||
// whose verb itself constrains the result, e.g.
|
||
// `assertIsNone(pickle.loads(blob))`). The
|
||
// assertion bounds the deser result so attacker-
|
||
// controlled blobs would fail loudly rather than
|
||
// leak side effects out of the test boundary.
|
||
// Mirrors the PHP Layer C3 recogniser; deferred
|
||
// note in `project_realrepo_*.md` flagged the same
|
||
// FP shape on Python test trees.
|
||
if matches!(
|
||
cq.meta.id,
|
||
"py.deser.pickle_loads" | "py.deser.yaml_load" | "py.deser.shelve_open"
|
||
) && self.lang_slug == "python"
|
||
&& is_python_deser_inside_unittest_assertion(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer C5: Ruby `Marshal.load` / `YAML.load` /
|
||
// `Psych.load` wrapped in a Minitest assertion
|
||
// (`assert_equal LIT, deser`, `assert_nil deser`,
|
||
// `assert deser`, `refute_equal LIT, deser`, ...) or
|
||
// an RSpec matcher chain (`expect(deser).to eq(LIT)`,
|
||
// `expect(deser).to be_nil`, `be_a(TYPE)`, ...).
|
||
// Same bounding semantics as the PHP / Python paths:
|
||
// a poisoned blob fails the assertion loudly rather
|
||
// than leak object-injection side effects out of
|
||
// the test boundary.
|
||
if matches!(cq.meta.id, "rb.deser.marshal_load" | "rb.deser.yaml_load")
|
||
&& self.lang_slug == "ruby"
|
||
&& is_ruby_deser_inside_test_assertion(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer D: C/C++ buffer-overflow pattern rules
|
||
// (`{c,cpp}.memory.strcpy`, `strcat`, `sprintf`) fire
|
||
// syntactically on every call regardless of argument
|
||
// bounds. The pattern's stated danger ("no bounds
|
||
// checking on destination buffer" / "no length limit on
|
||
// output buffer") is only realisable when the source /
|
||
// format-string contributes attacker-controlled length.
|
||
// When the source argument is a string literal (or a
|
||
// ternary of two string literals), the contributed length
|
||
// is statically bounded, there is no overflow vector
|
||
// for an attacker even if the destination buffer is
|
||
// mis-sized. Same principle for `sprintf` when the
|
||
// format string is a literal containing no bare `%s`
|
||
// (only width-bounded numeric / char specifiers, or
|
||
// precision-bounded `%.<N>s` / `%.*s`).
|
||
if (self.lang_slug == "c" || self.lang_slug == "cpp")
|
||
&& is_c_buffer_call_literal_safe(cq.meta.id, cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer E: C++ `reinterpret_cast<T>(x)` when T is a
|
||
// type explicitly defined as safe by the C++ aliasing
|
||
// rules — byte-pointer family (`char*`, `unsigned
|
||
// char*`, `uint8_t*`, `std::byte*`, etc., per
|
||
// [basic.lval]/11), `void*`, the integer round-trip
|
||
// types `uintptr_t` / `intptr_t`, and the BSD-socket
|
||
// `sockaddr` family (POSIX intentionally type-puns
|
||
// `sockaddr*` <-> `sockaddr_in*` etc.). A pattern
|
||
// rule cannot tell these from genuinely dangerous
|
||
// strict-aliasing UB casts, so it over-fires
|
||
// dramatically on serialization, hashing, and
|
||
// socket-API code where the cast is the canonical
|
||
// (and standard-blessed) idiom.
|
||
if self.lang_slug == "cpp"
|
||
&& is_cpp_cast_target_type_safe(cq.meta.id, cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer F: PHP `md5()` / `sha1()` flagged as weak hash
|
||
// functions, but used in a non-cryptographic context
|
||
// (ETag generation, cache-key / array-index hashing,
|
||
// identifier fingerprinting, deduplication). The
|
||
// pattern rule cannot distinguish weak-hash crypto
|
||
// misuse from these idiomatic uses, so it over-fires
|
||
// on every `md5(...)` callsite regardless of the
|
||
// surrounding consuming context. Suppress when the
|
||
// call's *consuming context* yields a name that
|
||
// matches a recognised non-cryptographic identifier
|
||
// pattern (variable / field / array-key / method
|
||
// suffix). Genuine weak-hash crypto misuse —
|
||
// `$password_hash = md5(...)`, `$signature = md5(...)`,
|
||
// `$tokenHash = md5(...)` — keeps firing because the
|
||
// name contains an excluded crypto-keyword substring.
|
||
if (cq.meta.id == "php.crypto.md5" || cq.meta.id == "php.crypto.sha1")
|
||
&& self.lang_slug == "php"
|
||
&& is_php_weak_hash_non_crypto_use(cap.node, self.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
let point = cap.node.start_position();
|
||
out.push(Diag {
|
||
path: self.path.to_string_lossy().into_owned(),
|
||
line: point.row + 1,
|
||
col: point.column + 1,
|
||
severity: cq.meta.severity,
|
||
id: cq.meta.id.to_owned(),
|
||
category: cq.meta.category.finding_category(),
|
||
path_validated: false,
|
||
guard_kind: None,
|
||
message: Some(cq.meta.description.to_owned()),
|
||
labels: vec![],
|
||
confidence: Some(cq.meta.confidence),
|
||
evidence: Some(Evidence {
|
||
source: None,
|
||
sink: Some(SpanEvidence {
|
||
path: self.path.to_string_lossy().into_owned(),
|
||
line: (point.row + 1) as u32,
|
||
col: (point.column + 1) as u32,
|
||
kind: "sink".into(),
|
||
snippet: None,
|
||
}),
|
||
guards: vec![],
|
||
sanitizers: vec![],
|
||
state: None,
|
||
notes: vec![],
|
||
..Default::default()
|
||
}),
|
||
rank_score: None,
|
||
rank_reason: None,
|
||
suppressed: false,
|
||
suppression: None,
|
||
rollup: None,
|
||
finding_id: String::new(),
|
||
alternative_finding_ids: Vec::new(),
|
||
stable_hash: 0,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
out
|
||
}
|
||
|
||
/// Sort, dedup, and optionally downgrade severity for non-production paths.
|
||
///
|
||
/// Dedup key matches the `issues` table PRIMARY KEY `(file_id, rule_id,
|
||
/// line, col)`, severity is NOT part of the key. Two diags that agree
|
||
/// on (line, col, id) but differ in severity (e.g. a pattern-rule finding
|
||
/// plus a taint-pipeline finding on the same call) would otherwise survive
|
||
/// dedup here and crash the indexer with a UNIQUE constraint violation.
|
||
/// Sorting severity ascending (Severity::High < Medium < Low) means
|
||
/// `dedup_by` keeps the first occurrence, preserving the highest severity.
|
||
fn finalize_diags(&self, out: &mut Vec<Diag>, cfg: &Config) {
|
||
out.sort_by(|a, b| {
|
||
(a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity))
|
||
});
|
||
out.dedup_by(|a, b| a.line == b.line && a.col == b.col && a.id == b.id);
|
||
|
||
if !cfg.scanner.include_nonprod && is_nonprod_path(self.path) {
|
||
for d in out.iter_mut() {
|
||
d.severity = downgrade_severity(d.severity);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Level 2: adds CFG graph, summaries, lang rules on top of ParsedSource.
|
||
struct ParsedFile<'a> {
|
||
source: ParsedSource<'a>,
|
||
file_cfg: FileCfg,
|
||
lang_rules: LangAnalysisRules,
|
||
has_lang_rules: bool,
|
||
/// Per-body SSA + const-prop + type-fact cache, lazily populated on first
|
||
/// request and indexed by `BodyId.0`. Was being recomputed 2-3× per body
|
||
/// across `run_cfg_analyses_with_lowered` (cfg analyses + state analyses)
|
||
/// and `run_auth_analyses` (`collect_file_var_types`); on the gin profile
|
||
/// `build_body_const_facts` accounted for 13.6% of wall-clock and a
|
||
/// single-pass cache collapses that to ~4.5%.
|
||
body_const_facts_cache: OnceCell<Vec<Option<cfg_analysis::BodyConstFacts>>>,
|
||
}
|
||
|
||
impl<'a> ParsedFile<'a> {
|
||
/// Build CFG + lang rules from a parsed source.
|
||
fn from_source(source: ParsedSource<'a>, cfg: &Config) -> Self {
|
||
let mut lang_rules = build_lang_rules(cfg, source.lang_slug);
|
||
// Single-file scans rarely have a nearby package.json, so the
|
||
// project-level `FrameworkContext` misses frameworks the file
|
||
// obviously imports. Augment the per-file rule set with any
|
||
// framework-conditional rules keyed off in-file import specifiers
|
||
// (e.g. `import fastify from 'fastify'`). Idempotent, skips
|
||
// frameworks already active from the manifest pass.
|
||
let in_file_fws =
|
||
crate::utils::project::detect_in_file_frameworks(source.bytes, source.lang_slug);
|
||
let missing: Vec<_> = in_file_fws
|
||
.into_iter()
|
||
.filter(|fw| !lang_rules.frameworks.contains(fw))
|
||
.collect();
|
||
if !missing.is_empty() {
|
||
let aug_ctx = crate::utils::project::FrameworkContext {
|
||
frameworks: missing.clone(),
|
||
inspected_langs: std::collections::HashSet::new(),
|
||
};
|
||
lang_rules
|
||
.extra_labels
|
||
.extend(crate::labels::framework_rules_for_lang_pub(
|
||
source.lang_slug,
|
||
&aug_ctx,
|
||
));
|
||
lang_rules.frameworks.extend(missing);
|
||
}
|
||
let has_lang_rules = !lang_rules.extra_labels.is_empty()
|
||
|| !lang_rules.terminators.is_empty()
|
||
|| !lang_rules.event_handlers.is_empty();
|
||
let rules_ref = if has_lang_rules {
|
||
Some(&lang_rules)
|
||
} else {
|
||
None
|
||
};
|
||
let mut file_cfg = build_cfg(
|
||
&source.tree,
|
||
source.bytes,
|
||
source.lang_slug,
|
||
&source.file_path_str,
|
||
rules_ref,
|
||
);
|
||
|
||
// Phase 04: when the scan paths produced a project ModuleGraph,
|
||
// resolve this file's imports against it and stash both on the
|
||
// FileCfg (for local consumers) and on the global per-file
|
||
// ImportTable (for cross-file lookups in phases 05/09/10). The
|
||
// wiring is no-op for non-JS/TS files and for direct callers of
|
||
// `analyse_file_fused` that pass a `Config` without a resolver
|
||
// (e.g. unit tests).
|
||
if let Some(graph) = cfg.module_graph.as_deref() {
|
||
let bindings = crate::resolve::extract_resolved_imports(
|
||
&source.tree,
|
||
source.bytes,
|
||
source.path,
|
||
graph,
|
||
source.lang_slug,
|
||
);
|
||
if !bindings.is_empty() {
|
||
graph.record_imports_for_file(source.path.to_path_buf(), bindings.clone());
|
||
file_cfg.resolved_imports = bindings;
|
||
}
|
||
}
|
||
|
||
Self {
|
||
source,
|
||
file_cfg,
|
||
lang_rules,
|
||
has_lang_rules,
|
||
body_const_facts_cache: OnceCell::new(),
|
||
}
|
||
}
|
||
|
||
/// Per-body const-fact cache, computed once on first request and shared
|
||
/// across every per-body iteration in this file's analysis. Indexed by
|
||
/// `BodyId.0` so callers can look up by body identity.
|
||
fn body_const_facts_all(&self) -> &[Option<cfg_analysis::BodyConstFacts>] {
|
||
self.body_const_facts_cache.get_or_init(|| {
|
||
let lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
|
||
self.file_cfg
|
||
.bodies
|
||
.iter()
|
||
.map(|b| cfg_analysis::build_body_const_facts(b, lang))
|
||
.collect()
|
||
})
|
||
}
|
||
|
||
/// Look up the cached const facts for a specific body.
|
||
fn body_const_facts(
|
||
&self,
|
||
body: &crate::cfg::BodyCfg,
|
||
) -> Option<&cfg_analysis::BodyConstFacts> {
|
||
let all = self.body_const_facts_all();
|
||
all.get(body.meta.id.0 as usize).and_then(|f| f.as_ref())
|
||
}
|
||
|
||
/// The top-level body's CFG graph (for backward-compatible access).
|
||
fn cfg_graph(&self) -> &Cfg {
|
||
&self.file_cfg.toplevel().graph
|
||
}
|
||
|
||
/// The top-level body's entry node.
|
||
#[allow(dead_code)]
|
||
fn entry(&self) -> NodeIndex {
|
||
self.file_cfg.toplevel().entry
|
||
}
|
||
|
||
fn local_summaries(&self) -> &FuncSummaries {
|
||
&self.file_cfg.summaries
|
||
}
|
||
|
||
fn rules_ref(&self) -> Option<&LangAnalysisRules> {
|
||
if self.has_lang_rules {
|
||
Some(&self.lang_rules)
|
||
} else {
|
||
None
|
||
}
|
||
}
|
||
|
||
fn export_summaries(&self) -> Vec<FuncSummary> {
|
||
self.export_summaries_with_root(None)
|
||
}
|
||
|
||
fn export_summaries_with_root(&self, scan_root: Option<&Path>) -> Vec<FuncSummary> {
|
||
let mut out = export_summaries(
|
||
self.local_summaries(),
|
||
&self.source.file_path_str,
|
||
self.source.lang_slug,
|
||
);
|
||
|
||
// every
|
||
// `FuncSummary` exported from this file carries a copy of the
|
||
// file's `hierarchy_edges` so the inheritance / impl /
|
||
// implements relationships persist through SQLite round-trips
|
||
// and re-merge into `crate::callgraph::TypeHierarchyIndex` at
|
||
// call-graph build time. Cheap (one clone per summary) and
|
||
// strictly additive, `merge_summaries` deduplicates downstream.
|
||
if !self.file_cfg.hierarchy_edges.is_empty() {
|
||
let edges = self.file_cfg.hierarchy_edges.clone();
|
||
for s in &mut out {
|
||
s.hierarchy_edges = edges.clone();
|
||
}
|
||
}
|
||
|
||
// Phase 10 — annotate entry-point summaries. Match each
|
||
// summary's body span (looked up via `FuncSummaries` keyed on
|
||
// `FuncKey`) against the per-file `entry_kinds` table so the
|
||
// tag survives SQLite round-trips and cross-file consumption.
|
||
if !self.file_cfg.entry_kinds.is_empty() {
|
||
// Build a (name, container, disambig) → span lookup from
|
||
// the file's bodies so we can associate each exported
|
||
// FuncSummary with its body span.
|
||
let mut by_identity: std::collections::HashMap<
|
||
(String, String, Option<u32>),
|
||
(usize, usize),
|
||
> = std::collections::HashMap::new();
|
||
for body in self.file_cfg.function_bodies() {
|
||
if let Some(key) = &body.meta.func_key {
|
||
by_identity.insert(
|
||
(key.name.clone(), key.container.clone(), key.disambig),
|
||
body.meta.span,
|
||
);
|
||
}
|
||
}
|
||
for s in &mut out {
|
||
let id = (s.name.clone(), s.container.clone(), s.disambig);
|
||
if let Some(span) = by_identity.get(&id) {
|
||
s.entry_kind = self.file_cfg.entry_kinds.get(span).cloned();
|
||
}
|
||
}
|
||
}
|
||
|
||
// Rust-specific enrichment: derive the crate-relative module path for
|
||
// this file and parse every top-level `use` declaration into an alias
|
||
// map. The information lets the call graph resolve same-name functions
|
||
// across modules and is cheap enough to compute once per file and
|
||
// duplicate across the file's summaries. Non-Rust files skip all of
|
||
// this and keep the new fields at `None`.
|
||
if self.source.lang_slug == "rust" && !out.is_empty() {
|
||
let module_path = crate::rust_resolve::derive_module_path(self.source.path, scan_root);
|
||
let use_map =
|
||
crate::rust_resolve::parse_rust_use_map(self.source.bytes, &self.source.tree);
|
||
|
||
let aliases = if use_map.aliases.is_empty() {
|
||
None
|
||
} else {
|
||
Some(use_map.aliases)
|
||
};
|
||
let wildcards = if use_map.wildcards.is_empty() {
|
||
None
|
||
} else {
|
||
Some(use_map.wildcards)
|
||
};
|
||
|
||
for s in &mut out {
|
||
s.module_path = module_path.clone();
|
||
s.rust_use_map = aliases.clone();
|
||
s.rust_wildcards = wildcards.clone();
|
||
}
|
||
}
|
||
|
||
out
|
||
}
|
||
|
||
/// Extract SSA function summaries for all functions in this file.
|
||
/// Extract SSA summaries and eligible callee bodies in a single lowering pass.
|
||
///
|
||
/// Returns two vectors keyed by canonical [`crate::symbol::FuncKey`].
|
||
/// The `FuncKey` identity preserves `(lang, namespace, container, name,
|
||
/// arity, disambig, kind)`, so two same-name definitions in this file
|
||
/// (e.g. a free `process` and a `Worker::process`, or overloads with
|
||
/// different arities) land on distinct entries instead of the later one
|
||
/// shadowing the earlier one.
|
||
fn extract_ssa_artifacts(
|
||
&self,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
module_graph: Option<&crate::resolve::ModuleGraph>,
|
||
) -> (
|
||
Vec<(crate::symbol::FuncKey, SsaFuncSummary)>,
|
||
Vec<(
|
||
crate::symbol::FuncKey,
|
||
crate::taint::ssa_transfer::CalleeSsaBody,
|
||
)>,
|
||
) {
|
||
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
|
||
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
|
||
let namespace = crate::symbol::namespace_with_package(
|
||
&self.source.file_path_str,
|
||
scan_root_str.as_deref(),
|
||
module_graph,
|
||
);
|
||
|
||
// Use the FileCfg path (same one `analyse_file` uses at taint time) so
|
||
// the SSA summaries stored cross-file match exactly what pass 2 will
|
||
// resolve against, no NodeIndex-space or entry-detection drift.
|
||
let locator = crate::summary::SinkSiteLocator {
|
||
tree: &self.source.tree,
|
||
bytes: self.source.bytes,
|
||
file_rel: &namespace,
|
||
};
|
||
let (summaries, bodies) = crate::taint::extract_ssa_artifacts_from_file_cfg(
|
||
&self.file_cfg,
|
||
caller_lang,
|
||
&namespace,
|
||
self.local_summaries(),
|
||
global_summaries,
|
||
Some(&locator),
|
||
scan_root_str.as_deref(),
|
||
module_graph,
|
||
);
|
||
|
||
(summaries.into_iter().collect(), bodies)
|
||
}
|
||
|
||
/// Lower every function body in this file to SSA exactly once. Used by
|
||
/// [`analyse_file_fused`] to share the result between the taint engine
|
||
/// ([`run_cfg_analyses_with_lowered`]) and the SSA artifact filter
|
||
/// ([`build_eligible_bodies_from_lowered`]), the prior code path lowered
|
||
/// twice (once inside `analyse_file`, once inside
|
||
/// `extract_ssa_artifacts_from_file_cfg`) and accounted for ~24% of the
|
||
/// pass-2 wall-clock on the bench corpus.
|
||
///
|
||
/// # Locator policy
|
||
///
|
||
/// Attaches a [`crate::summary::SinkSiteLocator`] so intra-file
|
||
/// summaries record concrete sink coordinates and a `from_chain` flag
|
||
/// distinguishing chain-hop markers from this body's own locator span.
|
||
/// Pass-2 emission then gates promotion into `Finding.primary_location`
|
||
/// on `from_chain || file_rel != caller_namespace`, see
|
||
/// [`crate::taint::ssa_transfer::should_promote_sink_site`].
|
||
///
|
||
/// Same-file single-hop helpers continue to surface the flow finding
|
||
/// at the call site (their site is `from_chain=false` and lives in the
|
||
/// caller's namespace, gate fails). Multi-hop chains promote because
|
||
/// `summary_extract` flips `from_chain=true` on every site that came
|
||
/// via `event.primary_sink_site`, the callee already pierced through
|
||
/// at least one summary boundary to record the deepest coordinates.
|
||
/// Cross-file callees promote because `file_rel` differs. This
|
||
/// preserves the closure-capture / lambda / helper-with-internal-sink
|
||
/// fixture shape (two findings: deep + call-site) while gaining
|
||
/// deep-line attribution on multi-hop chains that have no per-frame
|
||
/// intermediate finding to dedup with. See "Multi-hop intra-file
|
||
/// sink attribution gap" in deferred.md for the design tradeoff.
|
||
fn lower_ssa_for_fused(
|
||
&self,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
module_graph: Option<&crate::resolve::ModuleGraph>,
|
||
) -> (
|
||
std::collections::HashMap<
|
||
crate::symbol::FuncKey,
|
||
crate::summary::ssa_summary::SsaFuncSummary,
|
||
>,
|
||
std::collections::HashMap<
|
||
crate::symbol::FuncKey,
|
||
crate::taint::ssa_transfer::CalleeSsaBody,
|
||
>,
|
||
) {
|
||
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
|
||
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
|
||
let namespace = crate::symbol::namespace_with_package(
|
||
&self.source.file_path_str,
|
||
scan_root_str.as_deref(),
|
||
module_graph,
|
||
);
|
||
let locator = crate::summary::SinkSiteLocator {
|
||
tree: &self.source.tree,
|
||
bytes: self.source.bytes,
|
||
file_rel: &namespace,
|
||
};
|
||
crate::taint::lower_all_functions_from_bodies(
|
||
&self.file_cfg,
|
||
caller_lang,
|
||
&namespace,
|
||
self.local_summaries(),
|
||
global_summaries,
|
||
Some(&locator),
|
||
scan_root_str.as_deref(),
|
||
module_graph,
|
||
)
|
||
}
|
||
|
||
/// Run taint analysis, CFG structural analyses, and state-model analysis.
|
||
///
|
||
/// Wrapper around [`run_cfg_analyses_with_lowered`] that lowers SSA
|
||
/// internally (the standalone path). Callers that already hold a
|
||
/// pre-lowered result (today: only [`analyse_file_fused`]) should use
|
||
/// the `_with_lowered` variant directly to avoid the duplicate
|
||
/// lowering.
|
||
fn run_cfg_analyses(
|
||
&self,
|
||
cfg: &Config,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> Vec<Diag> {
|
||
// Reset before lowering: probes during lowering may publish
|
||
// path-safe-suppressed sink spans that state analysis consumes,
|
||
// and the SSA engine may publish all-validated sink spans that
|
||
// AST-pattern suppression consumes. See the equivalent resets
|
||
// in `analyse_file_fused`.
|
||
crate::taint::ssa_transfer::reset_path_safe_suppressed_spans();
|
||
crate::taint::ssa_transfer::reset_all_validated_spans();
|
||
let (ssa_summaries, callee_bodies) =
|
||
self.lower_ssa_for_fused(global_summaries, scan_root, cfg.module_graph.as_deref());
|
||
self.run_cfg_analyses_with_lowered(
|
||
cfg,
|
||
global_summaries,
|
||
scan_root,
|
||
&ssa_summaries,
|
||
&callee_bodies,
|
||
)
|
||
}
|
||
|
||
/// Like [`run_cfg_analyses`] but takes pre-lowered SSA summaries +
|
||
/// callee bodies and threads them into [`taint::analyse_file_with_lowered`].
|
||
/// Used by [`analyse_file_fused`] to share the lowering with the SSA
|
||
/// artifact extractor.
|
||
#[allow(clippy::too_many_arguments)]
|
||
fn run_cfg_analyses_with_lowered(
|
||
&self,
|
||
cfg: &Config,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
ssa_summaries: &std::collections::HashMap<
|
||
crate::symbol::FuncKey,
|
||
crate::summary::ssa_summary::SsaFuncSummary,
|
||
>,
|
||
callee_bodies: &std::collections::HashMap<
|
||
crate::symbol::FuncKey,
|
||
crate::taint::ssa_transfer::CalleeSsaBody,
|
||
>,
|
||
) -> Vec<Diag> {
|
||
let mut out = Vec::new();
|
||
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
|
||
|
||
// ── Taint analysis ──────────────────────────────────────────────
|
||
tracing::debug!("Running taint analysis on: {}", self.source.path.display());
|
||
tracing::debug!("Func summaries: {:?}", self.local_summaries());
|
||
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
|
||
let namespace = crate::symbol::namespace_with_package(
|
||
&self.source.file_path_str,
|
||
scan_root_str.as_deref(),
|
||
cfg.module_graph.as_deref(),
|
||
);
|
||
let extra = if self.lang_rules.extra_labels.is_empty() {
|
||
None
|
||
} else {
|
||
Some(self.lang_rules.extra_labels.as_slice())
|
||
};
|
||
// Phase-09 cross-package import lookup. Built per-file from the
|
||
// resolver's verdict; consumed by `resolve_callee_full` step 0.7
|
||
// when a flat-name lookup would otherwise miss.
|
||
let cross_package_imports = crate::taint::build_cross_package_func_keys(
|
||
&self.file_cfg.resolved_imports,
|
||
scan_root_str.as_deref(),
|
||
cfg.module_graph.as_deref(),
|
||
caller_lang,
|
||
);
|
||
let cross_package_imports_ref = if cross_package_imports.is_empty() {
|
||
None
|
||
} else {
|
||
Some(&cross_package_imports)
|
||
};
|
||
let taint_results = crate::taint::analyse_file_with_lowered(
|
||
&self.file_cfg,
|
||
self.local_summaries(),
|
||
global_summaries,
|
||
caller_lang,
|
||
&namespace,
|
||
&[],
|
||
extra,
|
||
ssa_summaries,
|
||
callee_bodies,
|
||
cross_package_imports_ref,
|
||
);
|
||
// Drain the path-safe-suppressed sink-span set published by the
|
||
// SSA taint engine. Used below by the state-analysis pass to
|
||
// suppress `state-unauthed-access` on sinks the taint engine has
|
||
// already proved cannot reach a privileged location.
|
||
let path_safe_suppressed_spans =
|
||
crate::taint::ssa_transfer::take_path_safe_suppressed_spans();
|
||
for finding in &taint_results {
|
||
let body_cfg = &self.file_cfg.body(finding.body_id).graph;
|
||
|
||
// Suppress internal redirect taint findings: res.redirect(`/path/...`)
|
||
// with a path-prefix argument is server-relative, not an open redirect.
|
||
let sink_info = &body_cfg[finding.sink];
|
||
let sink_has_ssrf = sink_info
|
||
.taint
|
||
.labels
|
||
.iter()
|
||
.any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SSRF)));
|
||
if sink_has_ssrf
|
||
&& let Some(ref callee) = sink_info.call.callee
|
||
&& (callee.ends_with("redirect") || callee.ends_with("Redirect"))
|
||
&& crate::cfg_analysis::guards::has_redirect_path_prefix(
|
||
self.source.bytes,
|
||
sink_info.ast.span,
|
||
)
|
||
{
|
||
continue;
|
||
}
|
||
|
||
if let Some(diag) = build_taint_diag(
|
||
finding,
|
||
body_cfg,
|
||
&self.source.tree,
|
||
self.source.path,
|
||
self.source.bytes,
|
||
scan_root,
|
||
) {
|
||
out.push(diag);
|
||
}
|
||
}
|
||
|
||
// ── CFG structural analyses (per body) ─────────────────────────
|
||
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
|
||
// Pre-compute, per body, the set of variable names whose
|
||
// release / close calls live in a NESTED closure body inside
|
||
// that body (e.g. `socket.on("close", () => ws.close())`).
|
||
// Both the structural ResourceMisuse pass and the state-model
|
||
// leak pass consult it to suppress findings whose cleanup is
|
||
// registered as a callback the per-body CFG can't follow.
|
||
// Only descendants count — sibling methods on the same class
|
||
// don't share resource ownership.
|
||
let closure_released_per_body =
|
||
state::collect_closure_released_var_names(&self.file_cfg.bodies, caller_lang);
|
||
let empty_set: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||
for body in &self.file_cfg.bodies {
|
||
let body_taint: Vec<_> = taint_results
|
||
.iter()
|
||
.filter(|f| f.body_id == body.meta.id)
|
||
.cloned()
|
||
.collect();
|
||
let body_const_facts = self.body_const_facts(body);
|
||
let cfg_ctx = cfg_analysis::AnalysisContext {
|
||
cfg: &body.graph,
|
||
entry: body.entry,
|
||
lang: caller_lang,
|
||
source_bytes: self.source.bytes,
|
||
func_summaries: self.local_summaries(),
|
||
global_summaries,
|
||
ssa_summaries: Some(ssa_summaries),
|
||
taint_findings: &body_taint,
|
||
analysis_rules: self.rules_ref(),
|
||
taint_active,
|
||
body_const_facts,
|
||
type_facts: body_const_facts.map(|f| &f.type_facts),
|
||
auth_decorators: &body.meta.auth_decorators,
|
||
closure_released_var_names: Some(
|
||
closure_released_per_body
|
||
.get(&body.meta.id)
|
||
.unwrap_or(&empty_set),
|
||
),
|
||
class_constant_scalars: Some(&self.file_cfg.class_constant_scalars),
|
||
};
|
||
for cf in cfg_analysis::run_all(&cfg_ctx) {
|
||
// Layer C4 mirror at the CFG-emission point: Python
|
||
// `pickle.loads` / `yaml.load` / `shelve.open` calls
|
||
// wrapped inside a `unittest.TestCase` literal-bound
|
||
// assertion fire `cfg-unguarded-sink` because the
|
||
// structural rule has no taint context. Apply the
|
||
// same recogniser used by the AST-pattern layer so
|
||
// both sides agree on what counts as test-bound deser.
|
||
if cf.rule_id == "cfg-unguarded-sink"
|
||
&& self.source.lang_slug == "python"
|
||
&& let Some(node) = self
|
||
.source
|
||
.tree
|
||
.root_node()
|
||
.descendant_for_byte_range(cf.span.0, cf.span.1)
|
||
&& is_python_deser_inside_unittest_assertion(node, self.source.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
// Layer C5 mirror: Ruby `Marshal.load` / `YAML.load` /
|
||
// `Psych.load` inside Minitest / RSpec assertions also
|
||
// fire `cfg-unguarded-sink` from the structural rule
|
||
// (which has no taint context). Apply the same
|
||
// recogniser used by the AST-pattern layer so both
|
||
// sides agree on what counts as test-bound deser.
|
||
if cf.rule_id == "cfg-unguarded-sink"
|
||
&& self.source.lang_slug == "ruby"
|
||
&& let Some(node) = self
|
||
.source
|
||
.tree
|
||
.root_node()
|
||
.descendant_for_byte_range(cf.span.0, cf.span.1)
|
||
&& is_ruby_deser_inside_test_assertion(node, self.source.bytes)
|
||
{
|
||
continue;
|
||
}
|
||
let point = byte_offset_to_point(&self.source.tree, cf.span.0);
|
||
let cfg_confidence = Some(match cf.confidence {
|
||
cfg_analysis::Confidence::High => crate::evidence::Confidence::High,
|
||
cfg_analysis::Confidence::Medium => crate::evidence::Confidence::Medium,
|
||
cfg_analysis::Confidence::Low => crate::evidence::Confidence::Low,
|
||
});
|
||
// Carry the sink node's resolved Sink caps onto the structural
|
||
// finding's evidence so downstream cap-classification (and the
|
||
// eval `cap_of`) buckets `cfg-unguarded-sink` under its real cap
|
||
// (sqli/cmdi/ssrf/…) instead of the catch-all `other`. Without
|
||
// this every taint-less structural sink finding fell through to
|
||
// `other`, hiding real recall (e.g. dvpwa `cur.execute` SQLi)
|
||
// and inflating the `other` bucket. Non-sink structural findings
|
||
// (resource-leak, auth-gap) carry no Sink label, so this is 0.
|
||
let cf_sink_caps: u32 = cf
|
||
.evidence
|
||
.first()
|
||
.map(|&n| {
|
||
cfg_ctx.cfg[n].taint.labels.iter().fold(0u32, |acc, l| {
|
||
if let crate::labels::DataLabel::Sink(c) = l {
|
||
acc | c.bits()
|
||
} else {
|
||
acc
|
||
}
|
||
})
|
||
})
|
||
.unwrap_or(0);
|
||
let cf_category = FindingCategory::for_structural_rule(&cf.rule_id);
|
||
out.push(Diag {
|
||
path: self.source.path.to_string_lossy().into_owned(),
|
||
line: point.row + 1,
|
||
col: point.column + 1,
|
||
severity: cf.severity,
|
||
id: cf.rule_id,
|
||
category: cf_category,
|
||
path_validated: false,
|
||
guard_kind: None,
|
||
message: Some(cf.message),
|
||
labels: vec![],
|
||
confidence: cfg_confidence,
|
||
evidence: Some(Evidence {
|
||
source: None,
|
||
sink: Some(SpanEvidence {
|
||
path: self.source.path.to_string_lossy().into_owned(),
|
||
line: (point.row + 1) as u32,
|
||
col: (point.column + 1) as u32,
|
||
kind: "sink".into(),
|
||
snippet: None,
|
||
}),
|
||
sink_caps: cf_sink_caps,
|
||
guards: vec![],
|
||
sanitizers: vec![],
|
||
state: None,
|
||
notes: vec![],
|
||
..Default::default()
|
||
}),
|
||
rank_score: None,
|
||
rank_reason: None,
|
||
suppressed: false,
|
||
suppression: None,
|
||
rollup: None,
|
||
finding_id: String::new(),
|
||
alternative_finding_ids: Vec::new(),
|
||
stable_hash: 0,
|
||
});
|
||
}
|
||
} // end for body in bodies (CFG structural analyses)
|
||
|
||
// ── State-model dataflow analysis (per body) ─────────────────────
|
||
if cfg.scanner.enable_state_analysis {
|
||
let resource_method_summaries =
|
||
state::build_resource_method_summaries(&self.file_cfg.bodies, caller_lang);
|
||
let mut all_state_findings = Vec::new();
|
||
for body in &self.file_cfg.bodies {
|
||
// When `NYX_POINTER_ANALYSIS=1` is set, derive a
|
||
// `var_name → PtrProxyHint` map from the body's
|
||
// points-to facts so the proxy-acquire transfer can
|
||
// suppress SymbolId attribution on field-aliased
|
||
// receivers (e.g. `m := c.mu; m.Lock()`).
|
||
let body_pointer_hints = self.body_const_facts(body).and_then(|f| {
|
||
f.pointer_facts
|
||
.as_ref()
|
||
.map(|pf| pf.name_proxy_hints(&f.ssa))
|
||
});
|
||
let state_findings = state::run_state_analysis(
|
||
&body.graph,
|
||
body.entry,
|
||
caller_lang,
|
||
self.source.bytes,
|
||
self.local_summaries(),
|
||
global_summaries,
|
||
cfg.scanner.enable_auth_analysis,
|
||
&resource_method_summaries,
|
||
&body.meta.auth_decorators,
|
||
&path_safe_suppressed_spans,
|
||
body_pointer_hints.as_ref(),
|
||
Some(
|
||
closure_released_per_body
|
||
.get(&body.meta.id)
|
||
.unwrap_or(&empty_set),
|
||
),
|
||
);
|
||
|
||
for sf in &state_findings {
|
||
let point = byte_offset_to_point(&self.source.tree, sf.span.0);
|
||
out.push(Diag {
|
||
path: self.source.path.to_string_lossy().into_owned(),
|
||
line: point.row + 1,
|
||
col: point.column + 1,
|
||
severity: sf.severity,
|
||
id: sf.rule_id.clone(),
|
||
category: FindingCategory::for_structural_rule(&sf.rule_id),
|
||
path_validated: false,
|
||
guard_kind: None,
|
||
message: Some(sf.message.clone()),
|
||
labels: vec![],
|
||
confidence: None,
|
||
evidence: Some(Evidence {
|
||
source: None,
|
||
sink: Some(SpanEvidence {
|
||
path: self.source.path.to_string_lossy().into_owned(),
|
||
line: (point.row + 1) as u32,
|
||
col: (point.column + 1) as u32,
|
||
kind: "sink".into(),
|
||
snippet: None,
|
||
}),
|
||
guards: vec![],
|
||
sanitizers: vec![],
|
||
state: Some(StateEvidence {
|
||
machine: sf.machine.into(),
|
||
subject: sf.subject.clone(),
|
||
from_state: sf.from_state.into(),
|
||
to_state: sf.to_state.into(),
|
||
}),
|
||
notes: vec![],
|
||
..Default::default()
|
||
}),
|
||
rank_score: None,
|
||
rank_reason: None,
|
||
suppressed: false,
|
||
suppression: None,
|
||
rollup: None,
|
||
finding_id: String::new(),
|
||
alternative_finding_ids: Vec::new(),
|
||
stable_hash: 0,
|
||
});
|
||
}
|
||
|
||
all_state_findings.extend(state_findings);
|
||
} // end for body in bodies (state analysis)
|
||
|
||
// Suppress cfg-resource-leak / cfg-auth-gap when state analysis
|
||
// already covers the same line (state analysis is more precise).
|
||
let state_lines: std::collections::HashSet<usize> = all_state_findings
|
||
.iter()
|
||
.map(|sf| byte_offset_to_point(&self.source.tree, sf.span.0).row + 1)
|
||
.collect();
|
||
if !all_state_findings.is_empty() {
|
||
out.retain(|d| {
|
||
!((d.id == "cfg-resource-leak" || d.id == "cfg-auth-gap")
|
||
&& state_lines.contains(&d.line))
|
||
});
|
||
}
|
||
}
|
||
|
||
out
|
||
}
|
||
|
||
/// Run AST-backed authorization analyses that do not require CFG construction.
|
||
fn run_auth_analyses(
|
||
&self,
|
||
cfg: &Config,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> Vec<Diag> {
|
||
// Harvest SSA-derived variable types across every body in the
|
||
// file so `run_auth_analysis` can refine sink classification by
|
||
// receiver type (e.g. `HttpClient::send` → `OutboundNetwork`,
|
||
// `HashMap::new`-bound var → `InMemoryLocal`).
|
||
let var_types = self.collect_file_var_types();
|
||
auth_analysis::run_auth_analysis(
|
||
&self.source.tree,
|
||
self.source.bytes,
|
||
self.source.lang_slug,
|
||
self.source.path,
|
||
cfg,
|
||
var_types.as_ref(),
|
||
global_summaries,
|
||
scan_root,
|
||
)
|
||
}
|
||
|
||
/// Build a per-file `var_name → TypeKind` map from SSA + type facts.
|
||
/// Conflicting non-`Unknown` types across bodies drop the entry ,
|
||
/// absence is safe because the auth sink gate falls back to
|
||
/// syntactic heuristics. Returns `None` when no body produces a
|
||
/// typed variable.
|
||
fn collect_file_var_types(&self) -> Option<auth_analysis::VarTypes> {
|
||
let mut merged: std::collections::HashMap<String, crate::ssa::type_facts::TypeKind> =
|
||
std::collections::HashMap::new();
|
||
let mut dropped: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||
for body in &self.file_cfg.bodies {
|
||
let Some(facts) = self.body_const_facts(body) else {
|
||
continue;
|
||
};
|
||
for (idx, def) in facts.ssa.value_defs.iter().enumerate() {
|
||
let Some(name) = def.var_name.as_ref() else {
|
||
continue;
|
||
};
|
||
let Some(ty) = facts.type_facts.get_type(crate::ssa::SsaValue(idx as u32)) else {
|
||
continue;
|
||
};
|
||
if matches!(ty, crate::ssa::type_facts::TypeKind::Unknown) {
|
||
continue;
|
||
}
|
||
if dropped.contains(name) {
|
||
continue;
|
||
}
|
||
match merged.get(name) {
|
||
Some(existing) if existing == ty => {}
|
||
Some(_) => {
|
||
merged.remove(name);
|
||
dropped.insert(name.clone());
|
||
}
|
||
None => {
|
||
merged.insert(name.clone(), ty.clone());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if merged.is_empty() {
|
||
None
|
||
} else {
|
||
Some(merged)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Pass 1: Extract function summaries (no taint analysis)
|
||
|
||
/// Extract function summaries from pre-read bytes.
|
||
///
|
||
/// This is the core **pass 1** implementation. Callers that already hold the
|
||
/// file contents should use this variant to avoid a redundant `fs::read`.
|
||
pub fn extract_summaries_from_bytes(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
) -> NyxResult<Vec<FuncSummary>> {
|
||
let _span = tracing::debug_span!("extract_summaries", file = %path.display()).entered();
|
||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||
return Ok(vec![]);
|
||
};
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
Ok(parsed.export_summaries())
|
||
}
|
||
|
||
/// Like [`extract_summaries_from_bytes`] but forwards `scan_root` so Rust
|
||
/// summaries carry their crate-relative module path.
|
||
pub fn extract_summaries_from_bytes_with_root(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
scan_root: Option<&Path>,
|
||
) -> NyxResult<Vec<FuncSummary>> {
|
||
let _span = tracing::debug_span!("extract_summaries", file = %path.display()).entered();
|
||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||
return Ok(vec![]);
|
||
};
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
Ok(parsed.export_summaries_with_root(scan_root))
|
||
}
|
||
|
||
/// Convenience wrapper that reads the file then delegates to
|
||
/// [`extract_summaries_from_bytes`].
|
||
#[allow(dead_code)] // used by benchmarks and lib consumers
|
||
pub fn extract_summaries_from_file(path: &Path, cfg: &Config) -> NyxResult<Vec<FuncSummary>> {
|
||
let bytes = std::fs::read(path)?;
|
||
extract_summaries_from_bytes(&bytes, path, cfg)
|
||
}
|
||
|
||
/// Build a CFG from a file and return the graph, entry node, function summaries,
|
||
/// and language.
|
||
///
|
||
/// Returns `None` for binary files or unsupported languages.
|
||
/// Intended for benchmarks and isolated testing of state analysis.
|
||
pub fn build_cfg_for_file(path: &Path, cfg: &Config) -> NyxResult<Option<(FileCfg, Lang)>> {
|
||
let bytes = std::fs::read(path)?;
|
||
let Some(source) = ParsedSource::try_new(&bytes, path)? else {
|
||
return Ok(None);
|
||
};
|
||
let lang = Lang::from_slug(source.lang_slug).unwrap_or(Lang::C);
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
Ok(Some((parsed.file_cfg, lang)))
|
||
}
|
||
|
||
/// Parse a file and return its `AuthorizationModel` for debug inspection.
|
||
///
|
||
/// Runs only the auth-extraction pipeline, no taint, no CFG construction.
|
||
/// Returns `None` for binary files or unsupported languages. Used by the
|
||
/// `/api/debug/auth` route to surface the structured authorization model
|
||
/// (routes, units, sensitive operations, auth checks) in the debug UI.
|
||
pub fn extract_auth_model_for_debug(
|
||
path: &Path,
|
||
cfg: &Config,
|
||
) -> NyxResult<Option<auth_analysis::model::AuthorizationModel>> {
|
||
let bytes = std::fs::read(path)?;
|
||
let Some(source) = ParsedSource::try_new(&bytes, path)? else {
|
||
return Ok(None);
|
||
};
|
||
let rules = auth_analysis::config::build_auth_rules(cfg, source.lang_slug);
|
||
if !rules.enabled {
|
||
return Ok(Some(auth_analysis::model::AuthorizationModel::default()));
|
||
}
|
||
let model = auth_analysis::extract::extract_authorization_model(
|
||
source.lang_slug,
|
||
cfg.framework_ctx.as_ref(),
|
||
&source.tree,
|
||
source.bytes,
|
||
source.path,
|
||
&rules,
|
||
None,
|
||
);
|
||
Ok(Some(model))
|
||
}
|
||
|
||
/// Production-equivalent fused-path stage timing.
|
||
///
|
||
/// Returns `[parse+CFG, shared_lower, taint_flow, build_eligible,
|
||
/// ast_queries, suppression, auth, run_cfg_state]` in µs, plus
|
||
/// the per-substage breakdown of `shared_lower` from the thread-local
|
||
/// timers in `taint::perf_lower_timings_*`.
|
||
///
|
||
/// Mirrors `analyse_file_fused`'s control flow so each chunk is timed
|
||
/// without the double-lowering overcount that `perf_stage_breakdown`
|
||
/// suffers (the latter calls `run_cfg_analyses` and
|
||
/// `extract_ssa_artifacts` separately, both of which lower).
|
||
#[doc(hidden)]
|
||
pub fn perf_stage_breakdown_fused(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
global_summaries: Option<&crate::summary::GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> Option<([u128; 8], [u128; 7])> {
|
||
use std::time::Instant;
|
||
let s_parse = Instant::now();
|
||
let source = ParsedSource::try_new(bytes, path).ok()??;
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
let t_parse_cfg = s_parse.elapsed().as_micros();
|
||
|
||
crate::taint::ssa_transfer::reset_path_safe_suppressed_spans();
|
||
crate::taint::ssa_transfer::reset_all_validated_spans();
|
||
crate::taint::perf_lower_timings_start();
|
||
|
||
let s_lower = Instant::now();
|
||
let (lowered_summaries, lowered_bodies) =
|
||
parsed.lower_ssa_for_fused(global_summaries, scan_root, cfg.module_graph.as_deref());
|
||
let t_lower = s_lower.elapsed().as_micros();
|
||
let lower_breakdown = crate::taint::perf_lower_timings_take().unwrap_or([0; 7]);
|
||
|
||
let s_taint = Instant::now();
|
||
let taint_diags = parsed.run_cfg_analyses_with_lowered(
|
||
cfg,
|
||
global_summaries,
|
||
scan_root,
|
||
&lowered_summaries,
|
||
&lowered_bodies,
|
||
);
|
||
let t_taint_flow = s_taint.elapsed().as_micros();
|
||
|
||
let s_eligible = Instant::now();
|
||
let _ = crate::taint::build_eligible_bodies(&parsed.file_cfg, lowered_bodies);
|
||
let t_eligible = s_eligible.elapsed().as_micros();
|
||
|
||
let s_ast = Instant::now();
|
||
let ast_findings = parsed.source.run_ast_queries(cfg);
|
||
let t_ast = s_ast.elapsed().as_micros();
|
||
|
||
let s_suppr = Instant::now();
|
||
let suppression =
|
||
TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &taint_diags);
|
||
let _filtered: Vec<_> = ast_findings
|
||
.into_iter()
|
||
.filter(|d| {
|
||
!suppression.should_suppress(&d.id, d.line)
|
||
&& !suppression.is_redundant_ast_pattern(&d.id, d.line)
|
||
})
|
||
.collect();
|
||
let t_suppr = s_suppr.elapsed().as_micros();
|
||
|
||
let s_auth = Instant::now();
|
||
let _ = parsed.run_auth_analyses(cfg, global_summaries, scan_root);
|
||
let t_auth = s_auth.elapsed().as_micros();
|
||
|
||
// 8th slot reserved (state-analysis breakdown if needed later);
|
||
// currently included in t_taint_flow.
|
||
let t_state = 0u128;
|
||
|
||
Some((
|
||
[
|
||
t_parse_cfg,
|
||
t_lower,
|
||
t_taint_flow,
|
||
t_eligible,
|
||
t_ast,
|
||
t_suppr,
|
||
t_auth,
|
||
t_state,
|
||
],
|
||
lower_breakdown,
|
||
))
|
||
}
|
||
|
||
/// Diagnostic stage-timing helper for the perf audit.
|
||
///
|
||
/// Times each stage of pass 2 internally and returns µs counts. Returns
|
||
/// `None` for unsupported languages. Not used in production, just for
|
||
/// `tests/perf_breakdown.rs` to attribute time inside `run_rules_on_bytes`
|
||
/// without touching the hot path.
|
||
#[doc(hidden)]
|
||
pub fn perf_stage_breakdown(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
global_summaries: Option<&crate::summary::GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> Option<[u128; 6]> {
|
||
use std::time::Instant;
|
||
let s_parse = Instant::now();
|
||
let source = ParsedSource::try_new(bytes, path).ok()??;
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
let t_parse_cfg = s_parse.elapsed().as_micros();
|
||
|
||
let s_taint = Instant::now();
|
||
let taint = parsed.run_cfg_analyses(cfg, global_summaries, scan_root);
|
||
let t_taint = s_taint.elapsed().as_micros();
|
||
|
||
let s_suppr = Instant::now();
|
||
let _ = TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &taint);
|
||
let t_suppr = s_suppr.elapsed().as_micros();
|
||
|
||
let s_ast = Instant::now();
|
||
let _ast_findings = parsed.source.run_ast_queries(cfg);
|
||
let t_ast = s_ast.elapsed().as_micros();
|
||
|
||
let s_auth = Instant::now();
|
||
let _ = parsed.run_auth_analyses(cfg, global_summaries, scan_root);
|
||
let t_auth = s_auth.elapsed().as_micros();
|
||
|
||
let s_ssa = Instant::now();
|
||
let _ = parsed.extract_ssa_artifacts(global_summaries, scan_root, cfg.module_graph.as_deref());
|
||
let t_ssa = s_ssa.elapsed().as_micros();
|
||
|
||
Some([t_parse_cfg, t_taint, t_suppr, t_ast, t_auth, t_ssa])
|
||
}
|
||
|
||
/// Extract both `FuncSummary` and `SsaFuncSummary` from pre-read bytes.
|
||
///
|
||
/// This is the shared pass-1 pipeline for indexed scans: parses once, builds
|
||
/// CFG once, and returns both summary types. Uses the same `ParsedFile`
|
||
/// pipeline as `analyse_file_fused`, no divergent extraction path.
|
||
pub fn extract_all_summaries_from_bytes(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
scan_root: Option<&Path>,
|
||
) -> NyxResult<(
|
||
Vec<FuncSummary>,
|
||
Vec<(crate::symbol::FuncKey, SsaFuncSummary)>,
|
||
Vec<(
|
||
crate::symbol::FuncKey,
|
||
crate::taint::ssa_transfer::CalleeSsaBody,
|
||
)>,
|
||
Vec<(
|
||
crate::symbol::FuncKey,
|
||
auth_analysis::model::AuthCheckSummary,
|
||
)>,
|
||
Option<(
|
||
String,
|
||
std::sync::Arc<HashMap<String, crate::symbol::FuncKey>>,
|
||
)>,
|
||
)> {
|
||
let _span = tracing::debug_span!("extract_all_summaries", file = %path.display()).entered();
|
||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||
return Ok((vec![], vec![], vec![], vec![], None));
|
||
};
|
||
let lang_slug = source.lang_slug;
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
let func_summaries = parsed.export_summaries_with_root(scan_root);
|
||
let (ssa_summaries, ssa_bodies) =
|
||
parsed.extract_ssa_artifacts(None, scan_root, cfg.module_graph.as_deref());
|
||
let auth_summaries = auth_analysis::extract_auth_summaries_by_key(
|
||
&parsed.source.tree,
|
||
parsed.source.bytes,
|
||
lang_slug,
|
||
parsed.source.path,
|
||
cfg,
|
||
scan_root,
|
||
);
|
||
let cross_package_imports = if parsed.file_cfg.resolved_imports.is_empty() {
|
||
None
|
||
} else {
|
||
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
|
||
let ns = crate::symbol::namespace_with_package(
|
||
&parsed.source.file_path_str,
|
||
scan_root_str.as_deref(),
|
||
cfg.module_graph.as_deref(),
|
||
);
|
||
let caller_lang = Lang::from_slug(parsed.source.lang_slug).unwrap_or(Lang::Rust);
|
||
let map = crate::taint::build_cross_package_func_keys(
|
||
&parsed.file_cfg.resolved_imports,
|
||
scan_root_str.as_deref(),
|
||
cfg.module_graph.as_deref(),
|
||
caller_lang,
|
||
);
|
||
if map.is_empty() {
|
||
None
|
||
} else {
|
||
Some((ns, std::sync::Arc::new(map)))
|
||
}
|
||
};
|
||
Ok((
|
||
func_summaries,
|
||
ssa_summaries,
|
||
ssa_bodies,
|
||
auth_summaries,
|
||
cross_package_imports,
|
||
))
|
||
}
|
||
|
||
// Constant-argument suppression helper
|
||
|
||
/// Returns `true` when the captured call node has only literal arguments
|
||
/// (string, number, boolean, null/nil/none), or identifier arguments that
|
||
/// resolve to a file-level scalar constant (`const NAME = "x"` at module
|
||
/// scope and equivalent in Java / Go / Python / Rust). Used to suppress
|
||
/// AST pattern findings on provably-constant calls like
|
||
/// `os.system(DEFAULT_CMD)` where `DEFAULT_CMD = "ls -la"`.
|
||
///
|
||
/// Conservative: returns `false` whenever the tree structure is unclear or
|
||
/// any argument is non-literal (including interpolated strings).
|
||
fn is_call_all_args_literal(node: tree_sitter::Node, bytes: &[u8], lang_slug: &str) -> bool {
|
||
// Walk upwards from the captured node to find the closest call_expression
|
||
// (or similar) ancestor, then locate its argument list child.
|
||
let call_node = find_enclosing_call(node);
|
||
let call_node = match call_node {
|
||
Some(n) => n,
|
||
None => return false,
|
||
};
|
||
|
||
// Find the argument_list / arguments child of the call node.
|
||
let arg_list = find_arg_list(call_node);
|
||
let arg_list = match arg_list {
|
||
Some(n) => n,
|
||
None => return false,
|
||
};
|
||
|
||
// Build the file-level scalar binding set lazily: only resolve once per
|
||
// call, never if every arg is a syntactic literal. Cheap: walks the
|
||
// file root's direct children for const / module-level assignment forms.
|
||
let scalars = file_level_scalar_bindings(node, bytes, lang_slug);
|
||
|
||
let mut has_any_arg = false;
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
let child = match arg_list.named_child(i) {
|
||
Some(c) => c,
|
||
None => continue,
|
||
};
|
||
has_any_arg = true;
|
||
if !is_literal_or_named_scalar(child, bytes, &scalars) {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
// If the argument list is empty (no args), we conservatively do NOT
|
||
// suppress, the danger may come from side effects, not arguments.
|
||
has_any_arg
|
||
}
|
||
|
||
/// Walk up from `node` to the file root and collect every file-level scalar
|
||
/// binding name reachable on this language. Empty set for languages without
|
||
/// a recognised binding form (JS/TS, Ruby, PHP, C/C++).
|
||
fn file_level_scalar_bindings(
|
||
node: tree_sitter::Node,
|
||
bytes: &[u8],
|
||
lang_slug: &str,
|
||
) -> std::collections::HashSet<String> {
|
||
let mut root = node;
|
||
while let Some(p) = root.parent() {
|
||
root = p;
|
||
}
|
||
crate::cfg::safe_fields::collect_class_constant_scalars(root, lang_slug, bytes)
|
||
.into_keys()
|
||
.collect()
|
||
}
|
||
|
||
/// Like [`is_literal_node`] but also accepts identifiers that resolve to a
|
||
/// file-level scalar binding (constant string / number / bool).
|
||
fn is_literal_or_named_scalar(
|
||
node: tree_sitter::Node,
|
||
bytes: &[u8],
|
||
scalars: &std::collections::HashSet<String>,
|
||
) -> bool {
|
||
if is_literal_node(node, bytes) {
|
||
return true;
|
||
}
|
||
let kind = node.kind();
|
||
// Identifier forms vary across grammars. PHP / Ruby use `variable_name`;
|
||
// every other supported language uses bare `identifier`. An `argument`
|
||
// wrapper (PHP / Go) lifts a single child — unwrap and recurse.
|
||
match kind {
|
||
"identifier" | "variable_name" => {
|
||
let Ok(text) = std::str::from_utf8(&bytes[node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
scalars.contains(text)
|
||
}
|
||
"argument" => node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_literal_or_named_scalar(c, bytes, scalars)),
|
||
// Unary / binary forms over a scalar binding remain a literal-valued
|
||
// expression at compile time.
|
||
"unary_expression" | "unary_op" => node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_literal_or_named_scalar(c, bytes, scalars)),
|
||
"binary_expression" | "concatenated_string" => {
|
||
node.named_child_count() >= 2
|
||
&& (0..node.named_child_count() as u32).all(|i| {
|
||
node.named_child(i)
|
||
.is_some_and(|c| is_literal_or_named_scalar(c, bytes, scalars))
|
||
})
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
/// Walk up to find a call-expression-like ancestor of the captured node.
|
||
/// Stops at statement/block boundaries to avoid matching unrelated outer calls.
|
||
fn find_enclosing_call(mut node: tree_sitter::Node) -> Option<tree_sitter::Node> {
|
||
// The captured node may already be the call, or it could be the callee
|
||
// identifier inside a call_expression. Walk up a few levels.
|
||
for _ in 0..4 {
|
||
let kind = node.kind();
|
||
if kind.contains("call") && !kind.contains("callee") {
|
||
return Some(node);
|
||
}
|
||
// Java / PHP / C-family kinds that don't have "call" in their name
|
||
// but represent the same call shape for arg-list inspection.
|
||
if matches!(
|
||
kind,
|
||
"function_call_expression"
|
||
| "method_invocation"
|
||
| "object_creation_expression"
|
||
| "explicit_constructor_invocation"
|
||
) {
|
||
return Some(node);
|
||
}
|
||
// Stop at scope/statement boundaries, don't cross into outer calls
|
||
if kind.contains("block")
|
||
|| kind.contains("body")
|
||
|| kind == "program"
|
||
|| kind == "module"
|
||
|| kind == "expression_statement"
|
||
{
|
||
return None;
|
||
}
|
||
node = node.parent()?;
|
||
}
|
||
None
|
||
}
|
||
|
||
/// Find the argument-list child of a call node across languages.
|
||
fn find_arg_list(call: tree_sitter::Node) -> Option<tree_sitter::Node> {
|
||
for i in 0..call.child_count() as u32 {
|
||
if let Some(child) = call.child(i) {
|
||
let kind = child.kind();
|
||
// Common argument list node kinds across languages:
|
||
// Python/JS/TS/Java/Go/C/C++/Rust: argument_list / arguments
|
||
// PHP: arguments
|
||
// Ruby: argument_list
|
||
if kind == "argument_list" || kind == "arguments" || kind == "actual_parameters" {
|
||
return Some(child);
|
||
}
|
||
}
|
||
}
|
||
None
|
||
}
|
||
|
||
/// Check if a tree-sitter node represents a literal value.
|
||
fn is_literal_node(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let kind = node.kind();
|
||
match kind {
|
||
// String literals, but Python's `string` node also covers
|
||
// f-strings, which carry `interpolation` children. An f-string
|
||
// with interpolation is *not* a literal: it embeds arbitrary
|
||
// expressions, so a sink call like `cursor.execute(f"…{x}")`
|
||
// must not be suppressed under Layer A's "all-literal args"
|
||
// shortcut. Same shape applies to any tree-sitter grammar
|
||
// that nests an `interpolation` (or `string_interpolation`)
|
||
// child inside a string node.
|
||
"string"
|
||
| "string_literal"
|
||
| "interpreted_string_literal"
|
||
| "raw_string_literal"
|
||
| "string_content"
|
||
| "string_fragment" => !has_interpolation(node),
|
||
|
||
// Numeric literals
|
||
"integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => {
|
||
true
|
||
}
|
||
|
||
// Boolean / null / nil / none
|
||
"true" | "false" | "null" | "nil" | "none" | "null_literal" | "boolean"
|
||
| "boolean_literal" => true,
|
||
|
||
// PHP encapsed_string: safe only if it has no variable interpolation
|
||
"encapsed_string" => {
|
||
// If it contains `$` variable interpolation nodes, it's not literal
|
||
!has_interpolation(node)
|
||
}
|
||
|
||
// Wrapper nodes: PHP wraps each arg in an `argument` node,
|
||
// Go uses `argument` too. Unwrap and check the inner value.
|
||
"argument" => {
|
||
node.named_child_count() == 1
|
||
&& node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_literal_node(c, bytes))
|
||
}
|
||
|
||
// Unary minus on a number literal: `-42`
|
||
"unary_expression" | "unary_op" => {
|
||
node.named_child_count() == 1
|
||
&& node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_literal_node(c, bytes))
|
||
}
|
||
|
||
// String concatenation of literals: `"a" + "b"` or `"a" . "b"`
|
||
"binary_expression" | "concatenated_string" => {
|
||
node.named_child_count() >= 2
|
||
&& (0..node.named_child_count() as u32).all(|i| {
|
||
node.named_child(i)
|
||
.is_some_and(|c| is_literal_node(c, bytes))
|
||
})
|
||
}
|
||
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
/// PHP-only: returns `true` when the captured `include_expression` node is
|
||
/// `include $var` (or `require $var`, etc.) and `$var` is a formal parameter
|
||
/// of the immediately enclosing function / method / closure / arrow function,
|
||
/// with no assignment to `$var` between the function body start and the
|
||
/// include site. This is the canonical PHP autoloader / scope-isolated
|
||
/// `Closure::bind(static function ($file) { include $file; }, ...)` shape;
|
||
/// composer's `ClassLoader::initializeIncludeClosure`, PSR-4 loaders, and
|
||
/// route-file loaders all match this. The pattern rule is intentionally
|
||
/// heuristic (no taint), so a parameter pass-through is the broadest
|
||
/// safe-suppression boundary; if the caller passes a tainted value, the
|
||
/// engine's separate taint-unsanitised-flow rule still fires.
|
||
fn is_php_include_param_passthrough(include_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
// tree-sitter-php shape:
|
||
// include_expression
|
||
// variable_name
|
||
// name "<param>"
|
||
let var_node = include_node.named_child(0);
|
||
let Some(var_node) = var_node else {
|
||
return false;
|
||
};
|
||
if var_node.kind() != "variable_name" {
|
||
return false;
|
||
}
|
||
let name_node = var_node.named_child(0);
|
||
let Some(name_node) = name_node else {
|
||
return false;
|
||
};
|
||
let var_name = match std::str::from_utf8(&bytes[name_node.byte_range()]) {
|
||
Ok(s) => s,
|
||
Err(_) => return false,
|
||
};
|
||
|
||
// Walk up to the enclosing function/method/closure.
|
||
let mut cur = include_node;
|
||
while let Some(parent) = cur.parent() {
|
||
match parent.kind() {
|
||
"method_declaration"
|
||
| "function_definition"
|
||
| "anonymous_function"
|
||
| "anonymous_function_creation_expression"
|
||
| "arrow_function" => {
|
||
let params = parent
|
||
.child_by_field_name("parameters")
|
||
.or_else(|| find_named_child_of_kind(parent, "formal_parameters"));
|
||
let Some(params) = params else {
|
||
return false;
|
||
};
|
||
if !param_list_contains_name(params, var_name, bytes) {
|
||
return false;
|
||
}
|
||
// Reassignment guard: if the variable is reassigned inside the
|
||
// function body before the include, the parameter-pass-through
|
||
// assumption breaks down.
|
||
let body = parent
|
||
.child_by_field_name("body")
|
||
.or_else(|| find_named_child_of_kind(parent, "compound_statement"));
|
||
let body_start = body.map(|b| b.start_byte()).unwrap_or(parent.start_byte());
|
||
if is_var_reassigned_before(
|
||
body.unwrap_or(parent),
|
||
var_name,
|
||
include_node.start_byte(),
|
||
body_start,
|
||
bytes,
|
||
) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
// Stop at class/program scope without a matching function, bare
|
||
// top-level `include $var` does not benefit from this guard.
|
||
"program" | "class_declaration" | "trait_declaration" | "interface_declaration" => {
|
||
return false;
|
||
}
|
||
_ => {}
|
||
}
|
||
cur = parent;
|
||
}
|
||
false
|
||
}
|
||
|
||
fn find_named_child_of_kind<'a>(
|
||
parent: tree_sitter::Node<'a>,
|
||
kind: &str,
|
||
) -> Option<tree_sitter::Node<'a>> {
|
||
for i in 0..parent.named_child_count() as u32 {
|
||
if let Some(child) = parent.named_child(i)
|
||
&& child.kind() == kind
|
||
{
|
||
return Some(child);
|
||
}
|
||
}
|
||
None
|
||
}
|
||
|
||
fn param_list_contains_name(params: tree_sitter::Node, target_name: &str, bytes: &[u8]) -> bool {
|
||
for i in 0..params.named_child_count() as u32 {
|
||
let Some(param) = params.named_child(i) else {
|
||
continue;
|
||
};
|
||
if !matches!(
|
||
param.kind(),
|
||
"simple_parameter"
|
||
| "variadic_parameter"
|
||
| "property_promotion_parameter"
|
||
| "promoted_constructor_parameter"
|
||
) {
|
||
continue;
|
||
}
|
||
// simple_parameter has a `variable_name` child whose `name` child is the bare ident.
|
||
let var_node = param
|
||
.child_by_field_name("name")
|
||
.or_else(|| find_named_child_of_kind(param, "variable_name"));
|
||
let Some(var_node) = var_node else {
|
||
continue;
|
||
};
|
||
let name_node = if var_node.kind() == "variable_name" {
|
||
var_node.named_child(0)
|
||
} else {
|
||
Some(var_node)
|
||
};
|
||
let Some(name_node) = name_node else {
|
||
continue;
|
||
};
|
||
if let Ok(name) = std::str::from_utf8(&bytes[name_node.byte_range()])
|
||
&& name == target_name
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Walk the function body looking for any `assignment_expression` whose LHS
|
||
/// names `target_name`, between `body_start` (inclusive) and `before_byte`
|
||
/// (exclusive). Crosses nested scopes (closures inside the function are
|
||
/// rare in this idiom, and reassignment inside them wouldn't shadow the
|
||
/// outer parameter).
|
||
fn is_var_reassigned_before(
|
||
root: tree_sitter::Node,
|
||
target_name: &str,
|
||
before_byte: usize,
|
||
body_start: usize,
|
||
bytes: &[u8],
|
||
) -> bool {
|
||
let mut stack = vec![root];
|
||
while let Some(node) = stack.pop() {
|
||
if node.start_byte() >= before_byte {
|
||
continue;
|
||
}
|
||
if node.end_byte() <= body_start {
|
||
continue;
|
||
}
|
||
if node.kind() == "assignment_expression" {
|
||
// LHS is the first named child (or the `left` field in newer grammars).
|
||
let lhs = node
|
||
.child_by_field_name("left")
|
||
.or_else(|| node.named_child(0));
|
||
if let Some(lhs) = lhs
|
||
&& lhs.kind() == "variable_name"
|
||
&& let Some(n) = lhs.named_child(0)
|
||
&& let Ok(s) = std::str::from_utf8(&bytes[n.byte_range()])
|
||
&& s == target_name
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
for i in 0..node.named_child_count() as u32 {
|
||
if let Some(c) = node.named_child(i) {
|
||
stack.push(c);
|
||
}
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// PHP-only: returns `true` when the captured `function_call_expression`
|
||
/// node is `unserialize($x, [..., 'allowed_classes' => <ARRAY|false>, ...])`.
|
||
/// This is the canonical PHP 7+ structural mitigation against object
|
||
/// injection, explicitly restricting which classes the deserialiser may
|
||
/// instantiate. Only suppress when the option is either:
|
||
///
|
||
/// - `'allowed_classes' => false` (no class instantiation), or
|
||
/// - `'allowed_classes' => [Foo::class]` (an array literal allow-list).
|
||
///
|
||
/// `'allowed_classes' => true` (the unsafe default) and dynamic values
|
||
/// (`'allowed_classes' => $opts`) leave the finding in place.
|
||
fn is_php_unserialize_allowed_classes_restricted(
|
||
cap_node: tree_sitter::Node,
|
||
bytes: &[u8],
|
||
) -> bool {
|
||
// The pattern captures `@n` (the function name) at index 0, so walk up
|
||
// to the enclosing function_call_expression.
|
||
let call_node = if cap_node.kind() == "function_call_expression" {
|
||
cap_node
|
||
} else {
|
||
let mut cur = cap_node;
|
||
let mut found = None;
|
||
for _ in 0..4 {
|
||
if cur.kind() == "function_call_expression" {
|
||
found = Some(cur);
|
||
break;
|
||
}
|
||
match cur.parent() {
|
||
Some(p) => cur = p,
|
||
None => break,
|
||
}
|
||
}
|
||
match found {
|
||
Some(c) => c,
|
||
None => return false,
|
||
}
|
||
};
|
||
let arg_list = find_named_child_of_kind(call_node, "arguments");
|
||
let Some(arg_list) = arg_list else {
|
||
return false;
|
||
};
|
||
// arg 0 is the data; arg 1 is the options array.
|
||
let mut args = Vec::new();
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
if let Some(c) = arg_list.named_child(i)
|
||
&& c.kind() == "argument"
|
||
{
|
||
args.push(c);
|
||
}
|
||
}
|
||
if args.len() < 2 {
|
||
return false;
|
||
}
|
||
// Unwrap the `argument` wrapper to its inner expression.
|
||
let opts = args[1].named_child(0);
|
||
let Some(opts) = opts else { return false };
|
||
if opts.kind() != "array_creation_expression" {
|
||
return false;
|
||
}
|
||
// Walk array_element_initializer children looking for the
|
||
// 'allowed_classes' key.
|
||
for i in 0..opts.named_child_count() as u32 {
|
||
let Some(elem) = opts.named_child(i) else {
|
||
continue;
|
||
};
|
||
if elem.kind() != "array_element_initializer" {
|
||
continue;
|
||
}
|
||
// Two named children: key, value.
|
||
if elem.named_child_count() < 2 {
|
||
continue;
|
||
}
|
||
let key = elem.named_child(0);
|
||
let value = elem.named_child(1);
|
||
let (Some(key), Some(value)) = (key, value) else {
|
||
continue;
|
||
};
|
||
if !is_string_literal_with_text(key, "allowed_classes", bytes) {
|
||
continue;
|
||
}
|
||
// Accept structural mitigation forms. The intent signal is
|
||
// "developer explicitly set allowed_classes to something other than
|
||
// `true`":
|
||
// - boolean `false` , no class instantiation at all
|
||
// - array literal , explicit allow-list
|
||
// - class-constant reference , `self::ALLOWED_CLASSES` /
|
||
// `Foo::CONSTANTS` resolved to
|
||
// a const array; engine cannot
|
||
// statically inspect, but the
|
||
// explicit option already
|
||
// distinguishes safe usage from
|
||
// the unsafe default.
|
||
match value.kind() {
|
||
"boolean" => {
|
||
if let Ok(s) = std::str::from_utf8(&bytes[value.byte_range()])
|
||
&& s.eq_ignore_ascii_case("false")
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
"array_creation_expression"
|
||
| "class_constant_access_expression"
|
||
| "scoped_property_access_expression" => return true,
|
||
_ => {}
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// PHP-only: returns `true` when the captured `function_call_expression`
|
||
/// is the canonical `Serializable::unserialize($input)` magic-method
|
||
/// pass-through — i.e. the call is inside a `method_declaration` named
|
||
/// exactly `unserialize` (PHP method names are case-insensitive) with
|
||
/// one formal parameter, and the call's single argument is the bare
|
||
/// parameter variable.
|
||
///
|
||
/// **Why this is a non-actionable site for `php.deser.unserialize`:**
|
||
/// `Serializable::unserialize($input)` is an interface contract method
|
||
/// that PHP itself invokes when restoring an instance via the runtime
|
||
/// `\unserialize($bytes)` machinery. The implementation MUST decode
|
||
/// `$input` (the body's `\unserialize(...)` call) — there is no
|
||
/// "safer" rewrite that preserves the contract. The actionable signal
|
||
/// is at the class level (the class implements the deprecated
|
||
/// `Serializable` interface — fix is to migrate to `__serialize` /
|
||
/// `__unserialize`), not at this call site.
|
||
///
|
||
/// Conservative recognition:
|
||
/// - method must be a `method_declaration` (NOT a free `function_definition` —
|
||
/// the magic semantics only apply to instance methods)
|
||
/// - method name == `unserialize` (case-insensitive)
|
||
/// - exactly 1 formal parameter
|
||
/// - call has exactly 1 argument
|
||
/// - argument's inner expression is a `variable_name` whose name equals the
|
||
/// formal parameter's name
|
||
///
|
||
/// Genuine deserialization sinks (free `unserialize($_GET[...])`, helpers
|
||
/// reading from session/cache and passing through, etc.) keep firing
|
||
/// because they are not inside a method declaration named `unserialize`.
|
||
fn is_php_unserialize_magic_method_passthrough(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
// The pattern captures `@n` (the function name); locate the enclosing
|
||
// function_call_expression.
|
||
let call_node = if cap_node.kind() == "function_call_expression" {
|
||
cap_node
|
||
} else {
|
||
let mut cur = cap_node;
|
||
let mut found = None;
|
||
for _ in 0..4 {
|
||
if cur.kind() == "function_call_expression" {
|
||
found = Some(cur);
|
||
break;
|
||
}
|
||
match cur.parent() {
|
||
Some(p) => cur = p,
|
||
None => break,
|
||
}
|
||
}
|
||
match found {
|
||
Some(c) => c,
|
||
None => return false,
|
||
}
|
||
};
|
||
|
||
// Walk up to the nearest method_declaration. Stop at any other
|
||
// function-introducing scope (free function, closure, arrow) — those
|
||
// are not the Serializable contract.
|
||
let mut cur = call_node;
|
||
let method = loop {
|
||
let Some(parent) = cur.parent() else {
|
||
return false;
|
||
};
|
||
match parent.kind() {
|
||
"method_declaration" => break parent,
|
||
"function_definition"
|
||
| "anonymous_function"
|
||
| "anonymous_function_creation_expression"
|
||
| "arrow_function"
|
||
| "program" => return false,
|
||
_ => {}
|
||
}
|
||
cur = parent;
|
||
};
|
||
|
||
// Method name must be exactly `unserialize` (case-insensitive).
|
||
let Some(name_node) = method
|
||
.child_by_field_name("name")
|
||
.or_else(|| find_named_child_of_kind(method, "name"))
|
||
else {
|
||
return false;
|
||
};
|
||
let Ok(method_name) = std::str::from_utf8(&bytes[name_node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
if !method_name.eq_ignore_ascii_case("unserialize") {
|
||
return false;
|
||
}
|
||
|
||
// Method must have exactly 1 formal parameter; capture its bare name.
|
||
let Some(params) = method
|
||
.child_by_field_name("parameters")
|
||
.or_else(|| find_named_child_of_kind(method, "formal_parameters"))
|
||
else {
|
||
return false;
|
||
};
|
||
let mut formal_params: Vec<tree_sitter::Node> = Vec::new();
|
||
for i in 0..params.named_child_count() as u32 {
|
||
if let Some(p) = params.named_child(i)
|
||
&& matches!(
|
||
p.kind(),
|
||
"simple_parameter"
|
||
| "variadic_parameter"
|
||
| "property_promotion_parameter"
|
||
| "promoted_constructor_parameter"
|
||
)
|
||
{
|
||
formal_params.push(p);
|
||
}
|
||
}
|
||
if formal_params.len() != 1 {
|
||
return false;
|
||
}
|
||
let param = formal_params[0];
|
||
let var_node = param
|
||
.child_by_field_name("name")
|
||
.or_else(|| find_named_child_of_kind(param, "variable_name"));
|
||
let Some(var_node) = var_node else {
|
||
return false;
|
||
};
|
||
let inner_name_node = if var_node.kind() == "variable_name" {
|
||
var_node.named_child(0)
|
||
} else {
|
||
Some(var_node)
|
||
};
|
||
let Some(inner_name_node) = inner_name_node else {
|
||
return false;
|
||
};
|
||
let Ok(param_name) = std::str::from_utf8(&bytes[inner_name_node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
|
||
// Call must have exactly 1 argument that is the bare parameter variable.
|
||
let Some(arg_list) = find_named_child_of_kind(call_node, "arguments") else {
|
||
return false;
|
||
};
|
||
let mut args: Vec<tree_sitter::Node> = Vec::new();
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
if let Some(c) = arg_list.named_child(i)
|
||
&& c.kind() == "argument"
|
||
{
|
||
args.push(c);
|
||
}
|
||
}
|
||
if args.len() != 1 {
|
||
return false;
|
||
}
|
||
let inner = args[0].named_child(0);
|
||
let Some(inner) = inner else { return false };
|
||
if inner.kind() != "variable_name" {
|
||
return false;
|
||
}
|
||
let Some(arg_name_node) = inner.named_child(0) else {
|
||
return false;
|
||
};
|
||
let Ok(arg_name) = std::str::from_utf8(&bytes[arg_name_node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
arg_name == param_name
|
||
}
|
||
|
||
/// PHP-only Layer C3: returns `true` when an `unserialize($x)` call
|
||
/// site is the second (or later) argument of a PHPUnit assertion call
|
||
/// whose first (expected) argument is a literal expression
|
||
/// (scalar, array literal, class constant access, or unary on a
|
||
/// literal).
|
||
///
|
||
/// **Why this is a non-actionable site for `php.deser.unserialize`:**
|
||
/// PHPUnit's `assertSame($expected, $actual)` /
|
||
/// `assertEquals(...)` / `assertNull(...)` family bound the
|
||
/// `unserialize` result to the literal expected value: if the
|
||
/// `$blob` argument were attacker-controlled and produced a
|
||
/// different shape, the assertion would fail loudly rather than
|
||
/// permit any object-injection side effect to escape the test
|
||
/// boundary. Drupal, Joomla, and Nextcloud each carry tens of
|
||
/// these `Serializable` / cache / session round-trip tests and
|
||
/// every firing is noise; the actionable signal lives at the
|
||
/// production call sites that thread real input through
|
||
/// `unserialize` without an assertion sandwich.
|
||
///
|
||
/// Conservative recognition:
|
||
/// - the `unserialize(...)` call must be wrapped in an `argument`
|
||
/// node whose parent is `arguments`
|
||
/// - the enclosing call must be a `member_call_expression`,
|
||
/// `nullsafe_member_call_expression`, `scoped_call_expression`,
|
||
/// or `function_call_expression` with a method/function name
|
||
/// starting with `assert` (case-insensitive) — covers the entire
|
||
/// PHPUnit assertion family
|
||
/// - the assertion must have at least two argument slots (an
|
||
/// expected/actual pair)
|
||
/// - the first argument's inner expression must be a literal: a
|
||
/// string / number / boolean / null literal, an
|
||
/// `array_creation_expression` whose elements are recursively
|
||
/// literal, a `class_constant_access_expression`, or a unary
|
||
/// sign on one of the above
|
||
///
|
||
/// Genuine production sites (`unserialize($_GET[...])`, helpers
|
||
/// reading from session/cache and handing the value to caller
|
||
/// code) keep firing because they are not wrapped in a PHPUnit
|
||
/// assertion. Single-argument assertions (`assertNotNull($x)`)
|
||
/// and assertions whose expected value is itself dynamic
|
||
/// (`assertEquals($computed, unserialize($blob))`) keep firing
|
||
/// because the bound is not statically verifiable.
|
||
fn is_php_unserialize_inside_phpunit_assertion(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
// The pattern captures `@n` (the function name); locate the enclosing
|
||
// function_call_expression. Mirrors the magic-method recogniser.
|
||
let call_node = if cap_node.kind() == "function_call_expression" {
|
||
cap_node
|
||
} else {
|
||
let mut cur = cap_node;
|
||
let mut found = None;
|
||
for _ in 0..4 {
|
||
if cur.kind() == "function_call_expression" {
|
||
found = Some(cur);
|
||
break;
|
||
}
|
||
match cur.parent() {
|
||
Some(p) => cur = p,
|
||
None => break,
|
||
}
|
||
}
|
||
match found {
|
||
Some(c) => c,
|
||
None => return false,
|
||
}
|
||
};
|
||
|
||
// The unserialize call must sit directly inside an `argument` wrapper
|
||
// that is itself inside an `arguments` list. Reject any wrapping
|
||
// expression (binary, conditional, etc.) — those break the literal
|
||
// bounding the assertion provides.
|
||
let Some(arg_wrapper) = call_node.parent() else {
|
||
return false;
|
||
};
|
||
if arg_wrapper.kind() != "argument" {
|
||
return false;
|
||
}
|
||
let Some(arguments) = arg_wrapper.parent() else {
|
||
return false;
|
||
};
|
||
if arguments.kind() != "arguments" {
|
||
return false;
|
||
}
|
||
let Some(assertion_call) = arguments.parent() else {
|
||
return false;
|
||
};
|
||
if !matches!(
|
||
assertion_call.kind(),
|
||
"member_call_expression"
|
||
| "nullsafe_member_call_expression"
|
||
| "scoped_call_expression"
|
||
| "function_call_expression"
|
||
) {
|
||
return false;
|
||
}
|
||
|
||
// Method/function name must start with `assert` (case-insensitive).
|
||
let name_node = assertion_call
|
||
.child_by_field_name("name")
|
||
.or_else(|| find_named_child_of_kind(assertion_call, "name"));
|
||
let Some(name_node) = name_node else {
|
||
return false;
|
||
};
|
||
let Ok(method_name) = std::str::from_utf8(&bytes[name_node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
if !method_name
|
||
.chars()
|
||
.take(6)
|
||
.collect::<String>()
|
||
.eq_ignore_ascii_case("assert")
|
||
{
|
||
return false;
|
||
}
|
||
|
||
// Collect the assertion's argument wrappers.
|
||
let mut args: Vec<tree_sitter::Node> = Vec::new();
|
||
for i in 0..arguments.named_child_count() as u32 {
|
||
if let Some(c) = arguments.named_child(i)
|
||
&& c.kind() == "argument"
|
||
{
|
||
args.push(c);
|
||
}
|
||
}
|
||
if args.is_empty() {
|
||
return false;
|
||
}
|
||
|
||
// Single-arg assertions: the verb itself bounds the result
|
||
// (`assertNull`, `assertIsArray`, `assertTrue`, ...). Restrict to
|
||
// a curated set so generic `assertSomething(unserialize($x))`
|
||
// helpers without a documented bound don't qualify.
|
||
if args.len() == 1 {
|
||
return is_phpunit_single_arg_bounding_verb(method_name);
|
||
}
|
||
|
||
// Multi-arg assertions: the first argument is the expected /
|
||
// literal-pinned value (PHPUnit's documented `$expected, $actual`
|
||
// order). The expected must be a static literal expression.
|
||
let Some(first_inner) = args[0].named_child(0) else {
|
||
return false;
|
||
};
|
||
is_php_assertion_literal_expected(first_inner, bytes)
|
||
}
|
||
|
||
/// PHPUnit single-arg assertion verbs whose name itself constrains
|
||
/// the inspected value to a known type or constant. When
|
||
/// `unserialize($x)` is the sole argument to one of these, a failed
|
||
/// assertion aborts the test rather than letting an object-injection
|
||
/// side effect escape.
|
||
fn is_phpunit_single_arg_bounding_verb(name: &str) -> bool {
|
||
matches!(
|
||
name.to_ascii_lowercase().as_str(),
|
||
"assertnull"
|
||
| "assertnotnull"
|
||
| "assertempty"
|
||
| "assertnotempty"
|
||
| "asserttrue"
|
||
| "assertfalse"
|
||
| "assertnan"
|
||
| "assertfinite"
|
||
| "assertinfinite"
|
||
| "assertisarray"
|
||
| "assertisnotarray"
|
||
| "assertisbool"
|
||
| "assertisnotbool"
|
||
| "assertiscallable"
|
||
| "assertisnotcallable"
|
||
| "assertisfloat"
|
||
| "assertisnotfloat"
|
||
| "assertisint"
|
||
| "assertisnotint"
|
||
| "assertisiterable"
|
||
| "assertisnotiterable"
|
||
| "assertisnumeric"
|
||
| "assertisnotnumeric"
|
||
| "assertisobject"
|
||
| "assertisnotobject"
|
||
| "assertisresource"
|
||
| "assertisnotresource"
|
||
| "assertisclosedresource"
|
||
| "assertisnotclosedresource"
|
||
| "assertisstring"
|
||
| "assertisnotstring"
|
||
| "assertisscalar"
|
||
| "assertisnotscalar"
|
||
)
|
||
}
|
||
|
||
/// PHP-only helper: returns `true` if `node` is a statically literal
|
||
/// expression suitable as the "expected" argument of a PHPUnit
|
||
/// assertion. Recursive: array elements must themselves be literal.
|
||
/// Class constants (`Foo::BAR`) count as literal — they resolve to
|
||
/// build-time values and PHPUnit treats them as expected pinning.
|
||
fn is_php_assertion_literal_expected(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
match node.kind() {
|
||
"string"
|
||
| "integer"
|
||
| "float"
|
||
| "boolean"
|
||
| "null"
|
||
| "true"
|
||
| "false"
|
||
| "class_constant_access_expression"
|
||
| "scoped_property_access_expression" => true,
|
||
"encapsed_string" => !has_interpolation(node),
|
||
"unary_op_expression" => node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_php_assertion_literal_expected(c, bytes)),
|
||
"array_creation_expression" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(child) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if child.kind() != "array_element_initializer" {
|
||
return false;
|
||
}
|
||
// array_element_initializer can have one (value) or
|
||
// two (key, value) named children; both must be literal.
|
||
for j in 0..child.named_child_count() as u32 {
|
||
let Some(grand) = child.named_child(j) else {
|
||
return false;
|
||
};
|
||
if !is_php_assertion_literal_expected(grand, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
true
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
/// Python-only Layer C4: returns `true` when a deserialization call
|
||
/// (`pickle.loads`, `yaml.load`, `shelve.open`, etc.) sits inside a
|
||
/// test assertion that bounds the result to a literal-expected shape.
|
||
///
|
||
/// Two assertion idioms are recognised:
|
||
/// 1. `unittest.TestCase` style — `self.assertEqual(LITERAL, pickle.loads(b))`,
|
||
/// `self.assertIsNone(pickle.loads(b))`, etc.
|
||
/// 2. pytest plain `assert` — `assert pickle.loads(b) == LITERAL`,
|
||
/// `assert pickle.loads(b) is None`, `assert isinstance(pickle.loads(b),
|
||
/// dict)`, `assert pickle.loads(b)` (truthy), `assert not
|
||
/// pickle.loads(b)` (falsy).
|
||
///
|
||
/// **Why this is a non-actionable site:** the assertion bounds the
|
||
/// deser result to a literal expected; if the blob argument were
|
||
/// attacker-controlled and produced a different shape, the assertion
|
||
/// would fail loudly rather than permit any object-injection side
|
||
/// effect to escape the test boundary. Python projects ship
|
||
/// round-trip tests for every pickled / YAML-loaded data class, and
|
||
/// every firing on those test bodies is noise.
|
||
///
|
||
/// Conservative recognition:
|
||
/// - the deser call must reach the assertion through allowed wrappers
|
||
/// only (parenthesized_expression, comparison_operator with literal
|
||
/// counterpart, unary `not`, `isinstance(_, TYPE)`, `bool` / `len` /
|
||
/// `type` / `id` single-arg wrap); boolean ops and conditional
|
||
/// expressions break the bound and reject.
|
||
/// - unittest verbs must start with `assert` or `fail` (case-sensitive
|
||
/// per Python conventions) and pass the curated single-arg / multi-
|
||
/// arg bounding tables.
|
||
/// - pytest plain `assert` requires the deser to be the asserted
|
||
/// expression (named_child(0) of `assert_statement`), not the
|
||
/// optional message at named_child(1).
|
||
fn is_python_deser_inside_unittest_assertion(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
// Three entry shapes:
|
||
// (a) unittest AST-pattern: `cap_node` is the `pickle` / `yaml` /
|
||
// `shelve` identifier under the deser call's `function.object`
|
||
// path. Walk up to the deser call, then up to an outer
|
||
// assertion call via `argument_list`.
|
||
// (b) unittest CFG-emission: `cap_node` is somewhere inside the
|
||
// OUTER assertion call (`self.assertEqual(...)`). Look for a
|
||
// deser sub-call inside its argument_list.
|
||
// (c) pytest plain-assert: `cap_node` resolves to the deser call,
|
||
// which sits directly under an `assert_statement` (possibly
|
||
// through allowed bounding wrappers).
|
||
let enclosing_call = find_enclosing_call(cap_node);
|
||
let Some(enclosing_call) = enclosing_call else {
|
||
return false;
|
||
};
|
||
|
||
// Path (a)/(c): enclosing call IS the deser.
|
||
if is_python_deser_call(enclosing_call, bytes) {
|
||
// (a) walk to outer call assertion via argument_list.
|
||
if let Some(arg_list) = enclosing_call.parent()
|
||
&& arg_list.kind() == "argument_list"
|
||
&& let Some(assertion_call) = arg_list.parent()
|
||
&& assertion_call.kind() == "call"
|
||
&& python_assertion_bounds_deser(assertion_call, enclosing_call, bytes)
|
||
{
|
||
return true;
|
||
}
|
||
// (c) walk up to assert_statement through allowed wrappers.
|
||
if python_pytest_assert_bounds_deser(enclosing_call, bytes) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
// Path (b): enclosing call IS an assertion that wraps a deser arg.
|
||
if let Some(deser_call) = python_find_direct_deser_arg(enclosing_call, bytes) {
|
||
return python_assertion_bounds_deser(enclosing_call, deser_call, bytes);
|
||
}
|
||
|
||
false
|
||
}
|
||
|
||
/// Search the assertion call's argument_list for a direct child that
|
||
/// is a recognised deserialization call. Direct child only — wrapped
|
||
/// expressions (binary, conditional, parenthesized) break the literal
|
||
/// bound and must keep firing.
|
||
fn python_find_direct_deser_arg<'tree>(
|
||
assertion_call: tree_sitter::Node<'tree>,
|
||
bytes: &[u8],
|
||
) -> Option<tree_sitter::Node<'tree>> {
|
||
let arg_list = assertion_call.child_by_field_name("arguments")?;
|
||
if arg_list.kind() != "argument_list" {
|
||
return None;
|
||
}
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
let Some(c) = arg_list.named_child(i) else {
|
||
continue;
|
||
};
|
||
if c.kind() == "call" && is_python_deser_call(c, bytes) {
|
||
return Some(c);
|
||
}
|
||
}
|
||
None
|
||
}
|
||
|
||
/// Core bounding check: given an assertion `call` node and the
|
||
/// deser sub-call inside its arg list, decide whether the assertion
|
||
/// bounds the deser result so the call is non-actionable.
|
||
fn python_assertion_bounds_deser(
|
||
assertion_call: tree_sitter::Node,
|
||
deser_call: tree_sitter::Node,
|
||
bytes: &[u8],
|
||
) -> bool {
|
||
let Some(func) = assertion_call.child_by_field_name("function") else {
|
||
return false;
|
||
};
|
||
let name_node = match func.kind() {
|
||
"attribute" => func
|
||
.child_by_field_name("attribute")
|
||
.or_else(|| find_named_child_of_kind(func, "identifier")),
|
||
"identifier" => Some(func),
|
||
_ => return false,
|
||
};
|
||
let Some(name_node) = name_node else {
|
||
return false;
|
||
};
|
||
let Ok(verb) = std::str::from_utf8(&bytes[name_node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
let lowered = verb.to_ascii_lowercase();
|
||
if !(lowered.starts_with("assert") || lowered.starts_with("fail")) {
|
||
return false;
|
||
}
|
||
|
||
let Some(arg_list) = assertion_call.child_by_field_name("arguments") else {
|
||
return false;
|
||
};
|
||
if arg_list.kind() != "argument_list" {
|
||
return false;
|
||
}
|
||
let mut pos_args: Vec<tree_sitter::Node> = Vec::new();
|
||
let mut deser_pos: Option<usize> = None;
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
let Some(c) = arg_list.named_child(i) else {
|
||
continue;
|
||
};
|
||
if c.kind() == "keyword_argument" {
|
||
continue;
|
||
}
|
||
if c.id() == deser_call.id() {
|
||
deser_pos = Some(pos_args.len());
|
||
}
|
||
pos_args.push(c);
|
||
}
|
||
let Some(deser_pos) = deser_pos else {
|
||
return false;
|
||
};
|
||
if pos_args.is_empty() {
|
||
return false;
|
||
}
|
||
|
||
if pos_args.len() == 1 {
|
||
return is_python_unittest_single_arg_bounding_verb(verb);
|
||
}
|
||
|
||
if matches!(verb, "assertIsInstance" | "assertNotIsInstance") {
|
||
let type_pos = if deser_pos == 0 { 1 } else { 0 };
|
||
if let Some(type_arg) = pos_args.get(type_pos)
|
||
&& is_python_type_reference(*type_arg)
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
|
||
if !is_python_unittest_multi_arg_bounding_verb(verb) {
|
||
return false;
|
||
}
|
||
for (i, arg) in pos_args.iter().enumerate() {
|
||
if i == deser_pos {
|
||
continue;
|
||
}
|
||
if is_python_assertion_literal_expected(*arg, bytes) {
|
||
return true;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// pytest plain-`assert` bounding check. `deser_call` must be the
|
||
/// recognised deser invocation; we walk upward through allowed
|
||
/// wrappers until we reach an `assert_statement` whose first named
|
||
/// child (the asserted expression, NOT the optional message) is the
|
||
/// chain we walked. Boolean operators and conditional expressions
|
||
/// break the bound (they can short-circuit past the assertion).
|
||
fn python_pytest_assert_bounds_deser(deser_call: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let mut cur = deser_call;
|
||
for _ in 0..8 {
|
||
let Some(parent) = cur.parent() else {
|
||
return false;
|
||
};
|
||
match parent.kind() {
|
||
"assert_statement" => {
|
||
// Asserted expression sits at named_child(0); the
|
||
// optional message sits at named_child(1).
|
||
let first = parent.named_child(0);
|
||
return first.is_some_and(|n| n.id() == cur.id());
|
||
}
|
||
"comparison_operator" => {
|
||
if !python_comparison_other_side_is_literal(parent, cur, bytes) {
|
||
return false;
|
||
}
|
||
cur = parent;
|
||
}
|
||
// `not deser` parses as `not_operator`; `+/-/~ deser` as
|
||
// `unary_operator`. Both leave the deser-side as the sole
|
||
// operand and bound the assertion result to a scalar.
|
||
"unary_operator" | "not_operator" => {
|
||
cur = parent;
|
||
}
|
||
"parenthesized_expression" => {
|
||
cur = parent;
|
||
}
|
||
"argument_list" => {
|
||
let Some(parent_call) = parent.parent() else {
|
||
return false;
|
||
};
|
||
if parent_call.kind() != "call" {
|
||
return false;
|
||
}
|
||
let Some(func) = parent_call.child_by_field_name("function") else {
|
||
return false;
|
||
};
|
||
if func.kind() != "identifier" {
|
||
return false;
|
||
}
|
||
let Ok(name) = std::str::from_utf8(&bytes[func.byte_range()]) else {
|
||
return false;
|
||
};
|
||
match name {
|
||
"isinstance" => {
|
||
// isinstance(deser, TYPE) — deser must be at
|
||
// positional index 0 and the second positional
|
||
// arg must be a type reference.
|
||
let mut pos = 0usize;
|
||
let mut found_at: Option<usize> = None;
|
||
let mut other_args: Vec<tree_sitter::Node> = Vec::new();
|
||
for i in 0..parent.named_child_count() as u32 {
|
||
let Some(c) = parent.named_child(i) else {
|
||
return false;
|
||
};
|
||
if c.kind() == "keyword_argument" {
|
||
continue;
|
||
}
|
||
if c.id() == cur.id() {
|
||
found_at = Some(pos);
|
||
} else {
|
||
other_args.push(c);
|
||
}
|
||
pos += 1;
|
||
}
|
||
if found_at != Some(0)
|
||
|| other_args.len() != 1
|
||
|| !is_python_type_reference(other_args[0])
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
"bool" | "len" | "type" | "id" => {
|
||
// bool(deser) / len(deser) / type(deser) /
|
||
// id(deser) — single-arg scalar wrappers.
|
||
let mut named_count = 0usize;
|
||
for i in 0..parent.named_child_count() as u32 {
|
||
let Some(c) = parent.named_child(i) else {
|
||
return false;
|
||
};
|
||
if c.kind() == "keyword_argument" {
|
||
continue;
|
||
}
|
||
named_count += 1;
|
||
}
|
||
if named_count != 1 {
|
||
return false;
|
||
}
|
||
}
|
||
_ => return false,
|
||
}
|
||
cur = parent_call;
|
||
}
|
||
// Boolean ops and conditionals can short-circuit and let
|
||
// a poisoned blob's side effect run before the assertion
|
||
// fires. Reject so the original finding stands.
|
||
"boolean_operator" | "conditional_expression" => return false,
|
||
_ => return false,
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// `comparison_operator` bounding: the other operand(s) must all be
|
||
/// literal expressions (recursive literal classifier). Operator-kind
|
||
/// children (`is` / `is_not` / `in` / `not_in` are named in
|
||
/// tree-sitter-python) are skipped. Also requires `deser_side` to
|
||
/// actually be one of the named children, defending against unrelated
|
||
/// chained comparisons.
|
||
fn python_comparison_other_side_is_literal(
|
||
cmp: tree_sitter::Node,
|
||
deser_side: tree_sitter::Node,
|
||
bytes: &[u8],
|
||
) -> bool {
|
||
let mut found_self = false;
|
||
for i in 0..cmp.named_child_count() as u32 {
|
||
let Some(c) = cmp.named_child(i) else {
|
||
return false;
|
||
};
|
||
match c.kind() {
|
||
"is" | "is_not" | "in" | "not_in" => continue,
|
||
_ => {}
|
||
}
|
||
if c.id() == deser_side.id() {
|
||
found_self = true;
|
||
continue;
|
||
}
|
||
if !is_python_assertion_literal_expected(c, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
found_self
|
||
}
|
||
|
||
/// Returns `true` when `call_node` is a Python `call` whose callee
|
||
/// is a recognised deserialization function (`pickle.loads` /
|
||
/// `pickle.load` / `yaml.load` / `shelve.open` / `marshal.loads` /
|
||
/// `marshal.load`). Plain identifier callees (`loads(blob)` after
|
||
/// `from pickle import loads`) are also recognised by leaf name to
|
||
/// match the import-shape ambiguity.
|
||
fn is_python_deser_call(call_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let Some(func) = call_node.child_by_field_name("function") else {
|
||
return false;
|
||
};
|
||
match func.kind() {
|
||
"attribute" => {
|
||
let Some(obj) = func.child_by_field_name("object") else {
|
||
return false;
|
||
};
|
||
let Some(attr) = func.child_by_field_name("attribute") else {
|
||
return false;
|
||
};
|
||
let Ok(obj_text) = std::str::from_utf8(&bytes[obj.byte_range()]) else {
|
||
return false;
|
||
};
|
||
let Ok(attr_text) = std::str::from_utf8(&bytes[attr.byte_range()]) else {
|
||
return false;
|
||
};
|
||
matches!(
|
||
(obj_text, attr_text),
|
||
("pickle", "loads")
|
||
| ("pickle", "load")
|
||
| ("cPickle", "loads")
|
||
| ("cPickle", "load")
|
||
| ("yaml", "load")
|
||
| ("yaml", "unsafe_load")
|
||
| ("shelve", "open")
|
||
| ("marshal", "loads")
|
||
| ("marshal", "load")
|
||
)
|
||
}
|
||
"identifier" => {
|
||
let Ok(name) = std::str::from_utf8(&bytes[func.byte_range()]) else {
|
||
return false;
|
||
};
|
||
matches!(name, "loads" | "load" | "unsafe_load")
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
/// Single-arg `unittest.TestCase` assertion verbs whose name itself
|
||
/// constrains the inspected value. When the deser call is the sole
|
||
/// positional argument to one of these, a failed assertion aborts
|
||
/// the test rather than letting an object-injection side effect
|
||
/// escape.
|
||
fn is_python_unittest_single_arg_bounding_verb(name: &str) -> bool {
|
||
matches!(
|
||
name,
|
||
"assertIsNone"
|
||
| "assertIsNotNone"
|
||
| "assertTrue"
|
||
| "assertFalse"
|
||
| "assertNotNone"
|
||
| "assertNone"
|
||
| "failIf"
|
||
| "failUnless"
|
||
| "assert_"
|
||
)
|
||
}
|
||
|
||
/// Multi-arg `unittest.TestCase` assertion verbs that perform a
|
||
/// literal-comparable bound on every value position (equality,
|
||
/// ordering, membership, regex match, type-equality).
|
||
fn is_python_unittest_multi_arg_bounding_verb(name: &str) -> bool {
|
||
matches!(
|
||
name,
|
||
"assertEqual"
|
||
| "assertEquals"
|
||
| "assertNotEqual"
|
||
| "assertNotEquals"
|
||
| "assert_equal"
|
||
| "assert_not_equal"
|
||
| "assertIs"
|
||
| "assertIsNot"
|
||
| "assertAlmostEqual"
|
||
| "assertNotAlmostEqual"
|
||
| "assertGreater"
|
||
| "assertGreaterEqual"
|
||
| "assertLess"
|
||
| "assertLessEqual"
|
||
| "assertListEqual"
|
||
| "assertTupleEqual"
|
||
| "assertDictEqual"
|
||
| "assertSetEqual"
|
||
| "assertSequenceEqual"
|
||
| "assertMultiLineEqual"
|
||
| "assertCountEqual"
|
||
| "assertItemsEqual"
|
||
| "assertIn"
|
||
| "assertNotIn"
|
||
| "assertRegex"
|
||
| "assertNotRegex"
|
||
| "assertRegexpMatches"
|
||
| "assertNotRegexpMatches"
|
||
| "failUnlessEqual"
|
||
| "failIfEqual"
|
||
)
|
||
}
|
||
|
||
/// Recognise a Python type reference suitable as the second arg to
|
||
/// `assertIsInstance(value, type)`. Accepts builtin/user-class
|
||
/// identifiers, dotted attribute access (`module.Type`), generic
|
||
/// subscripts (`list[int]`), and tuples-of-types.
|
||
fn is_python_type_reference(node: tree_sitter::Node) -> bool {
|
||
match node.kind() {
|
||
"identifier" | "attribute" | "subscript" => true,
|
||
"tuple" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(c) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if !is_python_type_reference(c) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
/// Python literal expression suitable as the "expected" argument of
|
||
/// a `unittest.TestCase.assertEqual`-family assertion. Recursive:
|
||
/// list / tuple / set / dict elements and unary signs on numerics
|
||
/// must themselves be literal. Identifier references and attribute
|
||
/// access do NOT count (those could resolve to dynamic values).
|
||
fn is_python_assertion_literal_expected(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
match node.kind() {
|
||
"string" => !has_python_string_interpolation(node),
|
||
"concatenated_string" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(c) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if !is_python_assertion_literal_expected(c, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
"integer" | "float" | "true" | "false" | "none" | "ellipsis" => true,
|
||
"unary_operator" => node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_python_assertion_literal_expected(c, bytes)),
|
||
"list" | "tuple" | "set" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(c) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if !is_python_assertion_literal_expected(c, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
"dictionary" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(c) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if c.kind() != "pair" {
|
||
return false;
|
||
}
|
||
let Some(key) = c.child_by_field_name("key") else {
|
||
return false;
|
||
};
|
||
let Some(value) = c.child_by_field_name("value") else {
|
||
return false;
|
||
};
|
||
if !is_python_assertion_literal_expected(key, bytes) {
|
||
return false;
|
||
}
|
||
if !is_python_assertion_literal_expected(value, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
/// Python f-strings are `string` nodes with `interpolation` children.
|
||
/// Treat them as non-literal because the interpolated value is
|
||
/// dynamic.
|
||
fn has_python_string_interpolation(node: tree_sitter::Node) -> bool {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
if let Some(c) = node.named_child(i)
|
||
&& c.kind() == "interpolation"
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Ruby Layer C5: returns `true` when a `Marshal.load` / `YAML.load` /
|
||
/// `Psych.load` call sits directly inside a Minitest assertion or RSpec
|
||
/// matcher chain whose other operand is a literal expected. Same
|
||
/// non-actionability rationale as the Python and PHP recognisers
|
||
/// above: round-trip tests bound the deser result to a literal, a
|
||
/// poisoned blob would fail the assertion, no object-injection side
|
||
/// effect escapes the test boundary.
|
||
///
|
||
/// Conservative recognition:
|
||
/// - Minitest: `assert_equal LIT, deser`, `assert_nil deser`,
|
||
/// `assert deser` (truthy), and the `refute_*` mirrors.
|
||
/// - RSpec: `expect(deser).to eq(LIT)`, `expect(deser).to be_nil`,
|
||
/// `expect(deser).to be_a(TYPE)`, `be_truthy`, `not_to`/`to_not`.
|
||
/// - Old-style `.should ==` chains are NOT recognised (they're
|
||
/// discouraged in modern RSpec and the AST shape parses as a
|
||
/// `binary` rather than the receiver-method-arguments shape).
|
||
fn is_ruby_deser_inside_test_assertion(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let enclosing_call = find_enclosing_call(cap_node);
|
||
let Some(deser_call) = enclosing_call else {
|
||
return false;
|
||
};
|
||
if !is_ruby_deser_call(deser_call, bytes) {
|
||
return false;
|
||
}
|
||
let Some(arg_list) = deser_call.parent() else {
|
||
return false;
|
||
};
|
||
if arg_list.kind() != "argument_list" {
|
||
return false;
|
||
}
|
||
let Some(outer_call) = arg_list.parent() else {
|
||
return false;
|
||
};
|
||
if outer_call.kind() != "call" {
|
||
return false;
|
||
}
|
||
if outer_call.child_by_field_name("receiver").is_some() {
|
||
return false;
|
||
}
|
||
let Some(method_node) = outer_call.child_by_field_name("method") else {
|
||
return false;
|
||
};
|
||
let Ok(name) = std::str::from_utf8(&bytes[method_node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
|
||
if is_ruby_minitest_single_arg_bounding_verb(name)
|
||
|| is_ruby_minitest_multi_arg_bounding_verb(name)
|
||
|| matches!(
|
||
name,
|
||
"assert_kind_of" | "assert_instance_of" | "refute_kind_of" | "refute_instance_of"
|
||
)
|
||
{
|
||
return ruby_minitest_assertion_bounds_deser(outer_call, deser_call, bytes);
|
||
}
|
||
|
||
if name == "expect" {
|
||
let Some(rspec_outer) = outer_call.parent() else {
|
||
return false;
|
||
};
|
||
if rspec_outer.kind() != "call" {
|
||
return false;
|
||
}
|
||
let Some(receiver) = rspec_outer.child_by_field_name("receiver") else {
|
||
return false;
|
||
};
|
||
if receiver.id() != outer_call.id() {
|
||
return false;
|
||
}
|
||
let Some(rspec_method) = rspec_outer.child_by_field_name("method") else {
|
||
return false;
|
||
};
|
||
let Ok(verb) = std::str::from_utf8(&bytes[rspec_method.byte_range()]) else {
|
||
return false;
|
||
};
|
||
if !matches!(verb, "to" | "not_to" | "to_not") {
|
||
return false;
|
||
}
|
||
let Some(matcher_args) = rspec_outer.child_by_field_name("arguments") else {
|
||
return false;
|
||
};
|
||
return ruby_rspec_matcher_bounds_deser(matcher_args, bytes);
|
||
}
|
||
|
||
false
|
||
}
|
||
|
||
/// `Marshal.load` / `YAML.load` / `YAML.unsafe_load` / `Psych.load` /
|
||
/// `Psych.unsafe_load` shape recogniser. Only the canonical `Module.method`
|
||
/// chain — bare-leaf `load(b)` is ambiguous in Ruby and not flagged as a
|
||
/// pattern hit, so no need to handle it here.
|
||
fn is_ruby_deser_call(call_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let Some(receiver) = call_node.child_by_field_name("receiver") else {
|
||
return false;
|
||
};
|
||
let Some(method) = call_node.child_by_field_name("method") else {
|
||
return false;
|
||
};
|
||
if receiver.kind() != "constant" {
|
||
return false;
|
||
}
|
||
let Ok(recv_text) = std::str::from_utf8(&bytes[receiver.byte_range()]) else {
|
||
return false;
|
||
};
|
||
let Ok(method_text) = std::str::from_utf8(&bytes[method.byte_range()]) else {
|
||
return false;
|
||
};
|
||
matches!(
|
||
(recv_text, method_text),
|
||
("Marshal", "load")
|
||
| ("Marshal", "restore")
|
||
| ("YAML", "load")
|
||
| ("YAML", "unsafe_load")
|
||
| ("YAML", "load_file")
|
||
| ("Psych", "load")
|
||
| ("Psych", "unsafe_load")
|
||
| ("Psych", "load_file")
|
||
)
|
||
}
|
||
|
||
fn ruby_minitest_assertion_bounds_deser(
|
||
call: tree_sitter::Node,
|
||
deser_call: tree_sitter::Node,
|
||
bytes: &[u8],
|
||
) -> bool {
|
||
let Some(method) = call.child_by_field_name("method") else {
|
||
return false;
|
||
};
|
||
let Ok(name) = std::str::from_utf8(&bytes[method.byte_range()]) else {
|
||
return false;
|
||
};
|
||
let Some(arg_list) = call.child_by_field_name("arguments") else {
|
||
return false;
|
||
};
|
||
let mut pos_args: Vec<tree_sitter::Node> = Vec::new();
|
||
let mut deser_pos: Option<usize> = None;
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
let Some(c) = arg_list.named_child(i) else {
|
||
continue;
|
||
};
|
||
// Minitest verbs accept a trailing message argument as last
|
||
// positional; both that and the value positions are checked
|
||
// through the literal tester so kwargs and hash splats are
|
||
// the only kinds that need to be stripped here.
|
||
if matches!(c.kind(), "pair" | "hash_splat_argument") {
|
||
continue;
|
||
}
|
||
if c.id() == deser_call.id() {
|
||
deser_pos = Some(pos_args.len());
|
||
}
|
||
pos_args.push(c);
|
||
}
|
||
let Some(deser_pos) = deser_pos else {
|
||
return false;
|
||
};
|
||
if pos_args.is_empty() {
|
||
return false;
|
||
}
|
||
|
||
if pos_args.len() == 1 {
|
||
return is_ruby_minitest_single_arg_bounding_verb(name);
|
||
}
|
||
|
||
if matches!(
|
||
name,
|
||
"assert_kind_of" | "assert_instance_of" | "refute_kind_of" | "refute_instance_of"
|
||
) {
|
||
let type_pos = if deser_pos == 0 { 1 } else { 0 };
|
||
if let Some(type_arg) = pos_args.get(type_pos)
|
||
&& is_ruby_type_reference(*type_arg)
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
|
||
if !is_ruby_minitest_multi_arg_bounding_verb(name) {
|
||
return false;
|
||
}
|
||
for (i, arg) in pos_args.iter().enumerate() {
|
||
if i == deser_pos {
|
||
continue;
|
||
}
|
||
if is_ruby_assertion_literal_expected(*arg, bytes) {
|
||
return true;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
fn ruby_rspec_matcher_bounds_deser(args_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let Some(matcher) = args_node.named_child(0) else {
|
||
return false;
|
||
};
|
||
match matcher.kind() {
|
||
"identifier" => {
|
||
// Bare-name matchers: be_nil, be_truthy, be_falsey, etc.
|
||
let Ok(name) = std::str::from_utf8(&bytes[matcher.byte_range()]) else {
|
||
return false;
|
||
};
|
||
is_ruby_rspec_bare_matcher(name)
|
||
}
|
||
"call" => {
|
||
let Some(method) = matcher.child_by_field_name("method") else {
|
||
return false;
|
||
};
|
||
let Ok(name) = std::str::from_utf8(&bytes[method.byte_range()]) else {
|
||
return false;
|
||
};
|
||
let Some(matcher_args) = matcher.child_by_field_name("arguments") else {
|
||
return false;
|
||
};
|
||
match name {
|
||
"eq" | "eql" | "equal" | "match_array" | "contain_exactly" => {
|
||
let mut any = false;
|
||
for i in 0..matcher_args.named_child_count() as u32 {
|
||
let Some(c) = matcher_args.named_child(i) else {
|
||
return false;
|
||
};
|
||
if !is_ruby_assertion_literal_expected(c, bytes) {
|
||
return false;
|
||
}
|
||
any = true;
|
||
}
|
||
any
|
||
}
|
||
"be_a" | "be_an" | "be_kind_of" | "be_instance_of" | "be_a_kind_of" => {
|
||
let Some(c) = matcher_args.named_child(0) else {
|
||
return false;
|
||
};
|
||
is_ruby_type_reference(c)
|
||
}
|
||
"be" => {
|
||
// `be(LITERAL)` — `be == LIT` shape isn't representable here,
|
||
// accept a single literal arg.
|
||
let Some(c) = matcher_args.named_child(0) else {
|
||
return false;
|
||
};
|
||
is_ruby_assertion_literal_expected(c, bytes)
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
fn is_ruby_minitest_single_arg_bounding_verb(name: &str) -> bool {
|
||
matches!(
|
||
name,
|
||
"assert" | "assert_nil" | "refute" | "refute_nil" | "assert_empty" | "refute_empty"
|
||
)
|
||
}
|
||
|
||
fn is_ruby_minitest_multi_arg_bounding_verb(name: &str) -> bool {
|
||
matches!(
|
||
name,
|
||
"assert_equal"
|
||
| "assert_not_equal"
|
||
| "refute_equal"
|
||
| "assert_in_delta"
|
||
| "assert_in_epsilon"
|
||
| "assert_includes"
|
||
| "refute_includes"
|
||
| "assert_match"
|
||
| "refute_match"
|
||
| "assert_operator"
|
||
| "refute_operator"
|
||
| "assert_predicate"
|
||
| "refute_predicate"
|
||
| "assert_same"
|
||
| "refute_same"
|
||
)
|
||
}
|
||
|
||
fn is_ruby_rspec_bare_matcher(name: &str) -> bool {
|
||
matches!(
|
||
name,
|
||
"be_nil"
|
||
| "be_truthy"
|
||
| "be_falsey"
|
||
| "be_falsy"
|
||
| "be_empty"
|
||
| "be_present"
|
||
| "be_zero"
|
||
| "be_positive"
|
||
| "be_negative"
|
||
)
|
||
}
|
||
|
||
fn is_ruby_type_reference(node: tree_sitter::Node) -> bool {
|
||
matches!(node.kind(), "constant" | "scope_resolution" | "identifier")
|
||
}
|
||
|
||
/// Recursive Ruby literal classifier. Strings count when they have no
|
||
/// `interpolation` children (`"hello"` literal yes, `"#{x}"` no).
|
||
/// Symbols, numbers, booleans, `nil`, arrays / hashes (recursive),
|
||
/// negative numeric unary, and ranges with literal endpoints all
|
||
/// qualify.
|
||
fn is_ruby_assertion_literal_expected(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
match node.kind() {
|
||
"string" => !has_ruby_string_interpolation(node),
|
||
"string_array" | "symbol_array" => true,
|
||
"integer" | "float" | "true" | "false" | "nil" | "simple_symbol" | "hash_key_symbol"
|
||
| "rational" | "complex" | "regex" => true,
|
||
"unary" => node
|
||
.named_child(0)
|
||
.is_some_and(|c| is_ruby_assertion_literal_expected(c, bytes)),
|
||
"array" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(c) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if !is_ruby_assertion_literal_expected(c, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
"hash" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(pair) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if pair.kind() != "pair" {
|
||
return false;
|
||
}
|
||
let Some(key) = pair.child_by_field_name("key") else {
|
||
return false;
|
||
};
|
||
let Some(value) = pair.child_by_field_name("value") else {
|
||
return false;
|
||
};
|
||
if !is_ruby_assertion_literal_expected(key, bytes) {
|
||
return false;
|
||
}
|
||
if !is_ruby_assertion_literal_expected(value, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
"range" => {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
let Some(c) = node.named_child(i) else {
|
||
return false;
|
||
};
|
||
if !is_ruby_assertion_literal_expected(c, bytes) {
|
||
return false;
|
||
}
|
||
}
|
||
true
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
fn has_ruby_string_interpolation(node: tree_sitter::Node) -> bool {
|
||
for i in 0..node.named_child_count() as u32 {
|
||
if let Some(c) = node.named_child(i)
|
||
&& c.kind() == "interpolation"
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// C/C++-only Layer D: structural suppression of buffer-overflow pattern
|
||
/// rules when the source / format-string argument is a literal whose
|
||
/// contributed length is statically bounded.
|
||
///
|
||
/// **Policy (vulnerability detection, not style):** Nyx flags
|
||
/// `c.memory.strcpy` / `c.memory.strcat` / `c.memory.sprintf` (and the
|
||
/// `cpp.memory.*` mirrors) when the source argument can carry
|
||
/// attacker-controlled length. Calls whose source is a string literal
|
||
/// have a compile-time bound and cannot overflow due to attacker input
|
||
///, a too-small destination is a fixed developer bug (caught by
|
||
/// compiler warnings / `-fstack-protector` / clang-tidy / ASan), not an
|
||
/// exploitable channel. Suppressing these literal-source calls is a
|
||
/// deliberate noise / false-positive reduction aligned with Nyx's scope
|
||
/// (vulnerability detection over style enforcement).
|
||
///
|
||
/// **Test coverage convention:**
|
||
/// - Negative cases (suppression correct) live alongside other state /
|
||
/// lifecycle fixtures and are recorded as soft expectations
|
||
/// (`must_match: false`) in `*.expect.json`. The notes there
|
||
/// reference this function so future authors can trace why the AST
|
||
/// pattern doesn't fire. Examples:
|
||
/// - `tests/fixtures/real_world/c/state/malloc_lifecycle.expect.json`
|
||
/// - `tests/fixtures/real_world/cpp/state/new_delete.expect.json`
|
||
/// - `tests/fixtures/real_world/cpp/state/malloc_branches.expect.json`
|
||
/// - Positive cases (suppression must NOT fire, source is a parameter
|
||
/// or other attacker-reachable value) live as hard expectations
|
||
/// (`must_match: true`) in the taint fixtures:
|
||
/// - `tests/fixtures/real_world/c/taint/buffer_overflow.c`
|
||
/// - `tests/fixtures/real_world/cpp/taint/gets_strcpy.cpp`
|
||
///
|
||
/// Removing this function or weakening its predicate would be caught by
|
||
/// neither, it would be caught by the unit tests below.
|
||
///
|
||
/// Pattern rules `c.memory.strcpy` / `c.memory.strcat` / `c.memory.sprintf`
|
||
/// (and the `cpp.memory.*` mirrors) flag the call syntactically; their
|
||
/// stated danger is "no bounds checking on destination buffer" / "no length
|
||
/// limit on output buffer". That danger is realised only when the source
|
||
/// argument can carry attacker-controlled length. When the source is a
|
||
/// string literal the bound is fixed at compile time, so the call cannot
|
||
/// overflow due to attacker input (a too-small destination is a fixed
|
||
/// developer bug, not an exploitable channel).
|
||
///
|
||
/// Shapes recognised:
|
||
/// - `strcpy(dst, "literal")` → suppress
|
||
/// - `strcpy(dst, COND ? "a" : "b")` → suppress (ternary of two
|
||
/// string-literal branches; the postgres `formatting.c` shape)
|
||
/// - `strcat(dst, "literal")` → same
|
||
/// - `sprintf(dst, "format")` where the format string is a literal
|
||
/// containing no bare `%s` (only width/precision-bounded specifiers
|
||
/// like `%d`, `%lld`, `%c`, `%.*s`, `%.5s`)
|
||
/// → suppress
|
||
///
|
||
/// Conservative refusals:
|
||
/// - source / format is an identifier (could be tainted, e.g.
|
||
/// `sprintf(buf, fmt, …)`) → keep firing
|
||
/// - format is `concatenated_string` containing identifier macros (e.g.
|
||
/// `"%" PRId64`), we cannot statically expand the macro, so refuse
|
||
/// - bare `%s` in format → keep firing (could read unbounded length)
|
||
fn is_c_buffer_call_literal_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let kind = match rule_id {
|
||
"c.memory.strcpy" | "cpp.memory.strcpy" => CBufferRule::StrcpyOrCat,
|
||
"c.memory.strcat" | "cpp.memory.strcat" => CBufferRule::StrcpyOrCat,
|
||
"c.memory.sprintf" | "cpp.memory.sprintf" => CBufferRule::Sprintf,
|
||
_ => return false,
|
||
};
|
||
let call = find_enclosing_call(cap_node);
|
||
let Some(call) = call else { return false };
|
||
let arg_list = find_arg_list(call);
|
||
let Some(arg_list) = arg_list else {
|
||
return false;
|
||
};
|
||
let mut args = Vec::new();
|
||
for i in 0..arg_list.named_child_count() as u32 {
|
||
if let Some(c) = arg_list.named_child(i) {
|
||
args.push(c);
|
||
}
|
||
}
|
||
if args.len() < 2 {
|
||
return false;
|
||
}
|
||
let src = args[1];
|
||
match kind {
|
||
CBufferRule::StrcpyOrCat => is_c_string_literal_or_lit_ternary(src, bytes),
|
||
CBufferRule::Sprintf => {
|
||
// Format must be a single string literal with safe specifiers.
|
||
// Refuse identifiers and concatenated_string (PRI* macros).
|
||
if !matches!(
|
||
src.kind(),
|
||
"string_literal" | "raw_string_literal" | "string"
|
||
) {
|
||
return false;
|
||
}
|
||
let Some(text) = c_string_literal_payload(src, bytes) else {
|
||
return false;
|
||
};
|
||
sprintf_format_is_safe(&text)
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Copy, Clone)]
|
||
enum CBufferRule {
|
||
StrcpyOrCat,
|
||
Sprintf,
|
||
}
|
||
|
||
/// True for: a C/C++ string literal, OR a `conditional_expression` whose
|
||
/// consequence + alternative are both either string literals or ALL_CAPS
|
||
/// identifiers (the canonical preprocessor-macro naming convention for
|
||
/// string-constant `#define`s, `P_M_STR`, `A_M_STR`, `BG_NAME`, etc., used
|
||
/// pervasively in postgres' `formatting.c::DCH_a_m`). Parenthesised forms
|
||
/// are unwrapped.
|
||
///
|
||
/// The ALL_CAPS heuristic recognises identifiers whose every character is
|
||
/// in `[A-Z0-9_]` and which contain at least one alphabetic letter.
|
||
/// Variables in C/C++ are conventionally lower / camelCase; macros are
|
||
/// SHOUTING_SNAKE. False acceptance of an actual variable is possible but
|
||
/// extraordinarily rare in real codebases.
|
||
fn is_c_string_literal_or_lit_ternary(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let n = unwrap_c_paren(node);
|
||
match n.kind() {
|
||
"string_literal" | "raw_string_literal" | "string" => true,
|
||
"conditional_expression" => {
|
||
// tree-sitter-c shape: condition, consequence, alternative as
|
||
// named children. Accept when BOTH branches are string
|
||
// literals or ALL_CAPS identifiers.
|
||
let mut branches: Vec<tree_sitter::Node> = Vec::new();
|
||
for i in 0..n.named_child_count() as u32 {
|
||
if let Some(c) = n.named_child(i) {
|
||
branches.push(c);
|
||
}
|
||
}
|
||
if branches.len() < 3 {
|
||
return false;
|
||
}
|
||
// first child is the condition; the next two are the branches.
|
||
let conseq = unwrap_c_paren(branches[1]);
|
||
let alt = unwrap_c_paren(branches[2]);
|
||
is_c_lit_or_macro_branch(conseq, bytes) && is_c_lit_or_macro_branch(alt, bytes)
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
fn is_c_lit_or_macro_branch(node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
match node.kind() {
|
||
"string_literal" | "raw_string_literal" | "string" => true,
|
||
"identifier" => {
|
||
let Ok(name) = std::str::from_utf8(&bytes[node.byte_range()]) else {
|
||
return false;
|
||
};
|
||
is_all_caps_macro_name(name)
|
||
}
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
fn is_all_caps_macro_name(s: &str) -> bool {
|
||
if s.is_empty() {
|
||
return false;
|
||
}
|
||
let mut has_alpha = false;
|
||
for ch in s.chars() {
|
||
if ch.is_ascii_uppercase() {
|
||
has_alpha = true;
|
||
} else if ch == '_' || ch.is_ascii_digit() {
|
||
// ok
|
||
} else {
|
||
return false;
|
||
}
|
||
}
|
||
has_alpha
|
||
}
|
||
|
||
fn unwrap_c_paren(mut node: tree_sitter::Node) -> tree_sitter::Node {
|
||
for _ in 0..4 {
|
||
if node.kind() == "parenthesized_expression"
|
||
&& let Some(inner) = node.named_child(0)
|
||
{
|
||
node = inner;
|
||
continue;
|
||
}
|
||
break;
|
||
}
|
||
node
|
||
}
|
||
|
||
/// Extract the textual payload of a C/C++ string literal node, stripping
|
||
/// the surrounding double-quotes and the optional encoding prefix
|
||
/// (`L"..."`, `u8"..."`, `R"(...)"`). Returns `None` if the bytes are not
|
||
/// valid UTF-8 or the literal cannot be decoded.
|
||
fn c_string_literal_payload(node: tree_sitter::Node, bytes: &[u8]) -> Option<String> {
|
||
// Prefer a `string_content` child if tree-sitter exposes one.
|
||
for i in 0..node.named_child_count() as u32 {
|
||
if let Some(c) = node.named_child(i)
|
||
&& c.kind() == "string_content"
|
||
&& let Ok(s) = std::str::from_utf8(&bytes[c.byte_range()])
|
||
{
|
||
return Some(s.to_string());
|
||
}
|
||
}
|
||
// Fall back: strip the surrounding quotes from the full literal text.
|
||
let raw = std::str::from_utf8(&bytes[node.byte_range()]).ok()?;
|
||
let trimmed = raw.trim();
|
||
// Drop optional encoding prefix.
|
||
let after_prefix = trimmed
|
||
.trim_start_matches('L')
|
||
.trim_start_matches("u8")
|
||
.trim_start_matches('u')
|
||
.trim_start_matches('U');
|
||
let s = after_prefix
|
||
.strip_prefix('"')
|
||
.and_then(|s| s.strip_suffix('"'));
|
||
s.map(|s| s.to_string())
|
||
}
|
||
|
||
/// Returns `true` when a `printf`-family format string can never overflow a
|
||
/// destination buffer due to attacker-controlled length. Walks every `%`
|
||
/// specifier in the format and refuses if any bare `%s` is present.
|
||
/// Width-bounded `%5s` is unbounded (width is a *minimum*), but
|
||
/// precision-bounded `%.5s` / `%.*s` is safe (precision caps the maximum).
|
||
pub(crate) fn sprintf_format_is_safe(fmt: &str) -> bool {
|
||
let bytes = fmt.as_bytes();
|
||
let mut i = 0;
|
||
while i < bytes.len() {
|
||
if bytes[i] != b'%' {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
i += 1;
|
||
if i >= bytes.len() {
|
||
// trailing `%`, malformed, refuse to suppress
|
||
return false;
|
||
}
|
||
if bytes[i] == b'%' {
|
||
i += 1;
|
||
continue;
|
||
}
|
||
// Skip flags
|
||
while i < bytes.len() && matches!(bytes[i], b'-' | b'+' | b'#' | b' ' | b'0' | b'\'') {
|
||
i += 1;
|
||
}
|
||
// Skip width (digits or `*`)
|
||
if i < bytes.len() && bytes[i] == b'*' {
|
||
i += 1;
|
||
} else {
|
||
while i < bytes.len() && bytes[i].is_ascii_digit() {
|
||
i += 1;
|
||
}
|
||
}
|
||
// Optional precision
|
||
let mut has_precision = false;
|
||
if i < bytes.len() && bytes[i] == b'.' {
|
||
has_precision = true;
|
||
i += 1;
|
||
if i < bytes.len() && bytes[i] == b'*' {
|
||
i += 1;
|
||
} else {
|
||
while i < bytes.len() && bytes[i].is_ascii_digit() {
|
||
i += 1;
|
||
}
|
||
}
|
||
}
|
||
// Length modifiers: h hh l ll L q z j t
|
||
while i < bytes.len() && matches!(bytes[i], b'h' | b'l' | b'L' | b'q' | b'z' | b'j' | b't')
|
||
{
|
||
i += 1;
|
||
}
|
||
if i >= bytes.len() {
|
||
return false;
|
||
}
|
||
let conv = bytes[i];
|
||
i += 1;
|
||
match conv {
|
||
// Numeric / char / pointer specifiers, bounded output for any input
|
||
b'd' | b'i' | b'u' | b'o' | b'x' | b'X' | b'c' | b'e' | b'E' | b'f' | b'F' | b'g'
|
||
| b'G' | b'a' | b'A' | b'p' | b'n' => continue,
|
||
// String specifier: only safe when precision-bounded
|
||
b's' => {
|
||
if !has_precision {
|
||
return false;
|
||
}
|
||
}
|
||
// Unknown conversion (e.g. `%S` wide-char on Windows is
|
||
// unbounded) → conservative refuse.
|
||
_ => return false,
|
||
}
|
||
}
|
||
true
|
||
}
|
||
|
||
fn is_string_literal_with_text(node: tree_sitter::Node, text: &str, bytes: &[u8]) -> bool {
|
||
if node.kind() != "string" && node.kind() != "encapsed_string" {
|
||
return false;
|
||
}
|
||
// Look for a single string_content / string_value child.
|
||
let mut payload = None;
|
||
for i in 0..node.named_child_count() as u32 {
|
||
if let Some(c) = node.named_child(i)
|
||
&& (c.kind() == "string_content" || c.kind() == "string_value")
|
||
{
|
||
payload = Some(c);
|
||
break;
|
||
}
|
||
}
|
||
let Some(payload) = payload else {
|
||
// Fall back: PHP single-quoted strings sometimes inline the content.
|
||
if let Ok(s) = std::str::from_utf8(&bytes[node.byte_range()]) {
|
||
let trimmed = s.trim_matches(|c| c == '\'' || c == '"');
|
||
return trimmed == text;
|
||
}
|
||
return false;
|
||
};
|
||
if let Ok(s) = std::str::from_utf8(&bytes[payload.byte_range()]) {
|
||
return s == text;
|
||
}
|
||
false
|
||
}
|
||
|
||
/// C++-only Layer E: structural suppression of `cpp.memory.reinterpret_cast`
|
||
/// when the cast's target type is explicitly defined as safe by the C++
|
||
/// aliasing rules.
|
||
///
|
||
/// `reinterpret_cast<T>(x)` is *not* always undefined behaviour — the C++
|
||
/// standard ([basic.lval]/11) explicitly permits accessing any object
|
||
/// representation through a pointer to `char`, `unsigned char`, or
|
||
/// `std::byte` (and, by long-standing convention, `int8_t` / `uint8_t`).
|
||
/// `void*` is similarly safe because reads / writes are illegal through it
|
||
/// (the program must always cast back before dereferencing). The integer
|
||
/// round-trip `uintptr_t` / `intptr_t` is guaranteed lossless by the
|
||
/// standard. POSIX additionally type-puns the `sockaddr` family — the
|
||
/// BSD-socket API takes `struct sockaddr *` and the program must cast from
|
||
/// `sockaddr_in*` / `sockaddr_in6*` / `sockaddr_un*` / `sockaddr_storage*`,
|
||
/// which is the API's intended use.
|
||
///
|
||
/// The pattern rule `cpp.memory.reinterpret_cast` cannot distinguish these
|
||
/// well-defined casts from genuinely dangerous strict-aliasing UB casts
|
||
/// (`reinterpret_cast<MyStruct*>(buf)`), so it over-fires by ~70% on
|
||
/// real-repo serialization, hashing, IPC, and socket-API code where the
|
||
/// cast is the canonical (and standard-blessed) idiom. Suppressing the
|
||
/// well-defined target-type set is a layer-2 structural fix (per the
|
||
/// bughunt depth hierarchy): the engine recognises the property
|
||
/// (well-defined target type) that makes the cast safe in C++ and
|
||
/// suppresses based on it. Genuine strict-aliasing risk casts (target is
|
||
/// a user struct / class type) keep firing.
|
||
///
|
||
/// Shapes recognised (any pointer depth `>= 1` unless noted):
|
||
/// - `char*`, `signed char*`, `unsigned char*`, `wchar_t*`
|
||
/// - `uint8_t*`, `int8_t*`, `std::byte*`, `byte*`
|
||
/// - `void*`
|
||
/// - `uintptr_t`, `std::uintptr_t`, `intptr_t`, `std::intptr_t` (no
|
||
/// pointer depth required — the standard guarantees the lossless
|
||
/// round-trip even for the integer form)
|
||
/// - `sockaddr*`, `struct sockaddr*`, `sockaddr_in*`, `sockaddr_in6*`,
|
||
/// `sockaddr_un*`, `sockaddr_storage*` (any of the BSD-socket
|
||
/// address-structure family)
|
||
///
|
||
/// Conservative refusals (kept firing): user-defined struct / class
|
||
/// pointer targets, template type parameters (`T*`), and any target the
|
||
/// normaliser cannot identify.
|
||
fn is_cpp_cast_target_type_safe(rule_id: &str, cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
if rule_id != "cpp.memory.reinterpret_cast" {
|
||
return false;
|
||
}
|
||
// `cap_node` is the `(identifier) @n` "reinterpret_cast" capture (the
|
||
// pattern's index-0 capture, by query-string order — see Layer A's
|
||
// `c.index == 0` selection in `run_ast_queries`). Walk up via
|
||
// `find_enclosing_call` to reach the outer `call_expression`. Its
|
||
// `function` field is a `template_function` whose `arguments` field is
|
||
// the `template_argument_list` carrying the target type.
|
||
let call = find_enclosing_call(cap_node);
|
||
let Some(call) = call else { return false };
|
||
let func = call.child_by_field_name("function");
|
||
let Some(func) = func else { return false };
|
||
if func.kind() != "template_function" {
|
||
return false;
|
||
}
|
||
let targs = func.child_by_field_name("arguments");
|
||
let Some(targs) = targs else { return false };
|
||
if targs.kind() != "template_argument_list" {
|
||
return false;
|
||
}
|
||
let Ok(text) = std::str::from_utf8(&bytes[targs.byte_range()]) else {
|
||
return false;
|
||
};
|
||
let inner = text
|
||
.trim()
|
||
.trim_start_matches('<')
|
||
.trim_end_matches('>')
|
||
.trim();
|
||
cpp_cast_target_type_is_safe(inner)
|
||
}
|
||
|
||
/// Normalise a C++ cast target type string and report whether it names a
|
||
/// well-defined-by-aliasing-rules type per the policy in
|
||
/// [`is_cpp_cast_target_type_safe`]. Public to the module so the unit
|
||
/// tests can pin the canonical and adversarial shapes.
|
||
pub(crate) fn cpp_cast_target_type_is_safe(s: &str) -> bool {
|
||
// Collapse all internal whitespace (tabs, newlines, multiple spaces)
|
||
// to single spaces so the normalised form is `const char *` with one
|
||
// space between every token.
|
||
let normalised: String = {
|
||
let mut out = String::with_capacity(s.len());
|
||
let mut prev_ws = true;
|
||
for ch in s.chars() {
|
||
if ch.is_whitespace() {
|
||
if !prev_ws {
|
||
out.push(' ');
|
||
prev_ws = true;
|
||
}
|
||
} else {
|
||
out.push(ch);
|
||
prev_ws = false;
|
||
}
|
||
}
|
||
out.trim().to_string()
|
||
};
|
||
let Some(base) = strip_pointer_and_cv(&normalised) else {
|
||
return false;
|
||
};
|
||
// Pointer-indirection depth = count of `*` tokens in the normalised
|
||
// form (whitespace already collapsed; compound forms with parens /
|
||
// brackets / templates are filtered by `strip_pointer_and_cv`).
|
||
let depth = normalised.chars().filter(|c| *c == '*').count();
|
||
|
||
// Depth 0 (value cast): only the pointer<->integer round-trip types
|
||
// are well-defined. Aliasing *through* a `uintptr_t*` / `intptr_t*`
|
||
// is **not** covered by the standard exemption — only converting a
|
||
// pointer value to/from the integer type is defined behaviour
|
||
// ([basic.compound]/3). Therefore we accept these names only at
|
||
// depth 0.
|
||
if depth == 0 {
|
||
return matches!(
|
||
base.as_str(),
|
||
"uintptr_t" | "intptr_t" | "std::uintptr_t" | "std::intptr_t"
|
||
);
|
||
}
|
||
|
||
// Depth >= 2 (pointer-to-pointer and beyond) is never safe: the
|
||
// [basic.lval]/11 aliasing exemption is for accessing an object's
|
||
// representation as bytes through a single pointer indirection.
|
||
// Reading a `char*` object through a `char**` is a strict-aliasing
|
||
// violation, and the same logic applies to `void**`, `uint8_t**`,
|
||
// etc.
|
||
if depth != 1 {
|
||
return false;
|
||
}
|
||
|
||
// Depth 1: standard aliasing exemption for byte-view access plus
|
||
// POSIX socket type-punning and the opaque `void*` target.
|
||
matches!(
|
||
base.as_str(),
|
||
"char"
|
||
| "signed char"
|
||
| "unsigned char"
|
||
| "wchar_t"
|
||
| "uint8_t"
|
||
| "int8_t"
|
||
| "std::byte"
|
||
| "byte"
|
||
| "void"
|
||
| "sockaddr"
|
||
| "struct sockaddr"
|
||
| "sockaddr_in"
|
||
| "sockaddr_in6"
|
||
| "sockaddr_un"
|
||
| "sockaddr_storage"
|
||
| "struct sockaddr_in"
|
||
| "struct sockaddr_in6"
|
||
| "struct sockaddr_un"
|
||
| "struct sockaddr_storage"
|
||
)
|
||
}
|
||
|
||
/// Strip a single C++ cast target's leading/trailing `const`/`volatile`
|
||
/// qualifiers and trailing `*` characters (any depth). Returns the bare
|
||
/// base type identifier on success. Returns `None` if anything left over
|
||
/// after pointer/cv stripping is not a plain identifier or scoped name
|
||
/// (e.g. function-pointer `void(*)(int)` or template `vector<int>`).
|
||
fn strip_pointer_and_cv(s: &str) -> Option<String> {
|
||
let mut t: &str = s.trim();
|
||
// Strip leading `const` / `volatile`, possibly multiple.
|
||
loop {
|
||
let after = t
|
||
.strip_prefix("const ")
|
||
.or_else(|| t.strip_prefix("volatile "));
|
||
match after {
|
||
Some(rest) => t = rest.trim_start(),
|
||
None => break,
|
||
}
|
||
}
|
||
// Repeatedly strip trailing `*` and trailing cv-qualifiers in either
|
||
// order — `T*`, `T* const`, `T*const`, `T const*`, `T**`, `const T*`
|
||
// are all reachable. The loop terminates when neither suffix
|
||
// matches.
|
||
loop {
|
||
let mut progressed = false;
|
||
// Strip trailing const/volatile that appears AFTER any `*` or
|
||
// before the first `*` (e.g. `T const`). Forms: ` const`, ` volatile`.
|
||
loop {
|
||
let after = t
|
||
.trim_end()
|
||
.strip_suffix(" const")
|
||
.or_else(|| t.trim_end().strip_suffix(" volatile"));
|
||
match after {
|
||
Some(rest) => {
|
||
t = rest;
|
||
progressed = true;
|
||
}
|
||
None => break,
|
||
}
|
||
}
|
||
// Strip trailing `*`s.
|
||
let trimmed = t.trim_end();
|
||
if let Some(stripped) = trimmed.strip_suffix('*') {
|
||
t = stripped;
|
||
progressed = true;
|
||
}
|
||
if !progressed {
|
||
break;
|
||
}
|
||
}
|
||
let base = t.trim();
|
||
if base.is_empty() {
|
||
return None;
|
||
}
|
||
// Refuse anything that contains characters typical of compound
|
||
// type forms we don't want to reason about: parens (function
|
||
// pointer), angle brackets (template instantiation), brackets
|
||
// (array), commas (multiple arguments). Accept identifier
|
||
// characters, `_`, `:` (for `std::byte`), spaces (for `unsigned
|
||
// char` / `struct sockaddr`).
|
||
for ch in base.chars() {
|
||
if !(ch.is_ascii_alphanumeric() || ch == '_' || ch == ':' || ch == ' ') {
|
||
return None;
|
||
}
|
||
}
|
||
Some(base.to_string())
|
||
}
|
||
|
||
/// PHP-only Layer F: structural suppression of `php.crypto.md5` /
|
||
/// `php.crypto.sha1` when the call's *consuming context* yields a name
|
||
/// that matches a recognised non-cryptographic identifier pattern.
|
||
///
|
||
/// The pattern rule fires syntactically on every `md5(...)` /
|
||
/// `sha1(...)` callsite regardless of how the result is used. In real
|
||
/// PHP code these functions are pervasively used for non-cryptographic
|
||
/// purposes — ETag generation (HTTP cache validators), array/cache-key
|
||
/// hashing, dedup fingerprints, content addressing for templates — and
|
||
/// those uses do not realise the "weak hash function" risk the rule
|
||
/// names. Suppress only when the consuming context yields a name from
|
||
/// a recognised non-crypto suffix set, while keeping every callsite
|
||
/// whose name contains a crypto-keyword substring (`password`,
|
||
/// `secret`, `token`, `signature`, `hmac`, `digest`, `salt`, …).
|
||
///
|
||
/// Consuming contexts inspected (walk up through transparent wrappers
|
||
/// — `binary_expression` for concat / equality, `parenthesized_expression`,
|
||
/// `conditional_expression`, `argument`):
|
||
/// - `assignment_expression` (covers `=`, `??=`, `+=`, …) — resolve
|
||
/// the LHS to a final identifier (variable name, member-access
|
||
/// property name, or string-literal subscript index).
|
||
/// - `array_element_initializer` — the key is a string literal whose
|
||
/// contents are the consuming name.
|
||
/// - `subscript_expression` where the call sits in the index position
|
||
/// — using a hash as an array index is intrinsically non-crypto.
|
||
/// - `return_statement` — resolve the enclosing
|
||
/// `function_definition` / `method_declaration` name (with the
|
||
/// conventional `get` prefix stripped).
|
||
///
|
||
/// All other consuming forms (bare expression statements, comparison
|
||
/// operands without an LHS, lambda returns, arguments to user-defined
|
||
/// helpers) keep firing.
|
||
fn is_php_weak_hash_non_crypto_use(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
|
||
let call = if cap_node.kind() == "function_call_expression" {
|
||
cap_node
|
||
} else {
|
||
let mut cur = cap_node;
|
||
let mut found = None;
|
||
for _ in 0..4 {
|
||
if cur.kind() == "function_call_expression" {
|
||
found = Some(cur);
|
||
break;
|
||
}
|
||
match cur.parent() {
|
||
Some(p) => cur = p,
|
||
None => break,
|
||
}
|
||
}
|
||
match found {
|
||
Some(c) => c,
|
||
None => return false,
|
||
}
|
||
};
|
||
|
||
let mut cur = call;
|
||
let mut steps = 0u32;
|
||
while let Some(parent) = cur.parent() {
|
||
if steps > 16 {
|
||
return false;
|
||
}
|
||
steps += 1;
|
||
match parent.kind() {
|
||
// Transparent wrappers — keep walking to find the
|
||
// consumer. These node kinds preserve the value flowing
|
||
// out of the md5/sha1 call without transforming its
|
||
// semantics, so we let the OUTER context (LHS name,
|
||
// array key, return method, etc.) classify the use.
|
||
//
|
||
// - `binary_expression`: concat (`'foo_' . md5($x)`),
|
||
// equality (`md5($x) === $stored`), arithmetic.
|
||
// - `parenthesized_expression`: redundant parens.
|
||
// - `conditional_expression`: `$cond ? md5($x) : ''`.
|
||
// - `argument` / `arguments`: positional / wrapped arg
|
||
// lists — the enclosing call (`substr(md5($x), 0, 8)`,
|
||
// `$q->createNamedParameter(md5($x))`) is what matters.
|
||
// - `function_call_expression`: identity-shaped wrappers
|
||
// such as `substr(...)`, `strtolower(...)`,
|
||
// `urlencode(...)` which propagate the hash to its
|
||
// real consumer.
|
||
// - `encapsed_string`: `"prefix-{md5($x)}"` interpolation.
|
||
//
|
||
// `member_call_expression` / `nullsafe_member_call_expression`
|
||
// are NOT in this transparent set — they have their own
|
||
// arm below that performs lookup-verb classification on
|
||
// the method name (`->get(md5($k))`, `->set(...)`, …)
|
||
// before optionally falling through to the outer
|
||
// consumer.
|
||
"binary_expression"
|
||
| "parenthesized_expression"
|
||
| "conditional_expression"
|
||
| "argument"
|
||
| "arguments"
|
||
| "function_call_expression"
|
||
| "encapsed_string" => {}
|
||
"assignment_expression" | "augmented_assignment_expression" => {
|
||
let lhs = parent
|
||
.child_by_field_name("left")
|
||
.or_else(|| parent.named_child(0));
|
||
let Some(lhs) = lhs else {
|
||
return false;
|
||
};
|
||
return resolve_php_lvalue_name(lhs, bytes)
|
||
.map(|n| name_is_non_crypto(&n))
|
||
.unwrap_or(false);
|
||
}
|
||
"array_element_initializer" => {
|
||
if parent.named_child_count() < 2 {
|
||
return false;
|
||
}
|
||
let key = parent.named_child(0);
|
||
let Some(key) = key else {
|
||
return false;
|
||
};
|
||
let Some(key_text) = string_literal_text(key, bytes) else {
|
||
return false;
|
||
};
|
||
return name_is_non_crypto(&key_text);
|
||
}
|
||
"subscript_expression" => {
|
||
// tree-sitter-php: subscript_expression has the receiver as
|
||
// the first named child and the index as the second. If our
|
||
// call sits past the receiver's end byte, we are the index.
|
||
let r0 = parent.named_child(0);
|
||
let Some(r0) = r0 else {
|
||
cur = parent;
|
||
continue;
|
||
};
|
||
if call.start_byte() >= r0.end_byte() {
|
||
return true;
|
||
}
|
||
// Otherwise we're inside the receiver chain; the surrounding
|
||
// `assignment_expression` (if any) will resolve the LHS name.
|
||
}
|
||
"member_call_expression" | "nullsafe_member_call_expression" => {
|
||
// The md5/sha1 result is being passed as an argument to a
|
||
// method call. When the method name is a recognised
|
||
// key/cache/lookup verb (`get`, `set`, `has`, `delete`,
|
||
// `fetch`, `store`, `find`, `getItem`, `setItem`, …), the
|
||
// result is being used as a non-cryptographic lookup key —
|
||
// canonical for cache backends, hash maps, and storage
|
||
// adapters where the developer is hashing arbitrary input
|
||
// to a fixed-length, character-safe key. Genuine
|
||
// crypto-comparison wrappers (`hash_equals`, `verify`,
|
||
// `password_verify`) keep firing because their method
|
||
// name does not match the verb set.
|
||
let name_node = parent.child_by_field_name("name").or_else(|| {
|
||
// Fallback: last named child is the method name.
|
||
let count = parent.named_child_count();
|
||
if count == 0 {
|
||
None
|
||
} else {
|
||
parent.named_child(count as u32 - 1)
|
||
}
|
||
});
|
||
if let Some(nn) = name_node
|
||
&& nn.kind() == "name"
|
||
&& let Ok(method) = std::str::from_utf8(&bytes[nn.byte_range()])
|
||
&& method_is_lookup_verb(method)
|
||
{
|
||
return true;
|
||
}
|
||
// Otherwise treat as transparent so the OUTER consumer can
|
||
// classify (`$x = $cache->get(sha1($k))` resolves LHS `x`).
|
||
}
|
||
"return_statement" => {
|
||
let mut p = parent;
|
||
for _ in 0..10 {
|
||
let Some(pp) = p.parent() else {
|
||
return false;
|
||
};
|
||
p = pp;
|
||
let kind = p.kind();
|
||
if kind == "method_declaration" || kind == "function_definition" {
|
||
let Some(nn) = p
|
||
.child_by_field_name("name")
|
||
.or_else(|| find_named_child_of_kind(p, "name"))
|
||
else {
|
||
return false;
|
||
};
|
||
let Ok(name) = std::str::from_utf8(&bytes[nn.byte_range()]) else {
|
||
return false;
|
||
};
|
||
return method_name_is_non_crypto(name);
|
||
}
|
||
if kind == "anonymous_function"
|
||
|| kind == "arrow_function"
|
||
|| kind == "anonymous_function_creation_expression"
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
// Halt at scope / statement boundaries we cannot resolve through.
|
||
"expression_statement"
|
||
| "compound_statement"
|
||
| "method_declaration"
|
||
| "function_definition"
|
||
| "anonymous_function"
|
||
| "anonymous_function_creation_expression"
|
||
| "arrow_function"
|
||
| "program" => return false,
|
||
_ => return false,
|
||
}
|
||
cur = parent;
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Resolve the final identifier of a PHP l-value expression to a string
|
||
/// suitable for [`name_is_non_crypto`] classification.
|
||
///
|
||
/// Handles:
|
||
/// - `$variable` (`variable_name` → inner name child)
|
||
/// - `$obj->property` (`member_access_expression` → name field)
|
||
/// - `$arr['literal_key']` (`subscript_expression` → string-literal index)
|
||
/// - `Class::$static` / `self::$prop` (`scoped_property_access_expression`)
|
||
///
|
||
/// Returns `None` for unrecognised l-value shapes (dynamic property
|
||
/// access, computed indices, function-call l-values, etc.); the caller
|
||
/// then falls back to keeping the finding.
|
||
fn resolve_php_lvalue_name(lhs: tree_sitter::Node, bytes: &[u8]) -> Option<String> {
|
||
let lhs = unwrap_php_paren(lhs);
|
||
match lhs.kind() {
|
||
"variable_name" => {
|
||
let name_node = lhs.named_child(0)?;
|
||
std::str::from_utf8(&bytes[name_node.byte_range()])
|
||
.ok()
|
||
.map(String::from)
|
||
}
|
||
"member_access_expression" => {
|
||
let n = lhs.child_by_field_name("name").or_else(|| {
|
||
let count = lhs.named_child_count();
|
||
if count == 0 {
|
||
None
|
||
} else {
|
||
lhs.named_child(count as u32 - 1)
|
||
}
|
||
})?;
|
||
// Property access can name a `name` (bare ident) or a
|
||
// `variable_name` (dynamic ${$x} — which we don't resolve).
|
||
if n.kind() == "name" {
|
||
std::str::from_utf8(&bytes[n.byte_range()])
|
||
.ok()
|
||
.map(String::from)
|
||
} else {
|
||
None
|
||
}
|
||
}
|
||
"subscript_expression" => {
|
||
if lhs.named_child_count() >= 2 {
|
||
let idx = lhs.named_child(1)?;
|
||
if let Some(txt) = string_literal_text(idx, bytes) {
|
||
return Some(txt);
|
||
}
|
||
}
|
||
// Dynamic / non-literal index: recurse into the receiver
|
||
// so `$columnNamesHashes[$col]` resolves to
|
||
// `columnNamesHashes`. This handles canonical
|
||
// `$lookup_by_hash[$key] = md5($key)` shapes.
|
||
let r = lhs.named_child(0)?;
|
||
resolve_php_lvalue_name(r, bytes)
|
||
}
|
||
"scoped_property_access_expression" => {
|
||
let count = lhs.named_child_count();
|
||
if count == 0 {
|
||
return None;
|
||
}
|
||
let prop = lhs.named_child(count as u32 - 1)?;
|
||
// The static property is a `variable_name`. Reuse this
|
||
// function recursively to extract the bare name.
|
||
resolve_php_lvalue_name(prop, bytes)
|
||
}
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
/// Return the textual contents of a PHP string literal node (`string`
|
||
/// or `encapsed_string`), stripping surrounding quotes. Returns `None`
|
||
/// for any non-string node and for interpolated `encapsed_string`s
|
||
/// containing template variables.
|
||
fn string_literal_text(node: tree_sitter::Node, bytes: &[u8]) -> Option<String> {
|
||
if node.kind() != "string" && node.kind() != "encapsed_string" {
|
||
return None;
|
||
}
|
||
if has_interpolation(node) {
|
||
return None;
|
||
}
|
||
for i in 0..node.named_child_count() as u32 {
|
||
if let Some(c) = node.named_child(i)
|
||
&& (c.kind() == "string_content" || c.kind() == "string_value")
|
||
{
|
||
return std::str::from_utf8(&bytes[c.byte_range()])
|
||
.ok()
|
||
.map(String::from);
|
||
}
|
||
}
|
||
if let Ok(s) = std::str::from_utf8(&bytes[node.byte_range()]) {
|
||
let trimmed = s.trim_matches(|c| c == '\'' || c == '"');
|
||
return Some(trimmed.to_string());
|
||
}
|
||
None
|
||
}
|
||
|
||
fn unwrap_php_paren(mut node: tree_sitter::Node) -> tree_sitter::Node {
|
||
for _ in 0..4 {
|
||
if node.kind() == "parenthesized_expression"
|
||
&& let Some(inner) = node.named_child(0)
|
||
{
|
||
node = inner;
|
||
continue;
|
||
}
|
||
break;
|
||
}
|
||
node
|
||
}
|
||
|
||
/// Classify a PHP identifier as non-cryptographic by name. Two-tier
|
||
/// check: any name containing a crypto-keyword substring is hard-rejected
|
||
/// (kept as a finding); the remaining names are accepted when their
|
||
/// form ends in a recognised non-crypto suffix at a word boundary
|
||
/// (underscore, digit, camelCase transition) or via a long-enough
|
||
/// stand-alone suffix (≥4 chars).
|
||
///
|
||
/// The crypto-keyword exclude list uses substring match (not just
|
||
/// suffix) so compound names like `hashedPassword` / `tokenHash` /
|
||
/// `sigStore` are conservatively kept. False rejections of safe
|
||
/// shapes are acceptable; false acceptances of crypto shapes are not.
|
||
pub(crate) fn name_is_non_crypto(name: &str) -> bool {
|
||
if name.is_empty() {
|
||
return false;
|
||
}
|
||
let lower = name.to_ascii_lowercase();
|
||
static CRYPTO_EXCLUDES: &[&str] = &[
|
||
"password",
|
||
"passwd",
|
||
"pw_hash",
|
||
"pwhash",
|
||
"pwdhash",
|
||
"pwd_hash",
|
||
"passhash",
|
||
"pass_hash",
|
||
"secret",
|
||
"token",
|
||
"signature",
|
||
"signed",
|
||
"hmac",
|
||
"digest",
|
||
"verifier",
|
||
"challenge",
|
||
"csrf",
|
||
"salt",
|
||
"nonce_secret",
|
||
"auth_code",
|
||
"authcode",
|
||
"auth_key",
|
||
"authkey",
|
||
"private",
|
||
"credential",
|
||
"creds",
|
||
"encryption",
|
||
"decryption",
|
||
"encryptkey",
|
||
"decryptkey",
|
||
"encrypt_key",
|
||
"decrypt_key",
|
||
"apikey",
|
||
"api_key",
|
||
];
|
||
for ex in CRYPTO_EXCLUDES {
|
||
if lower.contains(ex) {
|
||
return false;
|
||
}
|
||
}
|
||
// `sig` / `mac` are excluded only at word boundaries — the substrings
|
||
// appear in legitimate non-crypto names (`signal`, `unsigned`,
|
||
// `assignee`, `design`, `magic`).
|
||
if lower == "sig" || lower.ends_with("_sig") || lower.ends_with("sig_") {
|
||
return false;
|
||
}
|
||
if lower == "mac" || lower.ends_with("_mac") {
|
||
return false;
|
||
}
|
||
// Permissive safe-suffix recognition.
|
||
static SAFE_SUFFIXES: &[&str] = &[
|
||
"hash",
|
||
"hashes",
|
||
"etag",
|
||
"etags",
|
||
"md5",
|
||
"sha1",
|
||
"fingerprint",
|
||
"fingerprints",
|
||
"cachekey",
|
||
"cache_key",
|
||
"cacheid",
|
||
"cache_id",
|
||
"id",
|
||
"uid",
|
||
"uuid",
|
||
"guid",
|
||
"name_hash",
|
||
"checksum",
|
||
"slot",
|
||
"bucket",
|
||
"seed",
|
||
"marker",
|
||
"tag",
|
||
"gravatar",
|
||
"hashid",
|
||
"opaque",
|
||
"shortid",
|
||
"short_id",
|
||
"fnv",
|
||
"fingerprintkey",
|
||
"anchor",
|
||
"version",
|
||
"buster",
|
||
"cachebuster",
|
||
"cache_buster",
|
||
"revision",
|
||
"rev",
|
||
];
|
||
let bytes_orig = name.as_bytes();
|
||
for s in SAFE_SUFFIXES {
|
||
if lower == *s {
|
||
return true;
|
||
}
|
||
if !lower.ends_with(s) {
|
||
continue;
|
||
}
|
||
let prev_pos = lower.len() - s.len();
|
||
if prev_pos == 0 {
|
||
return true;
|
||
}
|
||
// Word boundary: previous byte is ASCII non-letter (underscore,
|
||
// digit, etc.). Treat non-ASCII (UTF-8 continuation / leading
|
||
// bytes) conservatively as part of an identifier letter — no
|
||
// boundary — to avoid mis-classifying `ëhash`-style names that
|
||
// have no real word break before the suffix.
|
||
let prev_byte = bytes_orig[prev_pos - 1];
|
||
if prev_byte.is_ascii() && !prev_byte.is_ascii_alphabetic() {
|
||
return true;
|
||
}
|
||
// CamelCase boundary: suffix starts with an uppercase letter
|
||
// in the original casing (`storageId`, `tableHash`, `sqlMd5`).
|
||
if bytes_orig[prev_pos].is_ascii_uppercase() {
|
||
return true;
|
||
}
|
||
// Long stand-alone suffix (≥4 chars) — accept without boundary.
|
||
if s.len() >= 4 {
|
||
return true;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Like [`name_is_non_crypto`] but with a leading `get` prefix stripped
|
||
/// to recognise the canonical `getETag` / `getHash` / `getCacheKey`
|
||
/// accessor naming convention. Pass the original-case name through so
|
||
/// downstream camelCase-boundary detection still works.
|
||
fn method_name_is_non_crypto(name: &str) -> bool {
|
||
let stripped = name
|
||
.strip_prefix("get")
|
||
.or_else(|| name.strip_prefix("Get"))
|
||
.unwrap_or(name);
|
||
if name_is_non_crypto(stripped) {
|
||
return true;
|
||
}
|
||
// Some accessors keep the prefix (e.g., `recoveryKeyId`,
|
||
// `formatPath` returning a hashed-path identifier). Also try the
|
||
// raw name for camelCase-boundary suffix detection.
|
||
name_is_non_crypto(name)
|
||
}
|
||
|
||
/// Recognise PHP method names that signal a lookup / cache / store /
|
||
/// container key-or-value operation. When `md5(...)` / `sha1(...)` is
|
||
/// passed to such a method, the result is being used as a content-
|
||
/// addressed key — not for cryptographic strength. The verb set is
|
||
/// purposely narrow so cryptographic comparison helpers
|
||
/// (`hash_equals`, `verify`, `password_verify`, `decryptWith`) keep
|
||
/// firing.
|
||
fn method_is_lookup_verb(method: &str) -> bool {
|
||
let lower = method.to_ascii_lowercase();
|
||
static VERBS: &[&str] = &[
|
||
"get",
|
||
"set",
|
||
"has",
|
||
"delete",
|
||
"remove",
|
||
"fetch",
|
||
"store",
|
||
"put",
|
||
"save",
|
||
"exists",
|
||
"find",
|
||
"lookup",
|
||
"getitem",
|
||
"setitem",
|
||
"hasitem",
|
||
"deleteitem",
|
||
"addtag",
|
||
"addtotag",
|
||
"key",
|
||
"keyfor",
|
||
"containskey",
|
||
"haskey",
|
||
"loadbykey",
|
||
"fetchbykey",
|
||
"getbykey",
|
||
"setbykey",
|
||
"deletebykey",
|
||
"incr",
|
||
"incrby",
|
||
"decr",
|
||
"decrby",
|
||
"expire",
|
||
"ttl",
|
||
"namespacekey",
|
||
"cachekey",
|
||
];
|
||
if VERBS.contains(&lower.as_str()) {
|
||
return true;
|
||
}
|
||
// Composite forms like `getCacheKey`, `setCacheKey`, `getRoute` —
|
||
// very common in cache adapters, accept any name ending in one of
|
||
// a few non-crypto-typed-result suffixes preceded by a get/set/has
|
||
// verb.
|
||
static SUFFIX_HINTS: &[&str] = &[
|
||
"cachekey",
|
||
"key",
|
||
"id",
|
||
"hash",
|
||
"etag",
|
||
"uid",
|
||
"tag",
|
||
"fingerprint",
|
||
];
|
||
if let Some(rest) = lower
|
||
.strip_prefix("get")
|
||
.or_else(|| lower.strip_prefix("set"))
|
||
.or_else(|| lower.strip_prefix("has"))
|
||
.or_else(|| lower.strip_prefix("create"))
|
||
.or_else(|| lower.strip_prefix("build"))
|
||
{
|
||
for h in SUFFIX_HINTS {
|
||
if rest.ends_with(h) {
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Check if a string node contains interpolation (e.g., PHP `"Hello $name"`).
|
||
fn has_interpolation(node: tree_sitter::Node) -> bool {
|
||
for i in 0..node.child_count() as u32 {
|
||
if let Some(child) = node.child(i) {
|
||
let kind = child.kind();
|
||
if kind == "variable_name"
|
||
|| kind == "simple_variable"
|
||
|| kind.contains("interpolation")
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
// Layer B: AST pattern suppression when taint confirms safety
|
||
|
||
/// Map the second segment of a pattern ID (e.g. "cmdi" from "py.cmdi.os_system")
|
||
/// to the `Cap` that taint analysis models. Returns `None` for categories taint
|
||
/// cannot subsume (memory safety, crypto, etc.), so those patterns are never suppressed.
|
||
fn pattern_category_cap(pattern_id: &str) -> Option<Cap> {
|
||
let category = pattern_id.split('.').nth(1)?;
|
||
match category {
|
||
"cmdi" => Some(Cap::SHELL_ESCAPE),
|
||
"xss" => Some(Cap::HTML_ESCAPE),
|
||
"sqli" => Some(Cap::SQL_QUERY),
|
||
"code_exec" => Some(Cap::CODE_EXEC),
|
||
"ssrf" => Some(Cap::SSRF),
|
||
"path" => Some(Cap::FILE_IO),
|
||
// deser/memory/crypto: taint cannot fully subsume these structural patterns
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
/// Suppression context built from CFG + taint results. Used to decide whether
|
||
/// an AST pattern finding can be safely suppressed because taint analysis
|
||
/// evaluated the data flow and found it safe.
|
||
struct TaintSuppressionCtx {
|
||
/// For each function scope, the set of lines containing Source-labeled nodes.
|
||
source_lines_by_func: HashMap<Option<String>, HashSet<usize>>,
|
||
/// For each function scope, the set of lines containing Sanitizer-labeled
|
||
/// nodes. Presence of an explicit sanitizer is the structural signal
|
||
/// that taint analysis successfully evaluated (and cleared) the flow,
|
||
/// so AST-pattern suppression is safe even when no taint findings
|
||
/// fired in the function.
|
||
sanitizer_lines_by_func: HashMap<Option<String>, HashSet<usize>>,
|
||
/// For each sink node line, its enclosing function scope.
|
||
sink_func_at_line: HashMap<usize, Option<String>>,
|
||
/// Lines where taint emitted a `taint-unsanitised-flow` finding.
|
||
taint_finding_lines: HashSet<usize>,
|
||
/// Per-function set of taint-finding lines. Used by Condition 4 of
|
||
/// [`should_suppress`] alongside [`sanitizer_lines_by_func`] to
|
||
/// distinguish "taint proved safe" from "taint failed to track".
|
||
taint_finding_lines_by_func: HashMap<Option<String>, HashSet<usize>>,
|
||
/// Functions where the SSA engine emitted at least one
|
||
/// `all_validated` event, every tainted input to *some* sink in
|
||
/// the function passed through a recognised validation/
|
||
/// sanitisation predicate. Drained from
|
||
/// `take_all_validated_spans`; positive evidence that the engine
|
||
/// reached a sink in this function and proved safety, even when no
|
||
/// `taint-unsanitised-flow` finding fired and no Sanitizer label
|
||
/// is present. Covers validation, dominator-based pruning,
|
||
/// early-return guards, type-check predicates, and interprocedural
|
||
/// sanitiser wrappers, all of which legitimately clear taint via
|
||
/// SSA branch-narrowing rather than a labelled sanitiser node.
|
||
engine_validated_funcs: HashSet<Option<String>>,
|
||
/// Functions where some Source's defining variable is later
|
||
/// rebound to a literal RHS (carries `TaintMeta.const_text`) in
|
||
/// the same scope, with no Source label on the rebinding node.
|
||
/// Positive evidence that the engine's SSA renaming structurally
|
||
/// kills the source's taint before any sink can read it, covers
|
||
/// `cmd = getenv(); cmd = "echo hello"; system(cmd)` patterns
|
||
/// where the rebind is what makes the code safe but the engine
|
||
/// has no `Sanitizer` label or `taint-unsanitised-flow` finding to
|
||
/// witness it.
|
||
source_killed_funcs: HashSet<Option<String>>,
|
||
/// Functions that call a same-file helper which itself contains a
|
||
/// labelled Sanitizer node. Positive evidence that the engine's
|
||
/// interprocedural analysis cleared the flow through a
|
||
/// user-defined wrapper (e.g. `def sanitize(s): return
|
||
/// shlex.quote(s)`). The current per-function `Sanitizer` check
|
||
/// only sees direct sanitisers in the *caller's* scope, without
|
||
/// this signal, every helper-wrapped sanitiser fires as an
|
||
/// AST-pattern FP because the engine cleared the value via Phase
|
||
/// 11 inline analysis but the sink's enclosing scope has no
|
||
/// labelled Sanitizer of its own.
|
||
interproc_sanitizer_callers: HashSet<Option<String>>,
|
||
/// Union of resolved sink-cap bits for cap-specific taint findings at
|
||
/// each line. Used by [`Self::is_redundant_ast_pattern`] to drop an
|
||
/// AST-pattern finding only when the flow engine already emitted a
|
||
/// specific rule id for the same vulnerability class. Legacy generic
|
||
/// findings (`taint-unsanitised-flow`, `cfg-unguarded-sink`) are not
|
||
/// canonical enough to subsume language-specific AST rule IDs such as
|
||
/// `py.cmdi.subprocess_shell` or `c.cmdi.system`.
|
||
specific_taint_finding_caps_by_line: HashMap<usize, u32>,
|
||
}
|
||
|
||
impl TaintSuppressionCtx {
|
||
/// Build suppression context from ALL per-body CFG graphs, tree (for
|
||
/// byte→line mapping), and existing taint findings.
|
||
///
|
||
/// Scans every body's graph (not just top-level) so that Source/Sink
|
||
/// nodes inside function bodies are visible for suppression decisions.
|
||
fn build(file_cfg: &FileCfg, tree: &tree_sitter::Tree, taint_diags: &[Diag]) -> Self {
|
||
let mut source_lines_by_func: HashMap<Option<String>, HashSet<usize>> = HashMap::new();
|
||
let mut sanitizer_lines_by_func: HashMap<Option<String>, HashSet<usize>> = HashMap::new();
|
||
let mut sink_func_at_line: HashMap<usize, Option<String>> = HashMap::new();
|
||
// Per-function (var_name, source_line) pairs for Source nodes whose
|
||
// `defines` is set. Used below to detect SSA source kills via
|
||
// const reassignment (`cmd = getenv(); cmd = "echo hello"`).
|
||
let mut source_var_defs_by_func: HashMap<Option<String>, Vec<(String, usize)>> =
|
||
HashMap::new();
|
||
// Per-function (var_name, line) pairs for nodes that bind a
|
||
// variable to a literal RHS (carry `TaintMeta.const_text`).
|
||
// Used to match against `source_var_defs_by_func` for kill
|
||
// detection.
|
||
let mut const_def_var_by_func: HashMap<Option<String>, Vec<(String, usize)>> =
|
||
HashMap::new();
|
||
// Set of `enclosing_func` names whose body contains at least
|
||
// one labelled Sanitizer. These are user-defined sanitiser
|
||
// wrappers callable from other functions in the same file
|
||
// (e.g. `def sanitize(s): return shlex.quote(s)`).
|
||
let mut sanitizer_funcs: HashSet<String> = HashSet::new();
|
||
// Per-function set of bare callee names invoked from this
|
||
// function's body. Bare = last `.`-separated segment, so
|
||
// `this.sanitize`, `obj.sanitize`, and `sanitize` all collapse
|
||
// to the same key for matching against `sanitizer_funcs`.
|
||
let mut callees_by_func: HashMap<Option<String>, HashSet<String>> = HashMap::new();
|
||
|
||
for body in &file_cfg.bodies {
|
||
for idx in body.graph.node_indices() {
|
||
let info = &body.graph[idx];
|
||
let mut has_source = false;
|
||
let mut has_sink = false;
|
||
let mut has_sanitizer = false;
|
||
for label in &info.taint.labels {
|
||
match label {
|
||
DataLabel::Source(_) => has_source = true,
|
||
DataLabel::Sink(_) => has_sink = true,
|
||
DataLabel::Sanitizer(_) => has_sanitizer = true,
|
||
}
|
||
}
|
||
// Skip synthetic source nodes emitted by `pre_emit_arg_source_nodes`
|
||
// (`__nyx_src_*` / `__nyx_chainsrc_*`). These are a CFG-level
|
||
// synthesis that hoists a source-labeled member-expression into
|
||
// its own Source node so taint can see a definition; absence of
|
||
// a downstream taint finding through such a synth source does
|
||
// NOT prove safety, it can also mean the engine couldn't
|
||
// propagate the taint (e.g. `&req` with `var req struct{}`
|
||
// where points-to doesn't track the address-of of a stack
|
||
// variable). Treating synth sources as "real" sources here
|
||
// would silently silence AST-pattern findings on every Go
|
||
// CRUD handler whose Decode destination is an `&req`-style
|
||
// address-of-local.
|
||
let is_synth_source = info.taint.defines.as_deref().is_some_and(|d| {
|
||
d.starts_with("__nyx_src_") || d.starts_with("__nyx_chainsrc_")
|
||
});
|
||
let byte = info.classification_span().0;
|
||
let point = byte_offset_to_point(tree, byte);
|
||
let line = point.row + 1;
|
||
if has_source && !is_synth_source {
|
||
source_lines_by_func
|
||
.entry(info.ast.enclosing_func.clone())
|
||
.or_default()
|
||
.insert(line);
|
||
if let Some(var) = info.taint.defines.as_deref() {
|
||
source_var_defs_by_func
|
||
.entry(info.ast.enclosing_func.clone())
|
||
.or_default()
|
||
.push((var.to_string(), line));
|
||
}
|
||
}
|
||
if has_sanitizer {
|
||
sanitizer_lines_by_func
|
||
.entry(info.ast.enclosing_func.clone())
|
||
.or_default()
|
||
.insert(line);
|
||
if let Some(func_name) = info.ast.enclosing_func.as_deref() {
|
||
sanitizer_funcs.insert(func_name.to_string());
|
||
}
|
||
}
|
||
if has_sink {
|
||
sink_func_at_line.insert(line, info.ast.enclosing_func.clone());
|
||
}
|
||
// Const-rebind detection: a node that defines a variable
|
||
// from a literal RHS and carries no Source label is a
|
||
// candidate kill site. Skip nodes that are themselves
|
||
// Sources (a literal-init source like `cmd := "ls"` is
|
||
// not a kill).
|
||
if !has_source
|
||
&& let (Some(var), Some(_)) = (
|
||
info.taint.defines.as_deref(),
|
||
info.taint.const_text.as_ref(),
|
||
)
|
||
{
|
||
const_def_var_by_func
|
||
.entry(info.ast.enclosing_func.clone())
|
||
.or_default()
|
||
.push((var.to_string(), line));
|
||
}
|
||
// Per-function callee inventory for interprocedural
|
||
// sanitiser detection. `bare_method_name` collapses
|
||
// `this.sanitize` / `obj.sanitize` / `sanitize` to the
|
||
// same key so receiver-prefixed Java/Ruby/etc. calls
|
||
// match a bare-named helper definition. Also include
|
||
// `arg_callees` so `println(... + sanitize(name) +
|
||
// ...)` recognises the inline sanitiser call buried
|
||
// inside the sink's argument expression.
|
||
let bare_inserts: Vec<&str> = info
|
||
.call
|
||
.callee
|
||
.as_deref()
|
||
.into_iter()
|
||
.chain(info.arg_callees.iter().filter_map(|c| c.as_deref()))
|
||
.collect();
|
||
if !bare_inserts.is_empty() {
|
||
let entry = callees_by_func
|
||
.entry(info.ast.enclosing_func.clone())
|
||
.or_default();
|
||
for callee in bare_inserts {
|
||
let bare = crate::labels::bare_method_name(callee);
|
||
if !bare.is_empty() {
|
||
entry.insert(bare.to_string());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Source-kill detection: a function is "source-killed" when at
|
||
// least one of its Source-defined variables is re-bound to a
|
||
// literal at a later line in the same scope. Captures
|
||
// `safe_reassigned`-style fixtures: the SSA engine renames the
|
||
// sink-read SSA value to a clean constant before any sink can
|
||
// observe taint, but neither a `Sanitizer` label nor a
|
||
// `taint-unsanitised-flow` finding fires to witness the kill.
|
||
let mut source_killed_funcs: HashSet<Option<String>> = HashSet::new();
|
||
for (func, src_defs) in &source_var_defs_by_func {
|
||
let Some(kills) = const_def_var_by_func.get(func) else {
|
||
continue;
|
||
};
|
||
for (src_var, src_line) in src_defs {
|
||
if kills
|
||
.iter()
|
||
.any(|(kill_var, kill_line)| kill_var == src_var && kill_line > src_line)
|
||
{
|
||
source_killed_funcs.insert(func.clone());
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Interprocedural sanitiser caller detection: a function is
|
||
// an "interproc sanitiser caller" when its body invokes any
|
||
// helper whose own body contains a labelled Sanitizer. This
|
||
// handles wrappers like `def sanitize(s): return
|
||
// shlex.quote(s)`, the engine clears taint via
|
||
// inline analysis, but the caller's scope has no labelled
|
||
// Sanitizer of its own to satisfy Condition 4(b).
|
||
let mut interproc_sanitizer_callers: HashSet<Option<String>> = HashSet::new();
|
||
if !sanitizer_funcs.is_empty() {
|
||
for (func, callees) in &callees_by_func {
|
||
if callees.iter().any(|c| sanitizer_funcs.contains(c)) {
|
||
interproc_sanitizer_callers.insert(func.clone());
|
||
}
|
||
}
|
||
}
|
||
|
||
// Drain the SSA engine's all-validated sink spans, attribute
|
||
// each to its enclosing function via `sink_func_at_line`, and
|
||
// record the function as "engine-validated". The set was
|
||
// populated by `ssa_events_to_findings` whenever the engine
|
||
// emitted an `SsaTaintEvent { all_validated: true, .. }` ,
|
||
// i.e. the engine reached a sink and proved every tainted
|
||
// input passed validation. This is the broadest form of
|
||
// engine-success evidence, covering predicate validation
|
||
// (`if !allowed[x]`), dominator early-return, type-check
|
||
// (`Atoi` / `typeof`), and interprocedural sanitiser
|
||
// wrappers.
|
||
let mut engine_validated_funcs: HashSet<Option<String>> = HashSet::new();
|
||
for (start, _end) in crate::taint::ssa_transfer::take_all_validated_spans() {
|
||
let line = byte_offset_to_point(tree, start).row + 1;
|
||
if let Some(func) = sink_func_at_line.get(&line) {
|
||
engine_validated_funcs.insert(func.clone());
|
||
}
|
||
}
|
||
|
||
let taint_finding_lines: HashSet<usize> = taint_diags
|
||
.iter()
|
||
.filter(|d| d.id.starts_with("taint-unsanitised-flow"))
|
||
.map(|d| d.line)
|
||
.collect();
|
||
|
||
// Cap bits per line for cap-specific flow-backed findings only, so a
|
||
// redundant AST pattern at the same line+cap can be dropped in favour
|
||
// of the richer flow. Do not count legacy generic findings here:
|
||
// `taint-unsanitised-flow` and `cfg-unguarded-sink` carry evidence,
|
||
// but their rule ids are deliberately catch-alls, while AST `cmdi`,
|
||
// `sqli`, etc. IDs are the canonical namespace many tests, SARIF
|
||
// consumers, and dynamic-verification spec derivation rely on.
|
||
let mut specific_taint_finding_caps_by_line: HashMap<usize, u32> = HashMap::new();
|
||
for d in taint_diags {
|
||
if d.id.starts_with("taint-") && !d.id.starts_with("taint-unsanitised-flow") {
|
||
if let Some(caps) = d.evidence.as_ref().map(|e| e.sink_caps) {
|
||
if caps != 0 {
|
||
*specific_taint_finding_caps_by_line
|
||
.entry(d.line)
|
||
.or_default() |= caps;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Per-function partition of taint findings. Maps each finding's
|
||
// line to the enclosing function scope by reusing
|
||
// `sink_func_at_line` (the same span/function mapping the Sink-side
|
||
// of taint analysis populated above).
|
||
let mut taint_finding_lines_by_func: HashMap<Option<String>, HashSet<usize>> =
|
||
HashMap::new();
|
||
for line in &taint_finding_lines {
|
||
let func = sink_func_at_line.get(line).cloned().unwrap_or(None);
|
||
taint_finding_lines_by_func
|
||
.entry(func)
|
||
.or_default()
|
||
.insert(*line);
|
||
}
|
||
|
||
Self {
|
||
source_lines_by_func,
|
||
sanitizer_lines_by_func,
|
||
sink_func_at_line,
|
||
taint_finding_lines,
|
||
taint_finding_lines_by_func,
|
||
engine_validated_funcs,
|
||
source_killed_funcs,
|
||
interproc_sanitizer_callers,
|
||
specific_taint_finding_caps_by_line,
|
||
}
|
||
}
|
||
|
||
/// Returns `true` when an AST pattern finding is a redundant restatement
|
||
/// of a flow the taint engine already reported at the same line.
|
||
///
|
||
/// The taint / structural flow finding carries source + path evidence the
|
||
/// bare pattern lacks, so when both fire at the same line for the same
|
||
/// cap the pattern is pure duplicate noise. This is the
|
||
/// taint-found-it-UNSAFE counterpart to [`Self::should_suppress`]'s
|
||
/// taint-found-it-SAFE logic: there, no flow finding means the pattern
|
||
/// may carry unique signal; here, a same-cap flow finding means it does
|
||
/// not. Cap-matched (not line-only) so a pattern whose cap differs from
|
||
/// the co-located flow's cap — a genuinely distinct sink — is preserved.
|
||
fn is_redundant_ast_pattern(&self, pattern_id: &str, line: usize) -> bool {
|
||
let Some(cap) = pattern_category_cap(pattern_id) else {
|
||
return false;
|
||
};
|
||
self.specific_taint_finding_caps_by_line
|
||
.get(&line)
|
||
.is_some_and(|caps| caps & cap.bits() != 0)
|
||
}
|
||
|
||
/// Returns `true` if this AST pattern finding should be suppressed.
|
||
fn should_suppress(&self, pattern_id: &str, line: usize) -> bool {
|
||
// Condition 1: pattern category maps to a Cap taint models
|
||
if pattern_category_cap(pattern_id).is_none() {
|
||
return false;
|
||
}
|
||
// Condition 2: at least one Source exists in the same function scope
|
||
// at an EARLIER line (upstream in control flow). This prevents suppression
|
||
// when the only Source is co-located (dual-label) or downstream from the
|
||
// sink, since taint couldn't have evaluated a flow that doesn't exist.
|
||
let func = match self.sink_func_at_line.get(&line) {
|
||
Some(f) => f,
|
||
None => return false, // No CFG sink at this line, taint had no opportunity to evaluate
|
||
};
|
||
match self.source_lines_by_func.get(func) {
|
||
Some(source_lines) => {
|
||
if !source_lines.iter().any(|&sl| sl < line) {
|
||
return false;
|
||
}
|
||
}
|
||
None => return false,
|
||
}
|
||
// Condition 3: no taint finding at this line (taint found it safe)
|
||
if self.taint_finding_lines.contains(&line) {
|
||
return false;
|
||
}
|
||
// Condition 4: distinguish "taint proved safe" from "taint failed
|
||
// to track". Suppress only when there's a structural signal that
|
||
// taint analysis actually evaluated this flow:
|
||
// (a) the function fired at least one taint-unsanitised-flow
|
||
// finding (engine ran successfully and reached *some* sink),
|
||
// OR
|
||
// (b) the function contains an explicit Sanitizer node (the
|
||
// canonical mechanism by which a flow is cleared, e.g.
|
||
// `escapeshellarg` between $_GET and `system`),
|
||
// OR
|
||
// (c) the SSA engine emitted at least one `all_validated`
|
||
// event in this function (engine reached *some* sink and
|
||
// proved every tainted input was validated, covers
|
||
// predicate validation, dominator early-return,
|
||
// type-check predicates, and interprocedural sanitiser
|
||
// wrappers that don't carry an explicit Sanitizer
|
||
// label),
|
||
// OR
|
||
// (d) the function rebinds a Source's defining variable to
|
||
// a literal RHS at a later line (engine's SSA renaming
|
||
// structurally kills taint before any sink reads it ,
|
||
// covers `cmd = getenv(); cmd = "echo"; system(cmd)`),
|
||
// OR
|
||
// (e) the function calls a same-file helper whose body
|
||
// contains a labelled Sanitizer (interprocedural
|
||
// sanitiser wrapper, covers `def sanitize(s): return
|
||
// shlex.quote(s)` patterns where the engine clears
|
||
// taint via inline analysis but the caller's
|
||
// scope has no Sanitizer label of its own).
|
||
//
|
||
// When none hold, we can't distinguish silent engine failure
|
||
// from real safety, e.g. Go points-to limitation on `&local`
|
||
// Decode destinations leaves the chain writeback fired but the
|
||
// field-cell propagation dead, suppressing legitimate
|
||
// AST-pattern findings on every Go CRUD handler whose Decode
|
||
// destination is a stack-local address-of.
|
||
let func_has_taint_finding = self
|
||
.taint_finding_lines_by_func
|
||
.get(func)
|
||
.is_some_and(|s| !s.is_empty());
|
||
let func_has_sanitizer = self
|
||
.sanitizer_lines_by_func
|
||
.get(func)
|
||
.is_some_and(|s| !s.is_empty());
|
||
let func_engine_validated = self.engine_validated_funcs.contains(func);
|
||
let func_source_killed = self.source_killed_funcs.contains(func);
|
||
let func_interproc_sanitizer = self.interproc_sanitizer_callers.contains(func);
|
||
if !func_has_taint_finding
|
||
&& !func_has_sanitizer
|
||
&& !func_engine_validated
|
||
&& !func_source_killed
|
||
&& !func_interproc_sanitizer
|
||
{
|
||
return false;
|
||
}
|
||
true
|
||
}
|
||
}
|
||
|
||
// Pass 2 / single‑file: Full rule execution (AST queries + taint)
|
||
|
||
/// Run all enabled analyses on pre-read bytes and return diagnostics.
|
||
///
|
||
/// This is the core **pass 2** implementation. Callers that already hold the
|
||
/// file contents should use this variant to avoid a redundant `fs::read`.
|
||
pub fn run_rules_on_bytes(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> NyxResult<Vec<Diag>> {
|
||
let _span = tracing::debug_span!("run_rules", file = %path.display()).entered();
|
||
maybe_inject_test_panic(path);
|
||
|
||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||
// Not a recognized tree-sitter language, try text-based patterns,
|
||
// but first surface a parse-timeout synthetic diag if that's what
|
||
// caused try_new to return None.
|
||
let mut out = scan_text_based_patterns(bytes, path, cfg);
|
||
if let Some(timeout_ms) = take_last_parse_timeout_ms() {
|
||
out.push(parse_timeout_diag(path, timeout_ms));
|
||
}
|
||
return Ok(out);
|
||
};
|
||
|
||
let mut out = Vec::new();
|
||
|
||
// CFG construction + taint + cfg_analysis only needed for CFG-capable modes.
|
||
let needs_cfg = matches!(
|
||
cfg.scanner.mode,
|
||
AnalysisMode::Full | AnalysisMode::Cfg | AnalysisMode::Taint
|
||
);
|
||
|
||
if needs_cfg {
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
out.extend(parsed.run_cfg_analyses(cfg, global_summaries, scan_root));
|
||
if cfg.scanner.mode == AnalysisMode::Full {
|
||
// Layer B: suppress AST findings where taint confirmed safety
|
||
let suppression =
|
||
TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &out);
|
||
let ast_findings = parsed.source.run_ast_queries(cfg);
|
||
out.extend(ast_findings.into_iter().filter(|d| {
|
||
!suppression.should_suppress(&d.id, d.line)
|
||
&& !suppression.is_redundant_ast_pattern(&d.id, d.line)
|
||
}));
|
||
}
|
||
if cfg.scanner.mode == AnalysisMode::Full {
|
||
out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root));
|
||
}
|
||
parsed.source.finalize_diags(&mut out, cfg);
|
||
} else {
|
||
// AST-only: no CFG construction (fast path preserved)
|
||
out.extend(source.run_ast_queries(cfg));
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root));
|
||
parsed.source.finalize_diags(&mut out, cfg);
|
||
}
|
||
|
||
Ok(out)
|
||
}
|
||
|
||
/// Convenience wrapper that reads the file then delegates to
|
||
/// [`run_rules_on_bytes`].
|
||
pub fn run_rules_on_file(
|
||
path: &Path,
|
||
cfg: &Config,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> NyxResult<Vec<Diag>> {
|
||
let bytes = std::fs::read(path)?;
|
||
run_rules_on_bytes(&bytes, path, cfg, global_summaries, scan_root)
|
||
}
|
||
|
||
// Fused single-pass: extract summaries + run full analysis in one parse/CFG
|
||
|
||
/// Result of a fused analysis pass: both function summaries and diagnostics.
|
||
pub struct FusedResult {
|
||
pub summaries: Vec<FuncSummary>,
|
||
pub diags: Vec<Diag>,
|
||
/// SSA-derived per-parameter summaries keyed by canonical
|
||
/// [`crate::symbol::FuncKey`]. Keys preserve `(lang, namespace,
|
||
/// container, name, arity, disambig, kind)` so two same-name definitions
|
||
/// in the same file never collide.
|
||
pub ssa_summaries: Vec<(crate::symbol::FuncKey, SsaFuncSummary)>,
|
||
pub cfg_nodes: usize,
|
||
/// Eligible callee bodies for cross-file symex, keyed by
|
||
/// canonical [`crate::symbol::FuncKey`] (same identity model as
|
||
/// `ssa_summaries`).
|
||
pub ssa_bodies: Vec<(
|
||
crate::symbol::FuncKey,
|
||
crate::taint::ssa_transfer::CalleeSsaBody,
|
||
)>,
|
||
/// Per-function auth-check summaries for cross-file helper
|
||
/// lifting. One entry per analysis unit whose body proves at
|
||
/// least one positional parameter under an ownership / membership
|
||
/// / admin / authorization check; empty for files with no such
|
||
/// helpers.
|
||
pub auth_summaries: Vec<(
|
||
crate::symbol::FuncKey,
|
||
auth_analysis::model::AuthCheckSummary,
|
||
)>,
|
||
/// Per-Python-file router-level dep declarations + `include_router`
|
||
/// edges for cross-file FastAPI router-dep propagation. `None` for
|
||
/// non-Python files; `Some((module_id, facts))` for Python files
|
||
/// where `module_id` is the file's
|
||
/// [`auth_analysis::router_facts::module_id_for_storage`] key.
|
||
/// Pass 1 collects these into
|
||
/// `GlobalSummaries.router_facts_by_module`; pass 2 resolves them
|
||
/// per-file via `GlobalSummaries::resolve_cross_file_router_deps`.
|
||
pub router_facts: Option<(String, auth_analysis::router_facts::PerFileRouterFacts)>,
|
||
/// Per-file Phase-09 cross-package import map. `None` when the
|
||
/// file's resolver produced no resolved bindings; otherwise
|
||
/// `Some((namespace, map))` where `namespace` is the file's
|
||
/// scan-root-relative path (matching `FuncKey::namespace`) and
|
||
/// `map` maps each local import binding name (e.g. `escapeHtml`)
|
||
/// to the canonical `FuncKey` of the imported function in its
|
||
/// own package. Pass 1 collects these into
|
||
/// `GlobalSummaries.cross_package_imports_by_namespace`; pass 2's
|
||
/// `inline_analyse_callee` consults the index when an inlined
|
||
/// callee body's own `cross_package_imports` Arc is empty (the
|
||
/// indexed-mode case where bodies round-trip through SQLite and
|
||
/// the Arc field is `#[serde(skip)]`).
|
||
pub cross_package_imports: Option<(
|
||
String,
|
||
std::sync::Arc<HashMap<String, crate::symbol::FuncKey>>,
|
||
)>,
|
||
}
|
||
|
||
/// Parse the file once, build the CFG once, and produce both function
|
||
/// summaries (for cross-file resolution) and full diagnostics (AST analyses +
|
||
/// taint + CFG structural analyses).
|
||
///
|
||
/// When `global_summaries` is `None`, the taint engine runs with local
|
||
/// context only (equivalent to pass 1 + partial pass 2). A second call
|
||
/// to `run_taint_only` can refine findings with the full cross-file view
|
||
/// without re-parsing or re-building the CFG.
|
||
pub fn analyse_file_fused(
|
||
bytes: &[u8],
|
||
path: &Path,
|
||
cfg: &Config,
|
||
global_summaries: Option<&GlobalSummaries>,
|
||
scan_root: Option<&Path>,
|
||
) -> NyxResult<FusedResult> {
|
||
let _span = tracing::debug_span!("analyse_fused", file = %path.display()).entered();
|
||
maybe_inject_test_panic(path);
|
||
|
||
let Some(source) = ParsedSource::try_new(bytes, path)? else {
|
||
// Not a recognized tree-sitter language, try text-based patterns,
|
||
// and surface a parse-timeout synthetic diag if that's what caused
|
||
// try_new to return None.
|
||
let mut diags = scan_text_based_patterns(bytes, path, cfg);
|
||
if let Some(timeout_ms) = take_last_parse_timeout_ms() {
|
||
diags.push(parse_timeout_diag(path, timeout_ms));
|
||
}
|
||
return Ok(FusedResult {
|
||
summaries: vec![],
|
||
diags,
|
||
ssa_summaries: vec![],
|
||
cfg_nodes: 0,
|
||
ssa_bodies: vec![],
|
||
auth_summaries: vec![],
|
||
router_facts: None,
|
||
cross_package_imports: None,
|
||
});
|
||
};
|
||
|
||
let parsed = ParsedFile::from_source(source, cfg);
|
||
let cfg_nodes = parsed.cfg_graph().node_count();
|
||
let summaries = parsed.export_summaries_with_root(scan_root);
|
||
|
||
let mut out = Vec::new();
|
||
|
||
let needs_cfg = matches!(
|
||
cfg.scanner.mode,
|
||
AnalysisMode::Full | AnalysisMode::Cfg | AnalysisMode::Taint
|
||
);
|
||
|
||
let (ssa_summaries, ssa_bodies) = if needs_cfg {
|
||
// Lower SSA exactly once and feed both the taint engine and the
|
||
// SSA-artifact extractor. Pre-fix, both consumers re-lowered the
|
||
// same `FileCfg` independently, `lower_all_functions_from_bodies`
|
||
// accounted for ~20% of `analyse_file_fused` wall-clock on the
|
||
// bench corpus.
|
||
//
|
||
// Reset the path-safe-suppressed span set BEFORE lowering: the
|
||
// per-parameter probes inside the lowering phase publish spans
|
||
// (`record_path_safe_suppressed_span`), and the state-analysis
|
||
// pass downstream relies on those spans surviving until
|
||
// `take_path_safe_suppressed_spans` drains the set inside
|
||
// `run_cfg_analyses_with_lowered`. The all-validated span set
|
||
// (cap-agnostic, AST-pattern suppression evidence) follows the
|
||
// same lifecycle and is drained inside `TaintSuppressionCtx`.
|
||
crate::taint::ssa_transfer::reset_path_safe_suppressed_spans();
|
||
crate::taint::ssa_transfer::reset_all_validated_spans();
|
||
let (lowered_summaries, lowered_bodies) =
|
||
parsed.lower_ssa_for_fused(global_summaries, scan_root, cfg.module_graph.as_deref());
|
||
out.extend(parsed.run_cfg_analyses_with_lowered(
|
||
cfg,
|
||
global_summaries,
|
||
scan_root,
|
||
&lowered_summaries,
|
||
&lowered_bodies,
|
||
));
|
||
let eligible_bodies = crate::taint::build_eligible_bodies(&parsed.file_cfg, lowered_bodies);
|
||
let summaries_vec: Vec<_> = lowered_summaries.into_iter().collect();
|
||
(summaries_vec, eligible_bodies)
|
||
} else {
|
||
(vec![], vec![])
|
||
};
|
||
|
||
let mut auth_summaries: Vec<(
|
||
crate::symbol::FuncKey,
|
||
auth_analysis::model::AuthCheckSummary,
|
||
)> = Vec::new();
|
||
|
||
// Per-file router-dep facts for cross-file FastAPI propagation.
|
||
// Extracted unconditionally for Python files so pass 1 can persist
|
||
// them into `GlobalSummaries.router_facts_by_module` even on Cfg /
|
||
// Taint modes (the auth analysis itself runs only under Full, but
|
||
// the index has to be populated by the time pass 2 launches).
|
||
let router_facts_for_this_file = if parsed.source.lang_slug == "python" {
|
||
auth_analysis::router_facts::module_id_for_storage(parsed.source.path).map(|module_id| {
|
||
let facts = auth_analysis::router_facts::extract_router_facts_for_python(
|
||
&parsed.source.tree,
|
||
parsed.source.bytes,
|
||
);
|
||
(module_id, facts)
|
||
})
|
||
} else {
|
||
None
|
||
};
|
||
|
||
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast {
|
||
let ast_findings = parsed.source.run_ast_queries(cfg);
|
||
// Layer B only applies when taint had the opportunity to evaluate
|
||
if needs_cfg && cfg.scanner.mode == AnalysisMode::Full {
|
||
let suppression =
|
||
TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &out);
|
||
out.extend(ast_findings.into_iter().filter(|d| {
|
||
!suppression.should_suppress(&d.id, d.line)
|
||
&& !suppression.is_redundant_ast_pattern(&d.id, d.line)
|
||
}));
|
||
} else {
|
||
out.extend(ast_findings);
|
||
}
|
||
// Build the AuthorizationModel exactly once per file when Full
|
||
// mode needs both diagnostics AND per-file summaries; pre-fix
|
||
// the diag path and the summary path each ran their own
|
||
// `extract::extract_authorization_model`, duplicating
|
||
// `collect_top_level_units` + every framework extractor's AST
|
||
// walk. See `auth_analysis::run_auth_analysis_with_model` for
|
||
// measured savings.
|
||
let auth_rules = auth_analysis::config::build_auth_rules(cfg, parsed.source.lang_slug);
|
||
if auth_rules.enabled {
|
||
// Resolve cross-file router-deps for the current file (Python only).
|
||
// The resolved map lives on `AuthorizationModel.cross_file_router_deps`
|
||
// BEFORE `FlaskExtractor::extract` runs, so the in-extractor merge
|
||
// sees both inline router-deps and the cross-file lift in one pass.
|
||
let cross_file_router_deps = if parsed.source.lang_slug == "python"
|
||
&& let Some(gs) = global_summaries
|
||
&& let Some(child_module_id) =
|
||
auth_analysis::router_facts::module_id_for_path(parsed.source.path)
|
||
{
|
||
let resolved = gs.resolve_cross_file_router_deps(&child_module_id);
|
||
if resolved.is_empty() {
|
||
None
|
||
} else {
|
||
Some(resolved)
|
||
}
|
||
} else {
|
||
None
|
||
};
|
||
let auth_model = auth_analysis::extract::extract_authorization_model(
|
||
parsed.source.lang_slug,
|
||
cfg.framework_ctx.as_ref(),
|
||
&parsed.source.tree,
|
||
parsed.source.bytes,
|
||
parsed.source.path,
|
||
&auth_rules,
|
||
cross_file_router_deps.as_ref(),
|
||
);
|
||
// Extract summaries from the **base** model (pre var-types,
|
||
// pre-helper-lifting) so the persisted per-file summary
|
||
// carries only the helper's own intrinsic auth checks,
|
||
// matching the legacy `extract_auth_summaries_by_key` path
|
||
// bit-for-bit.
|
||
if cfg.scanner.mode == AnalysisMode::Full {
|
||
auth_summaries = auth_analysis::extract_auth_summaries_from_model(
|
||
&auth_model,
|
||
parsed.source.lang_slug,
|
||
parsed.source.path,
|
||
scan_root,
|
||
);
|
||
}
|
||
let var_types = parsed.collect_file_var_types();
|
||
out.extend(auth_analysis::run_auth_analysis_with_model(
|
||
auth_model,
|
||
&parsed.source.tree,
|
||
parsed.source.lang_slug,
|
||
parsed.source.path,
|
||
&auth_rules,
|
||
var_types.as_ref(),
|
||
global_summaries,
|
||
scan_root,
|
||
));
|
||
}
|
||
}
|
||
parsed.source.finalize_diags(&mut out, cfg);
|
||
|
||
let cross_package_imports_for_this_file = if parsed.file_cfg.resolved_imports.is_empty() {
|
||
None
|
||
} else {
|
||
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
|
||
let ns = crate::symbol::namespace_with_package(
|
||
&parsed.source.file_path_str,
|
||
scan_root_str.as_deref(),
|
||
cfg.module_graph.as_deref(),
|
||
);
|
||
let caller_lang = Lang::from_slug(parsed.source.lang_slug).unwrap_or(Lang::Rust);
|
||
let map = crate::taint::build_cross_package_func_keys(
|
||
&parsed.file_cfg.resolved_imports,
|
||
scan_root_str.as_deref(),
|
||
cfg.module_graph.as_deref(),
|
||
caller_lang,
|
||
);
|
||
if map.is_empty() {
|
||
None
|
||
} else {
|
||
Some((ns, std::sync::Arc::new(map)))
|
||
}
|
||
};
|
||
|
||
Ok(FusedResult {
|
||
summaries,
|
||
diags: out,
|
||
ssa_summaries,
|
||
cfg_nodes,
|
||
ssa_bodies,
|
||
auth_summaries,
|
||
router_facts: router_facts_for_this_file,
|
||
cross_package_imports: cross_package_imports_for_this_file,
|
||
})
|
||
}
|
||
|
||
// Text-based pattern scanning (non-tree-sitter files)
|
||
|
||
/// Run text-based pattern scanners on files whose extension is not supported
|
||
/// by tree-sitter. Currently handles `.ejs` templates.
|
||
fn scan_text_based_patterns(bytes: &[u8], path: &Path, cfg: &Config) -> Vec<Diag> {
|
||
let ext = lowercase_ext(path);
|
||
match ext {
|
||
Some("ejs") => {
|
||
let mut diags = crate::patterns::ejs::scan_ejs_file(path, bytes);
|
||
// Respect severity filter
|
||
diags.retain(|d| d.severity <= cfg.scanner.min_severity);
|
||
diags
|
||
}
|
||
_ => vec![],
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn unknown_extension_returns_empty() {
|
||
let dir = tempfile::tempdir().unwrap();
|
||
let txt = dir.path().join("notes.txt");
|
||
std::fs::write(&txt, "just some text").unwrap();
|
||
|
||
let diags = run_rules_on_file(&txt, &Config::default(), None, None)
|
||
.expect("function should never error on plain text");
|
||
|
||
assert!(diags.is_empty());
|
||
}
|
||
|
||
#[test]
|
||
fn binary_file_guard_triggers() {
|
||
let dir = tempfile::tempdir().unwrap();
|
||
let bin = dir.path().join("junk.bin");
|
||
|
||
let mut data = vec![0_u8; 2048];
|
||
for i in (0..data.len()).step_by(3) {
|
||
data[i] = 0;
|
||
}
|
||
std::fs::write(&bin, &data).unwrap();
|
||
|
||
let diags = run_rules_on_file(&bin, &Config::default(), None, None).unwrap();
|
||
assert!(diags.is_empty(), "binary files are skipped");
|
||
}
|
||
|
||
#[test]
|
||
fn nonprod_path_detection() {
|
||
// Test that is_nonprod_path recognises common non-production paths
|
||
assert!(is_nonprod_path(Path::new("project/tests/test_main.py")));
|
||
assert!(is_nonprod_path(Path::new("src/__tests__/foo.js")));
|
||
assert!(is_nonprod_path(Path::new("benches/bench.rs")));
|
||
assert!(is_nonprod_path(Path::new("vendor/lib/foo.py")));
|
||
assert!(is_nonprod_path(Path::new("src/build.rs")));
|
||
assert!(is_nonprod_path(Path::new("dist/app.min.js")));
|
||
assert!(is_nonprod_path(Path::new("examples/demo.py")));
|
||
assert!(is_nonprod_path(Path::new("fixtures/data.json")));
|
||
|
||
// Should NOT match production paths
|
||
assert!(!is_nonprod_path(Path::new("src/main.rs")));
|
||
assert!(!is_nonprod_path(Path::new("lib/handler.py")));
|
||
assert!(!is_nonprod_path(Path::new("app/views.py")));
|
||
}
|
||
|
||
#[test]
|
||
fn test_file_detection_covers_all_supported_languages() {
|
||
// JS / TS — the existing surface, kept as a regression guard.
|
||
assert!(is_test_file(Path::new("src/foo.test.js")));
|
||
assert!(is_test_file(Path::new("src/foo.test.ts")));
|
||
assert!(is_test_file(Path::new("src/foo.spec.tsx")));
|
||
assert!(is_test_file(Path::new("src/foo.test.mjs")));
|
||
assert!(is_test_file(Path::new("src/__tests__/Component.jsx")));
|
||
|
||
// Python.
|
||
assert!(is_test_file(Path::new("tests/test_login.py")));
|
||
assert!(is_test_file(Path::new("project/views_test.py")));
|
||
assert!(is_test_file(Path::new("project/tests/conftest.py")));
|
||
assert!(is_test_file(Path::new("project/foo_tests.py")));
|
||
|
||
// Java (JUnit / TestNG).
|
||
assert!(is_test_file(Path::new("src/UserTest.java")));
|
||
assert!(is_test_file(Path::new("src/UserTests.java")));
|
||
assert!(is_test_file(Path::new("src/UserIT.java")));
|
||
|
||
// PHP (PHPUnit).
|
||
assert!(is_test_file(Path::new(
|
||
"tests/unit/Gis/GisVisualizationTest.php"
|
||
)));
|
||
|
||
// Ruby (RSpec / Minitest).
|
||
assert!(is_test_file(Path::new("spec/widget_spec.rb")));
|
||
assert!(is_test_file(Path::new("test/widget_test.rb")));
|
||
|
||
// Go.
|
||
assert!(is_test_file(Path::new("pkg/auth/login_test.go")));
|
||
|
||
// Rust (uncommon but valid).
|
||
assert!(is_test_file(Path::new("src/parser_test.rs")));
|
||
|
||
// C / C++.
|
||
assert!(is_test_file(Path::new("src/auth_test.c")));
|
||
assert!(is_test_file(Path::new("src/auth_test.cpp")));
|
||
assert!(is_test_file(Path::new("tests/test_main.cc")));
|
||
|
||
// Production paths must NOT match.
|
||
assert!(!is_test_file(Path::new("src/main.rs")));
|
||
assert!(!is_test_file(Path::new("src/UserController.java")));
|
||
assert!(!is_test_file(Path::new("app/views.py")));
|
||
assert!(!is_test_file(Path::new("pkg/auth/login.go")));
|
||
assert!(!is_test_file(Path::new("src/handler.go")));
|
||
assert!(!is_test_file(Path::new("src/Foo.php")));
|
||
assert!(!is_test_file(Path::new("src/Controllers/Operations.php")));
|
||
}
|
||
|
||
#[test]
|
||
fn test_suppressible_pattern_covers_cross_language_noise() {
|
||
// JS / TS — pre-existing surface, kept as a regression guard.
|
||
assert!(is_test_suppressible_pattern("js.crypto.math_random"));
|
||
assert!(is_test_suppressible_pattern("ts.crypto.math_random"));
|
||
assert!(is_test_suppressible_pattern("js.secrets.hardcoded_secret"));
|
||
assert!(is_test_suppressible_pattern("ts.transport.fetch_http"));
|
||
|
||
// Cross-language extensions added so weak crypto / hardcoded test
|
||
// tokens / insecure RNG used as fixture seeds do not surface as
|
||
// findings inside test modules.
|
||
assert!(is_test_suppressible_pattern("php.crypto.md5"));
|
||
assert!(is_test_suppressible_pattern("php.crypto.sha1"));
|
||
assert!(is_test_suppressible_pattern("php.crypto.rand"));
|
||
assert!(is_test_suppressible_pattern("py.crypto.md5"));
|
||
assert!(is_test_suppressible_pattern("py.crypto.sha1"));
|
||
assert!(is_test_suppressible_pattern("rb.crypto.md5"));
|
||
assert!(is_test_suppressible_pattern("go.crypto.md5"));
|
||
assert!(is_test_suppressible_pattern("go.crypto.sha1"));
|
||
assert!(is_test_suppressible_pattern("go.secrets.hardcoded_key"));
|
||
assert!(is_test_suppressible_pattern("java.crypto.weak_digest"));
|
||
assert!(is_test_suppressible_pattern("java.crypto.insecure_random"));
|
||
|
||
// Other security-relevant patterns must NOT be suppressed in tests:
|
||
// they capture real attack surface that test fixtures themselves can
|
||
// demonstrate (deserialization, command injection, taint flows).
|
||
assert!(!is_test_suppressible_pattern("php.deser.unserialize"));
|
||
assert!(!is_test_suppressible_pattern("py.deser.pickle_loads"));
|
||
assert!(!is_test_suppressible_pattern("php.cmdi.system"));
|
||
assert!(!is_test_suppressible_pattern("taint-unsanitised-flow"));
|
||
assert!(!is_test_suppressible_pattern("cfg-unguarded-sink"));
|
||
}
|
||
|
||
#[test]
|
||
fn vendored_asset_path_detection() {
|
||
// Minified bundle filename markers always trigger.
|
||
assert!(is_vendored_asset_path(Path::new(
|
||
"src/main/webapp/scripts/jquery-ui.custom.min.js"
|
||
)));
|
||
assert!(is_vendored_asset_path(Path::new("core/assets/htmx.min.js")));
|
||
assert!(is_vendored_asset_path(Path::new("public/app.bundle.js")));
|
||
assert!(is_vendored_asset_path(Path::new(
|
||
"dist/transliteration.umd.min.js"
|
||
)));
|
||
assert!(is_vendored_asset_path(Path::new("dist/lib.iife.js")));
|
||
assert!(is_vendored_asset_path(Path::new("css/site.min.css")));
|
||
|
||
// Path-component triggers: bower_components is unambiguous.
|
||
assert!(is_vendored_asset_path(Path::new(
|
||
"bower_components/lodash/lodash.js"
|
||
)));
|
||
|
||
// `vendor/` triggers only for front-end asset extensions, so Go module
|
||
// vendoring under `vendor/` keeps being scanned.
|
||
assert!(is_vendored_asset_path(Path::new(
|
||
"core/assets/vendor/jquery/jquery.js"
|
||
)));
|
||
assert!(is_vendored_asset_path(Path::new("src/vendors/foo/lib.css")));
|
||
assert!(!is_vendored_asset_path(Path::new(
|
||
"vendor/github.com/foo/bar/lib.go"
|
||
)));
|
||
assert!(!is_vendored_asset_path(Path::new(
|
||
"vendor/github.com/foo/bar/lib.rs"
|
||
)));
|
||
|
||
// Hand-authored production paths must NOT match.
|
||
assert!(!is_vendored_asset_path(Path::new("src/main.js")));
|
||
assert!(!is_vendored_asset_path(Path::new(
|
||
"app/components/Button.tsx"
|
||
)));
|
||
assert!(!is_vendored_asset_path(Path::new("lib/handler.py")));
|
||
// Plain `.js` outside vendor/bower with no `.min` suffix stays in scope
|
||
// even when the directory hints at third-party origin; the engine's
|
||
// existing `is_nonprod_path` downgrade still fires for those.
|
||
assert!(!is_vendored_asset_path(Path::new(
|
||
"webapp/WEB-INF/view/scripts/jquery-ui/jquery-ui-timepicker-addon.js"
|
||
)));
|
||
}
|
||
|
||
#[test]
|
||
fn severity_downgrade_works() {
|
||
assert_eq!(downgrade_severity(Severity::High), Severity::Medium);
|
||
assert_eq!(downgrade_severity(Severity::Medium), Severity::Low);
|
||
assert_eq!(downgrade_severity(Severity::Low), Severity::Low);
|
||
}
|
||
|
||
#[test]
|
||
fn nonprod_path_downgrades_findings() {
|
||
let dir = tempfile::tempdir().unwrap();
|
||
// Create a file under a "tests" directory
|
||
let test_dir = dir.path().join("tests");
|
||
std::fs::create_dir_all(&test_dir).unwrap();
|
||
let test_file = test_dir.join("test_cmd.py");
|
||
std::fs::write(
|
||
&test_file,
|
||
b"import os\ndef test():\n cmd = os.environ['X']\n os.system(cmd)\n",
|
||
)
|
||
.unwrap();
|
||
|
||
let default_cfg = Config::default();
|
||
let diags = run_rules_on_file(&test_file, &default_cfg, None, None).unwrap();
|
||
|
||
// All findings in tests/ should be downgraded (no HIGH)
|
||
let high: Vec<_> = diags
|
||
.iter()
|
||
.filter(|d| d.severity == Severity::High)
|
||
.collect();
|
||
assert!(
|
||
high.is_empty(),
|
||
"Findings in tests/ should be downgraded from HIGH; got {:?}",
|
||
high
|
||
);
|
||
|
||
// With include_nonprod=true, original severity preserved
|
||
let mut prod_cfg = Config::default();
|
||
prod_cfg.scanner.include_nonprod = true;
|
||
let diags_prod = run_rules_on_file(&test_file, &prod_cfg, None, None).unwrap();
|
||
|
||
// Not all diagnostics are necessarily high, but include_nonprod should not downgrade
|
||
// Just verify that if there are findings, they weren't downgraded by the nonprod logic
|
||
let _ = diags_prod;
|
||
}
|
||
|
||
#[test]
|
||
fn constant_arg_suppression_works() {
|
||
use tree_sitter::StreamingIterator;
|
||
|
||
// PHP: system("echo health-ok") should be suppressed
|
||
{
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
parser.set_language(&lang).unwrap();
|
||
let code = b"<?php\nsystem(\"echo health-ok\");\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let query_str = r#"(function_call_expression
|
||
function: (name) @n (#match? @n "^(system)$"))
|
||
@vuln"#;
|
||
let query = tree_sitter::Query::new(&lang, query_str).unwrap();
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code.as_slice());
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m.captures.iter().find(|c| c.index == 0).unwrap();
|
||
assert!(
|
||
is_call_all_args_literal(cap.node, code, "php"),
|
||
"PHP system(\"echo health-ok\") should have all-literal args"
|
||
);
|
||
}
|
||
|
||
// Python: os.system("echo health-ok") should be suppressed
|
||
{
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
let code = b"import os\nos.system(\"echo health-ok\")\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let query_str = r#"(call
|
||
function: (attribute
|
||
object: (identifier) @pkg (#eq? @pkg "os")
|
||
attribute: (identifier) @fn (#eq? @fn "system")))
|
||
@vuln"#;
|
||
let query = tree_sitter::Query::new(&lang, query_str).unwrap();
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code.as_slice());
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m.captures.iter().find(|c| c.index == 0).unwrap();
|
||
assert!(
|
||
is_call_all_args_literal(cap.node, code, "python"),
|
||
"Python os.system(\"echo health-ok\") should have all-literal args"
|
||
);
|
||
}
|
||
|
||
// Python: os.system(cmd) should NOT be suppressed (variable arg)
|
||
{
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
let code = b"import os\nos.system(cmd)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let query_str = r#"(call
|
||
function: (attribute
|
||
object: (identifier) @pkg (#eq? @pkg "os")
|
||
attribute: (identifier) @fn (#eq? @fn "system")))
|
||
@vuln"#;
|
||
let query = tree_sitter::Query::new(&lang, query_str).unwrap();
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code.as_slice());
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m.captures.iter().find(|c| c.index == 0).unwrap();
|
||
assert!(
|
||
!is_call_all_args_literal(cap.node, code, "python"),
|
||
"Python os.system(cmd) should NOT have all-literal args"
|
||
);
|
||
}
|
||
|
||
// Python: os.system(DEFAULT_CMD) with module-level `DEFAULT_CMD = "ls -la"`
|
||
// should be suppressed via the file-level scalar binding map.
|
||
{
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
let code = b"import os\nDEFAULT_CMD = \"ls -la\"\nos.system(DEFAULT_CMD)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let query_str = r#"(call
|
||
function: (attribute
|
||
object: (identifier) @pkg (#eq? @pkg "os")
|
||
attribute: (identifier) @fn (#eq? @fn "system")))
|
||
@vuln"#;
|
||
let query = tree_sitter::Query::new(&lang, query_str).unwrap();
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code.as_slice());
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m.captures.iter().find(|c| c.index == 0).unwrap();
|
||
assert!(
|
||
is_call_all_args_literal(cap.node, code, "python"),
|
||
"os.system(DEFAULT_CMD) with module-level scalar should be suppressed"
|
||
);
|
||
}
|
||
|
||
// Go: db.Exec(DriverName) with package-level `const DriverName = "postgres"`
|
||
// should be suppressed via the file-level scalar binding map.
|
||
{
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
let code = b"package main\nconst DriverName = \"postgres\"\nfunc f(db Db) { db.Exec(DriverName) }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let query_str = r#"(call_expression
|
||
function: (selector_expression
|
||
field: (field_identifier) @m (#eq? @m "Exec")))
|
||
@vuln"#;
|
||
let query = tree_sitter::Query::new(&lang, query_str).unwrap();
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code.as_slice());
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m.captures.iter().find(|c| c.index == 0).unwrap();
|
||
assert!(
|
||
is_call_all_args_literal(cap.node, code, "go"),
|
||
"db.Exec(DriverName) with package-level const should be suppressed"
|
||
);
|
||
}
|
||
}
|
||
|
||
/// Helper that runs a tree-sitter query against Python source and
|
||
/// returns the first capture-0 node, panicking if no match is found.
|
||
/// Used by the Python suppression tests below.
|
||
#[cfg(test)]
|
||
fn first_python_capture<'tree>(
|
||
tree: &'tree tree_sitter::Tree,
|
||
code: &[u8],
|
||
query_str: &str,
|
||
) -> tree_sitter::Node<'tree> {
|
||
use tree_sitter::StreamingIterator;
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles");
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code);
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m
|
||
.captures
|
||
.iter()
|
||
.find(|c| c.index == 0)
|
||
.expect("capture index 0");
|
||
cap.node
|
||
}
|
||
|
||
/// Helper that runs a tree-sitter query against Ruby source and returns
|
||
/// the first capture-0 node, panicking if no match is found. Used by
|
||
/// the Ruby suppression tests below.
|
||
#[cfg(test)]
|
||
fn first_ruby_capture<'tree>(
|
||
tree: &'tree tree_sitter::Tree,
|
||
code: &[u8],
|
||
query_str: &str,
|
||
) -> tree_sitter::Node<'tree> {
|
||
use tree_sitter::StreamingIterator;
|
||
let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE);
|
||
let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles");
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code);
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m
|
||
.captures
|
||
.iter()
|
||
.find(|c| c.index == 0)
|
||
.expect("capture index 0");
|
||
cap.node
|
||
}
|
||
|
||
/// Helper that runs a tree-sitter query against PHP source and returns the
|
||
/// first capture-0 node, panicking if no match is found. Used by the PHP
|
||
/// suppression tests below.
|
||
#[cfg(test)]
|
||
fn first_php_capture<'tree>(
|
||
tree: &'tree tree_sitter::Tree,
|
||
code: &[u8],
|
||
query_str: &str,
|
||
) -> tree_sitter::Node<'tree> {
|
||
use tree_sitter::StreamingIterator;
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles");
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code);
|
||
let m = matches.next().expect("query should match");
|
||
let cap = m
|
||
.captures
|
||
.iter()
|
||
.find(|c| c.index == 0)
|
||
.expect("capture index 0");
|
||
cap.node
|
||
}
|
||
|
||
#[test]
|
||
fn php_include_param_passthrough_recognises_canonical_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(include_expression (variable_name)) @vuln"#;
|
||
|
||
// Closure parameter pass-through (composer ClassLoader idiom).
|
||
let code = b"<?php\nstatic $cb = function ($file) { include $file; };\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_include_param_passthrough(cap, code),
|
||
"closure param pass-through should be recognised"
|
||
);
|
||
|
||
// Method parameter pass-through.
|
||
let code = b"<?php\nclass C { function f(string $file): void { include $file; } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_include_param_passthrough(cap, code),
|
||
"method param pass-through should be recognised"
|
||
);
|
||
|
||
// Local variable assigned from concat, NOT a pass-through.
|
||
let code = b"<?php\nclass C { function f(string $base): void { $f = $base . '/x.php'; include $f; } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_include_param_passthrough(cap, code),
|
||
"concat-built local should NOT be treated as pass-through"
|
||
);
|
||
|
||
// Param reassigned before include, NOT a pass-through.
|
||
let code = b"<?php\nfunction f($file) { $file = $_GET['x']; include $file; }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_include_param_passthrough(cap, code),
|
||
"reassigned param should NOT be treated as pass-through"
|
||
);
|
||
|
||
// Top-level (no enclosing function), NOT a pass-through.
|
||
let code = b"<?php\n$file = $_GET['x'];\ninclude $file;\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_include_param_passthrough(cap, code),
|
||
"top-level include should NOT be treated as pass-through"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn php_unserialize_allowed_classes_recognises_safe_forms() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(function_call_expression function: (name) @n (#eq? @n "unserialize")) @vuln"#;
|
||
|
||
// allowed_classes => false
|
||
let code = b"<?php\n$x = unserialize($d, ['allowed_classes' => false]);\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_allowed_classes_restricted(cap, code),
|
||
"allowed_classes => false should be recognised as safe"
|
||
);
|
||
|
||
// allowed_classes => [Foo::class, Bar::class]
|
||
let code = b"<?php\n$x = unserialize($d, ['allowed_classes' => [Foo::class]]);\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_allowed_classes_restricted(cap, code),
|
||
"allowed_classes => [array] should be recognised as safe"
|
||
);
|
||
|
||
// allowed_classes => self::ALLOWED (class constant reference)
|
||
let code =
|
||
b"<?php\nclass C { const A = []; function f($d) { return unserialize($d, ['allowed_classes' => self::A]); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_allowed_classes_restricted(cap, code),
|
||
"allowed_classes => self::CONST should be recognised as safe"
|
||
);
|
||
|
||
// allowed_classes => true, unsafe default, must NOT be suppressed
|
||
let code = b"<?php\n$x = unserialize($d, ['allowed_classes' => true]);\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_allowed_classes_restricted(cap, code),
|
||
"allowed_classes => true is the unsafe default, should NOT be suppressed"
|
||
);
|
||
|
||
// No second arg, must NOT be suppressed
|
||
let code = b"<?php\n$x = unserialize($d);\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_allowed_classes_restricted(cap, code),
|
||
"single-arg unserialize should NOT be suppressed"
|
||
);
|
||
|
||
// Dynamic options variable, must NOT be suppressed
|
||
let code = b"<?php\n$x = unserialize($d, $opts);\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_allowed_classes_restricted(cap, code),
|
||
"dynamic options variable should NOT be suppressed"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn php_unserialize_magic_method_passthrough_recognises_serializable_contract() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(function_call_expression function: (name) @n (#eq? @n "unserialize")) @vuln"#;
|
||
|
||
// Canonical Serializable::unserialize delegating to __unserialize.
|
||
let code = b"<?php\nclass R {\n public function unserialize($serialized): void {\n $this->__unserialize(unserialize($serialized));\n }\n}\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"Serializable::unserialize($x) → unserialize($x) should be suppressed"
|
||
);
|
||
|
||
// Multi-target list-destructuring assignment shape (Joomla Cli/Input).
|
||
let code = b"<?php\nclass C {\n public function unserialize($input) {\n [$this->a, $this->b] = unserialize($input);\n }\n}\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"list-destructuring inside Serializable::unserialize should be suppressed"
|
||
);
|
||
|
||
// Case-insensitive method name (PHP semantics).
|
||
let code = b"<?php\nclass C { public function UnSerialize($d) { return unserialize($d); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"method name should match case-insensitively (PHP)"
|
||
);
|
||
|
||
// Free function `unserialize` is NOT a magic method, must NOT be suppressed.
|
||
let code = b"<?php\nfunction load($d) { return unserialize($d); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"free function should NOT be suppressed"
|
||
);
|
||
|
||
// Different method name, NOT a Serializable contract, must NOT be suppressed.
|
||
let code = b"<?php\nclass C { public function decode($d) { return unserialize($d); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"method named `decode` should NOT be suppressed"
|
||
);
|
||
|
||
// Method named `unserialize` but with TWO params, NOT the magic signature,
|
||
// must NOT be suppressed.
|
||
let code = b"<?php\nclass C { public function unserialize($d, $opts) { return unserialize($d, $opts); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"two-param method named unserialize should NOT be suppressed"
|
||
);
|
||
|
||
// Magic-method signature but the call argument is NOT the formal param —
|
||
// user is unserializing some other source. Must NOT be suppressed.
|
||
let code = b"<?php\nclass C { public function unserialize($input) { return unserialize($_GET['x']); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"non-pass-through arg inside magic method should NOT be suppressed"
|
||
);
|
||
|
||
// Wrapped argument (`unserialize(trim($input))`) is NOT a bare-param
|
||
// pass-through — keep firing. This shape covers cache/session
|
||
// pass-throughs that the rule should still surface.
|
||
let code = b"<?php\nclass C { public function unserialize($input) { return unserialize(trim($input)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"wrapped argument inside magic method should NOT be suppressed (conservative)"
|
||
);
|
||
|
||
// Anonymous function named-like context (defensive — anonymous_function
|
||
// is not a method_declaration).
|
||
let code = b"<?php\n$f = function($input) { return unserialize($input); };\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_magic_method_passthrough(cap, code),
|
||
"closure should NOT be suppressed"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn php_unserialize_inside_phpunit_assertion_recognises_roundtrip_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(function_call_expression function: (name) @n (#eq? @n "unserialize")) @vuln"#;
|
||
|
||
// Canonical assertSame with array literal expected.
|
||
let code = b"<?php\nclass T { public function t() { $this->assertSame(['a' => 1], unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"assertSame(literal array, unserialize($x)) should be suppressed"
|
||
);
|
||
|
||
// assertEquals with scalar string expected.
|
||
let code =
|
||
b"<?php\nclass T { public function t() { $this->assertEquals('hello', unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"assertEquals(literal string, unserialize($x)) should be suppressed"
|
||
);
|
||
|
||
// Static dispatch: static::assertSame(...).
|
||
let code =
|
||
b"<?php\nclass T { public function t() { static::assertSame(['x'], unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"static::assertSame should be suppressed"
|
||
);
|
||
|
||
// Self dispatch: self::assertEquals(...).
|
||
let code =
|
||
b"<?php\nclass T { public function t() { self::assertEquals(['y'], unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"self::assertEquals should be suppressed"
|
||
);
|
||
|
||
// Single-arg verb: assertNull(unserialize($x)). The verb itself
|
||
// bounds the result.
|
||
let code = b"<?php\nclass T { public function t() { $this->assertNull(unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"assertNull(unserialize($x)) should be suppressed (verb bounds the result)"
|
||
);
|
||
|
||
// Single-arg verb: assertIsArray(unserialize($x)).
|
||
let code =
|
||
b"<?php\nclass T { public function t() { $this->assertIsArray(unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"assertIsArray(unserialize($x)) should be suppressed"
|
||
);
|
||
|
||
// Case-insensitive method name (PHP semantics).
|
||
let code =
|
||
b"<?php\nclass T { public function t() { $this->AssertSame(['z'], unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"method name should match case-insensitively"
|
||
);
|
||
|
||
// Free function `unserialize` outside any assertion: keep firing.
|
||
let code = b"<?php\n$x = unserialize($_GET['blob']);\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"unserialize outside any assertion should NOT be suppressed"
|
||
);
|
||
|
||
// assertEquals with a NON-literal first arg ($computed) keeps firing —
|
||
// the result is not statically pinned.
|
||
let code =
|
||
b"<?php\nclass T { public function t($e) { $this->assertEquals($e, unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"assertEquals($computed, unserialize($x)) should NOT be suppressed"
|
||
);
|
||
|
||
// Single-arg unrecognised assertion verb keeps firing.
|
||
let code = b"<?php\nclass T { public function t() { $this->assertSomethingCustom(unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"1-arg unknown assertion verb should NOT be suppressed"
|
||
);
|
||
|
||
// Wrapping in another expression (binary, ternary) breaks the
|
||
// bound — unserialize is no longer the direct argument. Conservative.
|
||
let code = b"<?php\nclass T { public function t() { $this->assertSame(['x'], unserialize($b) ?: []); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"wrapped (ternary) unserialize argument should NOT be suppressed"
|
||
);
|
||
|
||
// Method call whose name does NOT start with `assert` keeps firing.
|
||
let code = b"<?php\nclass T { public function t() { $this->log(['x'], unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"non-assert method should NOT be suppressed"
|
||
);
|
||
|
||
// First arg is a literal but it's a single-arg call (no actual) — defensive.
|
||
let code = b"<?php\nclass T { public function t() { $this->assertSame(unserialize($b)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_unserialize_inside_phpunit_assertion(cap, code),
|
||
"single-arg `assertSame(unserialize($x))` should NOT be suppressed (no expected)"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn python_deser_inside_unittest_assertion_recognises_roundtrip_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
// Pickle pattern equivalent: capture the `pickle` identifier under
|
||
// the deser call's `function.object` path.
|
||
let q = r#"(call function: (attribute object: (identifier) @pkg (#eq? @pkg "pickle") attribute: (identifier) @fn (#match? @fn "^loads?$"))) @vuln"#;
|
||
|
||
// Canonical assertEqual with dict literal expected.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertEqual({'a': 1}, pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertEqual(dict literal, pickle.loads(b)) should be suppressed"
|
||
);
|
||
|
||
// assertEquals with list literal expected.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertEquals([1, 2, 3], pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertEquals(list literal, pickle.loads(b)) should be suppressed"
|
||
);
|
||
|
||
// pytest-style ordering: deser first, literal second.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertEqual(pickle.loads(b), {'k': 'v'})\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertEqual(pickle.loads(b), dict literal) should be suppressed"
|
||
);
|
||
|
||
// Unary negative literal.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertEqual(-7, pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertEqual(unary-negative literal, pickle.loads(b)) should be suppressed"
|
||
);
|
||
|
||
// Single-arg verb: assertIsNone.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertIsNone(pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertIsNone(pickle.loads(b)) should be suppressed (verb bounds)"
|
||
);
|
||
|
||
// Single-arg verb: assertTrue.
|
||
let code =
|
||
b"import pickle\nclass T:\n def t(self, b):\n self.assertTrue(pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertTrue(pickle.loads(b)) should be suppressed (verb bounds)"
|
||
);
|
||
|
||
// assertIsInstance(value, type).
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertIsInstance(pickle.loads(b), dict)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertIsInstance(pickle.loads(b), dict) should be suppressed (type bounds)"
|
||
);
|
||
|
||
// msg=... kwarg: keep firing? actually no, msg is just informational; bound is satisfied.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertEqual([1], pickle.loads(b), msg='preserve')\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"msg= kwarg should not break the literal-positional bound"
|
||
);
|
||
|
||
// Free function shape (`from pickle import loads`) covered via leaf-
|
||
// name match. Use a different query that captures the identifier
|
||
// call shape.
|
||
let code_ff = b"from pickle import loads\nclass T:\n def t(self, b):\n self.assertEqual([1], loads(b))\n";
|
||
let tree = parser.parse(code_ff, None).unwrap();
|
||
// For free-function calls, use a query matching the bare identifier callee.
|
||
let q2 = r#"(call function: (identifier) @fn (#match? @fn "^loads?$")) @vuln"#;
|
||
let cap = first_python_capture(&tree, code_ff, q2);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code_ff),
|
||
"assertEqual(literal, loads(b)) for `from pickle import loads` should be suppressed"
|
||
);
|
||
|
||
// Production call (no assertion wrap) keeps firing.
|
||
let code = b"import pickle\ndef handler(blob):\n return pickle.loads(blob)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"production pickle.loads should NOT be suppressed"
|
||
);
|
||
|
||
// Non-literal expected ($computed) keeps firing.
|
||
let code = b"import pickle\nclass T:\n def t(self, b, expected):\n self.assertEqual(expected, pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertEqual(non-literal, pickle.loads(b)) should NOT be suppressed"
|
||
);
|
||
|
||
// Non-assert verb keeps firing.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.checkEqual([1], pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"checkEqual (non-assert verb) should NOT be suppressed"
|
||
);
|
||
|
||
// Wrapped in ternary: bound is broken.
|
||
let code = b"import pickle\nclass T:\n def t(self, b, c):\n self.assertEqual([1], pickle.loads(b) if c else [])\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"ternary wrapping pickle.loads should NOT be suppressed"
|
||
);
|
||
|
||
// assertCustom (unrecognised single-arg verb) keeps firing.
|
||
let code = b"import pickle\nclass T:\n def t(self, b):\n self.assertCustomCheck(pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assertCustomCheck single-arg should NOT be suppressed (verb not in bounding set)"
|
||
);
|
||
|
||
// assertEqual where both args are non-literal keeps firing.
|
||
let code = b"import pickle\nclass T:\n def t(self, b, e):\n self.assertEqual(e, pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"two non-literal positional args should NOT be suppressed"
|
||
);
|
||
|
||
// f-string expected (interpolation) keeps firing.
|
||
let code = b"import pickle\nclass T:\n def t(self, b, x):\n self.assertEqual(f'pre-{x}', pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"f-string expected (interpolation) should NOT be suppressed"
|
||
);
|
||
}
|
||
|
||
/// Pytest plain-`assert` round-trip recogniser invariants. Same
|
||
/// entry point as the unittest test above (the function handles both
|
||
/// idioms) but the asserted shape sits under an `assert_statement`
|
||
/// instead of a `unittest.TestCase` method call.
|
||
#[test]
|
||
fn python_deser_inside_pytest_assert_recognises_roundtrip_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(call function: (attribute object: (identifier) @pkg (#eq? @pkg "pickle") attribute: (identifier) @fn (#match? @fn "^loads?$"))) @vuln"#;
|
||
|
||
// assert deser == LITERAL
|
||
let code = b"import pickle\ndef t(b):\n assert pickle.loads(b) == [1, 2, 3]\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser == [literal] should be suppressed"
|
||
);
|
||
|
||
// assert deser is None
|
||
let code = b"import pickle\ndef t(b):\n assert pickle.loads(b) is None\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser is None should be suppressed"
|
||
);
|
||
|
||
// assert deser in [LITERAL, ...]
|
||
let code = b"import pickle\ndef t(b):\n assert pickle.loads(b) in [1, 2, 3]\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser in [literal] should be suppressed"
|
||
);
|
||
|
||
// assert deser (truthy bare)
|
||
let code = b"import pickle\ndef t(b):\n assert pickle.loads(b)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser (truthy bare) should be suppressed"
|
||
);
|
||
|
||
// assert not deser
|
||
let code = b"import pickle\ndef t(b):\n assert not pickle.loads(b)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert not deser should be suppressed"
|
||
);
|
||
|
||
// assert isinstance(deser, dict)
|
||
let code = b"import pickle\ndef t(b):\n assert isinstance(pickle.loads(b), dict)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert isinstance(deser, dict) should be suppressed"
|
||
);
|
||
|
||
// assert (deser == LITERAL) — paren wrap.
|
||
let code = b"import pickle\ndef t(b):\n assert (pickle.loads(b) == [1])\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert (deser == literal) with paren wrap should be suppressed"
|
||
);
|
||
|
||
// assert deser == LITERAL, "msg"
|
||
let code = b"import pickle\ndef t(b):\n assert pickle.loads(b) == 1, 'round trip'\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser == literal, msg should be suppressed (msg is named_child(1))"
|
||
);
|
||
|
||
// assert bool(deser)
|
||
let code = b"import pickle\ndef t(b):\n assert bool(pickle.loads(b))\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert bool(deser) should be suppressed"
|
||
);
|
||
|
||
// assert len(deser) == 3
|
||
let code = b"import pickle\ndef t(b):\n assert len(pickle.loads(b)) == 3\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert len(deser) == int_literal should be suppressed"
|
||
);
|
||
|
||
// Negatives ----------------------------------------------------------
|
||
|
||
// assert deser and X — boolean op short-circuits, can run side effect.
|
||
let code = b"import pickle\ndef t(b, x):\n assert pickle.loads(b) and x\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser and X (boolean op) should NOT be suppressed"
|
||
);
|
||
|
||
// assert deser if cond else X — conditional short-circuits.
|
||
let code = b"import pickle\ndef t(b, c):\n assert (pickle.loads(b) if c else 0)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert (deser if c else x) should NOT be suppressed"
|
||
);
|
||
|
||
// assert wrapper(deser) == LITERAL — arbitrary user fn breaks bound.
|
||
let code = b"import pickle\ndef t(b):\n assert wrapper(pickle.loads(b)) == [1]\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert wrapper(deser) == literal should NOT be suppressed"
|
||
);
|
||
|
||
// assert deser == non-literal — bound depends on dynamic var.
|
||
let code = b"import pickle\ndef t(b, e):\n assert pickle.loads(b) == e\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert deser == non_literal should NOT be suppressed"
|
||
);
|
||
|
||
// assert isinstance(deser, type_var) where type is dynamic.
|
||
let code = b"import pickle\ndef t(b):\n t = some_type_factory()\n assert isinstance(pickle.loads(b), t)\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
// `t` is an `identifier` and `is_python_type_reference` accepts
|
||
// identifier (assertIsInstance treats user-class identifiers as
|
||
// type references), so this case stays suppressed. Pinned to
|
||
// document the matching behaviour rather than tighten it.
|
||
assert!(
|
||
is_python_deser_inside_unittest_assertion(cap, code),
|
||
"assert isinstance(deser, identifier) treats identifier as type ref"
|
||
);
|
||
|
||
// Production assignment-then-assert: deser sits in `actual = pickle.loads(b)`,
|
||
// not under the assert. Must keep firing.
|
||
let code =
|
||
b"import pickle\ndef t(b):\n actual = pickle.loads(b)\n assert actual == [1]\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_python_capture(&tree, code, q);
|
||
assert!(
|
||
!is_python_deser_inside_unittest_assertion(cap, code),
|
||
"deser bound to a name then asserted should NOT be suppressed (assignment context)"
|
||
);
|
||
}
|
||
|
||
/// Ruby Layer C5 invariants. The recogniser must accept Minitest
|
||
/// `assert_*`/`refute_*` shapes, RSpec `expect(_).to MATCHER` shapes,
|
||
/// and reject production calls / dynamic-expected / unrelated wrappers.
|
||
#[test]
|
||
fn ruby_deser_inside_test_assertion_recognises_roundtrip_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
// Capture the `Marshal` constant under the deser call's `receiver` field.
|
||
let q = r#"(call receiver: (constant) @recv (#eq? @recv "Marshal") method: (identifier) @m (#eq? @m "load")) @vuln"#;
|
||
|
||
// Minitest assert_equal LITERAL, deser
|
||
let code = b"class T\n def t(b)\n assert_equal [1, 2, 3], Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"assert_equal [literal], Marshal.load(b) should be suppressed"
|
||
);
|
||
|
||
// Minitest assert_nil
|
||
let code = b"class T\n def t(b)\n assert_nil Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"assert_nil Marshal.load(b) should be suppressed"
|
||
);
|
||
|
||
// Minitest single-arg truthy assert
|
||
let code = b"class T\n def t(b)\n assert Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"assert Marshal.load(b) (truthy) should be suppressed"
|
||
);
|
||
|
||
// Minitest assert_kind_of TYPE, deser
|
||
let code = b"class T\n def t(b)\n assert_kind_of Array, Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"assert_kind_of TYPE, deser should be suppressed"
|
||
);
|
||
|
||
// Minitest refute_equal
|
||
let code = b"class T\n def t(b)\n refute_equal [9, 9], Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"refute_equal [literal], deser should be suppressed"
|
||
);
|
||
|
||
// RSpec expect(deser).to eq(LITERAL)
|
||
let code =
|
||
b"describe X do\n it 'x' do\n expect(Marshal.load(b)).to eq([1, 2, 3])\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"expect(deser).to eq([literal]) should be suppressed"
|
||
);
|
||
|
||
// RSpec expect(deser).to be_nil
|
||
let code = b"describe X do\n it 'x' do\n expect(Marshal.load(b)).to be_nil\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"expect(deser).to be_nil should be suppressed"
|
||
);
|
||
|
||
// RSpec expect(deser).to be_a(TYPE)
|
||
let code =
|
||
b"describe X do\n it 'x' do\n expect(Marshal.load(b)).to be_a(Array)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"expect(deser).to be_a(TYPE) should be suppressed"
|
||
);
|
||
|
||
// RSpec not_to be_nil
|
||
let code =
|
||
b"describe X do\n it 'x' do\n expect(Marshal.load(b)).not_to be_nil\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
is_ruby_deser_inside_test_assertion(cap, code),
|
||
"expect(deser).not_to be_nil should be suppressed"
|
||
);
|
||
|
||
// Negatives ----------------------------------------------------------
|
||
|
||
// Production call (no assertion) keeps firing.
|
||
let code = b"def handler(blob)\n Marshal.load(blob)\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
!is_ruby_deser_inside_test_assertion(cap, code),
|
||
"production Marshal.load should NOT be suppressed"
|
||
);
|
||
|
||
// assert_equal with dynamic expected keeps firing.
|
||
let code =
|
||
b"class T\n def t(b, expected)\n assert_equal expected, Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
!is_ruby_deser_inside_test_assertion(cap, code),
|
||
"assert_equal non_literal, deser should NOT be suppressed"
|
||
);
|
||
|
||
// RSpec expect(deser).to eq(dynamic) keeps firing.
|
||
let code =
|
||
b"describe X do\n it 'x' do\n expect(Marshal.load(b)).to eq(expected)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
!is_ruby_deser_inside_test_assertion(cap, code),
|
||
"expect(deser).to eq(non_literal) should NOT be suppressed"
|
||
);
|
||
|
||
// Custom unrecognised verb (not in the bounding sets) keeps firing.
|
||
let code = b"class T\n def t(b)\n custom_check Marshal.load(b)\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
!is_ruby_deser_inside_test_assertion(cap, code),
|
||
"non-assertion-verb wrap should NOT be suppressed"
|
||
);
|
||
|
||
// RSpec .should == LIT (old-style, parses as `binary`, not the
|
||
// expected receiver-method-arguments shape) keeps firing.
|
||
let code = b"describe X do\n it 'x' do\n Marshal.load(b).should == [1]\n end\nend\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_ruby_capture(&tree, code, q);
|
||
assert!(
|
||
!is_ruby_deser_inside_test_assertion(cap, code),
|
||
"old-style .should == LIT should NOT be suppressed"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn php_weak_hash_non_crypto_use_recognises_canonical_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(function_call_expression function: (name) @n (#match? @n "^(md5|sha1)$")) @vuln"#;
|
||
|
||
// ETag concat returned from getETag() — return-statement enclosing
|
||
// method name path.
|
||
let code = b"<?php\nclass C { public function getETag(): string { return '\"' . md5($this->data) . '\"'; } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"getETag concat should be suppressed"
|
||
);
|
||
|
||
// Array element value with a string-literal key whose name is non-crypto.
|
||
let code = b"<?php\nfunction f($x) { return ['table_name_hash' => md5($x)]; }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"array element with `*_hash` key should be suppressed"
|
||
);
|
||
|
||
// Subscript LHS with a string-literal index `'etag'`.
|
||
let code = b"<?php\nfunction f($x, &$row) { $row['etag'] = md5($x); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"subscript LHS with 'etag' key should be suppressed"
|
||
);
|
||
|
||
// Member-access LHS named `storageId` (camelCase boundary on `Id` suffix).
|
||
let code = b"<?php\nclass C { function f() { $this->storageId = md5($this->id); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"member-access LHS `storageId` should be suppressed"
|
||
);
|
||
|
||
// Null-coalescing assignment with subscript LHS.
|
||
let code = b"<?php\nfunction f($t, &$tables) { $tables[$t]['hash'] ??= md5($t); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"??= subscript LHS with 'hash' key should be suppressed"
|
||
);
|
||
|
||
// Call result used as an array index.
|
||
let code = b"<?php\nfunction f($a, $x) { return $a[md5($x)]; }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"md5 used as subscript index should be suppressed"
|
||
);
|
||
|
||
// Cache-style lookup verb (`$cache->get(sha1(...))`).
|
||
let code = b"<?php\nclass C { public $cache; function f($u) { return $this->cache->get(sha1($u)); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"method call to lookup-verb `get(sha1(..))` should be suppressed"
|
||
);
|
||
|
||
// Createnamedparameter wrapper around md5 inside an array element value.
|
||
let code = b"<?php\nclass C { public $q; function f($d) { $this->q->insert('t')->values(['etag' => $this->q->createNamedParameter(md5($d))]); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"wrapper-call inside array element with `etag` key should be suppressed"
|
||
);
|
||
|
||
// Dynamic-index subscript LHS with a non-crypto receiver name.
|
||
let code = b"<?php\nfunction f($cols) { $columnNamesHashes = []; foreach ($cols as $c) { $columnNamesHashes[$c] = md5($c); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
is_php_weak_hash_non_crypto_use(cap, code),
|
||
"subscript LHS with dynamic index — receiver name `*Hashes` should drive suppression"
|
||
);
|
||
|
||
// Crypto consumer — keep firing. $this->password = md5($pwd).
|
||
let code =
|
||
b"<?php\nclass C { public $password; function f($p) { $this->password = md5($p); } }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_weak_hash_non_crypto_use(cap, code),
|
||
"$this->password = md5(...) is crypto storage and must NOT be suppressed"
|
||
);
|
||
|
||
// Compound name with crypto-keyword substring. $tokenHash = md5(...).
|
||
let code = b"<?php\nfunction f($x) { $tokenHash = md5($x); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_weak_hash_non_crypto_use(cap, code),
|
||
"$tokenHash compound name must NOT be suppressed (contains 'token')"
|
||
);
|
||
|
||
// pw_hash compound — must NOT be suppressed.
|
||
let code = b"<?php\nfunction f($p) { $pw_hash = md5($p); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_weak_hash_non_crypto_use(cap, code),
|
||
"$pw_hash compound name must NOT be suppressed"
|
||
);
|
||
|
||
// Bare statement / unrecognised consumer — keep firing.
|
||
let code = b"<?php\nfunction f($x) { var_dump(md5($x)); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_php_capture(&tree, code, q);
|
||
assert!(
|
||
!is_php_weak_hash_non_crypto_use(cap, code),
|
||
"var_dump(md5(...)) has no recognisable consumer name and must NOT be suppressed"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn name_is_non_crypto_recognises_word_boundary_suffixes() {
|
||
// Whole-word and underscore boundaries.
|
||
assert!(name_is_non_crypto("hash"));
|
||
assert!(name_is_non_crypto("etag"));
|
||
assert!(name_is_non_crypto("table_name_hash"));
|
||
assert!(name_is_non_crypto("table_id"));
|
||
assert!(name_is_non_crypto("cache_key"));
|
||
|
||
// CamelCase boundaries.
|
||
assert!(name_is_non_crypto("storageId"));
|
||
assert!(name_is_non_crypto("tableHash"));
|
||
assert!(name_is_non_crypto("sqlMd5"));
|
||
assert!(name_is_non_crypto("cacheBuster"));
|
||
|
||
// Long stand-alone suffix (≥4) without word boundary.
|
||
assert!(name_is_non_crypto("columnnameshashes"));
|
||
assert!(name_is_non_crypto("tablefingerprint"));
|
||
|
||
// Non-letter previous char — digit.
|
||
assert!(name_is_non_crypto("v1id"));
|
||
|
||
// Keep firing on crypto-keyword compound names.
|
||
assert!(!name_is_non_crypto("password_hash"));
|
||
assert!(!name_is_non_crypto("hashedPassword"));
|
||
assert!(!name_is_non_crypto("tokenHash"));
|
||
assert!(!name_is_non_crypto("signatureHash"));
|
||
assert!(!name_is_non_crypto("pw_hash"));
|
||
assert!(!name_is_non_crypto("digest"));
|
||
assert!(!name_is_non_crypto("hmac"));
|
||
assert!(!name_is_non_crypto("salt"));
|
||
assert!(!name_is_non_crypto("private_key"));
|
||
|
||
// Bare `key`/`keys` and `apiKey` shapes are crypto-credential
|
||
// candidates and must keep firing; specific safe forms like
|
||
// `cache_key`/`cachekey` are still suppressed via their own
|
||
// entries in `SAFE_SUFFIXES`.
|
||
assert!(!name_is_non_crypto("key"));
|
||
assert!(!name_is_non_crypto("keys"));
|
||
assert!(!name_is_non_crypto("apiKey"));
|
||
assert!(!name_is_non_crypto("api_key"));
|
||
assert!(!name_is_non_crypto("apiKeyHash"));
|
||
assert!(!name_is_non_crypto("api_key_hash"));
|
||
assert!(name_is_non_crypto("cache_key"));
|
||
assert!(name_is_non_crypto("cachekey"));
|
||
|
||
// Words that LOOK like an `id` suffix but lack a word boundary —
|
||
// do NOT classify (no boundary, length-2 suffix).
|
||
assert!(!name_is_non_crypto("said"));
|
||
assert!(!name_is_non_crypto("void"));
|
||
assert!(!name_is_non_crypto("rapid"));
|
||
|
||
// Unrecognised generic names.
|
||
assert!(!name_is_non_crypto("x"));
|
||
assert!(!name_is_non_crypto("result"));
|
||
assert!(!name_is_non_crypto("output"));
|
||
assert!(!name_is_non_crypto(""));
|
||
|
||
// Non-ASCII before a short suffix should NOT be treated as a word
|
||
// boundary (no false-positive classification on identifiers like
|
||
// `tëhash` whose previous char is a Unicode letter, not punctuation).
|
||
assert!(!name_is_non_crypto("tëid"));
|
||
// Non-ASCII before a long (≥4) suffix still classifies via the
|
||
// length fallback, matching the `columnnameshashes` shape.
|
||
assert!(name_is_non_crypto("tëhash"));
|
||
// Non-ASCII before a real underscore-prefixed suffix continues to
|
||
// classify via the underscore boundary.
|
||
assert!(name_is_non_crypto("tablë_id"));
|
||
}
|
||
|
||
#[test]
|
||
fn method_is_lookup_verb_recognises_cache_verbs() {
|
||
// Direct verb match.
|
||
assert!(method_is_lookup_verb("get"));
|
||
assert!(method_is_lookup_verb("set"));
|
||
assert!(method_is_lookup_verb("has"));
|
||
assert!(method_is_lookup_verb("delete"));
|
||
assert!(method_is_lookup_verb("fetch"));
|
||
assert!(method_is_lookup_verb("getItem"));
|
||
assert!(method_is_lookup_verb("setItem"));
|
||
|
||
// Composite forms — verb prefix + non-crypto suffix.
|
||
assert!(method_is_lookup_verb("getCacheKey"));
|
||
assert!(method_is_lookup_verb("setCacheKey"));
|
||
assert!(method_is_lookup_verb("buildKey"));
|
||
assert!(method_is_lookup_verb("createId"));
|
||
assert!(method_is_lookup_verb("hasFingerprint"));
|
||
|
||
// Crypto-comparison helpers — keep firing.
|
||
assert!(!method_is_lookup_verb("hash_equals"));
|
||
assert!(!method_is_lookup_verb("verify"));
|
||
assert!(!method_is_lookup_verb("password_verify"));
|
||
assert!(!method_is_lookup_verb("decrypt"));
|
||
assert!(!method_is_lookup_verb("encrypt"));
|
||
assert!(!method_is_lookup_verb("sign"));
|
||
assert!(!method_is_lookup_verb("invoke"));
|
||
assert!(!method_is_lookup_verb("doSomething"));
|
||
}
|
||
|
||
#[test]
|
||
fn sprintf_format_safety_classifier() {
|
||
// Numeric / char / pointer specifiers, bounded by definition.
|
||
assert!(sprintf_format_is_safe(""));
|
||
assert!(sprintf_format_is_safe("hello world"));
|
||
assert!(sprintf_format_is_safe("%d"));
|
||
assert!(sprintf_format_is_safe("%lld%c"));
|
||
assert!(sprintf_format_is_safe("fixed=%d/%c"));
|
||
assert!(sprintf_format_is_safe("%5d %x %llo"));
|
||
assert!(sprintf_format_is_safe("%%literal-percent"));
|
||
assert!(sprintf_format_is_safe("%p"));
|
||
// Precision-bounded `%s` / `%.*s`, output capped at precision.
|
||
assert!(sprintf_format_is_safe(" %.*s"));
|
||
assert!(sprintf_format_is_safe("%.5s"));
|
||
assert!(sprintf_format_is_safe("[%-.10s]"));
|
||
// Bare `%s` / width-only `%5s`, width is a *minimum*, length is
|
||
// unbounded. Must NOT be suppressed.
|
||
assert!(!sprintf_format_is_safe("%s"));
|
||
assert!(!sprintf_format_is_safe("hello %s world"));
|
||
assert!(!sprintf_format_is_safe("%5s"));
|
||
assert!(!sprintf_format_is_safe("[%-20s]"));
|
||
// Unknown / non-standard conversions → conservative refuse.
|
||
assert!(!sprintf_format_is_safe("%S"));
|
||
assert!(!sprintf_format_is_safe("%"));
|
||
assert!(!sprintf_format_is_safe("%lZ"));
|
||
}
|
||
|
||
#[cfg(test)]
|
||
fn first_c_capture<'tree>(
|
||
tree: &'tree tree_sitter::Tree,
|
||
code: &[u8],
|
||
query_str: &str,
|
||
) -> tree_sitter::Node<'tree> {
|
||
use tree_sitter::StreamingIterator;
|
||
let lang = tree_sitter::Language::from(tree_sitter_c::LANGUAGE);
|
||
let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles");
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code);
|
||
let m = matches.next().expect("query should match");
|
||
m.captures
|
||
.iter()
|
||
.find(|c| c.index == 0)
|
||
.expect("capture index 0")
|
||
.node
|
||
}
|
||
|
||
#[cfg(test)]
|
||
fn first_cpp_capture<'tree>(
|
||
tree: &'tree tree_sitter::Tree,
|
||
code: &[u8],
|
||
query_str: &str,
|
||
) -> tree_sitter::Node<'tree> {
|
||
use tree_sitter::StreamingIterator;
|
||
let lang = tree_sitter::Language::from(tree_sitter_cpp::LANGUAGE);
|
||
let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles");
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code);
|
||
let m = matches.next().expect("query should match");
|
||
m.captures
|
||
.iter()
|
||
.find(|c| c.index == 0)
|
||
.expect("capture index 0")
|
||
.node
|
||
}
|
||
|
||
#[test]
|
||
fn cpp_cast_target_type_is_safe_recognises_canonical_shapes() {
|
||
use crate::ast::cpp_cast_target_type_is_safe as f;
|
||
// Byte-pointer family — C++ explicitly permits byte-level access.
|
||
assert!(f("char*"));
|
||
assert!(f("char *"));
|
||
assert!(f("const char*"));
|
||
assert!(f("const char *"));
|
||
assert!(f("unsigned char*"));
|
||
assert!(f("const unsigned char*"));
|
||
assert!(f("signed char*"));
|
||
assert!(f("uint8_t*"));
|
||
assert!(f("const uint8_t*"));
|
||
assert!(f("int8_t*"));
|
||
assert!(f("std::byte*"));
|
||
assert!(f("const std::byte*"));
|
||
assert!(f("byte*"));
|
||
assert!(f("wchar_t*"));
|
||
// void* — well-defined target.
|
||
assert!(f("void*"));
|
||
assert!(f("const void*"));
|
||
// Integer round-trip — value cast only (depth 0). Aliasing
|
||
// *through* a `uintptr_t*` / `intptr_t*` is NOT covered by the
|
||
// standard exemption — only the pointer<->integer value
|
||
// conversion is well-defined.
|
||
assert!(f("uintptr_t"));
|
||
assert!(f("std::uintptr_t"));
|
||
assert!(f("intptr_t"));
|
||
assert!(f("std::intptr_t"));
|
||
// BSD socket family — POSIX intentionally type-puns these.
|
||
assert!(f("sockaddr*"));
|
||
assert!(f("struct sockaddr*"));
|
||
assert!(f("sockaddr_in*"));
|
||
assert!(f("sockaddr_in6*"));
|
||
assert!(f("sockaddr_un*"));
|
||
assert!(f("sockaddr_storage*"));
|
||
|
||
// Multi-token / extra whitespace — normaliser should collapse it.
|
||
assert!(f("const uint8_t *"));
|
||
assert!(f("uint8_t * const"));
|
||
assert!(f("const unsigned char *"));
|
||
|
||
// Pointer-to-pointer is NOT covered by the [basic.lval]/11
|
||
// aliasing exemption — accessing a `char*` object through a
|
||
// `char**` is a strict-aliasing violation. Same for `void**`,
|
||
// `uint8_t**`, etc.
|
||
assert!(!f("char**"));
|
||
assert!(!f("uint8_t**"));
|
||
assert!(!f("void**"));
|
||
assert!(!f("void **"));
|
||
// Pointer-to-integer-roundtrip-type (`uintptr_t*`, `intptr_t*`)
|
||
// is also not safe: only the pointer<->integer **value** cast is
|
||
// well-defined, not aliasing through a pointer-to-uintptr_t.
|
||
assert!(!f("uintptr_t*"));
|
||
assert!(!f("intptr_t*"));
|
||
assert!(!f("std::uintptr_t*"));
|
||
|
||
// Non-safe shapes — must NOT be suppressed.
|
||
assert!(!f("MyStruct*"));
|
||
assert!(!f("InstanceType*"));
|
||
assert!(!f("DBImpl*"));
|
||
assert!(!f("C*"));
|
||
assert!(!f("CPP*"));
|
||
assert!(!f("T*"));
|
||
assert!(!f("secp256k1_keypair*"));
|
||
assert!(!f("PIP_ADAPTER_ADDRESSES"));
|
||
assert!(!f("std::vector<int>*"));
|
||
assert!(!f("void(*)(int)"));
|
||
assert!(!f("char[10]"));
|
||
// Bare integer (no pointer) is only safe for the round-trip
|
||
// types — `int`, `size_t`, `uint64_t` should NOT match.
|
||
assert!(!f("int"));
|
||
assert!(!f("size_t"));
|
||
assert!(!f("uint64_t"));
|
||
assert!(!f("char")); // bare char without pointer
|
||
assert!(!f("uint8_t")); // bare uint8_t without pointer
|
||
}
|
||
|
||
#[test]
|
||
fn cpp_reinterpret_cast_layer_e_recognises_byte_pointer_targets() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_cpp::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
let q = r#"(call_expression
|
||
function: (template_function
|
||
name: (identifier) @n (#eq? @n "reinterpret_cast")))
|
||
@vuln"#;
|
||
|
||
// reinterpret_cast<uint8_t*>(p) — the leveldb / serialization shape.
|
||
let code = b"void f(int* p) { auto q = reinterpret_cast<uint8_t*>(p); (void)q; }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_cpp_capture(&tree, code, q);
|
||
assert!(
|
||
is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code),
|
||
"reinterpret_cast<uint8_t*> must be suppressed (byte-pointer target)"
|
||
);
|
||
|
||
// reinterpret_cast<const std::byte*>(p) — qualified scoped name.
|
||
let code = b"#include <cstddef>\nvoid f(int* p) { auto q = reinterpret_cast<const std::byte*>(p); (void)q; }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_cpp_capture(&tree, code, q);
|
||
assert!(
|
||
is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code),
|
||
"reinterpret_cast<const std::byte*> must be suppressed"
|
||
);
|
||
|
||
// reinterpret_cast<void*>(0x08000000) — synthetic-address shape.
|
||
let code = b"void* f() { return reinterpret_cast<void*>(0x08000000); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_cpp_capture(&tree, code, q);
|
||
assert!(
|
||
is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code),
|
||
"reinterpret_cast<void*> must be suppressed (synthetic address)"
|
||
);
|
||
|
||
// reinterpret_cast<uintptr_t>(p) — integer round-trip.
|
||
let code =
|
||
b"#include <cstdint>\nuintptr_t f(int* p) { return reinterpret_cast<uintptr_t>(p); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_cpp_capture(&tree, code, q);
|
||
assert!(
|
||
is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code),
|
||
"reinterpret_cast<uintptr_t> must be suppressed (integer round-trip)"
|
||
);
|
||
|
||
// reinterpret_cast<sockaddr*>(&addr) — POSIX socket-API shape.
|
||
let code = b"struct sockaddr_in { int x; };\nstruct sockaddr;\nvoid f(struct sockaddr_in* a) { auto* s = reinterpret_cast<sockaddr*>(a); (void)s; }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_cpp_capture(&tree, code, q);
|
||
assert!(
|
||
is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code),
|
||
"reinterpret_cast<sockaddr*> must be suppressed (BSD socket pun)"
|
||
);
|
||
|
||
// reinterpret_cast<MyStruct*>(buf) — strict-aliasing UB risk, must NOT
|
||
// be suppressed.
|
||
let code = b"struct MyStruct { int a; };\nMyStruct* f(char* buf) { return reinterpret_cast<MyStruct*>(buf); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_cpp_capture(&tree, code, q);
|
||
assert!(
|
||
!is_cpp_cast_target_type_safe("cpp.memory.reinterpret_cast", cap, code),
|
||
"reinterpret_cast<MyStruct*> must NOT be suppressed (genuine strict-aliasing risk)"
|
||
);
|
||
|
||
// Other rule ids are unaffected.
|
||
assert!(
|
||
!is_cpp_cast_target_type_safe("cpp.memory.const_cast", cap, code),
|
||
"Layer E must only fire for cpp.memory.reinterpret_cast"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn c_buffer_call_literal_safe_recognises_canonical_shapes() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_c::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
|
||
let q_strcpy = r#"(call_expression function: (identifier) @id (#eq? @id "strcpy")) @vuln"#;
|
||
let q_strcat = r#"(call_expression function: (identifier) @id (#eq? @id "strcat")) @vuln"#;
|
||
let q_sprintf = r#"(call_expression function: (identifier) @id (#eq? @id "sprintf")) @vuln"#;
|
||
|
||
// strcpy(dst, "literal"), postgres autoprewarm shape.
|
||
let code = b"void f(char *d) { strcpy(d, \"pg_prewarm\"); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||
assert!(
|
||
is_c_buffer_call_literal_safe("c.memory.strcpy", cap, code),
|
||
"strcpy with string-literal source must be suppressed"
|
||
);
|
||
|
||
// strcpy(dst, cond ? "a" : "b"), string-literal ternary.
|
||
let code = b"void f(char *s, int h) { strcpy(s, (h >= 12) ? \"p.m.\" : \"a.m.\"); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||
assert!(
|
||
is_c_buffer_call_literal_safe("c.memory.strcpy", cap, code),
|
||
"strcpy with ternary-of-literals source must be suppressed"
|
||
);
|
||
|
||
// strcpy(dst, cond ? P_M_STR : A_M_STR), postgres formatting.c
|
||
// shape with #define'd ALL_CAPS string-constant macros.
|
||
let code = b"#define P_M_STR \"p.m.\"\n#define A_M_STR \"a.m.\"\nvoid f(char *s, int h) { strcpy(s, (h >= 12) ? P_M_STR : A_M_STR); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||
assert!(
|
||
is_c_buffer_call_literal_safe("c.memory.strcpy", cap, code),
|
||
"strcpy with ternary-of-ALL_CAPS-macros must be suppressed"
|
||
);
|
||
|
||
// strcpy(dst, cond ? var_a : var_b), lowercase variables, NOT a
|
||
// recognisable preprocessor macro shape. Must NOT suppress.
|
||
let code = b"void f(char *s, int h, char *a, char *b) { strcpy(s, (h >= 12) ? a : b); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||
assert!(
|
||
!is_c_buffer_call_literal_safe("c.memory.strcpy", cap, code),
|
||
"strcpy with ternary-of-lowercase-vars must NOT be suppressed"
|
||
);
|
||
|
||
// strcat(dst, "literal"), same principle as strcpy.
|
||
let code = b"void f(char *d) { strcat(d, \" (done)\"); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcat);
|
||
assert!(
|
||
is_c_buffer_call_literal_safe("c.memory.strcat", cap, code),
|
||
"strcat with string-literal source must be suppressed"
|
||
);
|
||
|
||
// sprintf(dst, "%lld%c", ...), numeric format string.
|
||
let code = b"void f(char *cp, long long v, char u) { sprintf(cp, \"%lld%c\", v, u); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||
assert!(
|
||
is_c_buffer_call_literal_safe("c.memory.sprintf", cap, code),
|
||
"sprintf with numeric-only format must be suppressed"
|
||
);
|
||
|
||
// sprintf(str, " %.*s", N, x), precision-bounded `%s`.
|
||
let code = b"void f(char *str, int n, const char *x) { sprintf(str, \" %.*s\", n, x); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||
assert!(
|
||
is_c_buffer_call_literal_safe("c.memory.sprintf", cap, code),
|
||
"sprintf with precision-bounded `%.*s` must be suppressed"
|
||
);
|
||
|
||
// strcpy(dst, src) where src is a non-literal, must NOT suppress.
|
||
let code = b"void f(char *d, char **a) { strcpy(d, a[1]); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||
assert!(
|
||
!is_c_buffer_call_literal_safe("c.memory.strcpy", cap, code),
|
||
"strcpy with non-literal source must NOT be suppressed"
|
||
);
|
||
|
||
// sprintf with bare `%s`, must NOT suppress.
|
||
let code = b"void f(char *b, const char *u) { sprintf(b, \"%s\", u); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||
assert!(
|
||
!is_c_buffer_call_literal_safe("c.memory.sprintf", cap, code),
|
||
"sprintf with bare `%%s` must NOT be suppressed"
|
||
);
|
||
|
||
// sprintf with non-literal format (concatenated_string with PRI* macro)
|
||
//, must NOT suppress (engine cannot statically expand the macro).
|
||
let code = b"void f(char *b, long long v) { sprintf(b, \"%\" PRId64, v); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_sprintf);
|
||
assert!(
|
||
!is_c_buffer_call_literal_safe("c.memory.sprintf", cap, code),
|
||
"sprintf with concatenated_string format must NOT be suppressed"
|
||
);
|
||
|
||
// Other rule ids should not be affected.
|
||
let code = b"void f(char *d) { strcpy(d, \"x\"); }\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_c_capture(&tree, code, q_strcpy);
|
||
assert!(
|
||
!is_c_buffer_call_literal_safe("c.memory.gets", cap, code),
|
||
"Layer D should only fire for buffer-overflow rule ids"
|
||
);
|
||
}
|
||
|
||
/// Regression: `is_literal_node` must NOT classify a Python f-string
|
||
/// (a `string` node containing `interpolation` children) as literal.
|
||
/// Layer A's "all-args-literal → suppress Security finding" shortcut
|
||
/// otherwise hides every CVE that injects via `cursor.execute(f"…{x}…")`
|
||
/// or `text(f"…{x}…")`. Motivated by CVE-2025-69662 (geopandas SQLi
|
||
/// via `text(f"SELECT … '{geom_name}' …")`) and CVE-2025-24793
|
||
/// (snowflake-connector-python f-string-built CREATE STAGE / DROP).
|
||
#[test]
|
||
fn is_literal_node_rejects_python_fstring_with_interpolation() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
|
||
// f-string with one interpolation segment, must be non-literal.
|
||
let code = b"x = f\"SELECT * WHERE y = '{u}'\"\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let assignment = tree
|
||
.root_node()
|
||
.child(0)
|
||
.and_then(|s| s.child(0))
|
||
.expect("assignment node");
|
||
let rhs = assignment
|
||
.child_by_field_name("right")
|
||
.expect("RHS of assignment");
|
||
assert_eq!(rhs.kind(), "string");
|
||
assert!(
|
||
!is_literal_node(rhs, code),
|
||
"f-string with interpolation must not be classified as literal"
|
||
);
|
||
|
||
// Plain string literal, must remain literal.
|
||
let code = b"x = \"plain literal\"\n";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let assignment = tree
|
||
.root_node()
|
||
.child(0)
|
||
.and_then(|s| s.child(0))
|
||
.expect("assignment node");
|
||
let rhs = assignment
|
||
.child_by_field_name("right")
|
||
.expect("RHS of assignment");
|
||
assert_eq!(rhs.kind(), "string");
|
||
assert!(
|
||
is_literal_node(rhs, code),
|
||
"plain string literal must be classified as literal"
|
||
);
|
||
}
|
||
|
||
#[cfg(test)]
|
||
fn first_java_capture<'tree>(
|
||
tree: &'tree tree_sitter::Tree,
|
||
code: &[u8],
|
||
query_str: &str,
|
||
) -> tree_sitter::Node<'tree> {
|
||
use tree_sitter::StreamingIterator;
|
||
let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE);
|
||
let query = tree_sitter::Query::new(&lang, query_str).expect("query compiles");
|
||
let mut cursor = tree_sitter::QueryCursor::new();
|
||
let mut matches = cursor.matches(&query, tree.root_node(), code);
|
||
let m = matches.next().expect("query should match");
|
||
m.captures
|
||
.iter()
|
||
.find(|c| c.index == 0)
|
||
.expect("capture index 0")
|
||
.node
|
||
}
|
||
|
||
#[test]
|
||
fn is_call_all_args_literal_recognises_java_call_kinds() {
|
||
let mut parser = tree_sitter::Parser::new();
|
||
let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE);
|
||
parser.set_language(&lang).unwrap();
|
||
|
||
// method_invocation with literal arg, Layer A must suppress.
|
||
let code = b"class T { void f() throws Exception { Class.forName(\"com.foo.Bar\"); } }";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let q = r#"(method_invocation
|
||
object: (identifier) @c (#eq? @c "Class")
|
||
name: (identifier) @id (#eq? @id "forName"))
|
||
@vuln"#;
|
||
let cap = first_java_capture(&tree, code, q);
|
||
assert!(
|
||
is_call_all_args_literal(cap, code, "java"),
|
||
"method_invocation with literal arg must trigger Layer A suppression"
|
||
);
|
||
|
||
// method_invocation with class-constant arg, Layer A must suppress
|
||
// via the file-level scalar-binding lookup (session 0014/0015).
|
||
let code = b"class T {\n private static final String D = \"com.foo.Bar\";\n void f() throws Exception { Class.forName(D); }\n}";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_java_capture(&tree, code, q);
|
||
assert!(
|
||
is_call_all_args_literal(cap, code, "java"),
|
||
"method_invocation with class-const arg must trigger Layer A suppression"
|
||
);
|
||
|
||
// method_invocation with parameter arg, Layer A must NOT suppress.
|
||
let code = b"class T { void f(String s) throws Exception { Class.forName(s); } }";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_java_capture(&tree, code, q);
|
||
assert!(
|
||
!is_call_all_args_literal(cap, code, "java"),
|
||
"method_invocation with non-literal arg must NOT trigger Layer A suppression"
|
||
);
|
||
|
||
// object_creation_expression with empty args (`new Yaml()` shape).
|
||
// `has_any_arg` stays false so the gate also returns false: empty
|
||
// arg lists do not satisfy "all args are literal" (arg-less calls
|
||
// can still carry side-effect risk via the constructor itself).
|
||
let code = b"class T { Object f() { return new Object(); } }";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let q = r#"(object_creation_expression) @vuln"#;
|
||
let cap = first_java_capture(&tree, code, q);
|
||
assert!(
|
||
!is_call_all_args_literal(cap, code, "java"),
|
||
"object_creation_expression with empty args must NOT trigger Layer A"
|
||
);
|
||
|
||
// object_creation_expression with literal arg, must suppress.
|
||
let code = b"class T { Object f() { return new String(\"literal\"); } }";
|
||
let tree = parser.parse(code, None).unwrap();
|
||
let cap = first_java_capture(&tree, code, q);
|
||
assert!(
|
||
is_call_all_args_literal(cap, code, "java"),
|
||
"object_creation_expression with literal arg must trigger Layer A"
|
||
);
|
||
}
|