This commit is contained in:
Eli Peter 2026-06-05 10:16:30 -05:00 committed by GitHub
parent 55247b7fcd
commit 991c84a1eb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
1464 changed files with 225448 additions and 1985 deletions

View file

@ -102,6 +102,7 @@ fn parse_timeout_diag(path: &Path, timeout_ms: u64) -> Diag {
rollup: None,
finding_id: String::new(),
alternative_finding_ids: Vec::new(),
stable_hash: 0,
}
}
@ -234,10 +235,17 @@ fn build_taint_diag(
.map(sanitize_desc)
})
.unwrap_or_else(|| "(unknown)".into());
// Sink-callee attribution: when the sink node is an *argument* of a call
// (e.g. PHP `header("location: " . $_GET['x'])` — the `$_GET[...]` subscript
// carries `callee = "$_GET"` but `outer_callee = "header"`), the enclosing
// call is the real sink and should be displayed, not the source token.
// `outer_callee` is only populated for nested/argument positions, so for a
// plain call node it is None and we fall back to the node's own callee.
let call_site_callee = cfg_graph[finding.sink]
.call
.callee
.outer_callee
.as_deref()
.or(cfg_graph[finding.sink].call.callee.as_deref())
.map(sanitize_desc)
.unwrap_or_else(|| "(unknown)".into());
let kind_label = source_kind_label(finding.source_kind);
@ -706,6 +714,7 @@ fn build_taint_diag(
rollup: None,
finding_id: finding.finding_id.clone(),
alternative_finding_ids: finding.alternative_finding_ids.to_vec(),
stable_hash: 0,
};
// Post-fill explanation and confidence limiters
@ -779,6 +788,35 @@ fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> {
}
}
/// All language slugs the scanner can parse, paired with the file extensions
/// that map to them. Single source of truth shared with [`lang_for_path`]; the
/// `supported_extensions_resolve_to_their_slug` test asserts they stay in sync.
pub(crate) const SUPPORTED_LANGUAGE_EXTENSIONS: &[(&str, &[&str])] = &[
("rust", &["rs"]),
("c", &["c"]),
(
"cpp",
&["cpp", "cc", "cxx", "c++", "hpp", "hxx", "hh", "h++"],
),
("java", &["java"]),
("go", &["go"]),
("php", &["php"]),
("python", &["py"]),
("typescript", &["ts", "tsx"]),
("javascript", &["js", "jsx"]),
("ruby", &["rb"]),
];
/// File extensions associated with a language slug (case-insensitive). Returns
/// an empty slice if `slug` is not a supported language.
pub fn extensions_for_lang(slug: &str) -> &'static [&'static str] {
SUPPORTED_LANGUAGE_EXTENSIONS
.iter()
.find(|(s, _)| s.eq_ignore_ascii_case(slug))
.map(|(_, exts)| *exts)
.unwrap_or(&[])
}
/// Fast binary-file guard: skip if >1% NUL bytes.
fn is_binary(bytes: &[u8]) -> bool {
bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1
@ -965,9 +1003,11 @@ fn is_test_suppressible_pattern(id: &str) -> bool {
// deterministic test data, insecure RNG used for fixture seeding.
id.ends_with(".secrets.hardcoded_secret")
|| id.ends_with(".secrets.hardcoded_key")
|| id.ends_with(".crypto.hardcoded_key")
|| id.ends_with(".crypto.math_random")
|| id.ends_with(".crypto.insecure_random")
|| id.ends_with(".crypto.weak_digest")
|| id.ends_with(".crypto.weak_algorithm")
|| id.ends_with(".crypto.md5")
|| id.ends_with(".crypto.sha1")
|| id.ends_with(".crypto.rand")
@ -1041,9 +1081,7 @@ fn downgrade_severity(s: Severity) -> Severity {
}
}
// ─────────────────────────────────────────────────────────────────────────────
// ParsedSource + ParsedFile: shared parse/CFG pipeline
// ─────────────────────────────────────────────────────────────────────────────
/// Level 1: parsed tree + lang info. No CFG construction.
struct ParsedSource<'a> {
@ -1363,6 +1401,7 @@ impl<'a> ParsedSource<'a> {
rollup: None,
finding_id: String::new(),
alternative_finding_ids: Vec::new(),
stable_hash: 0,
});
}
}
@ -1890,7 +1929,6 @@ impl<'a> ParsedFile<'a> {
cfg: &body.graph,
entry: body.entry,
lang: caller_lang,
file_path: &self.source.file_path_str,
source_bytes: self.source.bytes,
func_summaries: self.local_summaries(),
global_summaries,
@ -1950,13 +1988,35 @@ impl<'a> ParsedFile<'a> {
cfg_analysis::Confidence::Medium => crate::evidence::Confidence::Medium,
cfg_analysis::Confidence::Low => crate::evidence::Confidence::Low,
});
// Carry the sink node's resolved Sink caps onto the structural
// finding's evidence so downstream cap-classification (and the
// eval `cap_of`) buckets `cfg-unguarded-sink` under its real cap
// (sqli/cmdi/ssrf/…) instead of the catch-all `other`. Without
// this every taint-less structural sink finding fell through to
// `other`, hiding real recall (e.g. dvpwa `cur.execute` SQLi)
// and inflating the `other` bucket. Non-sink structural findings
// (resource-leak, auth-gap) carry no Sink label, so this is 0.
let cf_sink_caps: u32 = cf
.evidence
.first()
.map(|&n| {
cfg_ctx.cfg[n].taint.labels.iter().fold(0u32, |acc, l| {
if let crate::labels::DataLabel::Sink(c) = l {
acc | c.bits()
} else {
acc
}
})
})
.unwrap_or(0);
let cf_category = FindingCategory::for_structural_rule(&cf.rule_id);
out.push(Diag {
path: self.source.path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cf.severity,
id: cf.rule_id,
category: FindingCategory::Security,
category: cf_category,
path_validated: false,
guard_kind: None,
message: Some(cf.message),
@ -1971,6 +2031,7 @@ impl<'a> ParsedFile<'a> {
kind: "sink".into(),
snippet: None,
}),
sink_caps: cf_sink_caps,
guards: vec![],
sanitizers: vec![],
state: None,
@ -1984,6 +2045,7 @@ impl<'a> ParsedFile<'a> {
rollup: None,
finding_id: String::new(),
alternative_finding_ids: Vec::new(),
stable_hash: 0,
});
}
} // end for body in bodies (CFG structural analyses)
@ -2031,7 +2093,7 @@ impl<'a> ParsedFile<'a> {
col: point.column + 1,
severity: sf.severity,
id: sf.rule_id.clone(),
category: FindingCategory::Security,
category: FindingCategory::for_structural_rule(&sf.rule_id),
path_validated: false,
guard_kind: None,
message: Some(sf.message.clone()),
@ -2064,6 +2126,7 @@ impl<'a> ParsedFile<'a> {
rollup: None,
finding_id: String::new(),
alternative_finding_ids: Vec::new(),
stable_hash: 0,
});
}
@ -2157,9 +2220,7 @@ impl<'a> ParsedFile<'a> {
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Pass 1: Extract function summaries (no taint analysis)
// ─────────────────────────────────────────────────────────────────────────────
/// Extract function summaries from pre-read bytes.
///
@ -2305,7 +2366,10 @@ pub fn perf_stage_breakdown_fused(
TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &taint_diags);
let _filtered: Vec<_> = ast_findings
.into_iter()
.filter(|d| !suppression.should_suppress(&d.id, d.line))
.filter(|d| {
!suppression.should_suppress(&d.id, d.line)
&& !suppression.is_redundant_ast_pattern(&d.id, d.line)
})
.collect();
let t_suppr = s_suppr.elapsed().as_micros();
@ -2449,9 +2513,7 @@ pub fn extract_all_summaries_from_bytes(
))
}
// ─────────────────────────────────────────────────────────────────────────────
// Constant-argument suppression helper
// ─────────────────────────────────────────────────────────────────────────────
/// Returns `true` when the captured call node has only literal arguments
/// (string, number, boolean, null/nil/none), or identifier arguments that
@ -5351,9 +5413,7 @@ fn has_interpolation(node: tree_sitter::Node) -> bool {
false
}
// ─────────────────────────────────────────────────────────────────────────────
// Layer B: AST pattern suppression when taint confirms safety
// ─────────────────────────────────────────────────────────────────────────────
/// Map the second segment of a pattern ID (e.g. "cmdi" from "py.cmdi.os_system")
/// to the `Cap` that taint analysis models. Returns `None` for categories taint
@ -5425,6 +5485,14 @@ struct TaintSuppressionCtx {
/// 11 inline analysis but the sink's enclosing scope has no
/// labelled Sanitizer of its own.
interproc_sanitizer_callers: HashSet<Option<String>>,
/// Union of resolved sink-cap bits for cap-specific taint findings at
/// each line. Used by [`Self::is_redundant_ast_pattern`] to drop an
/// AST-pattern finding only when the flow engine already emitted a
/// specific rule id for the same vulnerability class. Legacy generic
/// findings (`taint-unsanitised-flow`, `cfg-unguarded-sink`) are not
/// canonical enough to subsume language-specific AST rule IDs such as
/// `py.cmdi.subprocess_shell` or `c.cmdi.system`.
specific_taint_finding_caps_by_line: HashMap<usize, u32>,
}
impl TaintSuppressionCtx {
@ -5623,6 +5691,26 @@ impl TaintSuppressionCtx {
.map(|d| d.line)
.collect();
// Cap bits per line for cap-specific flow-backed findings only, so a
// redundant AST pattern at the same line+cap can be dropped in favour
// of the richer flow. Do not count legacy generic findings here:
// `taint-unsanitised-flow` and `cfg-unguarded-sink` carry evidence,
// but their rule ids are deliberately catch-alls, while AST `cmdi`,
// `sqli`, etc. IDs are the canonical namespace many tests, SARIF
// consumers, and dynamic-verification spec derivation rely on.
let mut specific_taint_finding_caps_by_line: HashMap<usize, u32> = HashMap::new();
for d in taint_diags {
if d.id.starts_with("taint-") && !d.id.starts_with("taint-unsanitised-flow") {
if let Some(caps) = d.evidence.as_ref().map(|e| e.sink_caps) {
if caps != 0 {
*specific_taint_finding_caps_by_line
.entry(d.line)
.or_default() |= caps;
}
}
}
}
// Per-function partition of taint findings. Maps each finding's
// line to the enclosing function scope by reusing
// `sink_func_at_line` (the same span/function mapping the Sink-side
@ -5646,9 +5734,30 @@ impl TaintSuppressionCtx {
engine_validated_funcs,
source_killed_funcs,
interproc_sanitizer_callers,
specific_taint_finding_caps_by_line,
}
}
/// Returns `true` when an AST pattern finding is a redundant restatement
/// of a flow the taint engine already reported at the same line.
///
/// The taint / structural flow finding carries source + path evidence the
/// bare pattern lacks, so when both fire at the same line for the same
/// cap the pattern is pure duplicate noise. This is the
/// taint-found-it-UNSAFE counterpart to [`Self::should_suppress`]'s
/// taint-found-it-SAFE logic: there, no flow finding means the pattern
/// may carry unique signal; here, a same-cap flow finding means it does
/// not. Cap-matched (not line-only) so a pattern whose cap differs from
/// the co-located flow's cap — a genuinely distinct sink — is preserved.
fn is_redundant_ast_pattern(&self, pattern_id: &str, line: usize) -> bool {
let Some(cap) = pattern_category_cap(pattern_id) else {
return false;
};
self.specific_taint_finding_caps_by_line
.get(&line)
.is_some_and(|caps| caps & cap.bits() != 0)
}
/// Returns `true` if this AST pattern finding should be suppressed.
fn should_suppress(&self, pattern_id: &str, line: usize) -> bool {
// Condition 1: pattern category maps to a Cap taint models
@ -5734,9 +5843,7 @@ impl TaintSuppressionCtx {
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Pass 2 / singlefile: Full rule execution (AST queries + taint)
// ─────────────────────────────────────────────────────────────────────────────
/// Run all enabled analyses on pre-read bytes and return diagnostics.
///
@ -5779,11 +5886,10 @@ pub fn run_rules_on_bytes(
let suppression =
TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &out);
let ast_findings = parsed.source.run_ast_queries(cfg);
out.extend(
ast_findings
.into_iter()
.filter(|d| !suppression.should_suppress(&d.id, d.line)),
);
out.extend(ast_findings.into_iter().filter(|d| {
!suppression.should_suppress(&d.id, d.line)
&& !suppression.is_redundant_ast_pattern(&d.id, d.line)
}));
}
if cfg.scanner.mode == AnalysisMode::Full {
out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root));
@ -5812,9 +5918,7 @@ pub fn run_rules_on_file(
run_rules_on_bytes(&bytes, path, cfg, global_summaries, scan_root)
}
// ─────────────────────────────────────────────────────────────────────────────
// Fused single-pass: extract summaries + run full analysis in one parse/CFG
// ─────────────────────────────────────────────────────────────────────────────
/// Result of a fused analysis pass: both function summaries and diagnostics.
pub struct FusedResult {
@ -5979,11 +6083,10 @@ pub fn analyse_file_fused(
if needs_cfg && cfg.scanner.mode == AnalysisMode::Full {
let suppression =
TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &out);
out.extend(
ast_findings
.into_iter()
.filter(|d| !suppression.should_suppress(&d.id, d.line)),
);
out.extend(ast_findings.into_iter().filter(|d| {
!suppression.should_suppress(&d.id, d.line)
&& !suppression.is_redundant_ast_pattern(&d.id, d.line)
}));
} else {
out.extend(ast_findings);
}
@ -6086,9 +6189,7 @@ pub fn analyse_file_fused(
})
}
// ─────────────────────────────────────────────────────────────────────────────
// Text-based pattern scanning (non-tree-sitter files)
// ─────────────────────────────────────────────────────────────────────────────
/// Run text-based pattern scanners on files whose extension is not supported
/// by tree-sitter. Currently handles `.ejs` templates.