* chore: Exclude CLAUDE.md from Cargo.toml

* feat: add callgraph module and integrate into main analysis flow

* feat: enhance CLI with new severity filtering and analysis modes

* feat: update CHANGELOG with recent enhancements and fixes to severity filtering and output handling

* feat: implement state-model dataflow analysis for resource lifecycle and auth state

* feat: enhance diagnostic output formatting and add evidence structure

* feat: implement attack surface ranking for diagnostics with scoring and sorting

* feat: add comprehensive documentation for installation, usage, and rules reference

* feat: add multiple language support for command execution and evaluation endpoints

* feat: implement inline suppression for findings using `nyx:ignore` comments

* feat: add confidence levels to AST patterns and update output structure

* feat: implement low-noise prioritization system with category filtering, rollup grouping, and configurable budgets

* feat: bump version to 0.4.0 and update changelog with new features and improvements

* feat: add dead code allowances to various functions in mod.rs and real_world_tests.rs
This commit is contained in:
Eli Peter 2026-02-25 21:16:36 -05:00 committed by GitHub
parent 19b578c5c4
commit 1bbe4b1cfb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
456 changed files with 25628 additions and 1228 deletions

View file

@ -2,8 +2,10 @@ use crate::cfg::{build_cfg, export_summaries};
use crate::cfg_analysis;
use crate::commands::scan::Diag;
use crate::errors::{NyxError, NyxResult};
use crate::evidence::{Evidence, SpanEvidence, StateEvidence};
use crate::labels::{build_lang_rules, severity_for_source_kind};
use crate::patterns::Severity;
use crate::patterns::{FindingCategory, Severity};
use crate::state;
use crate::summary::{FuncSummary, GlobalSummaries};
use crate::symbol::{Lang, normalize_namespace};
use crate::taint::analyse_file;
@ -92,6 +94,23 @@ fn is_nonprod_path(path: &Path) -> bool {
false
}
/// Normalize a callee description for display.
fn sanitize_desc(s: &str) -> String {
crate::fmt::normalize_snippet(s)
}
/// Human-readable label for a `SourceKind`.
fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str {
use crate::labels::SourceKind;
match sk {
SourceKind::UserInput => "user input",
SourceKind::EnvironmentConfig => "environment config",
SourceKind::FileSystem => "file system data",
SourceKind::Database => "database result",
SourceKind::Unknown => "tainted data",
}
}
/// Downgrade severity by one tier: High→Medium, Medium→Low, Low→Low.
fn downgrade_severity(s: Severity) -> Severity {
match s {
@ -239,8 +258,45 @@ pub fn run_rules_on_bytes(
let source_byte = cfg_graph[finding.source].span.0;
let source_point = byte_offset_to_point(&_tree, source_byte);
let source_callee = cfg_graph[finding.source]
.callee
.as_deref()
.map(sanitize_desc)
.unwrap_or_else(|| "(unknown)".into());
let sink_callee = cfg_graph[finding.sink]
.callee
.as_deref()
.map(sanitize_desc)
.unwrap_or_else(|| "(unknown)".into());
let kind_label = source_kind_label(finding.source_kind);
let short_source = crate::fmt::shorten_callee(&source_callee);
let short_sink = crate::fmt::shorten_callee(&sink_callee);
let mut labels = vec![
(
"Source".into(),
format!(
"{source_callee} ({}:{})",
source_point.row + 1,
source_point.column + 1
),
),
("Sink".into(), sink_callee.to_string()),
];
if let Some(guard) = finding.guard_kind {
labels.push(("Path guard".into(), format!("{guard:?}")));
}
let file_path_owned = path.to_string_lossy().into_owned();
let mut evidence_notes = Vec::new();
if finding.path_validated {
evidence_notes.push("path_validated".into());
}
evidence_notes.push(format!("source_kind:{:?}", finding.source_kind));
out.push(Diag {
path: path.to_string_lossy().into_owned(),
path: file_path_owned.clone(),
line: sink_point.row + 1,
col: sink_point.column + 1,
severity: severity_for_source_kind(finding.source_kind),
@ -249,6 +305,50 @@ pub fn run_rules_on_bytes(
source_point.row + 1,
source_point.column + 1
),
category: FindingCategory::Security,
path_validated: finding.path_validated,
guard_kind: finding.guard_kind.map(|k| format!("{k:?}")),
message: Some(format!(
"unsanitised {kind_label} flows from {short_source} \u{2192} {short_sink}"
)),
labels,
confidence: None,
evidence: Some(Evidence {
source: Some(SpanEvidence {
path: file_path_owned.clone(),
line: (source_point.row + 1) as u32,
col: (source_point.column + 1) as u32,
kind: "source".into(),
snippet: Some(short_source.clone()),
}),
sink: Some(SpanEvidence {
path: file_path_owned,
line: (sink_point.row + 1) as u32,
col: (sink_point.column + 1) as u32,
kind: "sink".into(),
snippet: Some(short_sink.clone()),
}),
guards: finding
.guard_kind
.map(|g| {
vec![SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (sink_point.row + 1) as u32,
col: 0,
kind: "guard".into(),
snippet: Some(format!("{g:?}")),
}]
})
.unwrap_or_default(),
sanitizers: vec![],
state: None,
notes: evidence_notes,
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
@ -268,14 +368,111 @@ pub fn run_rules_on_bytes(
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&_tree, cf.span.0);
let cfg_confidence = Some(match cf.confidence {
cfg_analysis::Confidence::High => crate::evidence::Confidence::High,
cfg_analysis::Confidence::Medium => crate::evidence::Confidence::Medium,
cfg_analysis::Confidence::Low => crate::evidence::Confidence::Low,
});
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cf.severity,
id: cf.rule_id,
category: FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some(cf.message),
labels: vec![],
confidence: cfg_confidence,
evidence: Some(Evidence {
source: None,
sink: Some(SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (point.row + 1) as u32,
col: (point.column + 1) as u32,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
// ── State-model dataflow analysis ────────────────────────────────
if cfg.scanner.enable_state_analysis {
let state_findings = state::run_state_analysis(
&cfg_graph,
entry,
caller_lang,
bytes,
&summaries,
global_summaries,
);
// Collect state finding lines to dedup overlapping CFG findings.
let state_lines: std::collections::HashSet<usize> = state_findings
.iter()
.map(|sf| byte_offset_to_point(&_tree, sf.span.0).row + 1)
.collect();
for sf in &state_findings {
let point = byte_offset_to_point(&_tree, sf.span.0);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: sf.severity,
id: sf.rule_id.clone(),
category: FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some(sf.message.clone()),
labels: vec![],
confidence: None,
evidence: Some(Evidence {
source: None,
sink: Some(SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (point.row + 1) as u32,
col: (point.column + 1) as u32,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: Some(StateEvidence {
machine: sf.machine.into(),
subject: sf.subject.clone(),
from_state: sf.from_state.into(),
to_state: sf.to_state.into(),
}),
notes: vec![],
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
// Suppress cfg-resource-leak / cfg-auth-gap when state analysis
// already covers the same line (state analysis is more precise).
if !state_findings.is_empty() {
out.retain(|d| {
!((d.id == "cfg-resource-leak" || d.id == "cfg-auth-gap")
&& state_lines.contains(&d.line))
});
}
}
}
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast {
@ -285,7 +482,7 @@ pub fn run_rules_on_bytes(
let mut cursor = QueryCursor::new();
for cq in compiled.iter() {
if cfg.scanner.min_severity <= cq.meta.severity {
if cq.meta.severity > cfg.scanner.min_severity {
continue;
}
let mut matches = cursor.matches(&cq.query, root, bytes);
@ -298,6 +495,31 @@ pub fn run_rules_on_bytes(
col: point.column + 1,
severity: cq.meta.severity,
id: cq.meta.id.to_owned(),
category: cq.meta.category.finding_category(),
path_validated: false,
guard_kind: None,
message: Some(cq.meta.description.to_owned()),
labels: vec![],
confidence: Some(cq.meta.confidence),
evidence: Some(Evidence {
source: None,
sink: Some(SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (point.row + 1) as u32,
col: (point.column + 1) as u32,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
}
@ -427,8 +649,45 @@ pub fn analyse_file_fused(
let source_byte = cfg_graph[finding.source].span.0;
let source_point = byte_offset_to_point(&tree, source_byte);
let source_callee = cfg_graph[finding.source]
.callee
.as_deref()
.map(sanitize_desc)
.unwrap_or_else(|| "(unknown)".into());
let sink_callee = cfg_graph[finding.sink]
.callee
.as_deref()
.map(sanitize_desc)
.unwrap_or_else(|| "(unknown)".into());
let kind_label = source_kind_label(finding.source_kind);
let short_source = crate::fmt::shorten_callee(&source_callee);
let short_sink = crate::fmt::shorten_callee(&sink_callee);
let mut labels = vec![
(
"Source".into(),
format!(
"{source_callee} ({}:{})",
source_point.row + 1,
source_point.column + 1
),
),
("Sink".into(), sink_callee.to_string()),
];
if let Some(guard) = finding.guard_kind {
labels.push(("Path guard".into(), format!("{guard:?}")));
}
let fused_file_path = path.to_string_lossy().into_owned();
let mut fused_evidence_notes = Vec::new();
if finding.path_validated {
fused_evidence_notes.push("path_validated".into());
}
fused_evidence_notes.push(format!("source_kind:{:?}", finding.source_kind));
out.push(Diag {
path: path.to_string_lossy().into_owned(),
path: fused_file_path.clone(),
line: sink_point.row + 1,
col: sink_point.column + 1,
severity: severity_for_source_kind(finding.source_kind),
@ -437,6 +696,50 @@ pub fn analyse_file_fused(
source_point.row + 1,
source_point.column + 1
),
category: FindingCategory::Security,
path_validated: finding.path_validated,
guard_kind: finding.guard_kind.map(|k| format!("{k:?}")),
message: Some(format!(
"unsanitised {kind_label} flows from {short_source} \u{2192} {short_sink}"
)),
labels,
confidence: None,
evidence: Some(Evidence {
source: Some(SpanEvidence {
path: fused_file_path.clone(),
line: (source_point.row + 1) as u32,
col: (source_point.column + 1) as u32,
kind: "source".into(),
snippet: Some(short_source.clone()),
}),
sink: Some(SpanEvidence {
path: fused_file_path.clone(),
line: (sink_point.row + 1) as u32,
col: (sink_point.column + 1) as u32,
kind: "sink".into(),
snippet: Some(short_sink.clone()),
}),
guards: finding
.guard_kind
.map(|g| {
vec![SpanEvidence {
path: fused_file_path,
line: (sink_point.row + 1) as u32,
col: 0,
kind: "guard".into(),
snippet: Some(format!("{g:?}")),
}]
})
.unwrap_or_default(),
sanitizers: vec![],
state: None,
notes: fused_evidence_notes,
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
@ -455,14 +758,108 @@ pub fn analyse_file_fused(
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&tree, cf.span.0);
let fused_cfg_confidence = Some(match cf.confidence {
cfg_analysis::Confidence::High => crate::evidence::Confidence::High,
cfg_analysis::Confidence::Medium => crate::evidence::Confidence::Medium,
cfg_analysis::Confidence::Low => crate::evidence::Confidence::Low,
});
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cf.severity,
id: cf.rule_id,
category: FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some(cf.message),
labels: vec![],
confidence: fused_cfg_confidence,
evidence: Some(Evidence {
source: None,
sink: Some(SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (point.row + 1) as u32,
col: (point.column + 1) as u32,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
// ── State-model dataflow analysis ────────────────────────────────
if cfg.scanner.enable_state_analysis {
let state_findings = state::run_state_analysis(
&cfg_graph,
entry,
caller_lang,
bytes,
&local_summaries,
global_summaries,
);
let state_lines: std::collections::HashSet<usize> = state_findings
.iter()
.map(|sf| byte_offset_to_point(&tree, sf.span.0).row + 1)
.collect();
for sf in &state_findings {
let point = byte_offset_to_point(&tree, sf.span.0);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: sf.severity,
id: sf.rule_id.clone(),
category: FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some(sf.message.clone()),
labels: vec![],
confidence: None,
evidence: Some(Evidence {
source: None,
sink: Some(SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (point.row + 1) as u32,
col: (point.column + 1) as u32,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: Some(StateEvidence {
machine: sf.machine.into(),
subject: sf.subject.clone(),
from_state: sf.from_state.into(),
to_state: sf.to_state.into(),
}),
notes: vec![],
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
if !state_findings.is_empty() {
out.retain(|d| {
!((d.id == "cfg-resource-leak" || d.id == "cfg-auth-gap")
&& state_lines.contains(&d.line))
});
}
}
}
// AST pattern queries
@ -472,7 +869,7 @@ pub fn analyse_file_fused(
let mut cursor = QueryCursor::new();
for cq in compiled.iter() {
if cfg.scanner.min_severity <= cq.meta.severity {
if cq.meta.severity > cfg.scanner.min_severity {
continue;
}
let mut matches = cursor.matches(&cq.query, root, bytes);
@ -485,6 +882,31 @@ pub fn analyse_file_fused(
col: point.column + 1,
severity: cq.meta.severity,
id: cq.meta.id.to_owned(),
category: cq.meta.category.finding_category(),
path_validated: false,
guard_kind: None,
message: Some(cq.meta.description.to_owned()),
labels: vec![],
confidence: Some(cq.meta.confidence),
evidence: Some(Evidence {
source: None,
sink: Some(SpanEvidence {
path: path.to_string_lossy().into_owned(),
line: (point.row + 1) as u32,
col: (point.column + 1) as u32,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
});
}
}

599
src/callgraph.rs Normal file
View file

@ -0,0 +1,599 @@
use crate::interop::InteropEdge;
use crate::summary::{CalleeResolution, GlobalSummaries};
use crate::symbol::FuncKey;
use petgraph::graph::NodeIndex;
use petgraph::prelude::*;
use std::collections::HashMap;
// ─────────────────────────────────────────────────────────────────────────────
// Types
// ─────────────────────────────────────────────────────────────────────────────
/// Metadata attached to each call-graph edge.
#[derive(Debug, Clone)]
pub struct CallEdge {
/// The raw callee string as it appeared in source (e.g. `"env::var"`).
/// Preserved for diagnostics — **not** the normalized form used for resolution.
#[allow(dead_code)] // used for future diagnostics and path display
pub call_site: String,
}
/// A callee that could not be resolved to any known function definition.
#[derive(Debug, Clone)]
#[allow(dead_code)] // fields used for future diagnostics reporting
pub struct UnresolvedCallee {
pub caller: FuncKey,
pub callee_name: String,
}
/// A callee that matched multiple function definitions — ambiguous.
#[derive(Debug, Clone)]
#[allow(dead_code)] // fields used for future diagnostics reporting
pub struct AmbiguousCallee {
pub caller: FuncKey,
pub callee_name: String,
pub candidates: Vec<FuncKey>,
}
/// The whole-program call graph.
///
/// Nodes are [`FuncKey`]s (one per function definition across all files).
/// Edges represent call-site relationships resolved after pass 1.
pub struct CallGraph {
pub graph: DiGraph<FuncKey, CallEdge>,
/// `FuncKey → NodeIndex` for quick lookup.
#[allow(dead_code)] // used for future topo-ordered analysis and call-graph queries
pub index: HashMap<FuncKey, NodeIndex>,
/// Callee strings that could not be resolved to any [`FuncKey`].
pub unresolved_not_found: Vec<UnresolvedCallee>,
/// Callee strings that matched multiple candidates.
pub unresolved_ambiguous: Vec<AmbiguousCallee>,
}
/// Result of SCC / topological analysis on the call graph.
pub struct CallGraphAnalysis {
/// Strongly connected components.
pub sccs: Vec<Vec<NodeIndex>>,
/// Maps each `NodeIndex` to its SCC index in [`sccs`].
#[allow(dead_code)] // used for future topo-ordered taint propagation
pub node_to_scc: HashMap<NodeIndex, usize>,
/// SCC indices in **callee-first** (leaves-first) order.
///
/// Functions with no callees appear first; callers appear later.
/// Suitable for bottom-up taint propagation.
#[allow(dead_code)] // used for future topo-ordered taint propagation
pub topo_scc_callee_first: Vec<usize>,
}
// ─────────────────────────────────────────────────────────────────────────────
// Callee-name normalization
// ─────────────────────────────────────────────────────────────────────────────
/// Extract the last segment of a qualified callee name for resolution.
///
/// ```text
/// "env::var" → "var"
/// "std::process::Command" → "Command"
/// "obj.method" → "method"
/// "pkg.mod.func" → "func"
/// "foo" → "foo" (unchanged)
/// "" → "" (edge case)
/// ```
///
/// The original raw text is preserved on [`CallEdge::call_site`] for
/// diagnostics; this function only produces the lookup key.
pub(crate) fn normalize_callee_name(raw: &str) -> &str {
// Split on "::" first (Rust-style qualification), take last segment.
let after_colons = raw.rsplit("::").next().unwrap_or(raw);
// Then split on "." (method calls, Python/JS dotted paths), take last segment.
after_colons.rsplit('.').next().unwrap_or(after_colons)
}
// ─────────────────────────────────────────────────────────────────────────────
// Call-graph construction
// ─────────────────────────────────────────────────────────────────────────────
/// Build the whole-program call graph from merged summaries.
///
/// Resolution mirrors `GlobalSummaries::resolve_callee_key`:
/// 1. Normalize callee name (last segment after `::` or `.`)
/// 2. Same-language, arity-filtered, namespace-disambiguated lookup
/// 3. Interop edges (explicit cross-language bridges)
///
/// Unresolved and ambiguous callees are recorded for diagnostics but
/// do **not** create edges.
pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdge]) -> CallGraph {
let mut graph = DiGraph::new();
let mut index = HashMap::new();
// 1. Create one node per FuncKey.
for (key, _) in summaries.iter() {
let idx = graph.add_node(key.clone());
index.insert(key.clone(), idx);
}
let mut unresolved_not_found = Vec::new();
let mut unresolved_ambiguous = Vec::new();
// 2. Resolve callees and add edges.
for (caller_key, summary) in summaries.iter() {
let caller_node = index[caller_key];
for raw_callee in &summary.callees {
let normalized = normalize_callee_name(raw_callee);
match summaries.resolve_callee_key(
normalized,
caller_key.lang,
&caller_key.namespace,
None,
) {
CalleeResolution::Resolved(target_key) => {
if let Some(&target_node) = index.get(&target_key) {
graph.add_edge(
caller_node,
target_node,
CallEdge {
call_site: raw_callee.clone(),
},
);
}
}
CalleeResolution::NotFound => {
// Try interop edges before recording as not-found.
if let Some(target_key) =
resolve_via_interop(raw_callee, caller_key, interop_edges)
&& let Some(&target_node) = index.get(&target_key)
{
graph.add_edge(
caller_node,
target_node,
CallEdge {
call_site: raw_callee.clone(),
},
);
continue;
}
unresolved_not_found.push(UnresolvedCallee {
caller: caller_key.clone(),
callee_name: raw_callee.clone(),
});
}
CalleeResolution::Ambiguous(candidates) => {
unresolved_ambiguous.push(AmbiguousCallee {
caller: caller_key.clone(),
callee_name: raw_callee.clone(),
candidates,
});
}
}
}
}
CallGraph {
graph,
index,
unresolved_not_found,
unresolved_ambiguous,
}
}
/// Check interop edges for a matching cross-language bridge.
fn resolve_via_interop(
raw_callee: &str,
caller_key: &FuncKey,
interop_edges: &[InteropEdge],
) -> Option<FuncKey> {
for edge in interop_edges {
if edge.from.caller_lang == caller_key.lang
&& edge.from.caller_namespace == caller_key.namespace
&& edge.from.callee_symbol == raw_callee
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_key.name)
{
return Some(edge.to.clone());
}
}
None
}
// ─────────────────────────────────────────────────────────────────────────────
// SCC / topological analysis
// ─────────────────────────────────────────────────────────────────────────────
/// Compute SCC decomposition and topological ordering of the call graph.
///
/// `petgraph::algo::tarjan_scc` returns SCCs in *reverse* topological order
/// of the condensation DAG — i.e. leaf SCCs (no outgoing cross-SCC edges)
/// come **first**. That is exactly the **callee-first** order suitable for
/// bottom-up taint propagation.
pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
let sccs = petgraph::algo::tarjan_scc(&cg.graph);
let mut node_to_scc = HashMap::with_capacity(cg.graph.node_count());
for (scc_idx, scc) in sccs.iter().enumerate() {
for &node in scc {
node_to_scc.insert(node, scc_idx);
}
}
// tarjan_scc already gives callee-first ordering.
let topo_scc_callee_first: Vec<usize> = (0..sccs.len()).collect();
CallGraphAnalysis {
sccs,
node_to_scc,
topo_scc_callee_first,
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use crate::interop::CallSiteKey;
use crate::summary::{FuncSummary, merge_summaries};
use crate::symbol::Lang;
/// Helper to create a minimal FuncSummary.
fn make_summary(
name: &str,
file_path: &str,
lang: &str,
param_count: usize,
callees: Vec<&str>,
) -> FuncSummary {
FuncSummary {
name: name.into(),
file_path: file_path.into(),
lang: lang.into(),
param_count,
param_names: vec![],
source_caps: 0,
sanitizer_caps: 0,
sink_caps: 0,
propagates_taint: false,
tainted_sink_params: vec![],
callees: callees.into_iter().map(String::from).collect(),
}
}
// ── normalize_callee_name ────────────────────────────────────────────
#[test]
fn normalize_callee_basic() {
assert_eq!(normalize_callee_name("env::var"), "var");
assert_eq!(normalize_callee_name("std::process::Command"), "Command");
assert_eq!(normalize_callee_name("obj.method"), "method");
assert_eq!(normalize_callee_name("pkg.mod.func"), "func");
assert_eq!(normalize_callee_name("foo"), "foo");
assert_eq!(normalize_callee_name(""), "");
}
// ── same name, different Rust modules ────────────────────────────────
#[test]
fn same_name_different_rust_modules() {
let helper_a = make_summary("helper", "src/a.rs", "rust", 0, vec![]);
let helper_b = make_summary("helper", "src/b.rs", "rust", 0, vec![]);
let caller = make_summary("caller", "src/a.rs", "rust", 0, vec!["helper"]);
let gs = merge_summaries(vec![helper_a, helper_b, caller], None);
let cg = build_call_graph(&gs, &[]);
// Two helper nodes + one caller node = 3 nodes
assert_eq!(cg.graph.node_count(), 3);
// Caller is in src/a.rs, so "helper" resolves to src/a.rs::helper
let caller_key = FuncKey {
lang: Lang::Rust,
namespace: "src/a.rs".into(),
name: "caller".into(),
arity: Some(0),
};
let helper_a_key = FuncKey {
lang: Lang::Rust,
namespace: "src/a.rs".into(),
name: "helper".into(),
arity: Some(0),
};
let caller_node = cg.index[&caller_key];
let helper_a_node = cg.index[&helper_a_key];
// Exactly one edge: caller → helper_a
let edges: Vec<_> = cg
.graph
.edges(caller_node)
.filter(|e| e.target() == helper_a_node)
.collect();
assert_eq!(edges.len(), 1);
assert!(cg.unresolved_not_found.is_empty());
assert!(cg.unresolved_ambiguous.is_empty());
}
// ── same name, Python vs Rust ────────────────────────────────────────
#[test]
fn same_name_python_and_rust() {
let py_foo = make_summary("foo", "handler.py", "python", 0, vec![]);
let rs_foo = make_summary("foo", "handler.rs", "rust", 0, vec![]);
// Python caller calls "foo" — should only see the Python one
let py_caller = make_summary("main", "app.py", "python", 0, vec!["foo"]);
let gs = merge_summaries(vec![py_foo, rs_foo, py_caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.node_count(), 3);
let py_foo_key = FuncKey {
lang: Lang::Python,
namespace: "handler.py".into(),
name: "foo".into(),
arity: Some(0),
};
let caller_key = FuncKey {
lang: Lang::Python,
namespace: "app.py".into(),
name: "main".into(),
arity: Some(0),
};
let caller_node = cg.index[&caller_key];
let py_foo_node = cg.index[&py_foo_key];
// Edge goes to Python foo, not Rust foo
let edges: Vec<_> = cg.graph.edges(caller_node).collect();
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target(), py_foo_node);
}
// ── arity differences → separate nodes ───────────────────────────────
#[test]
fn arity_differences_separate_nodes() {
let helper1 = make_summary("helper", "lib.rs", "rust", 1, vec![]);
let helper2 = make_summary("helper", "lib.rs", "rust", 2, vec![]);
let gs = merge_summaries(vec![helper1, helper2], None);
let cg = build_call_graph(&gs, &[]);
// Two separate nodes (different arity → different FuncKey)
assert_eq!(cg.graph.node_count(), 2);
let key1 = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "helper".into(),
arity: Some(1),
};
let key2 = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "helper".into(),
arity: Some(2),
};
assert!(cg.index.contains_key(&key1));
assert!(cg.index.contains_key(&key2));
}
// ── recursive SCC detection ──────────────────────────────────────────
#[test]
fn recursive_scc_detection() {
let a = make_summary("a", "lib.rs", "rust", 0, vec!["b"]);
let b = make_summary("b", "lib.rs", "rust", 0, vec!["a"]);
let gs = merge_summaries(vec![a, b], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 2); // a→b and b→a
let analysis = analyse(&cg);
// Both nodes should be in the same SCC
let key_a = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "a".into(),
arity: Some(0),
};
let key_b = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "b".into(),
arity: Some(0),
};
let scc_a = analysis.node_to_scc[&cg.index[&key_a]];
let scc_b = analysis.node_to_scc[&cg.index[&key_b]];
assert_eq!(scc_a, scc_b);
assert_eq!(analysis.sccs[scc_a].len(), 2);
}
// ── unresolved callee → recorded as not found ────────────────────────
#[test]
fn unresolved_callee_recorded_as_not_found() {
let caller = make_summary("caller", "lib.rs", "rust", 0, vec!["nonexistent"]);
let gs = merge_summaries(vec![caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 0);
assert_eq!(cg.unresolved_not_found.len(), 1);
assert_eq!(cg.unresolved_not_found[0].callee_name, "nonexistent");
assert!(cg.unresolved_ambiguous.is_empty());
}
// ── ambiguous callee → recorded as ambiguous ─────────────────────────
#[test]
fn ambiguous_callee_recorded() {
// Two "helper" functions in different namespaces.
let helper_a = make_summary("helper", "a.rs", "rust", 0, vec![]);
let helper_b = make_summary("helper", "b.rs", "rust", 0, vec![]);
// Caller is in a THIRD namespace, so neither is preferred.
let caller = make_summary("caller", "c.rs", "rust", 0, vec!["helper"]);
let gs = merge_summaries(vec![helper_a, helper_b, caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 0); // no edge — ambiguous
assert!(cg.unresolved_not_found.is_empty());
assert_eq!(cg.unresolved_ambiguous.len(), 1);
assert_eq!(cg.unresolved_ambiguous[0].callee_name, "helper");
assert_eq!(cg.unresolved_ambiguous[0].candidates.len(), 2);
}
// ── diamond topo order (callee-first) ────────────────────────────────
#[test]
fn diamond_topo_callee_first() {
// A → B, A → C, B → D, C → D
let d = make_summary("d", "lib.rs", "rust", 0, vec![]);
let b = make_summary("b", "lib.rs", "rust", 0, vec!["d"]);
let c = make_summary("c", "lib.rs", "rust", 0, vec!["d"]);
let a = make_summary("a", "lib.rs", "rust", 0, vec!["b", "c"]);
let gs = merge_summaries(vec![a, b, c, d], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.node_count(), 4);
let analysis = analyse(&cg);
let key = |name: &str| FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: name.into(),
arity: Some(0),
};
let scc_of = |name: &str| analysis.node_to_scc[&cg.index[&key(name)]];
let topo_pos = |name: &str| {
analysis
.topo_scc_callee_first
.iter()
.position(|&s| s == scc_of(name))
.unwrap()
};
// D (leaf) must come before B and C, which must come before A (root).
assert!(topo_pos("d") < topo_pos("b"));
assert!(topo_pos("d") < topo_pos("c"));
assert!(topo_pos("b") < topo_pos("a"));
assert!(topo_pos("c") < topo_pos("a"));
}
// ── interop edge resolution ──────────────────────────────────────────
#[test]
fn interop_edge_resolution() {
let py_caller = make_summary("process", "handler.py", "python", 0, vec!["js_func"]);
let js_target = make_summary("js_func", "util.js", "javascript", 1, vec![]);
let gs = merge_summaries(vec![py_caller, js_target], None);
let interop = vec![InteropEdge {
from: CallSiteKey {
caller_lang: Lang::Python,
caller_namespace: "handler.py".into(),
caller_func: String::new(), // wildcard
callee_symbol: "js_func".into(),
ordinal: 0,
},
to: FuncKey {
lang: Lang::JavaScript,
namespace: "util.js".into(),
name: "js_func".into(),
arity: Some(1),
},
arg_map: vec![],
ret_taints: false,
}];
let cg = build_call_graph(&gs, &interop);
let caller_key = FuncKey {
lang: Lang::Python,
namespace: "handler.py".into(),
name: "process".into(),
arity: Some(0),
};
let target_key = FuncKey {
lang: Lang::JavaScript,
namespace: "util.js".into(),
name: "js_func".into(),
arity: Some(1),
};
let caller_node = cg.index[&caller_key];
let target_node = cg.index[&target_key];
let edges: Vec<_> = cg
.graph
.edges(caller_node)
.filter(|e| e.target() == target_node)
.collect();
assert_eq!(edges.len(), 1);
assert!(cg.unresolved_not_found.is_empty());
}
// ── namespace normalization consistency ───────────────────────────────
#[test]
fn namespace_normalization_consistency() {
// FuncSummary::func_key with a scan root produces the same namespace
// string that would be used as caller_namespace in resolution.
let summary = FuncSummary {
name: "my_func".into(),
file_path: "/home/user/proj/src/lib.rs".into(),
lang: "rust".into(),
param_count: 0,
param_names: vec![],
source_caps: 0,
sanitizer_caps: 0,
sink_caps: 0,
propagates_taint: false,
tainted_sink_params: vec![],
callees: vec![],
};
let root = "/home/user/proj";
let key = summary.func_key(Some(root));
// The namespace in the key must be the same as what normalize_namespace produces
let expected_ns = crate::symbol::normalize_namespace(&summary.file_path, Some(root));
assert_eq!(key.namespace, expected_ns);
assert_eq!(key.namespace, "src/lib.rs");
}
// ── raw call_site preserved on edge ──────────────────────────────────
#[test]
fn raw_call_site_preserved_on_edge() {
// Callee "env::var" normalizes to "var" for resolution, but
// the edge should retain the original raw text.
let source = make_summary("var", "util.rs", "rust", 0, vec![]);
let caller = make_summary("main", "util.rs", "rust", 0, vec!["env::var"]);
let gs = merge_summaries(vec![source, caller], None);
let cg = build_call_graph(&gs, &[]);
let caller_key = FuncKey {
lang: Lang::Rust,
namespace: "util.rs".into(),
name: "main".into(),
arity: Some(0),
};
let caller_node = cg.index[&caller_key];
let edges: Vec<_> = cg.graph.edges(caller_node).collect();
assert_eq!(edges.len(), 1);
// Raw call_site preserved, not the normalized "var"
assert_eq!(edges[0].weight().call_site, "env::var");
}
}

View file

@ -32,6 +32,9 @@ pub enum EdgeKind {
Back, // backedge that closes a loop
}
/// Maximum number of identifiers to store from a condition expression.
const MAX_COND_VARS: usize = 8;
#[derive(Debug, Clone)]
pub struct NodeInfo {
pub kind: StmtKind,
@ -44,6 +47,12 @@ pub struct NodeInfo {
pub enclosing_func: Option<String>,
/// Per-function call ordinal (0-based, only meaningful for Call nodes).
pub call_ordinal: u32,
/// For If nodes: raw condition text (truncated to 128 chars). None for non-If nodes.
pub condition_text: Option<String>,
/// For If nodes: identifiers referenced in the condition (sorted, deduped, max 8).
pub condition_vars: Vec<String>,
/// For If nodes: whether the condition has a leading negation (`!` / `not`).
pub condition_negated: bool,
}
/// Intrafile function summary with graphlocal node indices.
@ -122,6 +131,7 @@ fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Option<Strin
.child_by_field_name("function")
.or_else(|| c.child_by_field_name("method"))
.or_else(|| c.child_by_field_name("name"))
.or_else(|| c.child_by_field_name("type"))
.and_then(|f| text_of(f, code)),
Kind::CallMethod => {
let func = c
@ -155,6 +165,65 @@ fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Option<Strin
None
}
/// Search recursively for any nested call whose identifier classifies as a label.
/// Used for cases like `str(eval(expr))` where `str` doesn't match but `eval` does.
fn find_classifiable_inner_call<'a>(
n: Node<'a>,
lang: &str,
code: &'a [u8],
extra: Option<&[crate::labels::RuntimeLabelRule]>,
) -> Option<(String, DataLabel)> {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
match lookup(lang, c.kind()) {
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => {
let ident = match lookup(lang, c.kind()) {
Kind::CallFn => c
.child_by_field_name("function")
.or_else(|| c.child_by_field_name("method"))
.or_else(|| c.child_by_field_name("name"))
.or_else(|| c.child_by_field_name("type"))
.and_then(|f| text_of(f, code)),
Kind::CallMethod => {
let func = c
.child_by_field_name("method")
.or_else(|| c.child_by_field_name("name"))
.and_then(|f| text_of(f, code));
let recv = c
.child_by_field_name("object")
.or_else(|| c.child_by_field_name("receiver"))
.and_then(|f| root_receiver_text(f, lang, code));
match (recv, func) {
(Some(r), Some(f)) => Some(format!("{r}.{f}")),
(_, Some(f)) => Some(f),
_ => None,
}
}
Kind::CallMacro => c
.child_by_field_name("macro")
.and_then(|f| text_of(f, code)),
_ => None,
};
if let Some(ref id) = ident
&& let Some(lbl) = classify(lang, id, extra)
{
return Some((id.clone(), lbl));
}
// Recurse into arguments of this call
if let Some(found) = find_classifiable_inner_call(c, lang, code, extra) {
return Some(found);
}
}
_ => {
if let Some(found) = find_classifiable_inner_call(c, lang, code, extra) {
return Some(found);
}
}
}
}
None
}
/// Build the dot-joined text of a member_expression / attribute / selector_expression.
/// E.g. for `process.env.CMD` this returns `"process.env.CMD"`.
fn member_expr_text(n: Node, code: &[u8]) -> Option<String> {
@ -209,6 +278,25 @@ fn first_member_label(
}
}
}
// PHP/Python/Ruby subscript access: `$_GET['cmd']`, `os.environ['KEY']`, `params[:cmd]`
// Try to classify the object (before the `[`) as a source.
"subscript_expression" | "subscript" | "element_reference" => {
if let Some(obj) = n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child(0))
{
if let Some(txt) = text_of(obj, code)
&& let Some(lbl) = classify(lang, &txt, extra_labels)
{
return Some(lbl);
}
// Recurse into the object for nested member accesses
if let Some(lbl) = first_member_label(obj, lang, code, extra_labels) {
return Some(lbl);
}
}
}
_ => {}
}
let mut cursor = n.walk();
@ -224,6 +312,11 @@ fn first_member_label(
fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => member_expr_text(n, code),
"subscript_expression" | "subscript" | "element_reference" => n
.child_by_field_name("object")
.or_else(|| n.child_by_field_name("value"))
.or_else(|| n.child(0))
.and_then(|obj| text_of(obj, code)),
_ => {
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
@ -237,6 +330,42 @@ fn first_member_text(n: Node, code: &[u8]) -> Option<String> {
}
/// Check whether any descendant of `n` is a call expression.
/// Collect function-expression nodes nested inside a call's arguments.
///
/// This finds anonymous functions / arrow functions / closures that are
/// passed as arguments to a call and should be analysed as separate
/// function scopes. Only direct function-argument children are collected
/// (not functions nested inside other functions — those get handled when
/// the outer function is recursed into).
fn collect_nested_function_nodes<'a>(n: Node<'a>, lang: &str) -> Vec<Node<'a>> {
let mut funcs = Vec::new();
collect_nested_functions_rec(n, lang, &mut funcs, false);
funcs
}
fn collect_nested_functions_rec<'a>(
n: Node<'a>,
lang: &str,
out: &mut Vec<Node<'a>>,
inside_function: bool,
) {
let kind = lookup(lang, n.kind());
// Only treat as a function if it's a real function node (has children),
// not a keyword token like `function` in JS which shares the same kind name.
if kind == Kind::Function && n.child_count() > 0 {
if inside_function {
// Don't recurse into nested functions of nested functions
return;
}
out.push(n);
return;
}
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
collect_nested_functions_rec(c, lang, out, inside_function);
}
}
fn has_call_descendant(n: Node, lang: &str) -> bool {
let mut cursor = n.walk();
for c in n.children(&mut cursor) {
@ -361,6 +490,36 @@ fn def_use(ast: Node, lang: &str, code: &[u8]) -> (Option<String>, Vec<String>)
(defs, uses)
}
// iflet / whilelet — the `let_condition` binds a variable from
// the value expression. E.g. `if let Ok(cmd) = env::var("CMD")`
// defines `cmd` and uses `env`, `var`, `CMD`.
Kind::If | Kind::While => {
let cond = ast.child_by_field_name("condition");
if let Some(c) = cond
&& c.kind() == "let_condition"
{
let mut defs = None;
let mut uses = Vec::new();
if let Some(pat) = c.child_by_field_name("pattern") {
let mut tmp = Vec::<String>::new();
collect_idents(pat, code, &mut tmp);
// The first plain identifier in the pattern is the binding.
// Skip type identifiers (e.g. "Ok" in Ok(cmd)) — take the
// last ident which is the inner binding name.
defs = tmp.into_iter().last();
}
if let Some(val) = c.child_by_field_name("value") {
collect_idents(val, code, &mut uses);
}
return (defs, uses);
}
let mut uses = Vec::new();
collect_idents(ast, code, &mut uses);
(None, uses)
}
// everything else no definition, but may read vars
_ => {
let mut uses = Vec::new();
@ -370,6 +529,109 @@ fn def_use(ast: Node, lang: &str, code: &[u8]) -> (Option<String>, Vec<String>)
}
}
/// Extract raw condition metadata from an If AST node.
///
/// Returns `(condition_text, condition_vars, condition_negated)`.
/// The condition subtree is located via `child_by_field_name("condition")`
/// for most languages, with a positional fallback for Rust `if_expression`.
///
/// Negation is detected by checking for a leading unary `!` operator or
/// `not` keyword. Variables are sorted, deduped, and capped at
/// [`MAX_COND_VARS`].
fn extract_condition_raw<'a>(
ast: Node<'a>,
lang: &str,
code: &'a [u8],
) -> (Option<String>, Vec<String>, bool) {
// 1. Find the condition subtree.
let cond_node = ast.child_by_field_name("condition").or_else(|| {
// Rust `if_expression` uses positional children: the condition is
// the first child that is not a keyword, block, or `let` pattern.
let mut cursor = ast.walk();
ast.children(&mut cursor).find(|c| {
let k = c.kind();
!matches!(lookup(lang, k), Kind::Block | Kind::Trivia)
&& k != "if"
&& k != "else"
&& k != "let"
&& k != "{"
&& k != "}"
&& k != "("
&& k != ")"
})
});
let Some(cond) = cond_node else {
return (None, Vec::new(), false);
};
// 2. Detect leading negation (`!expr`, `not expr`, Ruby `unless`).
let (inner, negated) = detect_negation(cond, ast, lang);
// 3. Collect identifiers from the (inner) condition subtree.
let mut vars = Vec::new();
collect_idents(inner, code, &mut vars);
vars.sort();
vars.dedup();
vars.truncate(MAX_COND_VARS);
// 4. Extract text, truncated.
let text = text_of(cond, code).map(|t| {
if t.len() > 128 {
t[..128].to_string()
} else {
t
}
});
(text, vars, negated)
}
/// Detect leading negation and return the inner expression.
///
/// Handles:
/// - `!expr` (unary_expression / prefix_unary_expression with `!` operator)
/// - `not expr` (Python `not_operator`, Ruby)
/// - Ruby `unless` (the whole If node kind is `unless`)
fn detect_negation<'a>(cond: Node<'a>, if_ast: Node<'a>, _lang: &str) -> (Node<'a>, bool) {
// Ruby `unless` is mapped to Kind::If but is semantically negated.
if if_ast.kind() == "unless" {
return (cond, true);
}
// `!expr` appears as unary_expression, not_operator, or prefix_unary_expression
// with a `!` or `not` operator child.
let is_negation_wrapper = matches!(
cond.kind(),
"unary_expression" | "not_operator" | "prefix_unary_expression" | "unary_not"
);
if is_negation_wrapper {
// Check if the first child is a `!` or `not` operator.
let has_not = cond
.child(0)
.is_some_and(|c| c.kind() == "!" || c.kind() == "not");
if has_not {
// Return the operand (inner expression after the `!` / `not`).
let inner = cond
.child_by_field_name("argument")
.or_else(|| cond.child_by_field_name("operand"))
.or_else(|| {
// Last non-operator child.
let mut cursor = cond.walk();
cond.children(&mut cursor)
.filter(|c| c.kind() != "!" && c.kind() != "not")
.last()
})
.unwrap_or(cond);
return (inner, true);
}
}
(cond, false)
}
/// Create a node in one short borrow and optionally attach a taint label.
#[allow(clippy::too_many_arguments)]
fn push_node<'a>(
@ -391,6 +653,7 @@ fn push_node<'a>(
.child_by_field_name("function")
.or_else(|| ast.child_by_field_name("method"))
.or_else(|| ast.child_by_field_name("name"))
.or_else(|| ast.child_by_field_name("type"))
.and_then(|n| text_of(n, code))
.unwrap_or_default(),
@ -426,7 +689,7 @@ fn push_node<'a>(
// the whole line.
if matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment
Kind::CallWrapper | Kind::Assignment | Kind::Return
) && let Some(inner) = first_call_ident(ast, lang, code)
{
text = inner;
@ -437,6 +700,20 @@ fn push_node<'a>(
let extra = analysis_rules.map(|r| r.extra_labels.as_slice());
let mut label = classify(lang, &text, extra);
// If the outermost call didn't classify, try inner/nested calls.
// E.g. `str(eval(expr))` — `str` is not a sink, but `eval` is.
if label.is_none()
&& matches!(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment | Kind::Return
)
&& let Some((inner_text, inner_label)) =
find_classifiable_inner_call(ast, lang, code, extra)
{
label = Some(inner_label);
text = inner_text;
}
// For assignments like `element.innerHTML = value`, the inner-call heuristic
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
// If that didn't produce a label, check the LHS property name — it may be a
@ -493,18 +770,49 @@ fn push_node<'a>(
}
}
// For `if let` / `while let` patterns: try to classify the value expression
// in the let-condition as a source/sink. E.g. `if let Ok(cmd) = env::var("CMD")`
// should recognise `env::var` as a taint source and label this node accordingly.
if label.is_none()
&& matches!(lookup(lang, ast.kind()), Kind::If | Kind::While)
&& let Some(cond) = ast.child_by_field_name("condition")
&& cond.kind() == "let_condition"
&& let Some(val) = cond.child_by_field_name("value")
{
if let Some(ident) = first_call_ident(val, lang, code)
&& let Some(l) = classify(lang, &ident, extra)
{
label = Some(l);
text = ident;
}
if label.is_none()
&& let Some(ident_text) = text_of(val, code)
&& let Some(l) = classify(lang, &ident_text, extra)
{
label = Some(l);
text = ident_text;
}
}
let span = (ast.start_byte(), ast.end_byte());
/* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */
let (defines, uses) = def_use(ast, lang, code);
let callee = if kind == StmtKind::Call {
let callee = if kind == StmtKind::Call || label.is_some() {
Some(text.clone())
} else {
None
};
// Extract condition metadata for If nodes.
let (condition_text, condition_vars, condition_negated) = if kind == StmtKind::If {
extract_condition_raw(ast, lang, code)
} else {
(None, Vec::new(), false)
};
let idx = g.add_node(NodeInfo {
kind,
span,
@ -514,6 +822,9 @@ fn push_node<'a>(
callee,
enclosing_func: enclosing_func.map(|s| s.to_string()),
call_ordinal,
condition_text,
condition_vars,
condition_negated,
});
debug!(
@ -717,19 +1028,27 @@ fn build_sub<'a>(
}
exits
} else {
// No explicit else → if the then-branch falls through
// (non-empty exits), the false branch merges with those exits.
// If the then-branch terminates (break/return/continue →
// empty exits), the false branch flows from the condition
// to whatever comes next.
if then_exits.is_empty() {
vec![cond]
} else {
if let Some(&first) = then_exits.first() {
connect_all(g, &[cond], first, EdgeKind::False);
}
then_exits.clone()
}
// No explicit else → create a synthetic pass-through node
// for the false path. This avoids routing the False edge
// to a then-block exit (which would make it appear that the
// false path goes *through* the then-block) and gives
// path-sensitive analysis an explicit False edge to record
// predicates on.
let pass = g.add_node(NodeInfo {
kind: StmtKind::Seq,
span: (ast.end_byte(), ast.end_byte()),
label: None,
defines: None,
uses: Vec::new(),
callee: None,
enclosing_func: enclosing_func.map(|s| s.to_string()),
call_ordinal: 0,
condition_text: None,
condition_vars: Vec::new(),
condition_negated: false,
});
connect_all(g, &[cond], pass, EdgeKind::False);
vec![pass]
};
// Frontier = union of both branches
@ -995,7 +1314,7 @@ fn build_sub<'a>(
collect_idents(n, code, &mut tmp);
tmp.into_iter().next()
})
.unwrap_or_else(|| "<anon>".to_string());
.unwrap_or_else(|| format!("<anon@{}>", ast.start_byte()));
let entry_idx = push_node(
g,
StmtKind::Seq,
@ -1016,7 +1335,20 @@ fn build_sub<'a>(
// Snapshot the current node count so we can iterate only over nodes
// created within this function (avoids O(N²) scan of the full graph).
let fn_first_node: NodeIndex = NodeIndex::new(g.node_count());
let body = ast.child_by_field_name("body").expect("fn w/o body");
let body = ast.child_by_field_name("body").unwrap_or_else(|| {
// Some function expressions (e.g. JS anonymous `function(…) { … }`)
// don't have a named "body" field — find the first block child.
let mut c = ast.walk();
ast.children(&mut c)
.find(|n| matches!(lookup(lang, n.kind()), Kind::Block | Kind::SourceFile))
.unwrap_or_else(|| {
panic!(
"fn w/o body: kind={} text='{}'",
ast.kind(),
text_of(ast, code).unwrap_or_default()
)
})
});
let mut fn_call_ordinal: u32 = 0;
let mut fn_breaks = Vec::new();
let mut fn_continues = Vec::new();
@ -1191,6 +1523,9 @@ fn build_sub<'a>(
callee: None,
enclosing_func: Some(fn_name.clone()),
call_ordinal: 0,
condition_text: None,
condition_vars: Vec::new(),
condition_negated: false,
});
// Wire body exits (fall-through) to the exit node.
for &b in &body_exits {
@ -1300,6 +1635,28 @@ fn build_sub<'a>(
{
return Vec::new();
}
// Recurse into any function expressions nested in arguments
// (e.g. `app.get('/path', function(req, res) { ... })`)
// so that they get proper function summaries.
let nested = collect_nested_function_nodes(ast, lang);
for func_node in nested {
build_sub(
func_node,
&[node],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
);
}
vec![node]
}
@ -1326,6 +1683,26 @@ fn build_sub<'a>(
{
return Vec::new();
}
// Recurse into any function expressions nested in arguments
let nested = collect_nested_function_nodes(ast, lang);
for func_node in nested {
build_sub(
func_node,
&[n],
g,
lang,
code,
summaries,
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
);
}
vec![n]
}
@ -1412,6 +1789,9 @@ pub(crate) fn build_cfg<'a>(
callee: None,
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: Vec::new(),
condition_negated: false,
});
let exit = g.add_node(NodeInfo {
kind: StmtKind::Exit,
@ -1422,6 +1802,9 @@ pub(crate) fn build_cfg<'a>(
callee: None,
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: Vec::new(),
condition_negated: false,
});
// Build the body below the synthetic ENTRY.

View file

@ -33,7 +33,6 @@ pub struct CfgFinding {
pub severity: Severity,
pub confidence: Confidence,
pub span: (usize, usize),
#[allow(dead_code)]
pub message: String,
pub evidence: Vec<NodeIndex>,
pub score: Option<f64>,

View file

@ -681,6 +681,8 @@ fn taint_and_unguarded_sink_deduped() {
source: entry,
path: vec![entry, sink_node],
source_kind: crate::labels::SourceKind::UserInput,
path_validated: false,
guard_kind: None,
}];
let findings = parse_and_run_all_with_taint(

View file

@ -1,4 +1,4 @@
use clap::{Parser, Subcommand};
use clap::{Parser, Subcommand, ValueEnum};
#[derive(Parser)]
#[command(name = "nyx")]
@ -13,10 +13,55 @@ impl Commands {
/// Whether this command produces structured (machine-readable) output on
/// stdout, meaning human status messages must be suppressed entirely.
pub fn is_structured_output(&self) -> bool {
matches!(self, Commands::Scan { format, .. } if format == "json" || format == "sarif")
matches!(self, Commands::Scan { format, .. } if *format == OutputFormat::Json || *format == OutputFormat::Sarif)
}
}
/// Output format for scan results.
#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum, Default)]
pub enum OutputFormat {
#[default]
Console,
Json,
Sarif,
}
impl std::fmt::Display for OutputFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
OutputFormat::Console => write!(f, "console"),
OutputFormat::Json => write!(f, "json"),
OutputFormat::Sarif => write!(f, "sarif"),
}
}
}
/// Index mode for scan operations.
#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum, Default)]
pub enum IndexMode {
/// Use index if available, build if missing (default)
#[default]
Auto,
/// Skip indexing entirely, scan filesystem directly
Off,
/// Force rebuild index before scanning
Rebuild,
}
/// Analysis mode for scan operations.
#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum, Default)]
pub enum ScanMode {
/// Run all analyses: AST patterns + CFG + taint (default)
#[default]
Full,
/// Run AST pattern queries only (no CFG/taint)
Ast,
/// Run CFG structural analyses + taint only (no AST patterns)
Cfg,
/// Alias for cfg (CFG + taint analysis)
Taint,
}
#[derive(Subcommand)]
pub enum Commands {
/// Scan project for vulnerabilities
@ -25,35 +70,118 @@ pub enum Commands {
#[arg(default_value = ".")]
path: String,
/// Skip using/building index, scan directly
#[arg(long)]
no_index: bool,
/// Index mode: auto (default), off (no index), rebuild (force rebuild)
#[arg(long, value_enum, default_value_t = IndexMode::Auto)]
index: IndexMode,
/// Force rebuild index before scanning
#[arg(long)]
rebuild_index: bool,
/// Output format
#[arg(short, long, value_enum, default_value_t = OutputFormat::Console)]
format: OutputFormat,
/// Output format (console, json, sarif)
#[arg(short, long, default_value = "")]
format: String,
/// Show only high severity issues
/// Severity filter expression: HIGH, HIGH,MEDIUM, or >=MEDIUM
///
/// Filters findings AFTER all severity normalization (e.g. nonprod
/// downgrades). Only findings matching the expression are emitted.
/// Case-insensitive. Shell-quote expressions containing ">".
#[arg(long)]
high_only: bool,
severity: Option<String>,
#[arg(long)]
ast_only: bool,
/// Analysis mode: full (default), ast, cfg, taint
#[arg(long, value_enum, default_value_t = ScanMode::Full)]
mode: ScanMode,
#[arg(long)]
cfg_only: bool,
#[arg(long)]
/// Scan all targets (alias for --mode full)
#[arg(long, hide = true)]
all_targets: bool,
/// Include findings from test/vendor/build paths at original severity
/// (by default these are downgraded)
/// Preserve original severity for test/vendor/build paths
///
/// By default, findings in non-production paths are downgraded by one
/// severity tier. This flag preserves original severity.
#[arg(long, alias = "include-nonprod")]
keep_nonprod_severity: bool,
/// Suppress all human-readable status output
#[arg(long)]
include_nonprod: bool,
quiet: bool,
/// Exit with code 1 if any finding meets or exceeds this severity
///
/// Useful for CI gating. Example: --fail-on HIGH
#[arg(long)]
fail_on: Option<String>,
/// Disable attack-surface ranking (findings are sorted by exploitability by default)
#[arg(long)]
no_rank: bool,
/// Show inline-suppressed findings (dimmed, tagged [SUPPRESSED])
#[arg(long)]
show_suppressed: bool,
/// Show all findings: disables category filtering, rollups, and LOW budgets
#[arg(long = "all")]
show_all: bool,
/// Include Quality findings (excluded by default)
#[arg(long)]
include_quality: bool,
/// Maximum total LOW findings to show
#[arg(long, default_value_t = 20)]
max_low: u32,
/// Maximum LOW findings per file
#[arg(long, default_value_t = 1)]
max_low_per_file: u32,
/// Maximum LOW findings per rule
#[arg(long, default_value_t = 10)]
max_low_per_rule: u32,
/// Number of example locations in rollup findings
#[arg(long, default_value_t = 5)]
rollup_examples: u32,
/// Show all instances for a specific rule (bypasses rollup for that rule)
#[arg(long)]
show_instances: Option<String>,
/// Minimum attack-surface score to include in output
///
/// Findings with a rank score below this threshold are suppressed.
/// Requires ranking to be enabled (has no effect with --no-rank).
/// Example: --min-score 50
#[arg(long)]
min_score: Option<u32>,
/// Minimum confidence level to include in output
///
/// Values: low, medium, high. Findings below this level are dropped.
/// JSON/SARIF include all unless filtered.
#[arg(long)]
min_confidence: Option<String>,
// ── Deprecated aliases (hidden) ─────────────────────────────────
/// Deprecated: use --index off
#[arg(long, hide = true)]
no_index: bool,
/// Deprecated: use --index rebuild
#[arg(long, hide = true)]
rebuild_index: bool,
/// Deprecated: use --severity HIGH
#[arg(long, hide = true)]
high_only: bool,
/// Deprecated: use --mode ast
#[arg(long, hide = true)]
ast_only: bool,
/// Deprecated: use --mode cfg
#[arg(long, hide = true)]
cfg_only: bool,
},
/// Manage project indexes

View file

@ -4,9 +4,9 @@ pub mod index;
pub mod list;
pub mod scan;
use crate::cli::Commands;
use crate::cli::{Commands, IndexMode, ScanMode};
use crate::errors::NyxResult;
use crate::patterns::Severity;
use crate::patterns::{Severity, SeverityFilter};
use crate::utils::config::{AnalysisMode, Config};
use std::path::Path;
@ -19,36 +19,130 @@ pub fn handle_command(
match command {
Commands::Scan {
path,
index,
format,
severity,
mode,
all_targets,
keep_nonprod_severity,
quiet,
fail_on,
no_rank,
show_suppressed,
show_all,
include_quality,
max_low,
max_low_per_file,
max_low_per_rule,
rollup_examples,
show_instances,
min_score,
min_confidence,
// Deprecated aliases
no_index,
rebuild_index,
format,
high_only,
ast_only,
cfg_only,
all_targets,
include_nonprod,
} => {
if high_only {
config.scanner.min_severity = Severity::High
// ── Resolve deprecated aliases ──────────────────────────────
// Index mode: explicit --index wins, then deprecated flags
let effective_index = if no_index {
IndexMode::Off
} else if rebuild_index {
IndexMode::Rebuild
} else {
index
};
if ast_only {
config.scanner.mode = AnalysisMode::Ast
// Analysis mode: explicit --mode wins, then deprecated flags
let effective_mode = if ast_only {
ScanMode::Ast
} else if cfg_only {
ScanMode::Cfg
} else if all_targets {
ScanMode::Full
} else {
mode
};
if cfg_only {
config.scanner.mode = AnalysisMode::Taint
// Severity filter: explicit --severity wins, then --high-only
let severity_filter = if let Some(ref expr) = severity {
Some(SeverityFilter::parse(expr).map_err(|e| {
crate::errors::NyxError::Msg(format!("invalid --severity expression: {e}"))
})?)
} else if high_only {
Some(SeverityFilter::parse("HIGH").unwrap())
} else {
None
};
if all_targets {
config.scanner.mode = AnalysisMode::Full
// Fail-on threshold
let fail_on_sev = if let Some(ref expr) = fail_on {
Some(expr.trim().parse::<Severity>().map_err(|e| {
crate::errors::NyxError::Msg(format!("invalid --fail-on value: {e}"))
})?)
} else {
None
};
if include_nonprod {
config.scanner.include_nonprod = true
};
// ── Apply to config ─────────────────────────────────────────
scan::handle(&path, no_index, rebuild_index, format, database_dir, config)?;
match effective_mode {
ScanMode::Full => config.scanner.mode = AnalysisMode::Full,
ScanMode::Ast => config.scanner.mode = AnalysisMode::Ast,
ScanMode::Cfg | ScanMode::Taint => config.scanner.mode = AnalysisMode::Taint,
}
if keep_nonprod_severity {
config.scanner.include_nonprod = true;
}
if quiet {
config.output.quiet = true;
}
if no_rank {
config.output.attack_surface_ranking = false;
}
// Min-score: CLI wins, then config
if let Some(s) = min_score {
config.output.min_score = Some(s);
}
// Min-confidence: CLI wins, then config
if let Some(ref expr) = min_confidence {
config.output.min_confidence =
Some(expr.parse::<crate::evidence::Confidence>().map_err(|e| {
crate::errors::NyxError::Msg(format!("invalid --min-confidence value: {e}"))
})?);
}
if show_all {
config.output.show_all = true;
}
if include_quality {
config.output.include_quality = true;
}
// CLI values override config defaults (clap provides defaults)
config.output.max_low = max_low;
config.output.max_low_per_file = max_low_per_file;
config.output.max_low_per_rule = max_low_per_rule;
config.output.rollup_examples = rollup_examples;
scan::handle(
&path,
effective_index,
format,
severity_filter,
fail_on_sev,
show_suppressed,
show_instances.as_deref(),
database_dir,
config,
)?;
}
Commands::Index { action } => {
index::handle(action, database_dir, config)?;

File diff suppressed because it is too large Load diff

View file

@ -272,6 +272,18 @@ pub mod index {
line: row.get::<_, i64>(2)? as usize,
col: row.get::<_, i64>(3)? as usize,
severity: Severity::from_str(&sev_str).unwrap(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
})
})?;

396
src/evidence.rs Normal file
View file

@ -0,0 +1,396 @@
//! Structured evidence and confidence types for scan diagnostics.
//!
//! These types capture the provenance of findings (source locations,
//! sanitizer/guard info, state-machine transitions) in a structured form
//! that can be serialized to JSON and consumed by ranking, filtering,
//! and downstream tooling.
use crate::commands::scan::Diag;
use crate::patterns::Severity;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::str::FromStr;
// ─────────────────────────────────────────────────────────────────────────────
// Confidence
// ─────────────────────────────────────────────────────────────────────────────
/// Confidence level for a diagnostic finding.
///
/// Ordered Low < Medium < High so that `>=` comparisons work naturally
/// for filtering (e.g. `--min-confidence medium` keeps Medium and High).
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum Confidence {
Low,
Medium,
High,
}
impl fmt::Display for Confidence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Low => write!(f, "Low"),
Self::Medium => write!(f, "Medium"),
Self::High => write!(f, "High"),
}
}
}
impl FromStr for Confidence {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_ascii_lowercase().as_str() {
"low" => Ok(Self::Low),
"medium" | "med" => Ok(Self::Medium),
"high" => Ok(Self::High),
_ => Err(format!(
"unknown confidence level: {s:?} (expected low, medium, high)"
)),
}
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Evidence
// ─────────────────────────────────────────────────────────────────────────────
/// Structured evidence for a diagnostic finding.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Evidence {
/// Where tainted data originated.
#[serde(skip_serializing_if = "Option::is_none")]
pub source: Option<SpanEvidence>,
/// Where the dangerous operation happens.
#[serde(skip_serializing_if = "Option::is_none")]
pub sink: Option<SpanEvidence>,
/// Validation guards protecting this path.
#[serde(skip_serializing_if = "Vec::is_empty")]
pub guards: Vec<SpanEvidence>,
/// Sanitizers applied to this path.
#[serde(skip_serializing_if = "Vec::is_empty")]
pub sanitizers: Vec<SpanEvidence>,
/// State-machine evidence (resource lifecycle / auth).
#[serde(skip_serializing_if = "Option::is_none")]
pub state: Option<StateEvidence>,
/// Free-form notes for ranking and display.
#[serde(skip_serializing_if = "Vec::is_empty")]
pub notes: Vec<String>,
}
impl Evidence {
/// Returns `true` if the evidence contains no useful data.
pub fn is_empty(&self) -> bool {
self.source.is_none()
&& self.sink.is_none()
&& self.guards.is_empty()
&& self.sanitizers.is_empty()
&& self.state.is_none()
&& self.notes.is_empty()
}
}
/// A source-location evidence span.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpanEvidence {
pub path: String,
pub line: u32,
pub col: u32,
/// One of: `"source"`, `"sink"`, `"guard"`, `"sanitizer"`.
pub kind: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub snippet: Option<String>,
}
/// Evidence from a state-machine analysis (resource lifecycle / auth).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StateEvidence {
/// The state machine: `"resource"` or `"auth"`.
pub machine: String,
/// Variable name if available.
#[serde(skip_serializing_if = "Option::is_none")]
pub subject: Option<String>,
/// State before the event.
pub from_state: String,
/// State after the event.
pub to_state: String,
}
// ─────────────────────────────────────────────────────────────────────────────
// compute_confidence
// ─────────────────────────────────────────────────────────────────────────────
/// Derive a confidence level for `diag` based on its rule ID, severity,
/// evidence, and analysis kind.
///
/// This is called as a post-pass after all findings are collected; findings
/// that already have a confidence set (e.g. from CFG analysis) are preserved.
pub fn compute_confidence(diag: &Diag) -> Confidence {
// Degraded analysis caps confidence
if let Some(ev) = &diag.evidence
&& ev.notes.iter().any(|n| n.starts_with("degraded:"))
{
return Confidence::Low;
}
let id = &diag.id;
if id.starts_with("taint-") {
if let Some(ev) = &diag.evidence
&& ev.notes.iter().any(|n| n == "path_validated")
{
return Confidence::Medium;
}
// source+sink present = High
if let Some(ev) = &diag.evidence
&& ev.source.is_some()
&& ev.sink.is_some()
{
return Confidence::High;
}
return Confidence::High; // default for taint
}
if id.starts_with("state-") {
return match id.as_str() {
"state-use-after-close" => Confidence::High,
"state-double-close" => Confidence::High,
"state-unauthed-access" => Confidence::High,
"state-resource-leak" => Confidence::Medium,
"state-resource-leak-possible" => Confidence::Low,
_ => Confidence::Medium,
};
}
if id.starts_with("cfg-") {
// If CFG conversion already set confidence, preserve it
return diag.confidence.unwrap_or(Confidence::Medium);
}
// AST patterns: High severity → Medium confidence, else Low
if diag.severity == Severity::High {
Confidence::Medium
} else {
Confidence::Low
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
fn make_diag(id: &str, severity: Severity) -> Diag {
Diag {
path: "test.rs".into(),
line: 1,
col: 1,
severity,
id: id.into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
}
}
#[test]
fn compute_confidence_taint_high() {
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("env::var(\"X\")".into()),
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: Some("exec()".into()),
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
});
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn compute_confidence_taint_validated() {
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["path_validated".into()],
});
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn compute_confidence_degraded_caps_to_low() {
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: None,
sink: None,
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["degraded:budget_exceeded".into()],
});
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_state_rules() {
assert_eq!(
compute_confidence(&make_diag("state-use-after-close", Severity::High)),
Confidence::High,
);
assert_eq!(
compute_confidence(&make_diag("state-double-close", Severity::Medium)),
Confidence::High,
);
assert_eq!(
compute_confidence(&make_diag("state-unauthed-access", Severity::High)),
Confidence::High,
);
assert_eq!(
compute_confidence(&make_diag("state-resource-leak", Severity::Medium)),
Confidence::Medium,
);
assert_eq!(
compute_confidence(&make_diag("state-resource-leak-possible", Severity::Low)),
Confidence::Low,
);
}
#[test]
fn compute_confidence_cfg_preserves_existing() {
let mut d = make_diag("cfg-unguarded-sink", Severity::High);
d.confidence = Some(Confidence::Low);
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_ast_low() {
let d = make_diag("rs.code_exec.eval", Severity::Medium);
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_ast_high_severity_medium() {
let d = make_diag("rs.code_exec.eval", Severity::High);
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn evidence_is_empty() {
let ev = Evidence {
source: None,
sink: None,
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
};
assert!(ev.is_empty());
let ev2 = Evidence {
source: Some(SpanEvidence {
path: "x.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
sink: None,
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
};
assert!(!ev2.is_empty());
}
#[test]
fn confidence_ord() {
assert!(Confidence::Low < Confidence::Medium);
assert!(Confidence::Medium < Confidence::High);
assert!(Confidence::Low < Confidence::High);
}
#[test]
fn confidence_display_and_parse() {
assert_eq!(Confidence::Low.to_string(), "Low");
assert_eq!(Confidence::Medium.to_string(), "Medium");
assert_eq!(Confidence::High.to_string(), "High");
assert_eq!("low".parse::<Confidence>().unwrap(), Confidence::Low);
assert_eq!("MEDIUM".parse::<Confidence>().unwrap(), Confidence::Medium);
assert_eq!("High".parse::<Confidence>().unwrap(), Confidence::High);
assert!("invalid".parse::<Confidence>().is_err());
}
#[test]
fn compute_confidence_does_not_override_preset() {
// AST patterns set confidence directly; compute_confidence must not overwrite.
let mut d = make_diag("rs.quality.expect", Severity::Low);
d.confidence = Some(Confidence::High);
// The post-pass only runs when confidence is None, but verify compute_confidence
// itself would return something different (Low for AST + Low severity), proving
// the guard in scan.rs is necessary.
assert_eq!(compute_confidence(&d), Confidence::Low);
// The actual guard: confidence is already Some, so scan.rs skips compute_confidence.
assert_eq!(d.confidence, Some(Confidence::High));
}
#[test]
fn json_omits_none_fields() {
let ev = Evidence {
source: None,
sink: None,
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![],
};
let json = serde_json::to_string(&ev).unwrap();
assert_eq!(json, "{}");
}
}

984
src/fmt.rs Normal file
View file

@ -0,0 +1,984 @@
//! Console output formatting for scan diagnostics.
//!
//! Produces professional, security-tool-grade aligned output with a clear
//! severity hierarchy, normalised taint flow rendering, and stable wrapping.
use crate::commands::scan::{Diag, SuppressionStats};
use crate::patterns::Severity;
use console::style;
use std::collections::BTreeMap;
/// Default maximum line width when terminal size is unknown.
const DEFAULT_WIDTH: usize = 100;
// ─────────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────────
/// Render all diagnostics as grouped, formatted console output with a summary.
pub fn render_console(
diags: &[Diag],
project_name: &str,
suppression_stats: Option<&SuppressionStats>,
) -> String {
let width = terminal_width();
let mut out = String::new();
let mut grouped: BTreeMap<&str, Vec<&Diag>> = BTreeMap::new();
for d in diags {
grouped.entry(&d.path).or_default().push(d);
}
for (path, issues) in &grouped {
// File path header — dim blue, never brighter than severity.
out.push_str(&format!("{}\n", style(path).blue().dim().underlined()));
for d in issues {
out.push_str(&render_diag(d, width));
out.push('\n'); // blank line between findings
}
}
let suppressed_count = diags.iter().filter(|d| d.suppressed).count();
let active_count = diags.len() - suppressed_count;
if suppressed_count > 0 {
out.push_str(&format!(
"{} '{}' generated {} {} ({} suppressed).\n\n",
style("warning").yellow().bold(),
style(project_name).white().bold(),
style(active_count).bold(),
if active_count == 1 { "issue" } else { "issues" },
suppressed_count,
));
} else {
out.push_str(&format!(
"{} '{}' generated {} {}.\n\n",
style("warning").yellow().bold(),
style(project_name).white().bold(),
style(diags.len()).bold(),
if diags.len() == 1 { "issue" } else { "issues" },
));
}
// ── Suppression footer ─────────────────────────────────────────────
if let Some(stats) = suppression_stats {
let total = stats.total_suppressed();
if total > 0 {
out.push_str(&format!(
"{}\n",
style(format!("Suppressed {total} LOW/Quality findings.")).dim()
));
out.push_str(&format!("{}\n", style("Active filters:").dim()));
if !stats.include_quality {
out.push_str(&format!(
" {} {}\n",
style("include_quality =").dim(),
style("false").dim()
));
}
out.push_str(&format!(
" {} {}\n",
style("max_low =").dim(),
style(stats.max_low).dim()
));
out.push_str(&format!(
" {} {}\n",
style("max_low_per_file =").dim(),
style(stats.max_low_per_file).dim()
));
out.push_str(&format!(
" {} {}\n",
style("max_low_per_rule =").dim(),
style(stats.max_low_per_rule).dim()
));
out.push_str(&format!(
"\n{}\n",
style("Use --include-quality, --max-low, or --all to adjust.").dim()
));
}
}
out
}
/// Normalise a code snippet for display: collapse whitespace, join lines,
/// clean up method-chain spacing, trim, and truncate.
pub fn normalize_snippet(s: &str) -> String {
// Strip newlines/carriage returns with no replacement, then collapse
// runs of spaces into a single space.
let no_newlines: String = s.chars().filter(|c| *c != '\n' && *c != '\r').collect();
let collapsed: String = no_newlines.split_whitespace().collect::<Vec<_>>().join(" ");
// Clean up `) .foo(` → `).foo(` and similar spacing around dots in chains.
let cleaned = collapse_chain_spacing(&collapsed);
let trimmed = cleaned.trim();
if trimmed.len() > 120 {
format!("{}", &trimmed[..120])
} else {
trimmed.to_string()
}
}
/// Truncate method chains: keep constructor + first balanced `(...)`, then `…`.
///
/// E.g. `Command::new("sh").arg("-c").arg(&cmd)` → `Command::new("sh")…`
#[allow(dead_code)] // public API, used by consumers
pub fn shorten_callee(s: &str) -> String {
let s = s.trim();
if s.is_empty() {
return String::new();
}
let Some(open) = s.find('(') else {
return s.to_string();
};
let mut depth = 0u32;
let mut close = None;
for (i, ch) in s[open..].char_indices() {
match ch {
'(' => depth += 1,
')' => {
depth -= 1;
if depth == 0 {
close = Some(open + i);
break;
}
}
_ => {}
}
}
let Some(close_idx) = close else {
return s.to_string();
};
let end = close_idx + 1;
if end < s.len() {
format!("{}", &s[..end])
} else {
s.to_string()
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Internal rendering
// ─────────────────────────────────────────────────────────────────────────────
/// Indentation for body/evidence lines (spaces).
const BODY_INDENT: usize = 6;
/// Render a single diagnostic block.
fn render_diag(d: &Diag, width: usize) -> String {
let mut out = String::new();
// ── Header line ──────────────────────────────────────────────────────
// Format: ` 98:5 ⚠ [MEDIUM] taint-unsanitised-flow (Score: 87, Confidence: Medium)`
let loc = format!("{}:{}", d.line, d.col);
let sev = if d.suppressed {
format!("{} {}", style("").dim(), style("[SUPPRESSED]").dim(),)
} else {
severity_tag(d.severity)
};
let meta_suffix = match (d.rank_score, d.confidence) {
(Some(s), Some(c)) => format!(
" {}",
style(format!("(Score: {}, Confidence: {c})", s as u32)).dim()
),
(Some(s), None) => format!(" {}", style(format!("(Score: {})", s as u32)).dim()),
(None, Some(c)) => format!(" {}", style(format!("(Confidence: {c})")).dim()),
(None, None) => String::new(),
};
out.push_str(&format!(
" {} {} {}{}\n",
style(&loc).dim(),
sev,
style(&d.id).dim(),
meta_suffix,
));
// ── Rollup body ─────────────────────────────────────────────────────
let indent_str = " ".repeat(BODY_INDENT);
if let Some(ref rollup) = d.rollup {
out.push_str(&format!(
"{indent_str}{} ({} occurrences)\n",
style(&d.id).dim(),
rollup.count
));
if !rollup.occurrences.is_empty() {
let examples: Vec<String> = rollup
.occurrences
.iter()
.map(|loc| format!("{}:{}", loc.line, loc.col))
.collect();
out.push_str(&format!(
"{indent_str}{} {}\n",
style("Examples:").dim(),
style(examples.join(", ")).dim()
));
}
out.push_str(&format!(
"{indent_str}{}\n",
style(format!("Run: nyx scan --show-instances {}", d.id)).dim()
));
return out;
}
// ── Message body ─────────────────────────────────────────────────────
if let Some(msg) = &d.message {
let capitalized = capitalize_first(msg);
let wrapped = wrap_text(&capitalized, width, BODY_INDENT);
out.push_str(&format!("{indent_str}{wrapped}\n"));
}
// ── Evidence labels (Source, Sink, Path guard) ───────────────────────
if !d.labels.is_empty() {
out.push('\n');
let max_label = d.labels.iter().map(|(k, _)| k.len()).max().unwrap_or(0);
let key_width = max_label + 1; // +1 for ':'
for (label, value) in &d.labels {
let key_str = format!("{label}:");
let value_indent = BODY_INDENT + key_width + 1; // key + space
let wrapped_val = wrap_text(value, width, value_indent);
if label == "Path guard" {
out.push_str(&format!(
"{indent_str}{:<kw$} {}\n",
style(&key_str).dim(),
style(&wrapped_val).cyan(),
kw = key_width,
));
} else {
out.push_str(&format!(
"{indent_str}{:<kw$} {}\n",
style(&key_str).dim(),
wrapped_val,
kw = key_width,
));
}
}
} else if let Some(guard) = &d.guard_kind {
out.push_str(&format!(
"{indent_str}{} {}\n",
style("Path guard:").dim(),
style(guard).cyan(),
));
}
out
}
/// Colored severity tag with icon. The tag is the visual anchor of each finding.
///
/// - HIGH: bold red
/// - MEDIUM: bold 208 (orange) — distinct from yellow
/// - LOW: dim 67 (muted blue-gray)
fn severity_tag(sev: Severity) -> String {
match sev {
Severity::High => format!(
"{} [{}]",
style("").red().bold(),
style("HIGH").red().bold(),
),
Severity::Medium => format!(
"{} [{}]",
style("").color256(208).bold(),
style("MEDIUM").color256(208).bold(),
),
Severity::Low => format!(
"{} [{}]",
style("").color256(67),
style("LOW").color256(67),
),
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Text utilities
// ─────────────────────────────────────────────────────────────────────────────
/// Collapse spacing artefacts in method chains.
///
/// - `") .foo("` → `").foo("` (space between `)` and `.`)
/// - Multiple spaces → single space
fn collapse_chain_spacing(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let chars: Vec<char> = s.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
// Pattern: `)` followed by whitespace then `.`
if chars[i] == ')' {
out.push(')');
i += 1;
// Skip whitespace between `)` and `.`
let ws_start = i;
while i < len && chars[i] == ' ' {
i += 1;
}
if i < len && chars[i] == '.' {
// Collapse: emit `.` directly after `)`
continue;
} else {
// Not a chain continuation — emit the whitespace we skipped
for c in &chars[ws_start..i] {
out.push(*c);
}
}
} else {
out.push(chars[i]);
i += 1;
}
}
out
}
/// Word-wrap text to fit within `max_width`, with continuation lines indented
/// to `indent` spaces. The first line is NOT indented (caller handles that).
fn wrap_text(text: &str, max_width: usize, indent: usize) -> String {
let available_first = max_width.saturating_sub(indent);
let available_cont = max_width.saturating_sub(indent);
if available_first == 0 || text.len() <= available_first {
return text.to_string();
}
let indent_str = " ".repeat(indent);
let mut result = String::new();
let mut line_len = 0usize;
let mut first_line = true;
for word in text.split_whitespace() {
let wlen = word.len();
let avail = if first_line {
available_first
} else {
available_cont
};
if line_len == 0 {
result.push_str(word);
line_len = wlen;
} else if line_len + 1 + wlen > avail {
result.push('\n');
result.push_str(&indent_str);
result.push_str(word);
line_len = wlen;
first_line = false;
} else {
result.push(' ');
result.push_str(word);
line_len += 1 + wlen;
}
}
result
}
/// Get terminal width, falling back to DEFAULT_WIDTH.
fn terminal_width() -> usize {
terminal_size::terminal_size()
.map(|(w, _)| w.0 as usize)
.unwrap_or(DEFAULT_WIDTH)
}
/// Capitalise the first character of a string.
fn capitalize_first(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(c) => {
let mut out = String::with_capacity(s.len());
for upper in c.to_uppercase() {
out.push(upper);
}
out.push_str(chars.as_str());
out
}
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
// ── Helpers ──────────────────────────────────────────────────────────
/// Strip ANSI escape codes for testing visible content.
fn strip_ansi(s: &str) -> String {
let mut result = String::new();
let mut in_escape = false;
for ch in s.chars() {
if ch == '\x1b' {
in_escape = true;
} else if in_escape {
if ch == 'm' {
in_escape = false;
}
} else {
result.push(ch);
}
}
result
}
// ── normalize_snippet ────────────────────────────────────────────────
#[test]
fn normalize_snippet_strips_newlines_no_space() {
// Newlines are removed with no whitespace inserted in their place.
assert_eq!(normalize_snippet("foo\nbar\rbaz"), "foobarbaz");
}
#[test]
fn normalize_snippet_collapses_whitespace() {
assert_eq!(
normalize_snippet("Command::new(\"tar\") .arg(\"-czf\")"),
"Command::new(\"tar\").arg(\"-czf\")"
);
}
#[test]
fn normalize_snippet_trims() {
assert_eq!(normalize_snippet(" hello "), "hello");
}
#[test]
fn normalize_snippet_truncates_at_120() {
let long = "a".repeat(200);
let result = normalize_snippet(&long);
// 120 chars + '…' (3 bytes UTF-8)
assert!(result.len() > 120);
assert!(result.ends_with('…'));
}
#[test]
fn normalize_snippet_short_unchanged() {
assert_eq!(normalize_snippet("short"), "short");
}
// ── collapse_chain_spacing ───────────────────────────────────────────
#[test]
fn collapse_chain_removes_space_before_dot() {
assert_eq!(
collapse_chain_spacing("foo() .bar() .baz()"),
"foo().bar().baz()"
);
}
#[test]
fn collapse_chain_preserves_non_chain_spacing() {
assert_eq!(collapse_chain_spacing("foo() + bar()"), "foo() + bar()");
}
#[test]
fn collapse_chain_multiple_spaces() {
assert_eq!(
collapse_chain_spacing("cmd() .arg(\"-c\")"),
"cmd().arg(\"-c\")"
);
}
// ── shorten_callee ───────────────────────────────────────────────────
#[test]
fn shorten_callee_truncates_chain() {
assert_eq!(
shorten_callee("Command::new(\"sh\").arg(\"-c\").arg(&cmd)"),
"Command::new(\"sh\")…"
);
}
#[test]
fn shorten_callee_no_chain_unchanged() {
assert_eq!(shorten_callee("env::var(\"HOME\")"), "env::var(\"HOME\")");
}
#[test]
fn shorten_callee_nested_parens() {
assert_eq!(shorten_callee("foo(bar(1, 2)).baz()"), "foo(bar(1, 2))…");
}
#[test]
fn shorten_callee_no_parens() {
assert_eq!(shorten_callee("simple_name"), "simple_name");
}
#[test]
fn shorten_callee_empty() {
assert_eq!(shorten_callee(""), "");
}
// ── wrap_text ────────────────────────────────────────────────────────
#[test]
fn wrap_short_text_unchanged() {
assert_eq!(wrap_text("short text", 80, 4), "short text");
}
#[test]
fn wrap_breaks_at_boundary() {
let text = "word1 word2 word3 word4 word5";
let result = wrap_text(text, 20, 4);
assert!(result.contains('\n'));
for line in result.lines().skip(1) {
assert!(line.starts_with(" "));
}
}
// ── severity_tag ─────────────────────────────────────────────────────
#[test]
fn severity_tags_contain_level_name() {
let h = strip_ansi(&severity_tag(Severity::High));
let m = strip_ansi(&severity_tag(Severity::Medium));
let l = strip_ansi(&severity_tag(Severity::Low));
assert!(h.contains("HIGH"), "got: {h}");
assert!(m.contains("MEDIUM"), "got: {m}");
assert!(l.contains("LOW"), "got: {l}");
}
#[test]
fn severity_tags_have_icons() {
let h = strip_ansi(&severity_tag(Severity::High));
let m = strip_ansi(&severity_tag(Severity::Medium));
let l = strip_ansi(&severity_tag(Severity::Low));
assert!(h.contains('✖'), "HIGH should have ✖");
assert!(m.contains('⚠'), "MEDIUM should have ⚠");
assert!(l.contains('●'), "LOW should have ●");
}
// ── render_console ───────────────────────────────────────────────────
#[test]
fn render_console_groups_by_file() {
let diags = vec![
Diag {
path: "src/a.rs".into(),
line: 10,
col: 5,
severity: Severity::High,
id: "test-rule".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some("test message".into()),
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
},
Diag {
path: "src/b.rs".into(),
line: 20,
col: 1,
severity: Severity::Low,
id: "another-rule".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
},
];
let output = render_console(&diags, "test-project", None);
let stripped = strip_ansi(&output);
assert!(stripped.contains("src/a.rs"));
assert!(stripped.contains("src/b.rs"));
assert!(stripped.contains("2 issues"));
assert!(stripped.contains("test-project"));
}
#[test]
fn render_console_evidence_displayed() {
let diags = vec![Diag {
path: "src/main.rs".into(),
line: 42,
col: 5,
severity: Severity::High,
id: "taint-unsanitised-flow (source 12:3)".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some("unsanitised input".into()),
labels: vec![
("Source".into(), "env::var(\"HOME\") at 12:3".into()),
("Sink".into(), "Command::new(\"sh\")".into()),
],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
}];
let output = render_console(&diags, "proj", None);
let stripped = strip_ansi(&output);
assert!(stripped.contains("Source:"), "should contain Source label");
assert!(stripped.contains("Sink:"), "should contain Sink label");
// No backticks in output
assert!(
!stripped.contains('`'),
"should not contain backticks in evidence"
);
}
#[test]
fn render_console_blank_line_between_findings() {
let diags = vec![
Diag {
path: "src/a.rs".into(),
line: 1,
col: 1,
severity: Severity::High,
id: "rule-a".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some("first".into()),
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
},
Diag {
path: "src/a.rs".into(),
line: 10,
col: 1,
severity: Severity::Medium,
id: "rule-b".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some("second".into()),
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
},
];
let output = render_console(&diags, "proj", None);
let stripped = strip_ansi(&output);
// There should be a blank line between the two findings
assert!(
stripped.contains("First\n\n"),
"blank line between findings: {stripped}"
);
}
#[test]
fn json_omits_empty_labels() {
let d = Diag {
path: "x.rs".into(),
line: 1,
col: 1,
severity: Severity::Low,
id: "test".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let json = serde_json::to_string(&d).unwrap();
assert!(
!json.contains("labels"),
"empty labels should be omitted from JSON"
);
}
#[test]
fn json_omits_rank_fields_when_none() {
let d = Diag {
path: "x.rs".into(),
line: 1,
col: 1,
severity: Severity::Low,
id: "test".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let json = serde_json::to_string(&d).unwrap();
assert!(
!json.contains("rank_score"),
"rank_score should be omitted when None"
);
assert!(
!json.contains("rank_reason"),
"rank_reason should be omitted when None"
);
}
#[test]
fn json_includes_rank_score_when_set() {
let d = Diag {
path: "x.rs".into(),
line: 1,
col: 1,
severity: Severity::High,
id: "taint-unsanitised-flow".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: Some(120.0),
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let json = serde_json::to_string(&d).unwrap();
assert!(
json.contains("rank_score"),
"rank_score should be present when set"
);
assert!(json.contains("120"), "rank_score value should appear");
}
// ── capitalize_first ─────────────────────────────────────────────────
#[test]
fn capitalize_first_works() {
assert_eq!(capitalize_first("hello"), "Hello");
assert_eq!(capitalize_first(""), "");
assert_eq!(capitalize_first("A"), "A");
assert_eq!(capitalize_first("unsanitised"), "Unsanitised");
}
// ── taint flow rendering (integration-style) ─────────────────────────
#[test]
fn taint_flow_no_broken_backticks_or_weird_spacing() {
let raw_sink = "Command::new(\"tar\") .arg(\"-czf\") .arg(\"/backups/nightly.tar.gz\") .arg(\"/var/data\") .output()";
let normalised = normalize_snippet(raw_sink);
// Chain spacing should be collapsed
assert!(
!normalised.contains(") ."),
"chain spacing should be collapsed: {normalised}"
);
assert!(!normalised.contains(" "), "no double-spaces: {normalised}");
// Should not contain backticks
assert!(!normalised.contains('`'), "no backticks: {normalised}");
}
#[test]
fn multiline_sink_joined_and_normalised() {
let raw = "Command::new(\"tar\")\n .arg(\"-czf\")\n .arg(\"/backups/nightly.tar.gz\")\n .arg(\"/var/data\")\n .output()";
let normalised = normalize_snippet(raw);
assert_eq!(
normalised,
"Command::new(\"tar\").arg(\"-czf\").arg(\"/backups/nightly.tar.gz\").arg(\"/var/data\").output()"
);
}
// ── confidence display ──────────────────────────────────────────────
#[test]
fn confidence_after_score_on_header_line() {
let d = Diag {
path: "src/a.rs".into(),
line: 510,
col: 5,
severity: Severity::Medium,
id: "cfg-unguarded-sink".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some("dangerous sink".into()),
labels: vec![],
confidence: Some(crate::evidence::Confidence::Medium),
evidence: None,
rank_score: Some(36.0),
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let output = render_diag(&d, 120);
let stripped = strip_ansi(&output);
// Header line should contain score and confidence together
let header = stripped.lines().next().unwrap();
assert!(
header.contains("(Score: 36, Confidence: Medium)"),
"header should contain '(Score: 36, Confidence: Medium)': {header}"
);
// No standalone Confidence line
let non_header_lines: Vec<&str> = stripped.lines().skip(1).collect();
assert!(
!non_header_lines
.iter()
.any(|l| l.trim().starts_with("Confidence:")),
"should not have standalone Confidence line"
);
}
#[test]
fn confidence_title_case() {
for (conf, expected) in [
(crate::evidence::Confidence::Low, "Confidence: Low"),
(crate::evidence::Confidence::Medium, "Confidence: Medium"),
(crate::evidence::Confidence::High, "Confidence: High"),
] {
let d = Diag {
path: "x.rs".into(),
line: 1,
col: 1,
severity: Severity::Low,
id: "test".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: Some(conf),
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let output = render_diag(&d, 100);
let stripped = strip_ansi(&output);
assert!(
stripped.contains(expected),
"expected '{expected}' in: {stripped}"
);
}
}
#[test]
fn confidence_none_only_score() {
let d = Diag {
path: "src/a.rs".into(),
line: 10,
col: 5,
severity: Severity::High,
id: "test-rule".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: Some("test message".into()),
labels: vec![],
confidence: None,
evidence: None,
rank_score: Some(42.0),
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let output = render_diag(&d, 100);
let stripped = strip_ansi(&output);
let header = stripped.lines().next().unwrap();
assert!(
header.contains("(Score: 42)"),
"should show score without confidence: {header}"
);
assert!(
!header.contains("Confidence"),
"should not mention confidence when None: {header}"
);
}
#[test]
fn confidence_only_no_score() {
let d = Diag {
path: "src/a.rs".into(),
line: 10,
col: 5,
severity: Severity::High,
id: "test-rule".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: Some(crate::evidence::Confidence::High),
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let output = render_diag(&d, 100);
let stripped = strip_ansi(&output);
let header = stripped.lines().next().unwrap();
assert!(
header.contains("(Confidence: High)"),
"should show confidence without score: {header}"
);
}
#[test]
fn json_omits_confidence_when_none() {
let d = Diag {
path: "x.rs".into(),
line: 1,
col: 1,
severity: Severity::Low,
id: "test".into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
};
let json = serde_json::to_string(&d).unwrap();
assert!(
!json.contains("confidence"),
"confidence should be omitted when None: {json}"
);
}
}

View file

@ -31,6 +31,10 @@ pub static RULES: &[LabelRule] = &[
matchers: &["printf", "fprintf"],
label: DataLabel::Sink(Cap::FMT_STRING),
},
LabelRule {
matchers: &["fopen", "open"],
label: DataLabel::Sink(Cap::FILE_IO),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
@ -39,6 +43,9 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"while_statement" => Kind::While,
"for_statement" => Kind::For,
"do_statement" => Kind::While,
"switch_statement" => Kind::Block,
"case_statement" => Kind::Block,
"labeled_statement" => Kind::Block,
"return_statement" => Kind::Return,
"break_statement" => Kind::Break,
@ -47,6 +54,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
// structure
"translation_unit" => Kind::SourceFile,
"compound_statement" => Kind::Block,
"else_clause" => Kind::Block,
"function_definition" => Kind::Function,
// data-flow

View file

@ -29,6 +29,10 @@ pub static RULES: &[LabelRule] = &[
matchers: &["printf", "fprintf"],
label: DataLabel::Sink(Cap::FMT_STRING),
},
LabelRule {
matchers: &["fopen", "open"],
label: DataLabel::Sink(Cap::FILE_IO),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
@ -38,15 +42,23 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"for_statement" => Kind::For,
"for_range_loop" => Kind::For,
"do_statement" => Kind::While,
"switch_statement" => Kind::Block,
"case_statement" => Kind::Block,
"labeled_statement" => Kind::Block,
"return_statement" => Kind::Return,
"throw_statement" => Kind::Return,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,
// structure
"translation_unit" => Kind::SourceFile,
"compound_statement" => Kind::Block,
"else_clause" => Kind::Block,
"function_definition" => Kind::Function,
"try_statement" => Kind::Block,
"catch_clause" => Kind::Block,
"lambda_expression" => Kind::Block,
// data-flow
"call_expression" => Kind::CallFn,
@ -63,7 +75,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"preproc_include" => Kind::Trivia,
"preproc_def" => Kind::Trivia,
"using_declaration" => Kind::Trivia,
"namespace_definition" => Kind::Trivia,
"namespace_definition" => Kind::Block,
};
pub static PARAM_CONFIG: ParamConfig = ParamConfig {

View file

@ -8,7 +8,17 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
},
LabelRule {
matchers: &["http.Request", "r.FormValue", "r.URL"],
matchers: &[
"http.Request",
"r.FormValue",
"r.URL",
"r.Body",
"r.Header",
"r.URL.Query",
"r.URL.Query.Get",
"Request.FormValue",
"Request.URL",
],
label: DataLabel::Source(Cap::all()),
},
// ───────── Sanitizers ──────────
@ -17,18 +27,40 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
},
LabelRule {
matchers: &["url.QueryEscape"],
matchers: &["url.QueryEscape", "url.PathEscape"],
label: DataLabel::Sanitizer(Cap::URL_ENCODE),
},
LabelRule {
matchers: &["filepath.Clean", "filepath.Base"],
label: DataLabel::Sanitizer(Cap::FILE_IO),
},
// ─────────── Sinks ─────────────
LabelRule {
matchers: &["exec.Command"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["db.Query", "db.Exec"],
matchers: &["db.Query", "db.Exec", "db.QueryRow", "db.Prepare"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["fmt.Fprintf", "fmt.Sprintf", "fmt.Printf"],
label: DataLabel::Sink(Cap::FMT_STRING),
},
LabelRule {
matchers: &[
"os.Open",
"os.OpenFile",
"os.Create",
"ioutil.ReadFile",
"os.ReadFile",
],
label: DataLabel::Sink(Cap::FILE_IO),
},
LabelRule {
matchers: &["template.HTML"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
@ -46,6 +78,16 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"statement_list" => Kind::Block,
"function_declaration" => Kind::Function,
"method_declaration" => Kind::Function,
"func_literal" => Kind::Function,
"expression_switch_statement" => Kind::Block,
"type_switch_statement" => Kind::Block,
"expression_case" => Kind::Block,
"type_case" => Kind::Block,
"default_case" => Kind::Block,
"select_statement" => Kind::Block,
"communication_case" => Kind::Block,
"go_statement" => Kind::Block,
"defer_statement" => Kind::Block,
// data-flow
"call_expression" => Kind::CallFn,

View file

@ -8,7 +8,19 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
},
LabelRule {
matchers: &["getParameter", "getInputStream", "getHeader", "getCookies"],
matchers: &[
"getParameter",
"getInputStream",
"getHeader",
"getCookies",
"getReader",
"getQueryString",
"getPathInfo",
],
label: DataLabel::Source(Cap::all()),
},
LabelRule {
matchers: &["readObject", "readLine"],
label: DataLabel::Source(Cap::all()),
},
// ───────── Sanitizers ──────────
@ -18,13 +30,21 @@ pub static RULES: &[LabelRule] = &[
},
// ─────────── Sinks ─────────────
LabelRule {
matchers: &["Runtime.exec"],
matchers: &["Runtime.exec", "ProcessBuilder"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["executeQuery", "executeUpdate", "prepareStatement"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["Class.forName"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["println", "print", "write"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
@ -33,8 +53,10 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"while_statement" => Kind::While,
"for_statement" => Kind::For,
"enhanced_for_statement" => Kind::For,
"do_statement" => Kind::While,
"return_statement" => Kind::Return,
"throw_statement" => Kind::Return,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,
@ -46,6 +68,15 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"interface_body" => Kind::Block,
"method_declaration" => Kind::Function,
"constructor_declaration" => Kind::Function,
"switch_expression" => Kind::Block,
"switch_block" => Kind::Block,
"switch_block_statement_group" => Kind::Block,
"try_statement" => Kind::Block,
"catch_clause" => Kind::Block,
"finally_clause" => Kind::Block,
"lambda_expression" => Kind::Block,
"constructor_body" => Kind::Block,
"static_initializer" => Kind::Block,
// data-flow
"method_invocation" => Kind::CallMethod,

View file

@ -62,6 +62,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"while_statement" => Kind::While,
"for_statement" => Kind::For,
"for_in_statement" => Kind::For,
"do_statement" => Kind::While,
"return_statement" => Kind::Return,
"throw_statement" => Kind::Return,
@ -71,9 +72,24 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
// structure
"program" => Kind::SourceFile,
"statement_block" => Kind::Block,
"else_clause" => Kind::Block,
"function_declaration" => Kind::Function,
"function_expression" => Kind::Function,
"arrow_function" => Kind::Function,
"method_definition" => Kind::Function,
"generator_function_declaration" => Kind::Function,
"generator_function" => Kind::Function,
"switch_statement" => Kind::Block,
"switch_body" => Kind::Block,
"switch_case" => Kind::Block,
"switch_default" => Kind::Block,
"try_statement" => Kind::Block,
"catch_clause" => Kind::Block,
"finally_clause" => Kind::Block,
"class_declaration" => Kind::Block,
"class" => Kind::Block,
"class_body" => Kind::Block,
"export_statement" => Kind::Block,
// data-flow
"call_expression" => Kind::CallFn,

View file

@ -41,7 +41,6 @@ pub enum Kind {
InfiniteLoop,
While,
For,
LoopBody,
CallFn,
CallMethod,
CallMacro,
@ -196,7 +195,7 @@ pub fn lookup(lang: &str, raw: &str) -> Kind {
}
/// The kind of taint source, used to refine finding severity.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SourceKind {
/// Direct user input (request params, argv, stdin, form data)
UserInput,
@ -375,6 +374,11 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
let head = text.split(['(', '<']).next().unwrap_or("");
let trimmed = head.trim().as_bytes();
// For chained calls like `r.URL.Query().Get`, also strip internal
// `().` segments to produce a normalized form like `r.URL.Query.Get`.
let full_normalized = normalize_chained_call(text);
let full_norm_bytes = full_normalized.as_bytes();
// ── Check runtime (config) rules first — they take priority ──────
if let Some(extras) = extra {
// Pass 1: exact / suffix
@ -384,12 +388,8 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
if m.last() == Some(&b'_') {
continue;
}
if ends_with_ignore_case(trimmed, m) {
let start = trimmed.len() - m.len();
let ok = start == 0 || matches!(trimmed[start - 1], b'.' | b':');
if ok {
return Some(rule.label);
}
if match_suffix(trimmed, m) || match_suffix(full_norm_bytes, m) {
return Some(rule.label);
}
}
}
@ -397,7 +397,10 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
for rule in extras {
for raw in &rule.matchers {
let m = raw.as_bytes();
if m.last() == Some(&b'_') && starts_with_ignore_case(trimmed, m) {
if m.last() == Some(&b'_')
&& (starts_with_ignore_case(trimmed, m)
|| starts_with_ignore_case(full_norm_bytes, m))
{
return Some(rule.label);
}
}
@ -417,12 +420,8 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
if m.last() == Some(&b'_') {
continue;
}
if ends_with_ignore_case(trimmed, m) {
let start = trimmed.len() - m.len();
let ok = start == 0 || matches!(trimmed[start - 1], b'.' | b':');
if ok {
return Some(rule.label);
}
if match_suffix(trimmed, m) || match_suffix(full_norm_bytes, m) {
return Some(rule.label);
}
}
}
@ -431,7 +430,10 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
for rule in *rules {
for raw in rule.matchers {
let m = raw.as_bytes();
if m.last() == Some(&b'_') && starts_with_ignore_case(trimmed, m) {
if m.last() == Some(&b'_')
&& (starts_with_ignore_case(trimmed, m)
|| starts_with_ignore_case(full_norm_bytes, m))
{
return Some(rule.label);
}
}
@ -440,6 +442,58 @@ pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> O
None
}
/// Check if `text` ends with `matcher` at a word boundary (`.` or `:`).
#[inline]
fn match_suffix(text: &[u8], matcher: &[u8]) -> bool {
if ends_with_ignore_case(text, matcher) {
let start = text.len() - matcher.len();
start == 0 || matches!(text[start - 1], b'.' | b':')
} else {
false
}
}
/// Normalize a chained method call: strip `()` between `.` segments.
/// e.g. `r.URL.Query().Get` → `r.URL.Query.Get`
/// e.g. `r.URL.Query().Get("host")` → `r.URL.Query.Get`
fn normalize_chained_call(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b'(' => {
// Skip from `(` to matching `)`, but only if followed by `.`
// This handles `Query().Get` → `Query.Get`
let mut depth = 1u32;
let mut j = i + 1;
while j < bytes.len() && depth > 0 {
if bytes[j] == b'(' {
depth += 1;
} else if bytes[j] == b')' {
depth -= 1;
}
j += 1;
}
// If we're at end or next char is `.`, skip the parens
if j >= bytes.len() || bytes[j] == b'.' {
i = j;
} else {
// Keep the paren content (unusual case)
result.push('(');
i += 1;
}
}
b'<' => break, // Stop at generic args
_ => {
result.push(bytes[i] as char);
i += 1;
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;

View file

@ -3,8 +3,24 @@ use phf::{Map, phf_map};
pub static RULES: &[LabelRule] = &[
// ─────────── Sources ───────────
// Note: PHP `$` prefix is stripped by collect_idents, so match without `$`.
LabelRule {
matchers: &["$_GET", "$_POST", "$_REQUEST", "$_COOKIE"],
matchers: &[
"$_GET",
"_GET",
"$_POST",
"_POST",
"$_REQUEST",
"_REQUEST",
"$_COOKIE",
"_COOKIE",
"$_FILES",
"_FILES",
"$_SERVER",
"_SERVER",
"$_ENV",
"_ENV",
],
label: DataLabel::Source(Cap::all()),
},
LabelRule {
@ -20,17 +36,44 @@ pub static RULES: &[LabelRule] = &[
matchers: &["escapeshellarg", "escapeshellcmd"],
label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["basename"],
label: DataLabel::Sanitizer(Cap::FILE_IO),
},
// ─────────── Sinks ─────────────
LabelRule {
matchers: &["system", "exec", "passthru", "shell_exec"],
matchers: &[
"system",
"exec",
"passthru",
"shell_exec",
"proc_open",
"popen",
],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["eval", "assert"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["include", "include_once", "require", "require_once"],
label: DataLabel::Sink(Cap::FILE_IO),
},
LabelRule {
matchers: &["unserialize"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["move_uploaded_file", "copy", "file_put_contents", "fwrite"],
label: DataLabel::Sink(Cap::FILE_IO),
},
LabelRule {
matchers: &["echo", "print"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
LabelRule {
matchers: &["mysqli_query", "pg_query"],
matchers: &["mysqli_query", "pg_query", "query"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
];
@ -41,16 +84,29 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"while_statement" => Kind::While,
"for_statement" => Kind::For,
"foreach_statement" => Kind::For,
"do_statement" => Kind::While,
"return_statement" => Kind::Return,
"throw_expression" => Kind::Return,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,
// structure
"program" => Kind::SourceFile,
"compound_statement" => Kind::Block,
"else_clause" => Kind::Block,
"else_if_clause" => Kind::Block,
"function_definition" => Kind::Function,
"method_declaration" => Kind::Function,
"switch_statement" => Kind::Block,
"switch_block" => Kind::Block,
"case_statement" => Kind::Block,
"default_statement" => Kind::Block,
"try_statement" => Kind::Block,
"catch_clause" => Kind::Block,
"finally_clause" => Kind::Block,
"colon_block" => Kind::Block,
"class_declaration" => Kind::Block,
// data-flow
"function_call_expression" => Kind::CallFn,

View file

@ -24,7 +24,7 @@ pub static RULES: &[LabelRule] = &[
},
LabelRule {
matchers: &["open"],
label: DataLabel::Source(Cap::all()),
label: DataLabel::Sink(Cap::FILE_IO),
},
LabelRule {
matchers: &[
@ -65,6 +65,14 @@ pub static RULES: &[LabelRule] = &[
matchers: &["cursor.execute", "cursor.executemany"],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["send_file", "send_from_directory"],
label: DataLabel::Sink(Cap::FILE_IO),
},
LabelRule {
matchers: &["os.path.realpath"],
label: DataLabel::Sanitizer(Cap::FILE_IO),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
@ -74,13 +82,24 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"for_statement" => Kind::For,
"return_statement" => Kind::Return,
"raise_statement" => Kind::Return,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,
// structure
"module" => Kind::SourceFile,
"block" => Kind::Block,
"else_clause" => Kind::Block,
"elif_clause" => Kind::Block,
"with_statement" => Kind::Block,
"function_definition" => Kind::Function,
"try_statement" => Kind::Block,
"except_clause" => Kind::Block,
"finally_clause" => Kind::Block,
"class_definition" => Kind::Block,
"decorated_definition" => Kind::Block,
"match_statement" => Kind::Block,
"case_clause" => Kind::Block,
// data-flow
"call" => Kind::CallFn,

View file

@ -40,6 +40,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"if" => Kind::If,
"unless" => Kind::If,
"while" => Kind::While,
"until" => Kind::While,
"for" => Kind::For,
"return" => Kind::Return,
@ -49,15 +50,26 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
// structure
"program" => Kind::SourceFile,
"body_statement" => Kind::Block,
"do_block" => Kind::Block,
"do_block" => Kind::Function,
"then" => Kind::Block,
"else" => Kind::Block,
"elsif" => Kind::If,
"begin" => Kind::Block,
"rescue" => Kind::Block,
"ensure" => Kind::Block,
"case" => Kind::Block,
"when" => Kind::Block,
"class" => Kind::Block,
"module" => Kind::Block,
"do" => Kind::Block,
"block" => Kind::Function,
// data-flow
"call" => Kind::CallFn,
"method_call" => Kind::CallFn,
"assignment" => Kind::Assignment,
"method" => Kind::Function,
"singleton_method" => Kind::Function,
// trivia
"comment" => Kind::Trivia,

View file

@ -8,7 +8,7 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Source(Cap::all()),
},
LabelRule {
matchers: &["fs::read_to_string", "source_file"],
matchers: &["source_file"],
label: DataLabel::Source(Cap::all()),
},
// ───────── Sanitizers ──────────
@ -36,17 +36,29 @@ pub static RULES: &[LabelRule] = &[
matchers: &["sink_html"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
LabelRule {
matchers: &[
"fs::read_to_string",
"fs::write",
"fs::read",
"File::open",
"File::create",
],
label: DataLabel::Sink(Cap::FILE_IO),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
// control-flow
"if_expression" => Kind::If,
"loop_expression" => Kind::InfiniteLoop,
"loop_statement" => Kind::LoopBody,
"while_statement" => Kind::While,
"while_expression" => Kind::While,
"for_statement" => Kind::For,
"for_expression" => Kind::For,
"return_statement" => Kind::Return,
"return_expression" => Kind::Return,
"break_expression" => Kind::Break,
"break_statement" => Kind::Break,
"continue_expression" => Kind::Continue,
@ -55,7 +67,17 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
// structure
"source_file" => Kind::SourceFile,
"block" => Kind::Block,
"else_clause" => Kind::Block,
"match_expression" => Kind::Block,
"match_block" => Kind::Block,
"match_arm" => Kind::Block,
"unsafe_block" => Kind::Block,
"function_item" => Kind::Function,
"closure_expression" => Kind::Block,
"async_block" => Kind::Block,
"impl_item" => Kind::Block,
"trait_item" => Kind::Block,
"declaration_list" => Kind::Block,
// data-flow
"call_expression" => Kind::CallFn,

View file

@ -50,18 +50,36 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"while_statement" => Kind::While,
"for_statement" => Kind::For,
"for_in_statement" => Kind::For,
"for_of_statement" => Kind::For,
"do_statement" => Kind::While,
"return_statement" => Kind::Return,
"throw_statement" => Kind::Return,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,
// structure
"program" => Kind::SourceFile,
"statement_block" => Kind::Block,
"else_clause" => Kind::Block,
"function_declaration" => Kind::Function,
"function_expression" => Kind::Function,
"arrow_function" => Kind::Function,
"method_definition" => Kind::Function,
"generator_function_declaration" => Kind::Function,
"generator_function" => Kind::Function,
"switch_statement" => Kind::Block,
"switch_body" => Kind::Block,
"switch_case" => Kind::Block,
"switch_default" => Kind::Block,
"try_statement" => Kind::Block,
"catch_clause" => Kind::Block,
"finally_clause" => Kind::Block,
"class_declaration" => Kind::Block,
"class" => Kind::Block,
"class_body" => Kind::Block,
"abstract_class_declaration" => Kind::Block,
"export_statement" => Kind::Block,
"enum_declaration" => Kind::Trivia,
// data-flow
"call_expression" => Kind::CallFn,

View file

@ -1,19 +1,62 @@
// Re-exports for benchmarks and integration tests.
// The binary crate (main.rs) is the primary entry point; this lib target
// exposes internals for criterion and other tooling.
//! # Nyx Scanner
//!
//! A multi-language static vulnerability scanner. Nyx parses source files with
//! [tree-sitter](https://tree-sitter.github.io/), builds intra-procedural
//! control-flow graphs ([petgraph](https://docs.rs/petgraph)), and runs
//! cross-file taint analysis with a capability-based sanitizer system.
//!
//! ## Architecture
//!
//! Nyx uses a **two-pass architecture**:
//!
//! 1. **Pass 1 — Summary extraction**: Parse each file, build a CFG per function,
//! and export a [`summary::FuncSummary`] capturing source/sanitizer/sink capabilities,
//! taint propagation behavior, and callee lists. Summaries are persisted to SQLite.
//!
//! 2. **Pass 2 — Analysis**: Load all summaries into a [`summary::GlobalSummaries`] map,
//! re-parse files, and run taint analysis with cross-file callee resolution. CFG
//! structural analysis checks for auth gaps, unguarded sinks, and resource leaks.
//!
//! ## Four Detector Families
//!
//! - **Taint** ([`taint`]) — Monotone forward dataflow tracking source-to-sink flows
//! - **CFG Structural** ([`cfg_analysis`]) — Dominator-based guard and auth-gap detection
//! - **State Model** ([`state`]) — Resource lifecycle and authentication state lattices
//! - **AST Patterns** ([`patterns`]) — Tree-sitter structural queries per language
//!
//! ## Supported Languages
//!
//! Rust, C, C++, Java, Go, PHP, Python, Ruby, TypeScript, JavaScript.
//!
//! ## Entry Points
//!
//! - [`scan_no_index`] — Run a two-pass scan without indexing (for tests)
//! - [`commands::scan::scan_filesystem`] — Filesystem scan with optional indexing
//! - [`commands::scan::scan_with_index_parallel`] — Index-backed parallel scan
//!
//! ## Documentation
//!
//! See the [`docs/`](https://github.com/elicpeter/nyx/tree/master/docs) directory
//! for user and contributor documentation.
pub mod ast;
pub mod callgraph;
pub mod cfg;
pub mod cfg_analysis;
pub(crate) mod cli;
pub mod commands;
pub mod database;
pub mod errors;
pub mod evidence;
pub mod fmt;
pub mod interop;
pub mod labels;
pub mod output;
pub mod patterns;
pub mod rank;
pub mod state;
pub mod summary;
pub mod suppress;
pub mod symbol;
pub mod taint;
pub mod utils;

View file

@ -1,15 +1,21 @@
mod ast;
mod callgraph;
mod cfg;
mod cfg_analysis;
mod cli;
mod commands;
mod database;
mod errors;
mod evidence;
mod fmt;
mod interop;
mod labels;
mod output;
mod patterns;
mod rank;
mod state;
mod summary;
mod suppress;
mod symbol;
mod taint;
mod utils;
@ -25,7 +31,7 @@ use std::fs;
use std::time::Instant;
use tracing_subscriber::fmt::time;
use tracing_subscriber::prelude::*;
use tracing_subscriber::{EnvFilter, Registry, fmt};
use tracing_subscriber::{EnvFilter, Registry, fmt as tracing_fmt};
// use tracing_appender::rolling::{RollingFileAppender, Rotation};
// use tracing_appender::non_blocking;
@ -33,7 +39,7 @@ fn init_tracing() {
// let file_appender = RollingFileAppender::new(Rotation::HOURLY, "logs", "nyx-scanner.log");
// let (file_writer, guard) = non_blocking(file_appender);
let fmt_layer = fmt::layer()
let fmt_layer = tracing_fmt::layer()
.pretty()
.with_thread_ids(true)
.with_timer(time::UtcTime::rfc_3339());
@ -56,8 +62,8 @@ fn main() -> NyxResult<()> {
tracing::debug!("CLI starting up");
let cli = Cli::parse();
let proj_dirs = ProjectDirs::from("dev", "ecpeter23", "nyx")
.ok_or("Unable to determine project directories")?;
let proj_dirs =
ProjectDirs::from("", "", "nyx").ok_or("Unable to determine project directories")?;
// todo: check if we want to actually build a config file, maybe some environments will not want to have anything written
let config_dir = proj_dirs.config_dir();
@ -83,7 +89,7 @@ fn main() -> NyxResult<()> {
commands::handle_command(cli.command, database_dir, config_dir, &mut config)?;
if !quiet {
println!(
eprintln!(
"{} in {:.3}s.",
style("Finished").green().bold(),
now.elapsed().as_secs_f32()

View file

@ -38,6 +38,11 @@ fn cfg_rule_description(id: &str) -> Option<&'static str> {
}
"cfg-resource-leak" => Some("Resource acquired but not released on all exit paths"),
"cfg-lock-not-released" => Some("Lock acquired but not released on all exit paths"),
"state-use-after-close" => Some("Variable used after its resource handle was closed"),
"state-double-close" => Some("Resource handle closed more than once"),
"state-resource-leak" => Some("Resource acquired but never closed"),
"state-resource-leak-possible" => Some("Resource may not be closed on all paths"),
"state-unauthed-access" => Some("Sensitive operation reached without authentication"),
_ => None,
}
}
@ -116,11 +121,17 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_else(|_| d.path.clone());
json!({
// Prefer the per-finding message (e.g. from state analysis) over the generic rule description.
let msg_text = d
.message
.as_deref()
.unwrap_or_else(|| rule_description(base));
let mut result = json!({
"ruleId": base,
"ruleIndex": rule_index,
"level": severity_to_level(d.severity),
"message": { "text": rule_description(base) },
"message": { "text": msg_text },
"locations": [{
"physicalLocation": {
"artifactLocation": { "uri": uri },
@ -130,7 +141,50 @@ pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
}
}
}]
})
});
// Build properties object
let mut props = serde_json::Map::new();
props.insert("category".into(), json!(d.category.to_string()));
if let Some(conf) = d.confidence {
props.insert("confidence".into(), json!(conf.to_string()));
}
// Add rollup data if present
if let Some(ref rollup) = d.rollup {
props.insert(
"rollup".into(),
json!({
"count": rollup.count,
}),
);
// Add rollup occurrences as relatedLocations
let related: Vec<Value> = rollup
.occurrences
.iter()
.enumerate()
.map(|(idx, loc)| {
json!({
"id": idx,
"physicalLocation": {
"artifactLocation": { "uri": &uri },
"region": {
"startLine": loc.line,
"startColumn": loc.col
}
}
})
})
.collect();
if !related.is_empty() {
result["relatedLocations"] = json!(related);
}
}
result["properties"] = Value::Object(props);
result
})
.collect();

View file

@ -1,40 +1,95 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// C AST patterns.
///
/// Taint rules cover `system`/`popen`/`exec*` (command injection),
/// `sprintf`/`strcpy`/`strcat` (buffer overflow sinks), and `printf`/`fprintf`
/// (format-string sinks). AST patterns here focus on **banned-by-default
/// functions** (`gets`, `scanf %s`) and **format-string** variants not covered
/// by taint, since these are dangerous regardless of data origin.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Banned functions (always dangerous) ────────────────────
Pattern {
id: "strcpy_call",
description: "strcpy() usage",
query: "(call_expression function: (identifier) @id (#eq? @id \"strcpy\")) @vuln",
id: "c.memory.gets",
description: "gets() — no bounds checking, always exploitable",
query: r#"(call_expression function: (identifier) @id (#eq? @id "gets")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "strcat_call",
description: "strcat() usage",
query: "(call_expression function: (identifier) @id (#eq? @id \"strcat\")) @vuln",
id: "c.memory.strcpy",
description: "strcpy() — no bounds checking on destination buffer",
query: r#"(call_expression function: (identifier) @id (#eq? @id "strcpy")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "sprintf_call",
description: "sprintf() (no length limit)",
query: "(call_expression function: (identifier) @id (#eq? @id \"sprintf\")) @vuln",
id: "c.memory.strcat",
description: "strcat() — no bounds checking on destination buffer",
query: r#"(call_expression function: (identifier) @id (#eq? @id "strcat")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "gets_call",
description: "gets() usage",
query: "(call_expression function: (identifier) @id (#eq? @id \"gets\")) @vuln",
id: "c.memory.sprintf",
description: "sprintf() — no length limit on output buffer",
query: r#"(call_expression function: (identifier) @id (#eq? @id "sprintf")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "scanf_with_percent_s",
description: "scanf(\"%s\") without length specifier",
query: "(call_expression function: (identifier) @id (#eq? @id \"scanf\") arguments: (argument_list (string_literal) @fmt (#match? @fmt \".*%s.*\"))) @vuln",
id: "c.memory.scanf_percent_s",
description: "scanf(\"%s\") — unbounded string read",
query: r#"(call_expression
function: (identifier) @id (#eq? @id "scanf")
arguments: (argument_list
(string_literal) @fmt (#match? @fmt "%s")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "c.cmdi.system",
description: "system() — shell command execution",
query: r#"(call_expression function: (identifier) @id (#eq? @id "system")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
Pattern {
id: "system_call",
description: "system() shell execution",
query: "(call_expression function: (identifier) @id (#eq? @id \"system\")) @vuln",
id: "c.cmdi.popen",
description: "popen() — shell command execution with pipe",
query: r#"(call_expression function: (identifier) @id (#eq? @id "popen")) @vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Format-string ──────────────────────────────────────────
Pattern {
id: "c.memory.printf_no_fmt",
description: "printf(var) — format-string vulnerability when first arg is not literal",
query: r#"(call_expression
function: (identifier) @id (#eq? @id "printf")
arguments: (argument_list
. (identifier) @arg))
@vuln"#,
severity: Severity::High,
tier: PatternTier::B,
category: PatternCategory::MemorySafety,
confidence: Confidence::Medium,
},
];

View file

@ -1,40 +1,106 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// C++ AST patterns.
///
/// Inherits C banned-function concerns plus C++-specific patterns like
/// `reinterpret_cast` and `const_cast`. Taint rules overlap with C rules
/// for `system`/`sprintf`/`strcpy`/`strcat`.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Banned C functions (inherited) ─────────────────────────
Pattern {
id: "strcpy_call",
description: "strcpy() usage",
query: "(call_expression function: (identifier) @id (#eq? @id \"strcpy\")) @vuln",
id: "cpp.memory.gets",
description: "gets() — no bounds checking, always exploitable",
query: r#"(call_expression function: (identifier) @id (#eq? @id "gets")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "strcat_call",
description: "strcat() usage",
query: "(call_expression function: (identifier) @id (#eq? @id \"strcat\")) @vuln",
id: "cpp.memory.strcpy",
description: "strcpy() — no bounds checking on destination buffer",
query: r#"(call_expression function: (identifier) @id (#eq? @id "strcpy")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "sprintf_call",
description: "sprintf() (no length limit)",
query: "(call_expression function: (identifier) @id (#eq? @id \"sprintf\")) @vuln",
id: "cpp.memory.strcat",
description: "strcat() — no bounds checking on destination buffer",
query: r#"(call_expression function: (identifier) @id (#eq? @id "strcat")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "gets_call",
description: "gets() usage",
query: "(call_expression function: (identifier) @id (#eq? @id \"gets\")) @vuln",
id: "cpp.memory.sprintf",
description: "sprintf() — no length limit on output buffer",
query: r#"(call_expression function: (identifier) @id (#eq? @id "sprintf")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "cpp.cmdi.system",
description: "system() — shell command execution",
query: r#"(call_expression function: (identifier) @id (#eq? @id "system")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
Pattern {
id: "system_call",
description: "system() shell execution",
query: "(call_expression function: (identifier) @id (#eq? @id \"system\")) @vuln",
id: "cpp.cmdi.popen",
description: "popen() — shell command execution",
query: r#"(call_expression function: (identifier) @id (#eq? @id "popen")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Dangerous casts ────────────────────────────────────────
// C++ casts are parsed as call_expression with template_function
Pattern {
id: "cpp.memory.reinterpret_cast",
description: "reinterpret_cast — type-punning cast",
query: r#"(call_expression
function: (template_function
name: (identifier) @n (#eq? @n "reinterpret_cast")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "reinterpret_cast",
description: "reinterpret_cast usage",
query: "(reinterpret_cast_expression) @vuln",
id: "cpp.memory.const_cast",
description: "const_cast — removes const/volatile qualifier",
query: r#"(call_expression
function: (template_function
name: (identifier) @n (#eq? @n "const_cast")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
// ── Tier B: Format-string (variable first arg) ─────────────────────
Pattern {
id: "cpp.memory.printf_no_fmt",
description: "printf(var) — format-string vulnerability when first arg is not literal",
query: r#"(call_expression
function: (identifier) @id (#eq? @id "printf")
arguments: (argument_list
. (identifier) @arg))
@vuln"#,
severity: Severity::High,
tier: PatternTier::B,
category: PatternCategory::MemorySafety,
confidence: Confidence::Medium,
},
];

View file

@ -1,34 +1,120 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// Go AST patterns.
///
/// Taint rules cover `exec.Command` (command injection), `db.Query`/`db.Exec`
/// (SQL sinks). AST patterns here focus on **TLS misconfiguration**,
/// **weak crypto**, **unsafe.Pointer**, and **hardcoded secrets**.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "exec_command",
description: "os/exec Command construction",
query: "(call_expression function: (selector_expression field: (field_identifier) @f (#eq? @f \"Command\"))) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "http_insecure_tls",
description: "&http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}",
query: "(composite_literal type: (selector_expression field: (field_identifier) @t (#eq? @t \"Transport\")) body: (literal_value (keyed_element key: (identifier) @k (#eq? @k \"TLSClientConfig\") value: (composite_literal body: (literal_value (keyed_element key: (identifier) @ik (#eq? @ik \"InsecureSkipVerify\") value: (true)))))) @vuln",
id: "go.cmdi.exec_command",
description: "exec.Command() — arbitrary process execution",
query: r#"(call_expression
function: (selector_expression
field: (field_identifier) @f (#eq? @f "Command")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Unsafe pointer ─────────────────────────────────────────
Pattern {
id: "unsafe_pointer",
description: "Use of unsafe.Pointer",
query: "(qualified_type type: (selector_expression field: (field_identifier) @f (#eq? @f \"Pointer\"))) @vuln",
severity: Severity::High,
},
Pattern {
id: "md5_sha1",
description: "crypto/md5 or crypto/sha1 usage",
query: "(call_expression function: (selector_expression object: (identifier) @pkg (#match? @pkg \"md5|sha1\"))) @vuln",
id: "go.memory.unsafe_pointer",
description: "unsafe.Pointer — bypasses Go type system",
query: r#"(call_expression
function: (selector_expression
operand: (identifier) @pkg (#eq? @pkg "unsafe")
field: (field_identifier) @f (#eq? @f "Pointer")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
// ── Tier A: TLS misconfiguration ───────────────────────────────────
Pattern {
id: "hardcoded_secret",
description: "Hard-coded string that looks like an API key/token",
query: "(interpreted_string_literal) @s (#match? @s \"(?i)(api|secret|token|password)[=:]?[ \\t]*[A-Za-z0-9_\\-]{8,}\")",
id: "go.transport.insecure_skip_verify",
description: "InsecureSkipVerify: true — disables TLS certificate validation",
query: r#"(keyed_element
(literal_element
(identifier) @k (#eq? @k "InsecureSkipVerify"))
(literal_element (true)))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::InsecureTransport,
confidence: Confidence::High,
},
// ── Tier A: Weak crypto ────────────────────────────────────────────
Pattern {
id: "go.crypto.md5",
description: "md5.New() / md5.Sum() — weak hash algorithm",
query: r#"(call_expression
function: (selector_expression
operand: (identifier) @pkg (#eq? @pkg "md5")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
Pattern {
id: "go.crypto.sha1",
description: "sha1.New() / sha1.Sum() — weak hash algorithm",
query: r#"(call_expression
function: (selector_expression
operand: (identifier) @pkg (#eq? @pkg "sha1")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
// ── Tier B: SQL injection (concatenation heuristic) ────────────────
Pattern {
id: "go.sqli.query_concat",
description: "db.Query/Exec with concatenated string argument",
query: r#"(call_expression
function: (selector_expression
field: (field_identifier) @f (#match? @f "^(Query|Exec|QueryRow)$"))
arguments: (argument_list
(binary_expression) @concat))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::SqlInjection,
confidence: Confidence::Medium,
},
// ── Tier A: Hardcoded secrets ──────────────────────────────────────
Pattern {
id: "go.secrets.hardcoded_key",
description: "Variable with secret-like name assigned a string literal",
query: r#"(short_var_declaration
left: (expression_list
(identifier) @name (#match? @name "(?i)(password|secret|api_?key|token|private_?key)"))
right: (expression_list
(interpreted_string_literal) @val))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Secrets,
confidence: Confidence::High,
},
// ── Tier A: Deserialization ────────────────────────────────────────
Pattern {
id: "go.deser.gob_decode",
description: "gob.NewDecoder — Go binary deserialization",
query: r#"(call_expression
function: (selector_expression
operand: (identifier) @pkg (#eq? @pkg "gob")
field: (field_identifier) @f (#eq? @f "NewDecoder")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
];

View file

@ -1,40 +1,116 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// Java AST patterns.
///
/// Taint rules cover `Runtime.exec` (command injection) and
/// `executeQuery`/`executeUpdate`/`prepareStatement` (SQL sinks).
/// AST patterns here focus on **deserialization**, **reflection**,
/// **SQL with concatenation** (Tier B heuristic), and **weak crypto**.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Deserialization ────────────────────────────────────────
Pattern {
id: "runtime_exec",
description: "Runtime.getRuntime().exec(...) arbitrary-command execution",
query: "(method_invocation object: (method_invocation name: (identifier) @n (#eq? @n \"getRuntime\")) name: (identifier) @id (#eq? @id \"exec\")) @vuln",
id: "java.deser.readobject",
description: "ObjectInputStream.readObject() — unsafe deserialization",
// Match any .readObject() call — the method name is specific enough.
query: r#"(method_invocation
name: (identifier) @id (#eq? @id "readObject"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "class_for_name",
description: "Dynamic reflection via Class.forName(...)",
query: "(method_invocation object: (identifier) @c (#eq? @c \"Class\") name: (identifier) @id (#eq? @id \"forName\")) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "object_deserialization",
description: "java.io.ObjectInputStream#readObject() deserialization",
query: "(method_invocation object: (identifier) @o (#eq? @o \"ObjectInputStream\") name: (identifier) @id (#eq? @id \"readObject\")) @vuln",
id: "java.cmdi.runtime_exec",
description: "Runtime.getRuntime().exec() — shell command execution",
query: r#"(method_invocation
object: (method_invocation
name: (identifier) @n (#eq? @n "getRuntime"))
name: (identifier) @id (#eq? @id "exec"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Reflection ─────────────────────────────────────────────
Pattern {
id: "insecure_random",
description: "java.util.Random used where SecureRandom is expected",
query: "(object_creation_expression type: (identifier) @t (#eq? @t \"Random\")) @vuln",
id: "java.reflection.class_forname",
description: "Class.forName() — dynamic class loading",
query: r#"(method_invocation
object: (identifier) @c (#eq? @c "Class")
name: (identifier) @id (#eq? @id "forName"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Reflection,
confidence: Confidence::High,
},
Pattern {
id: "thread_stop",
description: "Deprecated Thread.stop() invocation",
query: "(method_invocation name: (identifier) @id (#eq? @id \"stop\") object: (identifier) @obj (#eq? @obj \"Thread\")) @vuln",
id: "java.reflection.method_invoke",
description: "Method.invoke() — reflective method invocation",
query: r#"(method_invocation
name: (identifier) @id (#eq? @id "invoke"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Reflection,
confidence: Confidence::High,
},
// ── Tier B: SQL injection (concatenation heuristic) ────────────────
Pattern {
id: "java.sqli.execute_concat",
description: "SQL execute with concatenated string argument",
query: r#"(method_invocation
name: (identifier) @id (#match? @id "^execute(Query|Update)?$")
arguments: (argument_list
(binary_expression) @concat))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::SqlInjection,
confidence: Confidence::Medium,
},
// ── Tier A: Weak crypto ────────────────────────────────────────────
Pattern {
id: "java.crypto.insecure_random",
description: "new Random() — java.util.Random is not cryptographically secure",
query: r#"(object_creation_expression
type: (type_identifier) @t (#eq? @t "Random"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
Pattern {
id: "sql_concat",
description: "SQL built with string concatenation",
query: "(method_invocation name: (identifier) @id (#match? @id \"execute(Query|Update)?\") arguments: (argument_list (binary_expression) @concat)) @vuln",
id: "java.crypto.weak_digest",
description: "MessageDigest.getInstance(\"MD5\"/\"SHA1\") — weak hash algorithm",
query: r#"(method_invocation
object: (identifier) @c (#eq? @c "MessageDigest")
name: (identifier) @id (#eq? @id "getInstance")
arguments: (argument_list
(string_literal) @alg (#match? @alg "(?i)(md5|sha-?1)")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
// ── Tier A: XSS (servlet) ──────────────────────────────────────────
Pattern {
id: "java.xss.getwriter_print",
description: "response.getWriter().print/println — direct output without encoding",
query: r#"(method_invocation
object: (method_invocation
name: (identifier) @gw (#eq? @gw "getWriter"))
name: (identifier) @id (#match? @id "^(print|println|write)$"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
];

View file

@ -1,117 +1,182 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// JavaScript AST patterns.
///
/// Taint rules cover `eval` (code injection), `innerHTML` (XSS),
/// `location.href` (open redirect), and `child_process.exec/spawn` (command
/// injection). AST patterns here add **new Function()**, **document.write**,
/// **setTimeout with string**, **deserialization**, **prototype pollution**,
/// **XSS sinks** not covered by taint, and **weak crypto**.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Code execution ─────────────────────────────────────────
Pattern {
id: "eval_call",
description: "Use of eval()",
query: "(call_expression function: (identifier) @id (#eq? @id \"eval\")) @vuln",
id: "js.code_exec.eval",
description: "eval() — dynamic code execution",
query: r#"(call_expression
function: (identifier) @id (#eq? @id "eval"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "new_function",
description: "new Function() constructor",
query: "(new_expression constructor: (identifier) @id (#eq? @id \"Function\")) @vuln",
id: "js.code_exec.new_function",
description: "new Function() constructor — eval equivalent",
query: r#"(new_expression
constructor: (identifier) @id (#eq? @id "Function"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "document_write",
description: "document.write() call",
query: "(call_expression function: (member_expression object: (identifier) @obj (#eq? @obj \"document\") property: (property_identifier) @prop (#eq? @prop \"write\"))) @vuln",
id: "js.code_exec.settimeout_string",
description: "setTimeout/setInterval with string argument — implicit eval",
query: r#"(call_expression
function: (identifier) @id (#match? @id "^(setTimeout|setInterval)$")
arguments: (arguments (string) @code))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ── Tier A: XSS sinks ──────────────────────────────────────────────
Pattern {
id: "settimeout_string",
description: "setTimeout / setInterval with a string argument",
query: "(call_expression function: (identifier) @id (#match? @id \"setTimeout|setInterval\") arguments: (arguments (string) @code . _)) @vuln",
id: "js.xss.document_write",
description: "document.write() — XSS sink",
query: r#"(call_expression
function: (member_expression
object: (identifier) @obj (#eq? @obj "document")
property: (property_identifier) @prop (#match? @prop "^(write|writeln)$")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
Pattern {
id: "json_parse",
description: "JSON.parse on dynamic string",
query: "(call_expression function: (member_expression object: (identifier) @obj (#eq? @obj \"JSON\") property: (property_identifier) @prop (#eq? @prop \"parse\"))) @vuln",
id: "js.xss.outer_html",
description: "Assignment to .outerHTML — XSS sink",
query: r#"(assignment_expression
left: (member_expression
property: (property_identifier) @prop (#eq? @prop "outerHTML")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
Pattern {
id: "js.xss.insert_adjacent_html",
description: "insertAdjacentHTML() — XSS sink",
query: r#"(call_expression
function: (member_expression
property: (property_identifier) @prop (#eq? @prop "insertAdjacentHTML")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
// ── Tier A: Prototype pollution ────────────────────────────────────
Pattern {
id: "js.prototype.proto_assignment",
description: "Assignment to __proto__ — prototype pollution",
query: r#"(assignment_expression
left: (member_expression
property: (property_identifier) @prop (#eq? @prop "__proto__")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Prototype,
confidence: Confidence::High,
},
Pattern {
id: "js.prototype.extend_object",
description: "Assignment to Object.prototype — prototype mutation",
query: r#"(assignment_expression
left: (member_expression
object: (member_expression
object: (identifier) @obj (#eq? @obj "Object")
property: (property_identifier) @mid (#eq? @mid "prototype"))))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Prototype,
confidence: Confidence::High,
},
// ── Tier A: Weak crypto ────────────────────────────────────────────
Pattern {
id: "js.crypto.weak_hash",
description: "crypto.createHash with weak algorithm (md5/sha1)",
query: r#"(call_expression
function: (member_expression
property: (property_identifier) @prop (#eq? @prop "createHash"))
arguments: (arguments
(string) @alg (#match? @alg "\"(md5|sha1)\"")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
Pattern {
id: "outer_html_assignment",
description: "Assignment to element.outerHTML",
query: "(assignment_expression
left: (member_expression
property: (property_identifier) @prop
(#eq? @prop \"outerHTML\"))) @vuln",
id: "js.crypto.math_random",
description: "Math.random() — not cryptographically secure",
query: r#"(call_expression
function: (member_expression
object: (identifier) @obj (#eq? @obj "Math")
property: (property_identifier) @prop (#eq? @prop "random")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
// ── Tier A: Open redirect ──────────────────────────────────────────
Pattern {
id: "js.xss.location_assign",
description: "Assignment to location/location.href — open redirect",
query: r#"(assignment_expression
left: (member_expression
object: (identifier) @obj (#match? @obj "^(window|location|document)$")
property: (property_identifier) @prop (#match? @prop "^(location|href)$")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
// ── Tier A: Insecure transport ─────────────────────────────────────
Pattern {
id: "insert_adjacent_html",
description: "insertAdjacentHTML() call",
query: "(call_expression
function: (member_expression
property: (property_identifier) @prop
(#eq? @prop \"insertAdjacentHTML\"))) @vuln",
severity: Severity::Medium,
id: "js.transport.fetch_http",
description: "fetch() over plain HTTP",
query: r#"(call_expression
function: (identifier) @id (#eq? @id "fetch")
arguments: (arguments
(string) @url (#match? @url "^\"http://")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::InsecureTransport,
confidence: Confidence::Medium,
},
// ── Tier A: Cookie manipulation ────────────────────────────────────
Pattern {
id: "location_href_assignment",
description: "Assignment to window.location / location.href",
query: "(assignment_expression
left: (member_expression
object: (identifier) @obj
(#match? @obj \"^(window|location|document|self|top|parent|frames)$\")
property: (property_identifier) @prop
(#match? @prop \"^(location|href)$\"))) @vuln",
severity: Severity::High,
},
Pattern {
id: "cookie_assignment",
id: "js.xss.cookie_write",
description: "Write to document.cookie",
query: "(assignment_expression
left: (member_expression
object: (identifier) @obj
(#eq? @obj \"document\")
property: (property_identifier) @prop
(#eq? @prop \"cookie\"))) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "proto_pollution",
description: "Assignment to __proto__ (prototype pollution)",
query: "(assignment_expression
left: (member_expression
property: (property_identifier) @prop
(#eq? @prop \"__proto__\"))) @vuln",
severity: Severity::Low,
},
Pattern {
id: "weak_hash_md5",
description: "crypto.createHash(\"md5\")",
query: "(call_expression
function: (member_expression
object: (identifier) @obj
(#eq? @obj \"crypto\")
property: (property_identifier) @prop
(#eq? @prop \"createHash\"))
arguments: (arguments
(string) @alg
(#eq? @alg \"md5\"))) @vuln",
severity: Severity::Low,
},
Pattern {
id: "regexp_constructor_string",
description: "new RegExp() with a dynamic string",
query: "(new_expression
constructor: (identifier) @id
(#eq? @id \"RegExp\")
arguments: (arguments (string) @pattern)) @vuln",
severity: Severity::Low,
},
Pattern {
id: "dangerous_extend_builtin",
description: "Extending Object.prototype (may lead to collisions/pollution)",
query: "(assignment_expression
left: (member_expression
object: (identifier) @obj
(#eq? @obj \"Object\")
property: (property_identifier) @prop
(#eq? @prop \"prototype\"))) @vuln",
query: r#"(assignment_expression
left: (member_expression
object: (identifier) @obj (#eq? @obj "document")
property: (property_identifier) @prop (#eq? @prop "cookie")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
];

View file

@ -1,3 +1,43 @@
//! # AST Pattern Conventions
//!
//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs.
//!
//! ## ID format
//!
//! `<lang>.<category>.<specific>` — e.g. `java.deser.readobject`, `py.cmdi.os_system`.
//!
//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`.
//!
//! ## Tiers
//!
//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`).
//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated
//! arg, format-string with variable first arg).
//!
//! ## Severity
//!
//! * **High** — command exec, deserialization, banned C functions.
//! * **Medium** — SQL concat, reflection, XSS sinks, casts.
//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`).
//!
//! Note: the default `min_severity` filter skips Low patterns; they only appear when
//! the user explicitly lowers the threshold.
//!
//! ## No-duplicate rule
//!
//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a
//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but
//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`).
//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location.
//!
//! ## Adding a new pattern
//!
//! 1. Pick the language file under `src/patterns/<lang>.rs`.
//! 2. Choose tier, category, severity per the rules above.
//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`.
//! 4. Add a snippet to `tests/fixtures/patterns/<lang>/positive.<ext>`.
//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`.
pub mod c;
pub mod cpp;
mod go;
@ -9,6 +49,7 @@ mod ruby;
pub mod rust;
pub mod typescript;
use crate::evidence::Confidence;
use console::style;
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
@ -16,7 +57,7 @@ use std::collections::HashMap;
use std::fmt;
use std::str::FromStr;
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
pub enum Severity {
High,
Medium,
@ -28,13 +69,14 @@ impl Severity {
///
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` — always 8 visible characters
/// so the column after the tag lines up regardless of severity.
#[allow(dead_code)] // public API for lib consumers
pub fn colored_tag(self) -> String {
// Visible widths: "[HIGH]" = 6, "[MEDIUM]" = 8, "[LOW]" = 5.
// Pad the *whole* tag to 8 visible chars (the longest, "[MEDIUM]").
let (label, styled_fn): (&str, fn(&str) -> String) = match self {
Severity::High => ("HIGH", |s| style(s).red().bold().to_string()),
Severity::Medium => ("MEDIUM", |s| style(s).yellow().bold().to_string()),
Severity::Low => ("LOW", |s| style(s).cyan().bold().to_string()),
Severity::Medium => ("MEDIUM", |s| style(s).color256(208).bold().to_string()),
Severity::Low => ("LOW", |s| style(s).color256(67).to_string()),
};
let bracket_len = label.len() + 2; // "[" + label + "]"
let pad = 8usize.saturating_sub(bracket_len);
@ -46,8 +88,8 @@ impl fmt::Display for Severity {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let styled = match *self {
Severity::High => style("HIGH").red().bold().to_string(),
Severity::Medium => style("MEDIUM").yellow().bold().to_string(),
Severity::Low => style("LOW").cyan().bold().to_string(),
Severity::Medium => style("MEDIUM").color256(208).bold().to_string(),
Severity::Low => style("LOW").color256(67).to_string(),
};
f.write_str(&styled)
}
@ -65,14 +107,132 @@ impl Severity {
}
impl FromStr for Severity {
// TODO: FIX
type Err = ();
type Err = String;
fn from_str(input: &str) -> Result<Self, Self::Err> {
match input.to_lowercase().as_str() {
"medium" => Ok(Severity::Medium),
"high" => Ok(Severity::High),
_ => Ok(Severity::Low),
match input.trim().to_ascii_uppercase().as_str() {
"HIGH" => Ok(Severity::High),
"MEDIUM" | "MED" => Ok(Severity::Medium),
"LOW" => Ok(Severity::Low),
other => Err(format!("unknown severity: '{other}'")),
}
}
}
/// A parsed severity filter expression.
///
/// Supports three forms:
/// - Single level: `"HIGH"` — matches only that level
/// - Comma list: `"HIGH,MEDIUM"` — matches any listed level
/// - Threshold: `">=MEDIUM"` — matches that level and above
///
/// Parsing is case-insensitive and tolerates whitespace around tokens.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SeverityFilter {
/// Match findings at or above this level (High >= Medium >= Low).
AtLeast(Severity),
/// Match findings whose severity is in this exact set.
AnyOf(Vec<Severity>),
}
impl SeverityFilter {
/// Parse a severity filter expression.
///
/// Examples: `"HIGH"`, `"high,medium"`, `">=MEDIUM"`, `">= low"`.
pub fn parse(expr: &str) -> Result<Self, String> {
let trimmed = expr.trim();
if trimmed.is_empty() {
return Err("empty severity expression".into());
}
// Threshold form: >=LEVEL
if let Some(rest) = trimmed.strip_prefix(">=") {
let level: Severity = rest.parse()?;
return Ok(SeverityFilter::AtLeast(level));
}
// Comma-separated list (also handles single value)
let levels: Result<Vec<Severity>, String> = trimmed
.split(',')
.map(|tok| tok.trim().parse::<Severity>())
.collect();
let levels = levels?;
if levels.is_empty() {
return Err("empty severity expression".into());
}
// Optimise single-value list
if levels.len() == 1 {
return Ok(SeverityFilter::AnyOf(levels));
}
Ok(SeverityFilter::AnyOf(levels))
}
/// Returns `true` if the given severity passes this filter.
pub fn matches(&self, sev: Severity) -> bool {
match self {
SeverityFilter::AtLeast(threshold) => {
// Severity ordering: High < Medium < Low (derived Ord).
// "at least Medium" means sev <= Medium in Ord terms.
sev <= *threshold
}
SeverityFilter::AnyOf(set) => set.contains(&sev),
}
}
}
/// Pattern confidence tier.
///
/// * **A** Structural presence alone is high-signal (e.g. `gets()`, `eval()`).
/// * **B** Requires a simple heuristic guard in the query (e.g. SQL with
/// concatenated arg, file-open with non-literal path).
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub enum PatternTier {
A,
B,
}
/// High-level finding category for noise reduction and prioritization.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
pub enum FindingCategory {
Security,
Reliability,
Quality,
}
impl std::fmt::Display for FindingCategory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FindingCategory::Security => write!(f, "Security"),
FindingCategory::Reliability => write!(f, "Reliability"),
FindingCategory::Quality => write!(f, "Quality"),
}
}
}
/// Vulnerability class that a pattern detects.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub enum PatternCategory {
CommandExec,
CodeExec,
Deserialization,
SqlInjection,
PathTraversal,
Xss,
Crypto,
Secrets,
InsecureTransport,
Reflection,
MemorySafety,
Prototype,
CodeQuality,
}
impl PatternCategory {
/// Map this vulnerability class to a high-level finding category.
pub fn finding_category(self) -> FindingCategory {
match self {
PatternCategory::CodeQuality => FindingCategory::Quality,
_ => FindingCategory::Security,
}
}
}
@ -80,7 +240,7 @@ impl FromStr for Severity {
/// One AST pattern with a tree-sitter query and meta-data.
#[derive(Debug, Clone, Serialize, PartialEq)]
pub struct Pattern {
/// Unique identifier (snake-case preferred).
/// Unique identifier — `<lang>.<category>.<specific>` preferred.
pub id: &'static str,
/// Human-readable explanation.
pub description: &'static str,
@ -88,6 +248,12 @@ pub struct Pattern {
pub query: &'static str,
/// Rough severity bucket.
pub severity: Severity,
/// Confidence tier (A = structural, B = heuristic-guarded).
pub tier: PatternTier,
/// Vulnerability class.
pub category: PatternCategory,
/// Confidence level for findings produced by this pattern.
pub confidence: Confidence,
}
/// Global, lazily-initialised registry: lang-name → pattern slice
@ -164,3 +330,66 @@ fn load_returns_correct_pattern_slices() {
assert!(load("brainfuck").is_empty());
}
#[test]
fn severity_from_str_rejects_unknown() {
assert!("garbage".parse::<Severity>().is_err());
}
#[test]
fn severity_filter_single() {
let f = SeverityFilter::parse("HIGH").unwrap();
assert!(f.matches(Severity::High));
assert!(!f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
}
#[test]
fn severity_filter_comma_list() {
let f = SeverityFilter::parse("HIGH,MEDIUM").unwrap();
assert!(f.matches(Severity::High));
assert!(f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
}
#[test]
fn severity_filter_threshold() {
let f = SeverityFilter::parse(">=MEDIUM").unwrap();
assert!(f.matches(Severity::High));
assert!(f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
let f2 = SeverityFilter::parse(">=LOW").unwrap();
assert!(f2.matches(Severity::High));
assert!(f2.matches(Severity::Medium));
assert!(f2.matches(Severity::Low));
let f3 = SeverityFilter::parse(">=HIGH").unwrap();
assert!(f3.matches(Severity::High));
assert!(!f3.matches(Severity::Medium));
}
#[test]
fn severity_filter_case_insensitive_and_whitespace() {
let f = SeverityFilter::parse(" high , medium ").unwrap();
assert!(f.matches(Severity::High));
assert!(f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
let f2 = SeverityFilter::parse(">= medium").unwrap();
assert!(f2.matches(Severity::High));
assert!(f2.matches(Severity::Medium));
}
#[test]
fn severity_filter_rejects_empty() {
assert!(SeverityFilter::parse("").is_err());
assert!(SeverityFilter::parse(" ").is_err());
}
#[test]
fn severity_filter_rejects_invalid_level() {
assert!(SeverityFilter::parse("CRITICAL").is_err());
assert!(SeverityFilter::parse("HIGH,CRITICAL").is_err());
assert!(SeverityFilter::parse(">=BOGUS").is_err());
}

View file

@ -1,40 +1,144 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// PHP AST patterns.
///
/// Taint rules cover `system`/`exec`/`passthru`/`shell_exec` (command
/// injection), `echo`/`print` (XSS sinks), and `mysqli_query`/`pg_query`
/// (SQL sinks). AST patterns here focus on **eval**, **deserialization**,
/// **deprecated dangerous functions**, **include with variable**, and
/// **SQL concatenation** (Tier B).
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Code execution ─────────────────────────────────────────
Pattern {
id: "eval_call",
description: "eval($code) execution",
query: "(function_call_expression function: (name) @n (#eq? @n \"eval\")) @vuln",
id: "php.code_exec.eval",
description: "eval() — dynamic code execution",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "eval"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "preg_replace_e",
description: "preg_replace with deprecated /e modifier",
query: "(function_call_expression function: (name) @n (#eq? @n \"preg_replace\") arguments: (arguments (string) @pat (#match? @pat \"/.*e.*$/\"))) @vuln",
id: "php.code_exec.create_function",
description: "create_function() — deprecated eval-like constructor",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "create_function"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "create_function",
description: "create_function(...) anonymous eval-like",
query: "(function_call_expression function: (name) @n (#eq? @n \"create_function\")) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "unserialize_call",
description: "unserialize(...) on user input",
query: "(function_call_expression function: (name) @n (#eq? @n \"unserialize\")) @vuln",
id: "php.code_exec.preg_replace_e",
description: "preg_replace with /e modifier — code execution via regex",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "preg_replace")
arguments: (arguments
(argument
(string) @pat (#match? @pat "/[^/]*/[a-zA-Z]*e"))))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "mysql_query_concat",
description: "mysql_query with concatenated SQL",
query: "(function_call_expression function: (name) @n (#eq? @n \"mysql_query\") arguments: (arguments (binary_expression) @concat)) @vuln",
id: "php.code_exec.assert_string",
description: "assert() with string argument — evaluates PHP code",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "assert")
arguments: (arguments
(argument (string) @code)))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "php.cmdi.system",
description: "system/shell_exec/exec/passthru — shell command execution",
query: r#"(function_call_expression
function: (name) @n (#match? @n "^(system|shell_exec|exec|passthru|proc_open|popen)$"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Deserialization ────────────────────────────────────────
Pattern {
id: "php.deser.unserialize",
description: "unserialize() — PHP object injection",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "unserialize"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier B: SQL injection (concatenation heuristic) ────────────────
Pattern {
id: "php.sqli.query_concat",
description: "mysql_query/mysqli_query with concatenated SQL string",
query: r#"(function_call_expression
function: (name) @n (#match? @n "^(mysql_query|mysqli_query)$")
arguments: (arguments
(argument (binary_expression) @concat)))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::SqlInjection,
confidence: Confidence::Medium,
},
// ── Tier B: Path traversal (include with variable) ─────────────────
Pattern {
id: "php.path.include_variable",
description: "include/require with variable path — file inclusion vulnerability",
query: r#"(include_expression (variable_name)) @vuln"#,
severity: Severity::High,
tier: PatternTier::B,
category: PatternCategory::PathTraversal,
confidence: Confidence::Medium,
},
// ── Tier A: Crypto ─────────────────────────────────────────────────
Pattern {
id: "php.crypto.md5",
description: "md5() — weak hash function",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "md5"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
Pattern {
id: "system_call",
description: "system()/shell_exec()/exec() command execution",
query: "(function_call_expression function: (name) @n (#match? @n \"system|shell_exec|exec|passthru\")) @vuln",
severity: Severity::Medium,
id: "php.crypto.sha1",
description: "sha1() — weak hash function",
query: r#"(function_call_expression
function: (name) @n (#eq? @n "sha1"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
Pattern {
id: "php.crypto.rand",
description: "rand()/mt_rand() — not cryptographically secure",
query: r#"(function_call_expression
function: (name) @n (#match? @n "^(rand|mt_rand)$"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
];

View file

@ -1,22 +1,178 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// Python AST patterns.
///
/// Taint rules cover `eval`/`exec`, `os.system`/`os.popen`/`subprocess.*`,
/// and `cursor.execute`. AST patterns here add coverage for **deserialization**,
/// **subprocess shell=True** (Tier B — taint doesn't check keyword args), and
/// **code execution** sinks that taint cannot structurally verify.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Code execution ─────────────────────────────────────────
Pattern {
id: "eval_call",
description: "eval() on dynamic input",
query: "(call function: (identifier) @id (#eq? @id \"eval\")) @vuln",
id: "py.code_exec.eval",
description: "eval() — dynamic code execution",
query: r#"(call function: (identifier) @id (#eq? @id "eval")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "exec_call",
description: "exec(...) execution of dynamic code",
query: "(call function: (identifier) @id (#eq? @id \"exec\")) @vuln",
id: "py.code_exec.exec",
description: "exec() — dynamic code execution",
query: r#"(call function: (identifier) @id (#eq? @id "exec")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "subprocess_shell_true",
description: "subprocess.* with shell=True",
query: "(call function: (attribute object: (identifier) @pkg (#eq? @pkg \"subprocess\")) arguments: (argument_list . (keyword_argument name: (identifier) @k (#eq? @k \"shell\")) (true) @val)) @vuln",
id: "py.code_exec.compile",
description: "compile() with exec/eval mode — code compilation from string",
query: r#"(call function: (identifier) @id (#eq? @id "compile")) @vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "py.cmdi.os_system",
description: "os.system() — shell command execution",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "os")
attribute: (identifier) @fn (#eq? @fn "system")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
Pattern {
id: "py.cmdi.os_popen",
description: "os.popen() — shell command execution",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "os")
attribute: (identifier) @fn (#eq? @fn "popen")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier B: subprocess with shell=True ─────────────────────────────
Pattern {
id: "py.cmdi.subprocess_shell",
description: "subprocess call with shell=True",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "subprocess"))
arguments: (argument_list
(keyword_argument
name: (identifier) @k (#eq? @k "shell")
value: (true))))
@vuln"#,
severity: Severity::High,
tier: PatternTier::B,
category: PatternCategory::CommandExec,
confidence: Confidence::Medium,
},
// ── Tier A: Deserialization ────────────────────────────────────────
Pattern {
id: "py.deser.pickle_loads",
description: "pickle.loads/load — arbitrary object deserialization",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "pickle")
attribute: (identifier) @fn (#match? @fn "^loads?$")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
Pattern {
id: "py.deser.yaml_load",
description: "yaml.load() without SafeLoader — arbitrary object instantiation",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "yaml")
attribute: (identifier) @fn (#eq? @fn "load")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
Pattern {
id: "py.deser.shelve_open",
description: "shelve.open() — pickle-backed deserialization",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "shelve")
attribute: (identifier) @fn (#eq? @fn "open")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier B: SQL injection (format/concat heuristic) ────────────────
Pattern {
id: "py.sqli.execute_format",
description: "cursor.execute with string concatenation — SQL injection risk",
query: r#"(call
function: (attribute
attribute: (identifier) @fn (#eq? @fn "execute"))
arguments: (argument_list
(binary_operator) @arg))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::SqlInjection,
confidence: Confidence::Medium,
},
// ── Tier A: Weak crypto ────────────────────────────────────────────
Pattern {
id: "py.crypto.md5",
description: "hashlib.md5() — weak hash algorithm",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "hashlib")
attribute: (identifier) @fn (#eq? @fn "md5")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
Pattern {
id: "py.crypto.sha1",
description: "hashlib.sha1() — weak hash algorithm",
query: r#"(call
function: (attribute
object: (identifier) @pkg (#eq? @pkg "hashlib")
attribute: (identifier) @fn (#eq? @fn "sha1")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
// ── Tier A: Template injection ─────────────────────────────────────
Pattern {
id: "py.xss.jinja_from_string",
description: "jinja2.Template from string — potential template injection",
query: r#"(call
function: (attribute
attribute: (identifier) @fn (#eq? @fn "from_string")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
];

View file

@ -1,133 +1,141 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// Ruby AST patterns.
///
/// Taint rules cover `system`/`exec` (command injection), `eval` (code
/// execution), and `puts`/`print` (output sinks). AST patterns here focus on
/// **deserialization** (YAML.load, Marshal.load), **instance_eval/class_eval**,
/// **backtick shell**, **send with dynamic arg**, and **constantize**.
pub const PATTERNS: &[Pattern] = &[
// ---------- Runtime code-execution primitives ----------
// ── Tier A: Code execution ─────────────────────────────────────────
Pattern {
id: "eval_call",
description: "Kernel#eval usage",
query: r#"
(call
(identifier) @id
(#eq? @id "eval")
) @vuln
"#,
id: "rb.code_exec.eval",
description: "Kernel#eval — dynamic code execution",
query: r#"(call (identifier) @id (#eq? @id "eval")) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "instance_eval_call",
description: "Object#instance_eval usage",
query: r#"
(call
(identifier) @id
(#eq? @id "instance_eval")
) @vuln
"#,
id: "rb.code_exec.instance_eval",
description: "instance_eval — evaluates string in object context",
query: r#"(call
method: (identifier) @id (#eq? @id "instance_eval"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "class_eval_call",
description: "Module#class_eval / module_eval usage",
query: r#"
(call
(identifier) @id
(#match? @id "^(class_eval|module_eval)$")
) @vuln
"#,
id: "rb.code_exec.class_eval",
description: "class_eval / module_eval — evaluates string in class context",
query: r#"(call
method: (identifier) @id (#match? @id "^(class_eval|module_eval)$"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ---------- Shell execution ----------
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "system_exec_interp",
description: "system/exec with string interpolation",
query: r#"
(call
method: (identifier) @m
(#match? @m "^(system|exec)$")
arguments: (argument_list
(string
(interpolation)+ @vuln
)
)
)
"#,
id: "rb.cmdi.backtick",
description: "Backtick shell execution",
query: r#"(subshell) @vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Shell execution ─────────────────────────────────────────
Pattern {
id: "rb.cmdi.system_interp",
description: "system/exec call — command execution risk",
query: r#"(call
method: (identifier) @m (#match? @m "^(system|exec)$"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CommandExec,
confidence: Confidence::High,
},
// ── Tier A: Deserialization ────────────────────────────────────────
Pattern {
id: "rb.deser.yaml_load",
description: "YAML.load — arbitrary object deserialization (use safe_load instead)",
query: r#"(call
receiver: (constant) @recv (#match? @recv "^(YAML|Psych)$")
method: (identifier) @m (#eq? @m "load"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
Pattern {
id: "backtick_command",
description: "Back-tick shell execution",
// `uname -a`
query: r#"(shell_command) @vuln"#,
id: "rb.deser.marshal_load",
description: "Marshal.load — arbitrary Ruby object deserialization",
query: r#"(call
receiver: (constant) @recv (#eq? @recv "Marshal")
method: (identifier) @m (#eq? @m "load"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ---------- Dangerous deserialisation ----------
// ── Tier A: Reflection ─────────────────────────────────────────────
Pattern {
id: "yaml_load",
description: "YAML.load / Psych.load (arbitrary object deserialisation)",
query: r#"
(call
receiver: (constant) @recv
(#match? @recv "^(YAML|Psych)$")
method: (identifier) @m
(#eq? @m "load")
) @vuln
"#,
severity: Severity::High,
},
Pattern {
id: "marshal_load",
description: "Marshal.load usage",
query: r#"
(call
receiver: (constant) @recv
(#eq? @recv "Marshal")
method: (identifier) @m
(#eq? @m "load")
) @vuln
"#,
severity: Severity::High,
},
// ---------- Reflection / meta-programming ----------
Pattern {
id: "send_dynamic",
description: "send() with dynamic first argument (not a literal symbol)",
query: r#"
(call
method: (identifier) @m
(#eq? @m "send")
arguments: (argument_list
[
(identifier) ; send(method_name_var, )
(string (interpolation)+) ; send("user_#{role}", )
] @vuln
)
)
id: "rb.reflection.send_dynamic",
description: "send() with non-symbol argument — arbitrary method dispatch",
query: r#"(call
method: (identifier) @m (#eq? @m "send")
arguments: (argument_list
[(identifier) (string (interpolation)+)] @vuln))
"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::Reflection,
confidence: Confidence::Medium,
},
Pattern {
id: "constantize_call",
description: "ActiveSupport constantize / safe_constantize on tainted data",
query: r#"
(call
method: (identifier) @m
(#match? @m "^(constantize|safe_constantize)$")
) @vuln
"#,
id: "rb.reflection.constantize",
description: "constantize / safe_constantize — dynamic class resolution",
query: r#"(call
method: (identifier) @m (#match? @m "^(constantize|safe_constantize)$"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Reflection,
confidence: Confidence::High,
},
// ---------- Insecure resource access ----------
// ── Tier A: SSRF ───────────────────────────────────────────────────
Pattern {
id: "open_uri_http",
description: "Kernel#open with HTTP(S) URL (open-uri auto-follow)",
query: r#"
(call
method: (identifier) @m
(#eq? @m "open")
arguments: (argument_list
(string) @url
(#match? @url "^\"https?://")
)
) @vuln
"#,
id: "rb.ssrf.open_uri",
description: "Kernel#open with HTTP URL — SSRF via open-uri",
query: r#"(call
method: (identifier) @m (#eq? @m "open")
arguments: (argument_list
(string) @url (#match? @url "^\"https?://")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::InsecureTransport,
confidence: Confidence::High,
},
// ── Tier A: Crypto ─────────────────────────────────────────────────
Pattern {
id: "rb.crypto.md5",
description: "Digest::MD5 — weak hash algorithm",
query: r#"(scope_resolution
name: (constant) @c (#eq? @c "MD5"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
];

View file

@ -1,118 +1,170 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// Rust AST patterns.
///
/// Rust taint rules already cover `Command::new`/`arg`/`status`/`output` sinks
/// and `env::var` / `fs::read_to_string` sources, so we do NOT duplicate those.
/// Patterns here focus on **unsafe memory**, **panicking APIs**, and structural
/// code-quality signals specific to Rust.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Memory Safety (unsafe) ─────────────────────────────────
Pattern {
id: "unsafe_block",
description: "Use of an `unsafe` block",
id: "rs.memory.transmute",
description: "std::mem::transmute — unchecked type reinterpretation",
query: r#"(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p "mem")
name: (identifier) @f (#eq? @f "transmute")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "rs.memory.copy_nonoverlapping",
description: "ptr::copy_nonoverlapping — raw pointer memcpy",
query: r#"(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p "ptr")
name: (identifier) @f (#eq? @f "copy_nonoverlapping")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "rs.memory.get_unchecked",
description: "get_unchecked / get_unchecked_mut — unchecked indexing",
query: r#"(call_expression
function: (field_expression
field: (field_identifier) @m
(#match? @m "^get_unchecked(_mut)?$")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "rs.memory.mem_zeroed",
description: "std::mem::zeroed — zero-initialised memory may be UB for non-POD types",
query: r#"(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p "mem")
name: (identifier) @n (#eq? @n "zeroed")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "rs.memory.ptr_read",
description: "ptr::read / ptr::read_volatile — raw pointer dereference",
query: r#"(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p "ptr")
name: (identifier) @n (#match? @n "^read(_volatile)?$")))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
// ── Tier A: Code quality / robustness ──────────────────────────────
Pattern {
id: "rs.quality.unsafe_block",
description: "unsafe block — manual memory safety obligation",
query: "(unsafe_block) @vuln",
severity: Severity::High,
},
Pattern {
id: "unsafe_fn",
description: "`unsafe fn` declaration",
query: "(function_item
(function_modifiers) @mods
(#match? @mods \"^unsafe\\b\")) @vuln",
severity: Severity::High,
},
Pattern {
id: "transmute_call",
description: "`std::mem::transmute` call",
query: "(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p \"mem\")
name: (identifier) @f (#eq? @f \"transmute\")))
@vuln",
severity: Severity::High,
},
Pattern {
id: "copy_nonoverlapping",
description: "Raw pointer `copy_nonoverlapping`",
query: "(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p \"ptr\")
name: (identifier) @f (#eq? @f \"copy_nonoverlapping\")))
@vuln",
severity: Severity::High,
},
Pattern {
id: "get_unchecked",
description: "`get_unchecked` / `get_unchecked_mut` slice access",
query: "(call_expression
function: (field_expression
field: (field_identifier) @m
(#match? @m \"get_unchecked(_mut)?\"))) @vuln",
severity: Severity::High,
},
Pattern {
id: "unwrap_call",
description: "`.unwrap()` call (may panic)",
query: "(call_expression
function: (field_expression
field: (field_identifier) @name
(#eq? @name \"unwrap\"))) ; exact match
@vuln",
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "expect_call",
description: "`.expect()` call (may panic)",
query: "(call_expression
function: (field_expression
field: (field_identifier) @name
(#eq? @name \"expect\"))) @vuln",
id: "rs.quality.unsafe_fn",
description: "unsafe fn declaration",
query: r#"(function_item
(function_modifiers) @mods
(#match? @mods "^unsafe"))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
Pattern {
id: "panic_macro",
description: "`panic!` macro invocation",
query: "(macro_invocation (identifier) @id (#eq? @id \"panic\")) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "todo_or_unimplemented",
description: "`todo!()` / `unimplemented!()` placeholder",
query: "(macro_invocation
(identifier) @id
(#match? @id \"todo|unimplemented\")) @vuln",
id: "rs.quality.unwrap",
description: ".unwrap() — panics on None/Err",
query: r#"(call_expression
function: (field_expression
field: (field_identifier) @name (#eq? @name "unwrap")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::CodeQuality,
confidence: Confidence::High,
},
Pattern {
id: "narrow_cast_with_as",
description: "`as` cast to an 8-/16-bit integer (possible truncation)",
query: "(type_cast_expression
type: (primitive_type) @to
(#match? @to \"^u?i(8|16)$\")) @vuln",
id: "rs.quality.expect",
description: ".expect() — panics on None/Err",
query: r#"(call_expression
function: (field_expression
field: (field_identifier) @name (#eq? @name "expect")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::CodeQuality,
confidence: Confidence::High,
},
Pattern {
id: "mem_zeroed",
description: "`std::mem::zeroed()`",
query: "(call_expression function:(scoped_identifier path:(identifier)@p (#eq? @p \"mem\") name:(identifier)@n (#eq? @n \"zeroed\")))@vuln",
severity: Severity::High,
},
Pattern {
id: "mem_forget",
description: "`std::mem::forget()`",
query: "(call_expression function:(scoped_identifier path:(identifier)@p (#eq? @p \"mem\") name:(identifier)@n (#eq? @n \"forget\")))@vuln",
severity: Severity::Medium,
},
Pattern {
id: "ptr_read",
description: "`ptr::read_*` raw-ptr read",
query: "(call_expression function:(scoped_identifier path:(identifier)@p (#eq? @p \"ptr\") name:(identifier)@n (#match? @n \"read(_volatile)?\")))@vuln",
severity: Severity::High,
},
Pattern {
id: "arc_unwrap",
description: "`Arc::unwrap_or_else_unchecked`",
query: "(call_expression function:(scoped_identifier name:(identifier)@n (#eq? @n \"unwrap_or_else_unchecked\")))@vuln",
severity: Severity::High,
},
Pattern {
id: "dbg_macro",
description: "`dbg!()` left in code",
query: "(macro_invocation (identifier)@id (#eq? @id \"dbg\"))@vuln",
id: "rs.quality.panic_macro",
description: "panic! macro invocation",
query: r#"(macro_invocation (identifier) @id (#eq? @id "panic")) @vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::CodeQuality,
confidence: Confidence::High,
},
Pattern {
id: "rs.quality.todo",
description: "todo!() / unimplemented!() placeholder left in code",
query: r#"(macro_invocation
(identifier) @id
(#match? @id "^(todo|unimplemented)$"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::CodeQuality,
confidence: Confidence::High,
},
// ── Tier A: Narrowing cast ─────────────────────────────────────────
Pattern {
id: "rs.memory.narrow_cast",
description: "`as` cast to 8/16-bit integer — possible truncation",
query: r#"(type_cast_expression
type: (primitive_type) @to
(#match? @to "^(u8|i8|u16|i16)$"))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::Medium,
},
Pattern {
id: "rs.memory.mem_forget",
description: "std::mem::forget — may leak resources",
query: r#"(call_expression
function: (scoped_identifier
path: (identifier) @p (#eq? @p "mem")
name: (identifier) @n (#eq? @n "forget")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::MemorySafety,
confidence: Confidence::High,
},
];

View file

@ -1,100 +1,157 @@
use crate::patterns::{Pattern, Severity};
use crate::evidence::Confidence;
use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
/// TypeScript AST patterns.
///
/// TypeScript shares most patterns with JavaScript. Taint rules cover `eval`,
/// `innerHTML`, and `child_process.*` sinks. AST patterns here mirror JS
/// patterns plus TS-specific `any` type-safety escapes.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Code execution ─────────────────────────────────────────
Pattern {
id: "eval_call",
description: "Use of eval()",
query: "(call_expression function: (identifier) @id (#eq? @id \"eval\")) @vuln",
id: "ts.code_exec.eval",
description: "eval() — dynamic code execution",
query: r#"(call_expression
function: (identifier) @id (#eq? @id "eval"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "new_function",
description: "new Function() constructor",
query: "(new_expression constructor: (identifier) @id (#eq? @id \"Function\")) @vuln",
id: "ts.code_exec.new_function",
description: "new Function() constructor — eval equivalent",
query: r#"(new_expression
constructor: (identifier) @id (#eq? @id "Function"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
Pattern {
id: "document_write",
description: "document.write() call",
query: "(call_expression function: (member_expression object: (identifier) @obj (#eq? @obj \"document\") property: (property_identifier) @prop (#eq? @prop \"write\"))) @vuln",
id: "ts.code_exec.settimeout_string",
description: "setTimeout/setInterval with string argument — implicit eval",
query: r#"(call_expression
function: (identifier) @id (#match? @id "^(setTimeout|setInterval)$")
arguments: (arguments (string) @code))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ── Tier A: XSS sinks ──────────────────────────────────────────────
Pattern {
id: "settimeout_string",
description: "setTimeout / setInterval with a string argument",
query: "(call_expression function: (identifier) @id (#match? @id \"setTimeout|setInterval\") arguments: (arguments (string) @code . _)) @vuln",
id: "ts.xss.document_write",
description: "document.write() — XSS sink",
query: r#"(call_expression
function: (member_expression
object: (identifier) @obj (#eq? @obj "document")
property: (property_identifier) @prop (#match? @prop "^(write|writeln)$")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
Pattern {
id: "any_type",
description: "Type annotation of `any`",
query: "(type_annotation (predefined_type) @t (#eq? @t \"any\")) @vuln",
severity: Severity::Low,
},
Pattern {
id: "json_parse",
description: "JSON.parse on dynamic string",
query: "(call_expression function: (member_expression object: (identifier) @obj (#eq? @obj \"JSON\") property: (property_identifier) @prop (#eq? @prop \"parse\"))) @vuln",
severity: Severity::Low,
},
Pattern {
id: "as_any_assertion",
description: "Type assertion to `any` using `as any`",
query: "(as_expression type: (predefined_type) @t (#eq? @t \"any\")) @vuln",
severity: Severity::Low,
},
Pattern {
id: "type_assertion_any",
description: "Type assertion to `any` using `<any>` syntax",
query: "(type_assertion type: (predefined_type) @t (#eq? @t \"any\")) @vuln",
severity: Severity::Low,
},
Pattern {
id: "outer_html_assignment",
description: "Assignment to element.outerHTML",
query: "(assignment_expression left: (member_expression property: (property_identifier) @prop (#eq? @prop \"outerHTML\"))) @vuln",
id: "ts.xss.outer_html",
description: "Assignment to .outerHTML — XSS sink",
query: r#"(assignment_expression
left: (member_expression
property: (property_identifier) @prop (#eq? @prop "outerHTML")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
Pattern {
id: "insert_adjacent_html",
description: "insertAdjacentHTML() call",
query: "(call_expression function: (member_expression property: (property_identifier) @prop (#eq? @prop \"insertAdjacentHTML\"))) @vuln",
id: "ts.xss.insert_adjacent_html",
description: "insertAdjacentHTML() — XSS sink",
query: r#"(call_expression
function: (member_expression
property: (property_identifier) @prop (#eq? @prop "insertAdjacentHTML")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
// ── Tier A: Weak crypto ────────────────────────────────────────────
Pattern {
id: "ts.crypto.math_random",
description: "Math.random() — not cryptographically secure",
query: r#"(call_expression
function: (member_expression
object: (identifier) @obj (#eq? @obj "Math")
property: (property_identifier) @prop (#eq? @prop "random")))
@vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::Crypto,
confidence: Confidence::Medium,
},
// ── Tier A: TypeScript-specific type-safety escapes ────────────────
Pattern {
id: "ts.quality.any_annotation",
description: "Type annotation of `any` — disables type checking",
query: r#"(type_annotation (predefined_type) @t (#eq? @t "any")) @vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::CodeQuality,
confidence: Confidence::Medium,
},
Pattern {
id: "document_cookie_write",
id: "ts.quality.as_any",
description: "Type assertion `as any` — type-safety escape hatch",
query: r#"(as_expression (predefined_type) @t (#eq? @t "any")) @vuln"#,
severity: Severity::Low,
tier: PatternTier::A,
category: PatternCategory::CodeQuality,
confidence: Confidence::Medium,
},
// ── Tier A: Prototype pollution ────────────────────────────────────
Pattern {
id: "ts.prototype.proto_assignment",
description: "Assignment to __proto__ — prototype pollution",
query: r#"(assignment_expression
left: (member_expression
property: (property_identifier) @prop (#eq? @prop "__proto__")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Prototype,
confidence: Confidence::High,
},
// ── Tier A: Open redirect ──────────────────────────────────────────
Pattern {
id: "ts.xss.location_assign",
description: "Assignment to location/location.href — open redirect",
query: r#"(assignment_expression
left: (member_expression
object: (identifier) @obj (#match? @obj "^(window|location|document)$")
property: (property_identifier) @prop (#match? @prop "^(location|href)$")))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::High,
},
// ── Tier A: Cookie manipulation ────────────────────────────────────
Pattern {
id: "ts.xss.cookie_write",
description: "Write to document.cookie",
query: "(assignment_expression left: (member_expression object: (identifier) @obj (#eq? @obj \"document\") property: (property_identifier) @prop (#eq? @prop \"cookie\"))) @vuln",
query: r#"(assignment_expression
left: (member_expression
object: (identifier) @obj (#eq? @obj "document")
property: (property_identifier) @prop (#eq? @prop "cookie")))
@vuln"#,
severity: Severity::Low,
},
Pattern {
id: "onclick_setattribute",
description: "Element.setAttribute('onclick', …)",
query: "(call_expression function: (member_expression property: (property_identifier) @prop (#eq? @prop \"setAttribute\")) arguments: (arguments (string) @name (#eq? @name \"\\\"onclick\\\"\") . (string) @handler)) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "math_random_call",
description: "Use of Math.random() for security-sensitive randomness",
query: "(call_expression function: (member_expression object: (identifier) @obj (#eq? @obj \"Math\") property: (property_identifier) @prop (#eq? @prop \"random\"))) @vuln",
severity: Severity::Low,
},
Pattern {
id: "crypto_createhash_md5",
description: "Insecure hash algorithm: crypto.createHash('md5')",
query: "(call_expression function: (member_expression object: (identifier) @obj (#eq? @obj \"crypto\") property: (property_identifier) @prop (#eq? @prop \"createHash\")) arguments: (arguments (string) @alg (#match? @alg \"(?i)\\\"md5\\\"\"))) @vuln",
severity: Severity::Medium,
},
Pattern {
id: "fetch_http_url",
description: "fetch() over plain HTTP",
query: "(call_expression function: (identifier) @id (#eq? @id \"fetch\") arguments: (arguments (string) @url (#match? @url \"^\\\"http://\"))) @vuln",
severity: Severity::Low,
},
Pattern {
id: "xhr_eval_response",
description: "eval() of XMLHttpRequest.responseText",
query: "(call_expression function: (identifier) @id (#eq? @id \"eval\") arguments: (arguments (member_expression property: (property_identifier) @prop (#eq? @prop \"responseText\")))) @vuln",
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Xss,
confidence: Confidence::Medium,
},
];

646
src/rank.rs Normal file
View file

@ -0,0 +1,646 @@
//! Attack surface ranking for scan diagnostics.
//!
//! Computes a deterministic score for each [`Diag`] using only in-memory
//! information (severity, evidence, source kind, rule ID, validation state).
//! The score is used to sort findings so that truncation keeps the most
//! exploitable / important results.
use crate::commands::scan::Diag;
use crate::evidence::Evidence;
use crate::patterns::Severity;
use std::hash::{DefaultHasher, Hash, Hasher};
/// Computed attack-surface ranking for a single diagnostic.
#[derive(Debug, Clone)]
pub struct AttackRank {
pub score: f64,
/// Breakdown of score components (for debug/display purposes).
#[allow(dead_code)]
pub components: Vec<(String, String)>,
}
/// Compute an attack-surface score for `diag`.
///
/// The score is a positive `f64`; higher means more exploitable / important.
/// Components are returned for optional debug/display.
pub fn compute_attack_rank(diag: &Diag) -> AttackRank {
let mut score = 0.0_f64;
let mut components: Vec<(String, String)> = Vec::new();
// ── 1. Severity base ────────────────────────────────────────────────
let sev_score = match diag.severity {
Severity::High => 60.0,
Severity::Medium => 30.0,
Severity::Low => 10.0,
};
score += sev_score;
components.push(("severity".into(), format!("{sev_score}")));
// ── 2. Analysis kind bonus ──────────────────────────────────────────
//
// Taint-confirmed findings are the strongest signal. State findings
// (resource lifecycle / auth) are next. CFG-structural findings
// without taint evidence rank lower. AST-only pattern matches are
// the weakest.
let kind_bonus = analysis_kind_bonus(&diag.id, diag.evidence.as_ref());
score += kind_bonus;
if kind_bonus != 0.0 {
components.push(("analysis_kind".into(), format!("{kind_bonus}")));
}
// ── 3. Evidence strength / source-kind priority ─────────────────────
let evidence_bonus = evidence_strength(diag);
score += evidence_bonus;
if evidence_bonus != 0.0 {
components.push(("evidence".into(), format!("{evidence_bonus}")));
}
// ── 4. State finding sub-ranking ────────────────────────────────────
let state_bonus = state_finding_bonus(&diag.id);
score += state_bonus;
if state_bonus != 0.0 {
components.push(("state_rule".into(), format!("{state_bonus}")));
}
// ── 5. Path validation penalty ──────────────────────────────────────
//
// If a taint path is guarded by a validation predicate, the finding
// has higher informational value but lower exploitability because the
// guard may prevent the vulnerability from being triggered. Apply a
// small penalty (5) to push validated paths below otherwise-equal
// unvalidated ones without changing the overall ranking tier.
let path_validated = diag.evidence.as_ref().map_or(diag.path_validated, |ev| {
ev.notes.iter().any(|n| n == "path_validated")
});
if path_validated {
score -= 5.0;
components.push(("path_validated_penalty".into(), "-5".into()));
}
AttackRank { score, components }
}
/// Deterministic sort key for a diagnostic.
///
/// Two diags with identical scores are tie-broken by:
/// severity (High < Medium < Low in the `Ord` impl, so we negate)
/// → rule ID → file path → line → col → message hash
///
/// Returns a tuple suitable for `sort_by`.
pub fn sort_key(diag: &Diag) -> impl Ord {
let sev_ord: u8 = match diag.severity {
Severity::High => 0,
Severity::Medium => 1,
Severity::Low => 2,
};
let msg_hash = {
let mut h = DefaultHasher::new();
diag.message.hash(&mut h);
h.finish()
};
(
sev_ord,
diag.id.clone(),
diag.path.clone(),
diag.line,
diag.col,
msg_hash,
)
}
/// Sort diagnostics in-place by descending attack-surface score, then by
/// deterministic tie-breaker. Populates `rank_score` on each `Diag`.
pub fn rank_diags(diags: &mut [Diag]) {
// Compute scores
let scores: Vec<f64> = diags.iter().map(|d| compute_attack_rank(d).score).collect();
// Attach scores to diags
for (d, s) in diags.iter_mut().zip(scores.iter()) {
d.rank_score = Some(*s);
}
// Sort descending by score, then ascending by tie-breaker
diags.sort_by(|a, b| {
let sa = a.rank_score.unwrap_or(0.0);
let sb = b.rank_score.unwrap_or(0.0);
// Descending score (higher first)
sb.partial_cmp(&sa)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| sort_key(a).cmp(&sort_key(b)))
});
}
// ─────────────────────────────────────────────────────────────────────────────
// Scoring helpers
// ─────────────────────────────────────────────────────────────────────────────
/// Bonus based on analysis kind inferred from rule ID + evidence.
fn analysis_kind_bonus(rule_id: &str, evidence: Option<&Evidence>) -> f64 {
if rule_id.starts_with("taint-") {
// Taint-confirmed flow is the strongest signal
10.0
} else if rule_id.starts_with("state-") {
// State-model findings (resource / auth) are strong
8.0
} else if rule_id.starts_with("cfg-") {
// CFG-structural findings: boost if evidence exists
if evidence.is_some_and(|e| !e.is_empty()) {
5.0
} else {
3.0
}
} else {
// AST-only pattern match
0.0
}
}
/// Bonus from evidence strength: number of evidence items and source-kind
/// priority.
fn evidence_strength(diag: &Diag) -> f64 {
let mut bonus = 0.0;
if let Some(ev) = &diag.evidence {
// Count structured evidence items (capped at 4)
let item_count = ev.source.is_some() as usize
+ ev.sink.is_some() as usize
+ (ev.guards.len() + ev.sanitizers.len()).min(2);
bonus += item_count.min(4) as f64;
// Source-kind priority from evidence notes
for note in &ev.notes {
if let Some(kind) = note.strip_prefix("source_kind:") {
bonus += source_kind_priority(kind);
break;
}
}
} else {
// Fallback for DB-cached diags without structured evidence
bonus += (diag.labels.len() as f64).min(4.0);
for (label, value) in &diag.labels {
if label == "Source" {
bonus += source_kind_priority(value);
}
}
}
bonus
}
/// Priority bonus based on the source kind string found in evidence.
///
/// UserInput / EnvironmentConfig / Unknown are most exploitable.
/// FileSystem / Database are lower because the attacker needs a more
/// indirect vector.
fn source_kind_priority(source_value: &str) -> f64 {
// Structured SourceKind enum values (from evidence.notes "source_kind:X")
match source_value {
"UserInput" => return 6.0,
"EnvironmentConfig" => return 5.0,
"FileSystem" => return 3.0,
"Database" => return 2.0,
"Unknown" => return 4.0,
_ => {}
}
// Fallback: substring matching for legacy labels
let lower = source_value.to_ascii_lowercase();
if lower.contains("stdin")
|| lower.contains("argv")
|| lower.contains("request")
|| lower.contains("form")
|| lower.contains("query")
|| lower.contains("param")
|| lower.contains("header")
|| lower.contains("body")
|| lower.contains("read_line")
{
// Strong user-input signals
6.0
} else if lower.contains("env") || lower.contains("var(") || lower.contains("getenv") {
// Environment / config — still attacker-controllable in many deployments
5.0
} else if lower.contains("read") || lower.contains("file") || lower.contains("open") {
// File system — needs indirect vector
3.0
} else if lower.contains("query") || lower.contains("fetch") || lower.contains("select") {
// Database — needs prior injection
2.0
} else {
// Unknown / unrecognised — treat as moderately exploitable
4.0
}
}
/// Bonus for specific state-analysis rule IDs.
fn state_finding_bonus(rule_id: &str) -> f64 {
match rule_id {
"state-use-after-close" => 6.0,
"state-unauthed-access" => 6.0,
"state-double-close" => 3.0,
"state-resource-leak" => 2.0, // must-leak
"state-resource-leak-possible" => 1.0, // may-leak
_ => 0.0,
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
fn make_diag(
severity: Severity,
id: &str,
path: &str,
line: usize,
labels: Vec<(String, String)>,
path_validated: bool,
) -> Diag {
Diag {
path: path.into(),
line,
col: 1,
severity,
id: id.into(),
category: crate::patterns::FindingCategory::Security,
path_validated,
guard_kind: None,
message: None,
labels,
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
}
}
// ── Ordering tests ──────────────────────────────────────────────────
#[test]
fn high_taint_user_input_ranks_above_medium_file_io() {
let high_taint = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![
("Source".into(), "read_line() at 1:1".into()),
("Sink".into(), "exec()".into()),
],
false,
);
let med_file = make_diag(
Severity::Medium,
"taint-unsanitised-flow (source 5:1)",
"src/lib.rs",
20,
vec![
("Source".into(), "File::open() at 5:1".into()),
("Sink".into(), "write()".into()),
],
false,
);
let score_high = compute_attack_rank(&high_taint).score;
let score_med = compute_attack_rank(&med_file).score;
assert!(
score_high > score_med,
"high taint user-input ({score_high}) should rank above medium file-io ({score_med})"
);
}
#[test]
fn must_leak_ranks_above_may_leak() {
let must = make_diag(
Severity::Medium,
"state-resource-leak",
"src/db.rs",
30,
vec![],
false,
);
let may = make_diag(
Severity::Low,
"state-resource-leak-possible",
"src/db.rs",
35,
vec![],
false,
);
let score_must = compute_attack_rank(&must).score;
let score_may = compute_attack_rank(&may).score;
assert!(
score_must > score_may,
"must-leak ({score_must}) should rank above may-leak ({score_may})"
);
}
#[test]
fn cfg_without_evidence_ranks_below_taint_confirmed() {
let taint = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![
("Source".into(), "env::var(\"CMD\") at 1:1".into()),
("Sink".into(), "exec()".into()),
],
false,
);
let cfg_only = make_diag(
Severity::High,
"cfg-unguarded-sink",
"src/main.rs",
10,
vec![],
false,
);
let score_taint = compute_attack_rank(&taint).score;
let score_cfg = compute_attack_rank(&cfg_only).score;
assert!(
score_taint > score_cfg,
"taint-confirmed ({score_taint}) should rank above cfg-only ({score_cfg})"
);
}
#[test]
fn determinism_input_order_independent() {
let d1 = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"a.rs",
1,
vec![("Source".into(), "stdin at 1:1".into())],
false,
);
let d2 = make_diag(
Severity::Medium,
"cfg-unguarded-sink",
"b.rs",
2,
vec![],
false,
);
let d3 = make_diag(Severity::Low, "rs.code_exec.eval", "c.rs", 3, vec![], false);
let mut order_a = vec![d1.clone(), d2.clone(), d3.clone()];
let mut order_b = vec![d3, d1, d2];
rank_diags(&mut order_a);
rank_diags(&mut order_b);
let ids_a: Vec<_> = order_a.iter().map(|d| (&d.id, d.line)).collect();
let ids_b: Vec<_> = order_b.iter().map(|d| (&d.id, d.line)).collect();
assert_eq!(
ids_a, ids_b,
"ranking must be deterministic regardless of input order"
);
}
#[test]
fn path_validated_penalty_applied() {
let unvalidated = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![("Source".into(), "env::var(\"X\") at 1:1".into())],
false,
);
let validated = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![("Source".into(), "env::var(\"X\") at 1:1".into())],
true,
);
let score_unval = compute_attack_rank(&unvalidated).score;
let score_val = compute_attack_rank(&validated).score;
assert!(
score_unval > score_val,
"unvalidated ({score_unval}) should rank above validated ({score_val})"
);
}
#[test]
fn state_use_after_close_ranks_above_may_leak() {
let uac = make_diag(
Severity::High,
"state-use-after-close",
"x.rs",
1,
vec![],
false,
);
let may = make_diag(
Severity::Low,
"state-resource-leak-possible",
"x.rs",
2,
vec![],
false,
);
let score_uac = compute_attack_rank(&uac).score;
let score_may = compute_attack_rank(&may).score;
assert!(score_uac > score_may);
}
#[test]
fn unauthed_access_ranks_above_resource_leak() {
let unauth = make_diag(
Severity::High,
"state-unauthed-access",
"x.rs",
1,
vec![],
false,
);
let leak = make_diag(
Severity::Medium,
"state-resource-leak",
"x.rs",
2,
vec![],
false,
);
let score_ua = compute_attack_rank(&unauth).score;
let score_lk = compute_attack_rank(&leak).score;
assert!(score_ua > score_lk);
}
#[test]
fn ast_only_ranks_below_all_others_at_same_severity() {
let ast = make_diag(
Severity::High,
"rs.code_exec.eval",
"x.rs",
1,
vec![],
false,
);
let cfg = make_diag(
Severity::High,
"cfg-unguarded-sink",
"x.rs",
2,
vec![],
false,
);
let taint = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"x.rs",
3,
vec![("Source".into(), "env::var(\"X\") at 1:1".into())],
false,
);
let state = make_diag(
Severity::High,
"state-use-after-close",
"x.rs",
4,
vec![],
false,
);
let s_ast = compute_attack_rank(&ast).score;
let s_cfg = compute_attack_rank(&cfg).score;
let s_taint = compute_attack_rank(&taint).score;
let s_state = compute_attack_rank(&state).score;
assert!(s_ast < s_cfg, "AST ({s_ast}) < CFG ({s_cfg})");
assert!(s_ast < s_taint, "AST ({s_ast}) < taint ({s_taint})");
assert!(s_ast < s_state, "AST ({s_ast}) < state ({s_state})");
}
#[test]
fn structured_evidence_source_kind_matches_legacy() {
// Structured evidence with source_kind:UserInput note should give
// the same source-kind bonus as a legacy "Source" label with user input.
let mut structured = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![],
false,
);
structured.evidence = Some(crate::evidence::Evidence {
source: Some(crate::evidence::SpanEvidence {
path: "src/main.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("read_line()".into()),
}),
sink: Some(crate::evidence::SpanEvidence {
path: "src/main.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: Some("exec()".into()),
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["source_kind:UserInput".into()],
});
let legacy = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![
("Source".into(), "read_line() at 1:1".into()),
("Sink".into(), "exec()".into()),
],
false,
);
let score_structured = compute_attack_rank(&structured).score;
let score_legacy = compute_attack_rank(&legacy).score;
assert_eq!(
score_structured, score_legacy,
"structured ({score_structured}) should equal legacy ({score_legacy})"
);
}
#[test]
fn evidence_item_count_capped_at_4() {
let mut d = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![],
false,
);
let span = || crate::evidence::SpanEvidence {
path: "x.rs".into(),
line: 1,
col: 1,
kind: "guard".into(),
snippet: None,
};
d.evidence = Some(crate::evidence::Evidence {
source: Some(span()),
sink: Some(span()),
guards: vec![span(), span(), span()], // 3 guards
sanitizers: vec![span()], // 1 sanitizer
state: None,
notes: vec![],
});
// item_count = 1 (source) + 1 (sink) + min(2, 3+1) = 4
// evidence bonus should be exactly 4.0 (from items) + 4.0 (unknown source kind) = 8.0
// ... but no source_kind note, so no source priority bonus
let score = evidence_strength(&d);
assert!(
(score - 4.0).abs() < f64::EPSILON,
"evidence item count should be capped at 4, got {score}"
);
}
#[test]
fn path_validated_from_evidence_notes() {
let mut d = make_diag(
Severity::High,
"taint-unsanitised-flow (source 1:1)",
"src/main.rs",
10,
vec![],
false, // path_validated is false on Diag
);
d.evidence = Some(crate::evidence::Evidence {
source: None,
sink: None,
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["path_validated".into()],
});
let rank = compute_attack_rank(&d);
assert!(
rank.components
.iter()
.any(|(k, _)| k == "path_validated_penalty"),
"path_validated note in evidence should trigger penalty"
);
}
}

313
src/state/domain.rs Normal file
View file

@ -0,0 +1,313 @@
use super::lattice::Lattice;
use super::symbol::SymbolId;
use bitflags::bitflags;
use std::collections::{HashMap, HashSet};
// ── ResourceLifecycle ────────────────────────────────────────────────────
bitflags! {
/// Bitset of possible lifecycle states for a single resource handle.
///
/// Join = bitwise OR (a variable may be in multiple states across paths).
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct ResourceLifecycle: u8 {
const UNINIT = 0b0001;
const OPEN = 0b0010;
const CLOSED = 0b0100;
const MOVED = 0b1000;
}
}
impl Lattice for ResourceLifecycle {
fn bot() -> Self {
ResourceLifecycle::empty()
}
fn join(&self, other: &Self) -> Self {
*self | *other
}
fn leq(&self, other: &Self) -> bool {
self.intersection(*other) == *self
}
}
// ── ResourceDomainState ──────────────────────────────────────────────────
/// Maps interned variable IDs to their lifecycle bitsets.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct ResourceDomainState {
pub vars: HashMap<SymbolId, ResourceLifecycle>,
}
impl ResourceDomainState {
pub fn new() -> Self {
Self::default()
}
pub fn get(&self, sym: SymbolId) -> ResourceLifecycle {
self.vars
.get(&sym)
.copied()
.unwrap_or(ResourceLifecycle::empty())
}
pub fn set(&mut self, sym: SymbolId, state: ResourceLifecycle) {
self.vars.insert(sym, state);
}
}
impl Lattice for ResourceDomainState {
fn bot() -> Self {
Self::new()
}
fn join(&self, other: &Self) -> Self {
let mut merged = self.clone();
for (&sym, &other_lc) in &other.vars {
let entry = merged.vars.entry(sym).or_insert(ResourceLifecycle::empty());
*entry = entry.join(&other_lc);
}
merged
}
fn leq(&self, other: &Self) -> bool {
for (&sym, &self_lc) in &self.vars {
let other_lc = other.get(sym);
if !self_lc.leq(&other_lc) {
return false;
}
}
true
}
}
// ── AuthLevel ────────────────────────────────────────────────────────────
/// Simple ordered lattice for path authentication state.
///
/// Bot = `Unauthed`. Join = `min` (conservative: if any path is unauthed,
/// the joined state is unauthed).
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum AuthLevel {
Unauthed,
Authed,
Admin,
}
impl Lattice for AuthLevel {
fn bot() -> Self {
AuthLevel::Unauthed
}
fn join(&self, other: &Self) -> Self {
// Conservative: take the minimum (least privileged)
(*self).min(*other)
}
fn leq(&self, other: &Self) -> bool {
// Higher auth subsumes lower: Unauthed ⊑ Authed ⊑ Admin
// In our lattice, join = min, so leq means self >= other
*self >= *other
}
}
// ── AuthDomainState ──────────────────────────────────────────────────────
/// Path auth level + per-variable validation bit.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct AuthDomainState {
pub auth_level: AuthLevel,
pub validated: HashSet<SymbolId>,
}
impl Default for AuthDomainState {
fn default() -> Self {
Self {
auth_level: AuthLevel::Unauthed,
validated: HashSet::new(),
}
}
}
impl AuthDomainState {
pub fn new() -> Self {
Self::default()
}
}
impl Lattice for AuthDomainState {
fn bot() -> Self {
Self::new()
}
fn join(&self, other: &Self) -> Self {
Self {
auth_level: self.auth_level.join(&other.auth_level),
// Only validated on ALL paths counts
validated: self
.validated
.intersection(&other.validated)
.copied()
.collect(),
}
}
fn leq(&self, other: &Self) -> bool {
self.auth_level.leq(&other.auth_level) && self.validated.is_superset(&other.validated)
}
}
// ── ProductState ─────────────────────────────────────────────────────────
/// Composable product of resource and auth domains.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ProductState {
pub resource: ResourceDomainState,
pub auth: AuthDomainState,
}
impl ProductState {
pub fn initial() -> Self {
Self {
resource: ResourceDomainState::new(),
auth: AuthDomainState::new(),
}
}
}
impl Lattice for ProductState {
fn bot() -> Self {
Self {
resource: ResourceDomainState::bot(),
auth: AuthDomainState::bot(),
}
}
fn join(&self, other: &Self) -> Self {
Self {
resource: self.resource.join(&other.resource),
auth: self.auth.join(&other.auth),
}
}
fn leq(&self, other: &Self) -> bool {
self.resource.leq(&other.resource) && self.auth.leq(&other.auth)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resource_lifecycle_join_is_or() {
let a = ResourceLifecycle::OPEN;
let b = ResourceLifecycle::CLOSED;
assert_eq!(
a.join(&b),
ResourceLifecycle::OPEN | ResourceLifecycle::CLOSED
);
}
#[test]
fn resource_lifecycle_bot_identity() {
let a = ResourceLifecycle::OPEN;
assert_eq!(a.join(&ResourceLifecycle::bot()), a);
}
#[test]
fn resource_lifecycle_leq() {
let a = ResourceLifecycle::OPEN;
let b = ResourceLifecycle::OPEN | ResourceLifecycle::CLOSED;
assert!(a.leq(&b));
assert!(!b.leq(&a));
}
#[test]
fn resource_domain_join_merges_keys() {
let mut a = ResourceDomainState::new();
let mut b = ResourceDomainState::new();
let sym_x = SymbolId(0);
let sym_y = SymbolId(1);
a.set(sym_x, ResourceLifecycle::OPEN);
b.set(sym_x, ResourceLifecycle::CLOSED);
b.set(sym_y, ResourceLifecycle::OPEN);
let joined = a.join(&b);
assert_eq!(
joined.get(sym_x),
ResourceLifecycle::OPEN | ResourceLifecycle::CLOSED
);
assert_eq!(joined.get(sym_y), ResourceLifecycle::OPEN);
}
#[test]
fn auth_level_join_is_min() {
assert_eq!(
AuthLevel::Admin.join(&AuthLevel::Unauthed),
AuthLevel::Unauthed
);
assert_eq!(AuthLevel::Authed.join(&AuthLevel::Admin), AuthLevel::Authed);
assert_eq!(
AuthLevel::Authed.join(&AuthLevel::Authed),
AuthLevel::Authed
);
}
#[test]
fn auth_domain_join_intersects_validated() {
let sym_a = SymbolId(0);
let sym_b = SymbolId(1);
let sym_c = SymbolId(2);
let a = AuthDomainState {
auth_level: AuthLevel::Authed,
validated: [sym_a, sym_b].into_iter().collect(),
};
let b = AuthDomainState {
auth_level: AuthLevel::Admin,
validated: [sym_b, sym_c].into_iter().collect(),
};
let joined = a.join(&b);
assert_eq!(joined.auth_level, AuthLevel::Authed);
assert_eq!(joined.validated, [sym_b].into_iter().collect());
}
#[test]
fn product_state_join() {
let a = ProductState::initial();
let b = ProductState::initial();
let joined = a.join(&b);
assert_eq!(joined, ProductState::initial());
}
#[test]
fn may_must_leak_semantics() {
// Must-leak: OPEN only
let must_leak = ResourceLifecycle::OPEN;
assert!(must_leak.contains(ResourceLifecycle::OPEN));
assert!(!must_leak.contains(ResourceLifecycle::CLOSED));
assert!(!must_leak.contains(ResourceLifecycle::MOVED));
// May-leak: OPEN | CLOSED (some paths close, some don't)
let may_leak = ResourceLifecycle::OPEN | ResourceLifecycle::CLOSED;
assert!(may_leak.contains(ResourceLifecycle::OPEN));
assert!(may_leak.contains(ResourceLifecycle::CLOSED));
// No leak: CLOSED only
let no_leak = ResourceLifecycle::CLOSED;
assert!(!no_leak.contains(ResourceLifecycle::OPEN));
assert!(no_leak.contains(ResourceLifecycle::CLOSED));
}
// SymbolId is a newtype used in domain tests; ensure it's Copy
#[test]
fn symbol_id_is_copy() {
let s = SymbolId(0);
let s2 = s;
assert_eq!(s, s2);
}
}

288
src/state/engine.rs Normal file
View file

@ -0,0 +1,288 @@
use super::lattice::Lattice;
use crate::cfg::{Cfg, EdgeKind, NodeInfo};
use petgraph::graph::NodeIndex;
use petgraph::visit::EdgeRef;
use std::collections::{HashMap, VecDeque};
/// Maximum tracked variables per function (guarded degradation).
pub const MAX_TRACKED_VARS: usize = 64;
/// Default worklist iteration budget.
pub const MAX_WORKLIST_ITERATIONS: usize = 100_000;
/// Generic transfer function trait for forward dataflow analysis.
///
/// Domains implement this to define how abstract state flows through
/// CFG nodes and what events (findings) are emitted.
pub trait Transfer<S: Lattice> {
/// Side-channel events emitted during transfer (e.g., findings, violations).
type Event: Clone;
/// Apply the transfer function to a node, returning the output state
/// and any events.
fn apply(
&self,
node: NodeIndex,
info: &NodeInfo,
edge: Option<EdgeKind>,
state: S,
) -> (S, Vec<Self::Event>);
/// Per-domain iteration budget. Defaults to [`MAX_WORKLIST_ITERATIONS`].
fn iteration_budget(&self) -> usize {
MAX_WORKLIST_ITERATIONS
}
/// Called when the budget is exhausted. Returns true if the engine
/// should continue with the current (non-converged) state, false to bail.
fn on_budget_exceeded(&self) -> bool {
false
}
}
/// Result of running the forward dataflow engine.
pub struct DataflowResult<S, E> {
/// Converged state at the entry of each node.
pub states: HashMap<NodeIndex, S>,
/// Events emitted during Phase 2 transfer over converged states.
pub events: Vec<E>,
/// Whether the analysis converged (false if budget was hit).
#[allow(dead_code)]
pub converged: bool,
}
/// Run a forward worklist dataflow analysis over the CFG.
///
/// Two-phase design:
/// - Phase 1: fixed-point iteration to converge states (no event collection).
/// - Phase 2: single pass over converged states to collect events.
///
/// Termination is guaranteed by lattice finiteness + iteration budget.
pub fn run_forward<S: Lattice, T: Transfer<S>>(
cfg: &Cfg,
entry: NodeIndex,
transfer: &T,
initial: S,
) -> DataflowResult<S, T::Event> {
let mut states: HashMap<NodeIndex, S> = HashMap::new();
let budget = transfer.iteration_budget();
// Initialize entry node
states.insert(entry, initial);
// ── Phase 1: fixed-point iteration (compute converged states) ─────
let mut worklist: VecDeque<NodeIndex> = VecDeque::new();
worklist.push_back(entry);
let mut iterations: usize = 0;
let mut converged = true;
while let Some(node) = worklist.pop_front() {
iterations += 1;
if iterations > budget {
converged = !transfer.on_budget_exceeded();
if !converged {
break;
}
}
let node_state = match states.get(&node) {
Some(s) => s.clone(),
None => continue,
};
let edges: Vec<_> = cfg.edges(node).map(|e| (*e.weight(), e.target())).collect();
// No outgoing edges — nothing to propagate (exit/dead end).
if edges.is_empty() {
continue;
}
for (edge_kind, target) in edges {
let info = &cfg[node];
let (out_state, _events) =
transfer.apply(node, info, Some(edge_kind), node_state.clone());
// Join into target's state
let target_state = states.get(&target);
let new_target = match target_state {
Some(existing) => existing.join(&out_state),
None => out_state,
};
let changed = target_state.is_none_or(|existing| *existing != new_target);
if changed {
states.insert(target, new_target);
if !worklist.contains(&target) {
worklist.push_back(target);
}
}
}
}
// ── Phase 2: single pass over converged states to collect events ──
let mut events: Vec<T::Event> = Vec::new();
let mut seen_edges: std::collections::HashSet<(NodeIndex, NodeIndex)> =
std::collections::HashSet::new();
for node in states.keys().copied().collect::<Vec<_>>() {
let node_state = match states.get(&node) {
Some(s) => s.clone(),
None => continue,
};
let edges: Vec<_> = cfg.edges(node).map(|e| (*e.weight(), e.target())).collect();
if edges.is_empty() {
// Exit / dead end — apply transfer for event collection.
let info = &cfg[node];
let (_out_state, new_events) = transfer.apply(node, info, None, node_state);
events.extend(new_events);
continue;
}
for (edge_kind, target) in edges {
if !seen_edges.insert((node, target)) {
continue;
}
let info = &cfg[node];
let (_out_state, new_events) =
transfer.apply(node, info, Some(edge_kind), node_state.clone());
events.extend(new_events);
}
}
DataflowResult {
states,
events,
converged,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cfg::{EdgeKind, NodeInfo, StmtKind};
use crate::cfg_analysis::rules;
use crate::state::domain::ResourceLifecycle;
use crate::state::symbol::SymbolInterner;
use crate::state::transfer::DefaultTransfer;
use crate::symbol::Lang;
use petgraph::Graph;
fn make_node(kind: StmtKind) -> NodeInfo {
NodeInfo {
kind,
span: (0, 0),
label: None,
defines: None,
uses: vec![],
callee: None,
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: vec![],
condition_negated: false,
}
}
#[test]
fn linear_cfg_converges() {
use crate::state::domain::ProductState;
// Entry → fopen(f) → fclose(f) → Exit
let mut cfg: Cfg = Graph::new();
let entry = cfg.add_node(make_node(StmtKind::Entry));
let open_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
defines: Some("f".into()),
callee: Some("fopen".into()),
..make_node(StmtKind::Call)
});
let close_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
uses: vec!["f".into()],
callee: Some("fclose".into()),
..make_node(StmtKind::Call)
});
let exit = cfg.add_node(make_node(StmtKind::Exit));
cfg.add_edge(entry, open_node, EdgeKind::Seq);
cfg.add_edge(open_node, close_node, EdgeKind::Seq);
cfg.add_edge(close_node, exit, EdgeKind::Seq);
let interner = SymbolInterner::from_cfg(&cfg);
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let result = run_forward(&cfg, entry, &transfer, ProductState::initial());
// No events (clean open→close)
assert!(result.events.is_empty());
assert!(result.converged);
// At exit, f should be CLOSED
let sym_f = interner.get("f").unwrap();
let exit_state = result.states.get(&exit).unwrap();
assert_eq!(exit_state.resource.get(sym_f), ResourceLifecycle::CLOSED);
}
#[test]
fn diamond_cfg_joins_states() {
use crate::state::domain::ProductState;
// Entry
// |
// fopen(f)
// |
// If
// / \
// fclose(f) (no close)
// \ /
// Exit
let mut cfg: Cfg = Graph::new();
let entry = cfg.add_node(make_node(StmtKind::Entry));
let open_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
defines: Some("f".into()),
callee: Some("fopen".into()),
..make_node(StmtKind::Call)
});
let if_node = cfg.add_node(make_node(StmtKind::If));
let close_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
uses: vec!["f".into()],
callee: Some("fclose".into()),
..make_node(StmtKind::Call)
});
let no_close = cfg.add_node(make_node(StmtKind::Seq));
let exit = cfg.add_node(make_node(StmtKind::Exit));
cfg.add_edge(entry, open_node, EdgeKind::Seq);
cfg.add_edge(open_node, if_node, EdgeKind::Seq);
cfg.add_edge(if_node, close_node, EdgeKind::True);
cfg.add_edge(if_node, no_close, EdgeKind::False);
cfg.add_edge(close_node, exit, EdgeKind::Seq);
cfg.add_edge(no_close, exit, EdgeKind::Seq);
let interner = SymbolInterner::from_cfg(&cfg);
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let result = run_forward(&cfg, entry, &transfer, ProductState::initial());
// At exit, f should be OPEN | CLOSED (may-leak)
let sym_f = interner.get("f").unwrap();
let exit_state = result.states.get(&exit).unwrap();
assert_eq!(
exit_state.resource.get(sym_f),
ResourceLifecycle::OPEN | ResourceLifecycle::CLOSED
);
}
}

355
src/state/facts.rs Normal file
View file

@ -0,0 +1,355 @@
use super::domain::{AuthLevel, ProductState, ResourceLifecycle};
use super::engine::DataflowResult;
use super::symbol::SymbolInterner;
use super::transfer::{TransferEvent, TransferEventKind};
use crate::cfg::{Cfg, StmtKind};
use crate::labels::{Cap, DataLabel};
use crate::patterns::Severity;
use crate::symbol::Lang;
use petgraph::visit::IntoNodeReferences;
/// Normalize a callee description for display.
fn sanitize_desc(s: &str) -> String {
crate::fmt::normalize_snippet(s)
}
/// A finding produced by state analysis.
#[derive(Debug, Clone)]
pub struct StateFinding {
pub rule_id: String,
pub severity: Severity,
pub span: (usize, usize),
pub message: String,
/// State machine that produced this finding: `"resource"` or `"auth"`.
pub machine: &'static str,
/// Variable name involved, if available.
pub subject: Option<String>,
/// State before the event (e.g. `"closed"`, `"open"`, `"unauthed"`).
pub from_state: &'static str,
/// State after the event (e.g. `"used"`, `"closed"`, `"leaked"`, `"access"`).
pub to_state: &'static str,
}
/// Extract findings from converged dataflow state + transfer events.
pub fn extract_findings(
result: &DataflowResult<ProductState, TransferEvent>,
cfg: &Cfg,
interner: &SymbolInterner,
lang: Lang,
func_summaries: &crate::cfg::FuncSummaries,
) -> Vec<StateFinding> {
let mut findings = Vec::new();
// ── 1. Use-after-close from transfer events ──────────────────────────
for event in &result.events {
let info = &cfg[event.node];
let var_name = interner.resolve(event.var);
match event.kind {
TransferEventKind::UseAfterClose => {
findings.push(StateFinding {
rule_id: "state-use-after-close".into(),
severity: Severity::High,
span: info.span,
message: format!("variable `{var_name}` used after close"),
machine: "resource",
subject: Some(var_name.to_string()),
from_state: "closed",
to_state: "used",
});
}
TransferEventKind::DoubleClose => {
findings.push(StateFinding {
rule_id: "state-double-close".into(),
severity: Severity::Medium,
span: info.span,
message: format!("variable `{var_name}` closed twice"),
machine: "resource",
subject: Some(var_name.to_string()),
from_state: "closed",
to_state: "closed",
});
}
}
}
// ── 2. Resource leaks at Exit and function-Return nodes ──────────────
for (idx, info) in cfg.node_references() {
// Check both the file-level Exit node and the *synthesised* function
// exit node (a Return node). Skip early-return nodes — they flow
// into the synthesised exit and carry only path-specific state.
// The synthesised exit is the one Return node that does NOT have an
// outgoing edge to another Return in the same function.
let is_exit = info.kind == StmtKind::Exit;
let is_func_exit = info.kind == StmtKind::Return && info.enclosing_func.is_some();
if !is_exit && !is_func_exit {
continue;
}
if is_func_exit {
use petgraph::Direction;
let is_early_return = cfg
.neighbors_directed(idx, Direction::Outgoing)
.any(|succ| {
let s = &cfg[succ];
s.kind == StmtKind::Return && s.enclosing_func == info.enclosing_func
});
if is_early_return {
continue;
}
}
let Some(state) = result.states.get(&idx) else {
continue;
};
for (&sym, &lifecycle) in &state.resource.vars {
if !lifecycle.contains(ResourceLifecycle::OPEN) {
continue;
}
let var_name = interner.resolve(sym);
if !lifecycle.contains(ResourceLifecycle::CLOSED)
&& !lifecycle.contains(ResourceLifecycle::MOVED)
{
// Definite leak: open on all paths, never closed
// Find the acquire span by scanning backwards for this variable's define
let acquire_span = find_acquire_span(cfg, sym, interner);
findings.push(StateFinding {
rule_id: "state-resource-leak".into(),
severity: Severity::Medium,
span: acquire_span.unwrap_or(info.span),
message: format!("resource `{var_name}` is never closed"),
machine: "resource",
subject: Some(var_name.to_string()),
from_state: "open",
to_state: "leaked",
});
} else if lifecycle.contains(ResourceLifecycle::CLOSED) {
// May-leak: open on some paths, closed on others
let acquire_span = find_acquire_span(cfg, sym, interner);
findings.push(StateFinding {
rule_id: "state-resource-leak-possible".into(),
severity: Severity::Low,
span: acquire_span.unwrap_or(info.span),
message: format!("resource `{var_name}` may not be closed on all paths"),
machine: "resource",
subject: Some(var_name.to_string()),
from_state: "open",
to_state: "possibly_leaked",
});
}
}
}
// ── 3. Auth-required sinks ───────────────────────────────────────────
// Check if any function is a web entrypoint
let has_web_entrypoint = cfg.node_references().any(|(_, info)| {
if let Some(ref func_name) = info.enclosing_func {
is_web_entrypoint_simple(func_name, lang, func_summaries, cfg)
} else {
false
}
});
if has_web_entrypoint {
for (idx, info) in cfg.node_references() {
if !is_privileged_sink(info) {
continue;
}
let Some(state) = result.states.get(&idx) else {
continue;
};
if state.auth.auth_level == AuthLevel::Unauthed {
let callee_desc = sanitize_desc(info.callee.as_deref().unwrap_or("(sensitive op)"));
findings.push(StateFinding {
rule_id: "state-unauthed-access".into(),
severity: Severity::High,
span: info.span,
message: format!(
"sensitive operation `{callee_desc}` reached without authentication"
),
machine: "auth",
subject: None,
from_state: "unauthed",
to_state: "access",
});
}
}
}
// Dedup
findings.sort_by(|a, b| a.span.cmp(&b.span).then_with(|| a.rule_id.cmp(&b.rule_id)));
findings.dedup_by(|a, b| a.span == b.span && a.rule_id == b.rule_id);
findings
}
/// Find the span where a variable was acquired (defined via Call node).
fn find_acquire_span(
cfg: &Cfg,
sym: super::symbol::SymbolId,
interner: &SymbolInterner,
) -> Option<(usize, usize)> {
let var_name = interner.resolve(sym);
for (_idx, info) in cfg.node_references() {
if info.kind == StmtKind::Call
&& let Some(ref def) = info.defines
&& def == var_name
{
return Some(info.span);
}
}
None
}
/// Check if a node is a privileged sink (shell execution or file I/O).
fn is_privileged_sink(info: &crate::cfg::NodeInfo) -> bool {
match info.label {
Some(DataLabel::Sink(caps)) => caps.intersects(Cap::SHELL_ESCAPE | Cap::FILE_IO),
_ => false,
}
}
/// Simplified web entrypoint check (avoids AnalysisContext dependency).
fn is_web_entrypoint_simple(
func_name: &str,
lang: Lang,
func_summaries: &crate::cfg::FuncSummaries,
_cfg: &Cfg,
) -> bool {
let name_lower = func_name.to_ascii_lowercase();
// Skip bare "main" — it's typically a CLI entry
if name_lower == "main" {
return false;
}
let is_handler_name = name_lower.starts_with("handle_")
|| name_lower.starts_with("route_")
|| name_lower.starts_with("api_")
|| name_lower.starts_with("serve_")
|| name_lower.starts_with("process_")
|| name_lower == "handler";
if !is_handler_name {
return false;
}
// Check for web-like parameters
let web_params: &[&str] = match lang {
Lang::Rust => &["request", "req", "json", "query", "form", "payload", "body"],
Lang::JavaScript | Lang::TypeScript => &["req", "request", "ctx", "res", "response"],
Lang::Python => &["request", "req"],
Lang::Go => &["w", "writer", "r", "req", "request"],
Lang::Java => &["request", "req"],
_ => &["request", "req"],
};
let has_web_params = func_summaries.values().any(|s| {
s.param_names
.iter()
.any(|p| web_params.contains(&p.to_ascii_lowercase().as_str()))
});
// Strong handler names are enough even without web params
let strong_name = name_lower.starts_with("handle_")
|| name_lower.starts_with("route_")
|| name_lower.starts_with("api_");
has_web_params || strong_name
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cfg::{EdgeKind, NodeInfo};
use crate::cfg_analysis::rules;
use crate::state::domain::ProductState;
use crate::state::engine;
use crate::state::symbol::SymbolInterner;
use crate::state::transfer::DefaultTransfer;
use petgraph::Graph;
use std::collections::HashMap;
fn make_node(kind: StmtKind) -> NodeInfo {
NodeInfo {
kind,
span: (0, 0),
label: None,
defines: None,
uses: vec![],
callee: None,
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: vec![],
condition_negated: false,
}
}
#[test]
fn detects_resource_leak() {
// Entry → fopen(f) → Exit (no close)
let mut cfg: Cfg = Graph::new();
let entry = cfg.add_node(make_node(StmtKind::Entry));
let open_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
span: (10, 20),
defines: Some("f".into()),
callee: Some("fopen".into()),
..make_node(StmtKind::Call)
});
let exit = cfg.add_node(make_node(StmtKind::Exit));
cfg.add_edge(entry, open_node, EdgeKind::Seq);
cfg.add_edge(open_node, exit, EdgeKind::Seq);
let interner = SymbolInterner::from_cfg(&cfg);
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let result = engine::run_forward(&cfg, entry, &transfer, ProductState::initial());
let findings = extract_findings(&result, &cfg, &interner, Lang::C, &HashMap::new());
assert_eq!(findings.len(), 1);
assert_eq!(findings[0].rule_id, "state-resource-leak");
assert!(findings[0].message.contains("f"));
}
#[test]
fn clean_open_close_no_findings() {
// Entry → fopen(f) → fclose(f) → Exit
let mut cfg: Cfg = Graph::new();
let entry = cfg.add_node(make_node(StmtKind::Entry));
let open_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
defines: Some("f".into()),
callee: Some("fopen".into()),
..make_node(StmtKind::Call)
});
let close_node = cfg.add_node(NodeInfo {
kind: StmtKind::Call,
uses: vec!["f".into()],
callee: Some("fclose".into()),
..make_node(StmtKind::Call)
});
let exit = cfg.add_node(make_node(StmtKind::Exit));
cfg.add_edge(entry, open_node, EdgeKind::Seq);
cfg.add_edge(open_node, close_node, EdgeKind::Seq);
cfg.add_edge(close_node, exit, EdgeKind::Seq);
let interner = SymbolInterner::from_cfg(&cfg);
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let result = engine::run_forward(&cfg, entry, &transfer, ProductState::initial());
let findings = extract_findings(&result, &cfg, &interner, Lang::C, &HashMap::new());
assert!(findings.is_empty());
}
}

91
src/state/lattice.rs Normal file
View file

@ -0,0 +1,91 @@
/// A bounded semi-lattice with bottom element and monotone join.
///
/// Implementations must satisfy:
/// - `join` is commutative, associative, and idempotent
/// - `bot()` is the identity for `join`
/// - `leq(a, b)` iff `join(a, b) == b`
#[allow(dead_code)]
pub trait Lattice: Clone + Eq + Sized {
/// Bottom element (least information / unreachable).
fn bot() -> Self;
/// Least upper bound: merge two abstract values.
fn join(&self, other: &Self) -> Self;
/// Partial order: `self ⊑ other`.
fn leq(&self, other: &Self) -> bool;
}
#[cfg(test)]
mod tests {
use super::*;
/// A trivial 3-element lattice for testing the trait contract.
#[derive(Clone, Debug, PartialEq, Eq)]
struct Three(u8); // 0=bot, 1, 2=top-ish
impl Lattice for Three {
fn bot() -> Self {
Three(0)
}
fn join(&self, other: &Self) -> Self {
Three(self.0.max(other.0))
}
fn leq(&self, other: &Self) -> bool {
self.0 <= other.0
}
}
#[test]
fn bot_identity() {
let a = Three(1);
assert_eq!(a.join(&Three::bot()), a);
assert_eq!(Three::bot().join(&a), a);
}
#[test]
fn join_commutative() {
let a = Three(1);
let b = Three(2);
assert_eq!(a.join(&b), b.join(&a));
}
#[test]
fn join_associative() {
let a = Three(0);
let b = Three(1);
let c = Three(2);
assert_eq!(a.join(&b).join(&c), a.join(&b.join(&c)));
}
#[test]
fn join_idempotent() {
let a = Three(1);
assert_eq!(a.join(&a), a);
}
#[test]
fn leq_reflexive() {
let a = Three(1);
assert!(a.leq(&a));
}
#[test]
fn leq_transitive() {
let a = Three(0);
let b = Three(1);
let c = Three(2);
assert!(a.leq(&b));
assert!(b.leq(&c));
assert!(a.leq(&c));
}
#[test]
fn leq_consistent_with_join() {
let a = Three(1);
let b = Three(2);
// a ⊑ b iff join(a, b) == b
assert!(a.leq(&b));
assert_eq!(a.join(&b), b);
}
}

62
src/state/mod.rs Normal file
View file

@ -0,0 +1,62 @@
pub mod domain;
pub mod engine;
pub mod facts;
pub mod lattice;
pub mod symbol;
pub mod transfer;
use crate::cfg::{Cfg, FuncSummaries};
use crate::cfg_analysis::rules;
use crate::summary::GlobalSummaries;
use crate::symbol::Lang;
use domain::ProductState;
use engine::MAX_TRACKED_VARS;
use facts::StateFinding;
use petgraph::graph::NodeIndex;
use symbol::SymbolInterner;
use transfer::DefaultTransfer;
/// Run state-model dataflow analysis on a single function's CFG.
///
/// Returns findings for use-after-close, double-close, resource leaks,
/// and unauthenticated access to sensitive sinks.
pub fn run_state_analysis(
cfg: &Cfg,
entry: NodeIndex,
lang: Lang,
_source_bytes: &[u8],
func_summaries: &FuncSummaries,
_global_summaries: Option<&GlobalSummaries>,
) -> Vec<StateFinding> {
let _span = tracing::debug_span!("run_state_analysis").entered();
// 1. Build symbol interner from CFG
let interner = SymbolInterner::from_cfg(cfg);
// Guarded degradation: cap tracked variables
if interner.len() > MAX_TRACKED_VARS {
tracing::warn!(
symbols = interner.len(),
max = MAX_TRACKED_VARS,
"state analysis: too many variables, capping tracking"
);
// Still run — the interner has all symbols, but transfer will only
// track the first MAX_TRACKED_VARS due to HashMap insertion order.
// This is conservative but safe.
}
// 2. Construct transfer function
let resource_pairs = rules::resource_pairs(lang);
let transfer = DefaultTransfer {
lang,
resource_pairs,
interner: &interner,
};
// 3. Run forward dataflow engine
let initial = ProductState::initial();
let result = engine::run_forward(cfg, entry, &transfer, initial);
// 4. Extract findings
facts::extract_findings(&result, cfg, &interner, lang, func_summaries)
}

101
src/state/symbol.rs Normal file
View file

@ -0,0 +1,101 @@
use crate::cfg::Cfg;
use petgraph::visit::IntoNodeReferences;
use std::collections::HashMap;
/// Cheap `Copy` handle into a [`SymbolInterner`].
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct SymbolId(pub(crate) u32);
/// Per-function interner: maps `String` ↔ [`SymbolId`].
///
/// Built once from CFG node `defines`/`uses`, reused throughout analysis.
#[derive(Default)]
pub struct SymbolInterner {
to_id: HashMap<String, SymbolId>,
to_str: Vec<String>,
}
impl SymbolInterner {
pub fn new() -> Self {
Self::default()
}
/// Intern a name, returning its stable [`SymbolId`].
pub fn intern(&mut self, name: &str) -> SymbolId {
if let Some(&id) = self.to_id.get(name) {
return id;
}
let id = SymbolId(self.to_str.len() as u32);
self.to_str.push(name.to_owned());
self.to_id.insert(name.to_owned(), id);
id
}
/// Look up a name without interning it.
pub fn get(&self, name: &str) -> Option<SymbolId> {
self.to_id.get(name).copied()
}
/// Resolve an id back to its string.
pub fn resolve(&self, id: SymbolId) -> &str {
&self.to_str[id.0 as usize]
}
/// Number of interned symbols.
pub fn len(&self) -> usize {
self.to_str.len()
}
/// Whether the interner is empty.
#[allow(dead_code)]
pub fn is_empty(&self) -> bool {
self.to_str.is_empty()
}
/// Build from a CFG: walk all nodes, intern every `defines`/`uses` string.
pub fn from_cfg(cfg: &Cfg) -> Self {
let mut interner = Self::new();
for (_idx, info) in cfg.node_references() {
if let Some(ref d) = info.defines {
interner.intern(d);
}
for u in &info.uses {
interner.intern(u);
}
}
interner
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn intern_resolve_roundtrip() {
let mut interner = SymbolInterner::new();
let a = interner.intern("foo");
let b = interner.intern("bar");
let a2 = interner.intern("foo");
assert_eq!(a, a2);
assert_ne!(a, b);
assert_eq!(interner.resolve(a), "foo");
assert_eq!(interner.resolve(b), "bar");
}
#[test]
fn get_returns_none_for_unknown() {
let interner = SymbolInterner::new();
assert!(interner.get("missing").is_none());
}
#[test]
fn len_tracks_unique_symbols() {
let mut interner = SymbolInterner::new();
interner.intern("a");
interner.intern("b");
interner.intern("a"); // duplicate
assert_eq!(interner.len(), 2);
}
}

426
src/state/transfer.rs Normal file
View file

@ -0,0 +1,426 @@
use super::domain::{AuthLevel, ProductState, ResourceLifecycle};
use super::engine::Transfer;
use super::symbol::{SymbolId, SymbolInterner};
use crate::cfg::{EdgeKind, NodeInfo, StmtKind};
use crate::cfg_analysis::rules::{self, ResourcePair};
use crate::symbol::Lang;
use petgraph::graph::NodeIndex;
/// Events emitted during transfer for illegal state transitions.
/// These are NOT lattice values — they become findings in `facts.rs`.
#[derive(Debug, Clone)]
pub struct TransferEvent {
pub kind: TransferEventKind,
pub node: NodeIndex,
pub var: SymbolId,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TransferEventKind {
UseAfterClose,
DoubleClose,
}
/// Resource-use patterns: callees that read/write/operate on a resource handle
/// (triggering use-after-close if the handle is closed).
static RESOURCE_USE_PATTERNS: &[&str] = &[
"read", "write", "send", "recv", "fread", "fwrite", "fgets", "fputs", "fprintf", "fscanf",
"fflush", "fseek", "ftell", "rewind", "feof", "ferror", "fgetc", "fputc", "getc", "putc",
"ungetc", "query", "execute", "fetch", "sendto", "recvfrom", "ioctl", "fcntl",
// Memory access functions (for malloc/free use-after-free detection)
"strcpy", "strncpy", "strcat", "strncat", "memcpy", "memmove", "memset", "memcmp", "strcmp",
"strncmp", "strlen", "sprintf", "snprintf",
];
/// Auth-call matchers for admin-level privilege.
static ADMIN_PATTERNS: &[&str] = &[
"is_admin",
"hasrole",
"has_role",
"check_admin",
"require_admin",
];
pub struct DefaultTransfer<'a> {
pub lang: Lang,
pub resource_pairs: &'a [ResourcePair],
pub interner: &'a SymbolInterner,
}
impl Transfer<ProductState> for DefaultTransfer<'_> {
type Event = TransferEvent;
fn apply(
&self,
node_idx: NodeIndex,
info: &NodeInfo,
edge: Option<EdgeKind>,
mut state: ProductState,
) -> (ProductState, Vec<TransferEvent>) {
let mut events = Vec::new();
match info.kind {
StmtKind::Call => {
self.apply_call(node_idx, info, &mut state, &mut events);
}
StmtKind::If => {
self.apply_if(info, edge, &mut state);
}
StmtKind::Seq => {
self.apply_assignment(node_idx, info, &mut state);
}
_ => {}
}
(state, events)
}
}
impl DefaultTransfer<'_> {
fn apply_call(
&self,
node_idx: NodeIndex,
info: &NodeInfo,
state: &mut ProductState,
events: &mut Vec<TransferEvent>,
) {
let callee = match &info.callee {
Some(c) => c.to_ascii_lowercase(),
None => return,
};
// ── Resource acquire ─────────────────────────────────────────────
for pair in self.resource_pairs {
let is_acquire = pair.acquire.iter().any(|a| callee_matches(&callee, a));
let is_excluded = pair
.exclude_acquire
.iter()
.any(|e| callee_matches(&callee, e));
if is_acquire
&& !is_excluded
&& let Some(ref def) = info.defines
&& let Some(sym) = self.interner.get(def)
{
state.resource.set(sym, ResourceLifecycle::OPEN);
}
}
// ── Resource release ─────────────────────────────────────────────
// Track which variables have already been released to avoid double-
// matching across multiple resource pair definitions.
let mut released: smallvec::SmallVec<[SymbolId; 4]> = smallvec::SmallVec::new();
for pair in self.resource_pairs {
let is_release = pair.release.iter().any(|r| callee_matches(&callee, r));
if is_release {
for used in &info.uses {
if let Some(sym) = self.interner.get(used) {
if released.contains(&sym) {
continue;
}
let current = state.resource.get(sym);
if current == ResourceLifecycle::CLOSED {
// Double close
events.push(TransferEvent {
kind: TransferEventKind::DoubleClose,
node: node_idx,
var: sym,
});
} else if current.contains(ResourceLifecycle::OPEN) {
state.resource.set(sym, ResourceLifecycle::CLOSED);
}
released.push(sym);
}
}
}
}
// ── Resource use (read/write/etc.) ───────────────────────────────
let is_use = RESOURCE_USE_PATTERNS
.iter()
.any(|p| callee_matches(&callee, p));
if is_use {
for used in &info.uses {
if let Some(sym) = self.interner.get(used) {
let current = state.resource.get(sym);
if current == ResourceLifecycle::CLOSED {
events.push(TransferEvent {
kind: TransferEventKind::UseAfterClose,
node: node_idx,
var: sym,
});
}
}
}
}
// ── Auth call ────────────────────────────────────────────────────
let auth_rules = rules::auth_rules(self.lang);
let is_auth = auth_rules.iter().any(|rule| {
rule.matchers
.iter()
.any(|m| callee_matches(&callee, &m.to_ascii_lowercase()))
});
if is_auth {
let is_admin = ADMIN_PATTERNS.iter().any(|p| callee_matches(&callee, p));
let new_level = if is_admin {
AuthLevel::Admin
} else {
AuthLevel::Authed
};
if new_level > state.auth.auth_level {
state.auth.auth_level = new_level;
}
}
// ── Validation call (guard) ──────────────────────────────────────
if is_guard_like(&callee) {
for used in &info.uses {
if let Some(sym) = self.interner.get(used) {
state.auth.validated.insert(sym);
}
}
}
}
fn apply_if(&self, info: &NodeInfo, edge: Option<EdgeKind>, state: &mut ProductState) {
// On the True edge of an If node whose condition is an auth check,
// refine auth level.
let is_true_edge = matches!(edge, Some(EdgeKind::True));
if !is_true_edge {
return;
}
if let Some(ref cond) = info.condition_text {
let cond_lower = cond.to_ascii_lowercase();
// Auth-related condition
let auth_rules = rules::auth_rules(self.lang);
let is_auth_cond = auth_rules.iter().any(|rule| {
rule.matchers
.iter()
.any(|m| cond_lower.contains(&m.to_ascii_lowercase()))
});
if is_auth_cond && !info.condition_negated {
let is_admin = ADMIN_PATTERNS.iter().any(|p| cond_lower.contains(p));
let new_level = if is_admin {
AuthLevel::Admin
} else {
AuthLevel::Authed
};
if new_level > state.auth.auth_level {
state.auth.auth_level = new_level;
}
}
// Validation-related condition
if is_guard_like(&cond_lower) && !info.condition_negated {
for var in &info.condition_vars {
if let Some(sym) = self.interner.get(var) {
state.auth.validated.insert(sym);
}
}
}
}
}
fn apply_assignment(&self, _node_idx: NodeIndex, info: &NodeInfo, state: &mut ProductState) {
// Ownership transfer: if `defines` reassigns a tracked resource
// variable from a `uses` variable, transfer the lifecycle.
if let Some(ref def) = info.defines
&& let Some(def_sym) = self.interner.get(def)
{
// If the RHS is a tracked resource, transfer its state
for used in &info.uses {
if let Some(use_sym) = self.interner.get(used) {
let lc = state.resource.get(use_sym);
if lc.contains(ResourceLifecycle::OPEN) {
state.resource.set(def_sym, lc);
state.resource.set(use_sym, ResourceLifecycle::MOVED);
return;
}
}
}
}
}
}
/// Check if a callee matches a pattern.
/// Supports suffix matching (e.g., "fclose" matches callee "my_fclose")
/// and dot-prefix matching (e.g., ".close" matches "file.close").
fn callee_matches(callee: &str, pattern: &str) -> bool {
let pattern_lower = pattern.to_ascii_lowercase();
if pattern_lower.starts_with('.') {
// Method pattern: ".close" matches "x.close", "file.close", etc.
callee.ends_with(&pattern_lower)
} else {
// Exact or suffix match
callee == pattern_lower || callee.ends_with(&pattern_lower)
}
}
/// Check if a callee looks like a guard/validation function.
fn is_guard_like(callee: &str) -> bool {
static GUARD_PREFIXES: &[&str] = &["validate", "sanitize", "check_", "verify_", "assert_"];
GUARD_PREFIXES.iter().any(|p| callee.starts_with(p))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn callee_matches_exact() {
assert!(callee_matches("fopen", "fopen"));
assert!(!callee_matches("fopen", "fclose"));
}
#[test]
fn callee_matches_suffix() {
assert!(callee_matches("curlx_fclose", "fclose"));
}
#[test]
fn callee_matches_dot_prefix() {
assert!(callee_matches("file.close", ".close"));
assert!(!callee_matches("file.close", ".open"));
}
#[test]
fn acquire_sets_open() {
let mut interner = SymbolInterner::new();
let sym_f = interner.intern("f");
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let info = NodeInfo {
kind: StmtKind::Call,
span: (0, 10),
label: None,
defines: Some("f".into()),
uses: vec![],
callee: Some("fopen".into()),
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: vec![],
condition_negated: false,
};
let (state, events) =
transfer.apply(NodeIndex::new(0), &info, None, ProductState::initial());
assert!(events.is_empty());
assert_eq!(state.resource.get(sym_f), ResourceLifecycle::OPEN);
}
#[test]
fn close_after_open_sets_closed() {
let mut interner = SymbolInterner::new();
let sym_f = interner.intern("f");
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let mut state = ProductState::initial();
state.resource.set(sym_f, ResourceLifecycle::OPEN);
let info = NodeInfo {
kind: StmtKind::Call,
span: (10, 20),
label: None,
defines: None,
uses: vec!["f".into()],
callee: Some("fclose".into()),
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: vec![],
condition_negated: false,
};
let (state, events) = transfer.apply(NodeIndex::new(1), &info, None, state);
assert!(events.is_empty());
assert_eq!(state.resource.get(sym_f), ResourceLifecycle::CLOSED);
}
#[test]
fn double_close_emits_event() {
let mut interner = SymbolInterner::new();
let sym_f = interner.intern("f");
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let mut state = ProductState::initial();
state.resource.set(sym_f, ResourceLifecycle::CLOSED);
let info = NodeInfo {
kind: StmtKind::Call,
span: (20, 30),
label: None,
defines: None,
uses: vec!["f".into()],
callee: Some("fclose".into()),
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: vec![],
condition_negated: false,
};
let (_state, events) = transfer.apply(NodeIndex::new(2), &info, None, state);
assert_eq!(events.len(), 1);
assert_eq!(events[0].kind, TransferEventKind::DoubleClose);
assert_eq!(events[0].var, sym_f);
}
#[test]
fn use_after_close_emits_event() {
let mut interner = SymbolInterner::new();
let sym_f = interner.intern("f");
let transfer = DefaultTransfer {
lang: Lang::C,
resource_pairs: rules::resource_pairs(Lang::C),
interner: &interner,
};
let mut state = ProductState::initial();
state.resource.set(sym_f, ResourceLifecycle::CLOSED);
let info = NodeInfo {
kind: StmtKind::Call,
span: (30, 40),
label: None,
defines: None,
uses: vec!["f".into()],
callee: Some("fread".into()),
enclosing_func: None,
call_ordinal: 0,
condition_text: None,
condition_vars: vec![],
condition_negated: false,
};
let (_state, events) = transfer.apply(NodeIndex::new(3), &info, None, state);
assert_eq!(events.len(), 1);
assert_eq!(events[0].kind, TransferEventKind::UseAfterClose);
}
#[test]
fn is_guard_like_check() {
assert!(is_guard_like("validate_input"));
assert!(is_guard_like("sanitize_html"));
assert!(is_guard_like("check_permission"));
assert!(!is_guard_like("open_file"));
}
}

View file

@ -139,6 +139,22 @@ impl FuncSummary {
}
}
// ── Callee resolution ────────────────────────────────────────────────────
/// Result of resolving a bare callee name to a [`FuncKey`].
///
/// Three-valued: the call graph builder and taint engine need to distinguish
/// "no candidates at all" from "multiple candidates, can't pick one".
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CalleeResolution {
/// Exactly one candidate matched.
Resolved(FuncKey),
/// No candidates found at all.
NotFound,
/// Multiple candidates — ambiguous, cannot pick one.
Ambiguous(Vec<FuncKey>),
}
// ── Lookup map used by the taint engine ─────────────────────────────────
/// A merged view of all function summaries keyed by qualified [`FuncKey`].
@ -216,16 +232,66 @@ impl GlobalSummaries {
}
}
#[allow(dead_code)]
#[allow(dead_code)] // used by tests and future call-graph consumers
pub fn is_empty(&self) -> bool {
self.by_key.is_empty()
}
/// Iterate over all (key, summary) pairs.
#[allow(dead_code)]
pub fn iter(&self) -> impl Iterator<Item = (&FuncKey, &FuncSummary)> {
self.by_key.iter()
}
/// Resolve a bare (already-normalized) callee name to a [`FuncKey`].
///
/// Resolution order:
/// 1. Collect all same-language candidates matching the name.
/// 2. If `arity_hint` is `Some`, filter candidates by matching arity.
/// 3. If exactly one candidate → [`CalleeResolution::Resolved`].
/// 4. If multiple, filter by `caller_namespace`; if exactly one → `Resolved`.
/// 5. If still multiple → [`CalleeResolution::Ambiguous`].
/// 6. If zero candidates → [`CalleeResolution::NotFound`].
pub fn resolve_callee_key(
&self,
callee: &str,
caller_lang: Lang,
caller_namespace: &str,
arity_hint: Option<usize>,
) -> CalleeResolution {
let candidates = self.lookup_same_lang(caller_lang, callee);
if candidates.is_empty() {
return CalleeResolution::NotFound;
}
// Apply arity filter if hint provided.
let filtered: Vec<&FuncKey> = if let Some(arity) = arity_hint {
candidates
.iter()
.filter(|(k, _)| k.arity == Some(arity))
.map(|(k, _)| *k)
.collect()
} else {
candidates.iter().map(|(k, _)| *k).collect()
};
match filtered.len() {
0 => CalleeResolution::NotFound,
1 => CalleeResolution::Resolved(filtered[0].clone()),
_ => {
// Namespace disambiguation: prefer same-namespace match.
let same_ns: Vec<&FuncKey> = filtered
.iter()
.filter(|k| k.namespace == caller_namespace)
.copied()
.collect();
match same_ns.len() {
1 => CalleeResolution::Resolved(same_ns[0].clone()),
0 => CalleeResolution::Ambiguous(filtered.into_iter().cloned().collect()),
_ => CalleeResolution::Ambiguous(same_ns.into_iter().cloned().collect()),
}
}
}
}
}
impl std::fmt::Debug for GlobalSummaries {

715
src/suppress/mod.rs Normal file
View file

@ -0,0 +1,715 @@
//! Inline per-finding suppression via source-code comments.
//!
//! Supports two directive forms:
//! - `nyx:ignore <RULE_ID>[, <RULE_ID>…]` — suppress findings on the same line
//! - `nyx:ignore-next-line <RULE_ID>[, …]` — suppress findings on the next line
//!
//! Comments are detected for all supported languages without tree-sitter,
//! using a lightweight string/comment state machine.
use std::collections::HashMap;
// ─────────────────────────────────────────────────────────────────────────────
// Public types
// ─────────────────────────────────────────────────────────────────────────────
/// Whether the directive suppresses on its own line or the next line.
#[derive(Debug, Clone, serde::Serialize)]
pub enum SuppressionKind {
SameLine,
NextLine,
}
/// Metadata attached to a suppressed finding.
#[derive(Debug, Clone, serde::Serialize)]
pub struct SuppressionMeta {
pub kind: SuppressionKind,
/// The pattern that matched the finding's rule ID.
pub matched_pattern: String,
/// 1-indexed line where the suppression directive appears.
pub directive_line: usize,
}
// ─────────────────────────────────────────────────────────────────────────────
// Internal types
// ─────────────────────────────────────────────────────────────────────────────
/// A single rule matcher — either exact or wildcard-suffix (`foo.*`).
#[derive(Debug)]
enum RuleMatcher {
Exact(String),
/// `prefix` stores everything before the trailing `.*`.
WildcardSuffix(String),
}
impl RuleMatcher {
fn matches(&self, rule_id: &str) -> bool {
match self {
RuleMatcher::Exact(s) => s == rule_id,
RuleMatcher::WildcardSuffix(prefix) => {
rule_id.starts_with(prefix.as_str())
&& rule_id.len() > prefix.len()
&& rule_id.as_bytes()[prefix.len()] == b'.'
}
}
}
}
/// A parsed directive from a single comment.
#[derive(Debug)]
struct LineDirective {
kind: SuppressionKind,
/// 1-indexed line where the directive comment appears.
directive_line: usize,
matchers: Vec<RuleMatcher>,
}
/// Pre-built index of suppression directives keyed by **target line** (the
/// line whose findings should be suppressed, 1-indexed).
pub struct SuppressionIndex {
directives: HashMap<usize, Vec<LineDirective>>,
}
impl SuppressionIndex {
/// Check whether a finding at `line` (1-indexed) with `rule_id` is suppressed.
pub fn check(&self, line: usize, rule_id: &str) -> Option<SuppressionMeta> {
let canon = canonical_rule_id(rule_id);
let dirs = self.directives.get(&line)?;
for dir in dirs {
for m in &dir.matchers {
if m.matches(canon) {
let display_pattern = match m {
RuleMatcher::Exact(s) => s.clone(),
RuleMatcher::WildcardSuffix(s) => format!("{s}.*"),
};
return Some(SuppressionMeta {
kind: dir.kind.clone(),
matched_pattern: display_pattern,
directive_line: dir.directive_line,
});
}
}
}
None
}
/// Returns `true` if no directives were found.
pub fn is_empty(&self) -> bool {
self.directives.is_empty()
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Canonical rule ID
// ─────────────────────────────────────────────────────────────────────────────
/// Strip parenthetical suffix from a rule ID:
/// `"taint-unsanitised-flow (source 5:1)"` → `"taint-unsanitised-flow"`.
pub fn canonical_rule_id(id: &str) -> &str {
let trimmed = id.trim();
if let Some(idx) = trimmed.find(" (") {
trimmed[..idx].trim_end()
} else {
trimmed
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Comment style per language
// ─────────────────────────────────────────────────────────────────────────────
#[derive(Clone, Copy)]
enum CommentStyle {
/// `//` and `/* */` — Rust, C, C++, Java, Go, JS, TS
CStyle,
/// `#` only — Python, Ruby
Hash,
/// `//`, `#`, and `/* */` — PHP
PhpStyle,
}
/// Map a file extension to the comment style for that language.
fn comment_style_for_ext(ext: &str) -> Option<CommentStyle> {
match ext {
"rs" | "c" | "cpp" | "java" | "go" | "ts" | "js" => Some(CommentStyle::CStyle),
"py" | "rb" => Some(CommentStyle::Hash),
"php" => Some(CommentStyle::PhpStyle),
_ => None,
}
}
/// Map a file path to its comment style by inspecting the extension.
fn comment_style_for_path(path: &std::path::Path) -> Option<CommentStyle> {
let ext = path.extension().and_then(|s| s.to_str())?;
// Normalise common variant extensions
let norm = match ext {
"RS" => "rs",
"c++" => "cpp",
"PY" => "py",
"TSX" | "tsx" => "ts",
other => other,
};
comment_style_for_ext(norm)
}
// ─────────────────────────────────────────────────────────────────────────────
// Parser
// ─────────────────────────────────────────────────────────────────────────────
/// Parse inline suppression directives from `source`, using comment syntax
/// appropriate for the given file path.
///
/// Returns an empty index if the source doesn't contain `nyx:ignore` or the
/// language is unsupported.
pub fn parse_inline_suppressions(path: &std::path::Path, source: &str) -> SuppressionIndex {
// Fast path: no directives possible.
if !source.as_bytes().windows(10).any(|w| w == b"nyx:ignore") {
return SuppressionIndex {
directives: HashMap::new(),
};
}
let Some(style) = comment_style_for_path(path) else {
return SuppressionIndex {
directives: HashMap::new(),
};
};
let mut index: HashMap<usize, Vec<LineDirective>> = HashMap::new();
let total_lines = source.lines().count();
// State machine for string/comment tracking.
let mut in_block_comment = false;
let mut block_comment_start_line: usize = 0;
for (line_idx, raw_line) in source.lines().enumerate() {
let line_num = line_idx + 1; // 1-indexed
let line = raw_line.trim_end_matches('\r');
if in_block_comment {
// Check for block comment end.
if let Some(end_pos) = line.find("*/") {
// Extract text before `*/` — may contain a directive.
let block_text = &line[..end_pos];
if let Some(dir) = try_parse_directive(block_text, line_num) {
let target = target_line(&dir, line_num, total_lines);
if let Some(t) = target {
index.entry(t).or_default().push(dir);
}
}
in_block_comment = false;
// After the block comment ends, check the rest of the line
// for a line comment.
let rest = &line[end_pos + 2..];
if let Some(dir) = extract_from_line_rest(rest, line_num, style) {
let target = target_line(&dir, line_num, total_lines);
if let Some(t) = target {
index.entry(t).or_default().push(dir);
}
}
} else {
// Still inside block comment — check for directive.
if let Some(dir) = try_parse_directive(line, line_num) {
let target = target_line(&dir, line_num, total_lines);
if let Some(t) = target {
index.entry(t).or_default().push(dir);
}
}
}
let _ = block_comment_start_line; // suppress unused warning
continue;
}
// Not in a block comment — scan the line character by character
// tracking string state.
if let Some(dir) = scan_line_for_directive(line, line_num, style, &mut in_block_comment) {
let target = target_line(&dir, line_num, total_lines);
if let Some(t) = target {
index.entry(t).or_default().push(dir);
}
}
if in_block_comment {
block_comment_start_line = line_num;
}
}
SuppressionIndex { directives: index }
}
/// Compute the target line for a directive. Returns `None` if the directive
/// is `NextLine` but on the last line (EOF — no-op).
fn target_line(dir: &LineDirective, line_num: usize, total_lines: usize) -> Option<usize> {
match dir.kind {
SuppressionKind::SameLine => Some(line_num),
SuppressionKind::NextLine => {
if line_num < total_lines {
Some(line_num + 1)
} else {
None // EOF — no next line
}
}
}
}
/// Scan a single line (not inside a block comment) for a suppression directive.
/// Tracks string literals to avoid false positives.
///
/// Sets `in_block_comment` to `true` if the line opens a `/* */` block that
/// doesn't close on the same line.
fn scan_line_for_directive(
line: &str,
line_num: usize,
style: CommentStyle,
in_block_comment: &mut bool,
) -> Option<LineDirective> {
let bytes = line.as_bytes();
let len = bytes.len();
let mut i = 0;
// String state
let mut in_string: Option<u8> = None; // quote char: b'"', b'\'', b'`'
while i < len {
let ch = bytes[i];
// ── Inside a string literal ─────────────────────────────────────
if let Some(quote) = in_string {
if ch == b'\\' {
i += 2; // skip escaped char
continue;
}
// Python triple quotes
if (quote == b'"' || quote == b'\'')
&& i + 2 < len
&& bytes[i] == quote
&& bytes[i + 1] == quote
&& bytes[i + 2] == quote
{
// Check if this is a triple-quote close
// (we entered via triple-quote open, but we track single quote char)
in_string = None;
i += 3;
continue;
}
if ch == quote {
in_string = None;
}
i += 1;
continue;
}
// ── Not in a string ─────────────────────────────────────────────
// Rust raw strings: r"..." or r#"..."#
if ch == b'r' && i + 1 < len {
let next = bytes[i + 1];
if next == b'"' {
// r"..." — skip to closing "
i += 2;
while i < len && bytes[i] != b'"' {
i += 1;
}
i += 1; // skip closing "
continue;
}
if next == b'#' {
// Count hashes
let hash_start = i + 1;
let mut j = i + 1;
while j < len && bytes[j] == b'#' {
j += 1;
}
let hash_count = j - hash_start;
if j < len && bytes[j] == b'"' {
// Skip to closing "###
let close_pat_len = 1 + hash_count; // " + hashes
i = j + 1;
'raw: while i < len {
if bytes[i] == b'"' {
// Check for matching hashes
let mut k = 1;
while k <= hash_count && i + k < len && bytes[i + k] == b'#' {
k += 1;
}
if k > hash_count {
i += close_pat_len;
break 'raw;
}
}
i += 1;
}
continue;
}
}
}
// Python triple quotes: """ or '''
if (ch == b'"' || ch == b'\'') && i + 2 < len && bytes[i + 1] == ch && bytes[i + 2] == ch {
in_string = Some(ch);
i += 3;
continue;
}
// Regular string literals
if ch == b'"' || ch == b'\'' || ch == b'`' {
in_string = Some(ch);
i += 1;
continue;
}
// ── Comment detection ───────────────────────────────────────────
// C-style line comment: //
let has_slash_slash = matches!(style, CommentStyle::CStyle | CommentStyle::PhpStyle);
if has_slash_slash && ch == b'/' && i + 1 < len && bytes[i + 1] == b'/' {
let comment_body = &line[i + 2..];
return try_parse_directive(comment_body, line_num);
}
// Block comment: /*
let has_block = matches!(style, CommentStyle::CStyle | CommentStyle::PhpStyle);
if has_block && ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
// Look for closing */ on the same line
let rest = &line[i + 2..];
if let Some(end) = rest.find("*/") {
let block_body = &rest[..end];
// Check directive in block body
if let Some(dir) = try_parse_directive(block_body, line_num) {
return Some(dir);
}
// Continue scanning after the block
i = i + 2 + end + 2;
continue;
} else {
// Block comment extends to next line(s)
*in_block_comment = true;
let block_body = rest;
return try_parse_directive(block_body, line_num);
}
}
// Hash comment: #
let has_hash = matches!(style, CommentStyle::Hash | CommentStyle::PhpStyle);
if has_hash && ch == b'#' {
let comment_body = &line[i + 1..];
return try_parse_directive(comment_body, line_num);
}
i += 1;
}
None
}
/// Try to extract a directive from a line rest (after a block comment closes).
fn extract_from_line_rest(
rest: &str,
line_num: usize,
style: CommentStyle,
) -> Option<LineDirective> {
let mut in_block = false;
scan_line_for_directive(rest, line_num, style, &mut in_block)
}
/// Try to parse a `nyx:ignore` or `nyx:ignore-next-line` directive from
/// comment body text. Returns `None` if no directive is found.
fn try_parse_directive(text: &str, line_num: usize) -> Option<LineDirective> {
let trimmed = text.trim();
// Strip leading `*` or `* ` common in block comments (e.g. ` * nyx:ignore ...`).
let trimmed = trimmed
.strip_prefix("* ")
.or(trimmed.strip_prefix('*'))
.unwrap_or(trimmed)
.trim();
// Check for `nyx:ignore-next-line` first (longer prefix wins).
if let Some(rest) = strip_directive_prefix(trimmed, "nyx:ignore-next-line") {
let matchers = parse_rule_ids(rest);
if matchers.is_empty() {
return None;
}
return Some(LineDirective {
kind: SuppressionKind::NextLine,
directive_line: line_num,
matchers,
});
}
if let Some(rest) = strip_directive_prefix(trimmed, "nyx:ignore") {
let matchers = parse_rule_ids(rest);
if matchers.is_empty() {
return None;
}
return Some(LineDirective {
kind: SuppressionKind::SameLine,
directive_line: line_num,
matchers,
});
}
None
}
/// Strip a directive prefix, allowing optional whitespace or the rest of the
/// line to follow.
fn strip_directive_prefix<'a>(text: &'a str, prefix: &str) -> Option<&'a str> {
let rest = text.strip_prefix(prefix)?;
// Must be followed by whitespace, end of string, or nothing.
// If prefix is "nyx:ignore" and rest starts with "-next-line", don't match
// (handled by checking the longer prefix first).
if rest.is_empty() || rest.starts_with(char::is_whitespace) {
Some(rest)
} else {
None
}
}
/// Parse comma-separated rule IDs into matchers.
fn parse_rule_ids(text: &str) -> Vec<RuleMatcher> {
text.split(',')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| {
if let Some(prefix) = s.strip_suffix(".*") {
RuleMatcher::WildcardSuffix(prefix.to_string())
} else {
RuleMatcher::Exact(s.to_string())
}
})
.collect()
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
fn rust_path() -> &'static Path {
Path::new("test.rs")
}
fn py_path() -> &'static Path {
Path::new("test.py")
}
fn rb_path() -> &'static Path {
Path::new("test.rb")
}
fn php_path() -> &'static Path {
Path::new("test.php")
}
fn js_path() -> &'static Path {
Path::new("test.js")
}
// 1. `//` comment parsing
#[test]
fn slash_slash_comment_suppresses() {
let src = "let x = 1; // nyx:ignore rule.a\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_some());
assert!(idx.check(1, "rule.b").is_none());
}
// 2. `#` comment parsing
#[test]
fn hash_comment_suppresses() {
let src = "x = 1 # nyx:ignore rule.a\n";
let idx = parse_inline_suppressions(py_path(), src);
assert!(idx.check(1, "rule.a").is_some());
}
// 3. `/* */` block comment
#[test]
fn block_comment_suppresses() {
let src = "let x = 1; /* nyx:ignore rule.a */\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_some());
}
// 4. Same-line semantics
#[test]
fn same_line_only_suppresses_own_line() {
let src = "line1\nlet x = 1; // nyx:ignore rule.a\nline3\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_none());
assert!(idx.check(2, "rule.a").is_some());
assert!(idx.check(3, "rule.a").is_none());
}
// 5. Next-line semantics
#[test]
fn next_line_suppresses_following_line() {
let src = "// nyx:ignore-next-line rule.a\nlet x = dangerous();\nline3\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_none());
assert!(idx.check(2, "rule.a").is_some());
assert!(idx.check(3, "rule.a").is_none());
}
// 6. Multiple rule IDs
#[test]
fn multiple_rule_ids() {
let src = "let x = 1; // nyx:ignore a.b.c, x.y.z\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "a.b.c").is_some());
assert!(idx.check(1, "x.y.z").is_some());
assert!(idx.check(1, "other").is_none());
}
// 7. Wildcard suffix
#[test]
fn wildcard_suffix_matching() {
let src = "let x = 1; // nyx:ignore rs.quality.*\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rs.quality.foo").is_some());
assert!(idx.check(1, "rs.quality.bar").is_some());
assert!(idx.check(1, "rs.other.foo").is_none());
// Exact match of prefix without the dot should not match
assert!(idx.check(1, "rs.quality").is_none());
}
// 8. String literal guard
#[test]
fn string_literal_not_suppressed() {
let src = "let x = \"// nyx:ignore rule.a\";\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_none());
}
// 9. Rust raw string guard
#[test]
fn rust_raw_string_not_suppressed() {
let src = "let x = r#\"// nyx:ignore rule.a\"#;\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_none());
}
// 10. Rule ID mismatch
#[test]
fn rule_id_mismatch() {
let src = "let x = 1; // nyx:ignore rule-a\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule-a").is_some());
assert!(idx.check(1, "rule-b").is_none());
}
// 11. Taint rule ID canonicalization
#[test]
fn taint_rule_id_canonicalization() {
let src = "let x = 1; // nyx:ignore taint-unsanitised-flow\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(
idx.check(1, "taint-unsanitised-flow (source 5:1)")
.is_some()
);
assert!(idx.check(1, "taint-unsanitised-flow").is_some());
}
// 12. Multiple directives targeting the same line
#[test]
fn multiple_directives_same_target() {
let src = "// nyx:ignore-next-line rule-a\n// nyx:ignore-next-line rule-b\nlet x = dangerous();\n";
let idx = parse_inline_suppressions(rust_path(), src);
// First ignore-next-line targets line 2, second targets line 3
assert!(idx.check(2, "rule-a").is_some());
assert!(idx.check(3, "rule-b").is_some());
}
// 13. Block comment with ignore-next-line
#[test]
fn block_comment_next_line() {
let src = "/* nyx:ignore-next-line rule.a */\nlet x = dangerous();\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(2, "rule.a").is_some());
}
// 14. EOF ignore-next-line is a no-op
#[test]
fn eof_next_line_no_panic() {
let src = "// nyx:ignore-next-line rule.a";
let idx = parse_inline_suppressions(rust_path(), src);
// Line 1 is the last line, so ignore-next-line targets line 2 which doesn't exist
assert!(idx.check(1, "rule.a").is_none());
assert!(idx.check(2, "rule.a").is_none());
}
// 15. CRLF input
#[test]
fn crlf_line_endings() {
let src = "let x = 1; // nyx:ignore rule.a\r\nlet y = 2;\r\n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_some());
assert!(idx.check(2, "rule.a").is_none());
}
// 16. Whitespace tolerance
#[test]
fn whitespace_tolerance() {
let src = "let x = 1; // nyx:ignore rule.a, rule.b \n";
let idx = parse_inline_suppressions(rust_path(), src);
assert!(idx.check(1, "rule.a").is_some());
assert!(idx.check(1, "rule.b").is_some());
}
// 17. PHP multi-style comments
#[test]
fn php_multi_style() {
let src_hash = "<?php\n$x = 1; # nyx:ignore rule.a\n";
let src_slash = "<?php\n$x = 1; // nyx:ignore rule.b\n";
let idx_hash = parse_inline_suppressions(php_path(), src_hash);
let idx_slash = parse_inline_suppressions(php_path(), src_slash);
assert!(idx_hash.check(2, "rule.a").is_some());
assert!(idx_slash.check(2, "rule.b").is_some());
}
// ── canonical_rule_id tests ─────────────────────────────────────────
#[test]
fn canonical_strips_parenthetical() {
assert_eq!(
canonical_rule_id("taint-unsanitised-flow (source 5:1)"),
"taint-unsanitised-flow"
);
}
#[test]
fn canonical_no_parenthetical_unchanged() {
assert_eq!(canonical_rule_id("rs.quality.unwrap"), "rs.quality.unwrap");
}
#[test]
fn canonical_trims_whitespace() {
assert_eq!(canonical_rule_id(" rule.a "), "rule.a");
}
// ── Ruby hash comment ───────────────────────────────────────────────
#[test]
fn ruby_hash_comment() {
let src = "x = dangerous # nyx:ignore rule.a\n";
let idx = parse_inline_suppressions(rb_path(), src);
assert!(idx.check(1, "rule.a").is_some());
}
// ── JS template literal guard ───────────────────────────────────────
#[test]
fn js_template_literal_not_suppressed() {
let src = "let x = `// nyx:ignore rule.a`;\n";
let idx = parse_inline_suppressions(js_path(), src);
assert!(idx.check(1, "rule.a").is_none());
}
// ── Multiline block comment ─────────────────────────────────────────
#[test]
fn multiline_block_comment() {
let src = "/*\n * nyx:ignore rule.a\n */\nlet x = dangerous;\n";
let idx = parse_inline_suppressions(rust_path(), src);
// The directive is on line 2, same-line → targets line 2
assert!(idx.check(2, "rule.a").is_some());
}
}

620
src/taint/domain.rs Normal file
View file

@ -0,0 +1,620 @@
use crate::labels::{Cap, SourceKind};
use crate::state::lattice::Lattice;
use crate::state::symbol::SymbolId;
use crate::taint::path_state::PredicateKind;
use petgraph::graph::NodeIndex;
use smallvec::SmallVec;
/// Maximum origins tracked per variable (bounded to prevent growth).
const MAX_ORIGINS_PER_VAR: usize = 4;
/// Per-variable taint information.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct VarTaint {
pub caps: Cap,
/// Up to N origins that contributed taint (bounded).
pub origins: SmallVec<[TaintOrigin; 2]>,
}
/// A single taint origin — the node and classification of where taint came from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct TaintOrigin {
pub node: NodeIndex,
pub source_kind: SourceKind,
}
/// Compact bitset for up to 64 variables (indexed by SymbolId ordinal).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct SmallBitSet(u64);
impl SmallBitSet {
pub fn empty() -> Self {
Self(0)
}
pub fn insert(&mut self, id: SymbolId) {
let idx = id.0;
if idx < 64 {
self.0 |= 1u64 << idx;
}
}
pub fn contains(&self, id: SymbolId) -> bool {
let idx = id.0;
if idx < 64 {
self.0 & (1u64 << idx) != 0
} else {
false
}
}
/// Union: self | other
pub fn union(self, other: Self) -> Self {
Self(self.0 | other.0)
}
/// Intersection: self & other
pub fn intersection(self, other: Self) -> Self {
Self(self.0 & other.0)
}
#[allow(dead_code)]
pub fn is_empty(self) -> bool {
self.0 == 0
}
/// Whether self is a subset of other.
#[allow(dead_code)] // used by Lattice::leq
pub fn is_subset_of(self, other: Self) -> bool {
self.0 & other.0 == self.0
}
/// Whether self is a superset of other.
#[allow(dead_code)] // used by Lattice::leq
pub fn is_superset_of(self, other: Self) -> bool {
other.is_subset_of(self)
}
}
/// Monotone predicate summary per variable.
///
/// Tracks which whitelisted predicate kinds are known true/false on ALL paths.
/// join = intersection of bits (must-hold semantics).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct PredicateSummary {
/// Bitmask: bit 0=NullCheck, 1=EmptyCheck, 2=ErrorCheck
pub known_true: u8,
pub known_false: u8,
}
impl PredicateSummary {
pub fn empty() -> Self {
Self {
known_true: 0,
known_false: 0,
}
}
/// Join = intersection (only predicates true on ALL paths).
pub fn join(self, other: Self) -> Self {
Self {
known_true: self.known_true & other.known_true,
known_false: self.known_false & other.known_false,
}
}
/// Check for contradiction: same kind known both true and false.
pub fn has_contradiction(self) -> bool {
self.known_true & self.known_false != 0
}
pub fn is_empty(self) -> bool {
self.known_true == 0 && self.known_false == 0
}
}
/// Map a whitelisted PredicateKind to its bit index (0-2).
/// Returns None for non-whitelisted kinds.
pub fn predicate_kind_bit(kind: PredicateKind) -> Option<u8> {
match kind {
PredicateKind::NullCheck => Some(0),
PredicateKind::EmptyCheck => Some(1),
PredicateKind::ErrorCheck => Some(2),
_ => None,
}
}
/// The abstract taint state at a program point.
///
/// Uses sorted SmallVec keyed by SymbolId for O(n) merge-join.
/// Variables beyond the interner's capacity are naturally excluded.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct TaintState {
/// Per-variable taint, sorted by SymbolId.
pub vars: SmallVec<[(SymbolId, VarTaint); 16]>,
/// Variables validated on ALL paths (intersection on join).
pub validated_must: SmallBitSet,
/// Variables validated on ANY path (union on join).
pub validated_may: SmallBitSet,
/// Per-variable predicate summary (sorted by SymbolId).
pub predicates: SmallVec<[(SymbolId, PredicateSummary); 4]>,
}
impl TaintState {
/// Create the initial state (no taint, no validation, no predicates).
pub fn initial() -> Self {
Self {
vars: SmallVec::new(),
validated_must: SmallBitSet::empty(),
validated_may: SmallBitSet::empty(),
predicates: SmallVec::new(),
}
}
/// Look up taint for a variable.
pub fn get(&self, sym: SymbolId) -> Option<&VarTaint> {
self.vars
.binary_search_by_key(&sym, |(id, _)| *id)
.ok()
.map(|idx| &self.vars[idx].1)
}
/// Insert or update taint for a variable.
pub fn set(&mut self, sym: SymbolId, taint: VarTaint) {
match self.vars.binary_search_by_key(&sym, |(id, _)| *id) {
Ok(idx) => self.vars[idx].1 = taint,
Err(idx) => self.vars.insert(idx, (sym, taint)),
}
}
/// Remove taint for a variable.
pub fn remove(&mut self, sym: SymbolId) {
if let Ok(idx) = self.vars.binary_search_by_key(&sym, |(id, _)| *id) {
self.vars.remove(idx);
}
}
/// Set a predicate summary for a variable.
pub fn set_predicate(&mut self, sym: SymbolId, summary: PredicateSummary) {
match self.predicates.binary_search_by_key(&sym, |(id, _)| *id) {
Ok(idx) => self.predicates[idx].1 = summary,
Err(idx) => self.predicates.insert(idx, (sym, summary)),
}
}
/// Get predicate summary for a variable.
pub fn get_predicate(&self, sym: SymbolId) -> PredicateSummary {
self.predicates
.binary_search_by_key(&sym, |(id, _)| *id)
.ok()
.map(|idx| self.predicates[idx].1)
.unwrap_or_else(PredicateSummary::empty)
}
/// Check if any variable has contradictory predicates.
pub fn has_contradiction(&self) -> bool {
self.predicates.iter().any(|(_, s)| s.has_contradiction())
}
}
impl Lattice for TaintState {
fn bot() -> Self {
Self::initial()
}
fn join(&self, other: &Self) -> Self {
// Merge-join vars (sorted by SymbolId)
let vars = merge_join_vars(&self.vars, &other.vars);
// validated_must = intersection (must hold on ALL paths)
let validated_must = self.validated_must.intersection(other.validated_must);
// validated_may = union (holds on ANY path)
let validated_may = self.validated_may.union(other.validated_may);
// predicates = per-key intersection of known_true/known_false bits
let predicates = merge_join_predicates(&self.predicates, &other.predicates);
TaintState {
vars,
validated_must,
validated_may,
predicates,
}
}
fn leq(&self, other: &Self) -> bool {
// Per-key Cap subset + origins subset
if !vars_leq(&self.vars, &other.vars) {
return false;
}
// validated_must: self ⊇ other (superset = less info = lower)
if !self.validated_must.is_superset_of(other.validated_must) {
return false;
}
// validated_may: self ⊆ other
if !self.validated_may.is_subset_of(other.validated_may) {
return false;
}
// predicates: self.known_true ⊇ other.known_true (more precise = lower)
predicates_leq(&self.predicates, &other.predicates)
}
}
/// Merge-join two sorted var lists: per-key Cap OR + origins merge (bounded).
fn merge_join_vars(
a: &[(SymbolId, VarTaint)],
b: &[(SymbolId, VarTaint)],
) -> SmallVec<[(SymbolId, VarTaint); 16]> {
let mut result = SmallVec::with_capacity(a.len().max(b.len()));
let (mut i, mut j) = (0, 0);
while i < a.len() && j < b.len() {
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
result.push(a[i].clone());
i += 1;
}
std::cmp::Ordering::Greater => {
result.push(b[j].clone());
j += 1;
}
std::cmp::Ordering::Equal => {
let caps = a[i].1.caps | b[j].1.caps;
let origins = merge_origins(&a[i].1.origins, &b[j].1.origins);
result.push((a[i].0, VarTaint { caps, origins }));
i += 1;
j += 1;
}
}
}
// Remaining from either side
while i < a.len() {
result.push(a[i].clone());
i += 1;
}
while j < b.len() {
result.push(b[j].clone());
j += 1;
}
result
}
/// Merge two origin lists, deduplicating by node and bounding at MAX_ORIGINS_PER_VAR.
fn merge_origins(
a: &SmallVec<[TaintOrigin; 2]>,
b: &SmallVec<[TaintOrigin; 2]>,
) -> SmallVec<[TaintOrigin; 2]> {
let mut merged = a.clone();
for origin in b {
if merged.len() >= MAX_ORIGINS_PER_VAR {
break;
}
if !merged.iter().any(|o| o.node == origin.node) {
merged.push(*origin);
}
}
merged
}
/// Check if a.vars ⊑ b.vars (per-key Cap subset + origins subset).
#[allow(dead_code)] // called by Lattice::leq
fn vars_leq(a: &[(SymbolId, VarTaint)], b: &[(SymbolId, VarTaint)]) -> bool {
let (mut i, mut j) = (0, 0);
while i < a.len() {
if j >= b.len() {
return false; // a has keys not in b → not ⊑
}
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => return false, // key in a but not b
std::cmp::Ordering::Greater => {
j += 1; // key only in b, skip
}
std::cmp::Ordering::Equal => {
// Cap subset check
if a[i].1.caps & b[j].1.caps != a[i].1.caps {
return false;
}
// Origins subset check (by node)
for orig in &a[i].1.origins {
if !b[j].1.origins.iter().any(|o| o.node == orig.node) {
return false;
}
}
i += 1;
j += 1;
}
}
}
true
}
/// Merge-join predicate summaries with intersection semantics.
fn merge_join_predicates(
a: &[(SymbolId, PredicateSummary)],
b: &[(SymbolId, PredicateSummary)],
) -> SmallVec<[(SymbolId, PredicateSummary); 4]> {
let mut result = SmallVec::new();
let (mut i, mut j) = (0, 0);
while i < a.len() && j < b.len() {
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
// Key only in a — intersection with empty = empty → drop
i += 1;
}
std::cmp::Ordering::Greater => {
j += 1;
}
std::cmp::Ordering::Equal => {
let joined = a[i].1.join(b[j].1);
if !joined.is_empty() {
result.push((a[i].0, joined));
}
i += 1;
j += 1;
}
}
}
// Keys only in one side → intersection with empty = drop
result
}
/// Check if a.predicates ⊑ b.predicates.
/// More precise (more known_true bits) = lower in the lattice.
/// So a ⊑ b means a.known_true ⊇ b.known_true for each key.
#[allow(dead_code)] // called by Lattice::leq
fn predicates_leq(a: &[(SymbolId, PredicateSummary)], b: &[(SymbolId, PredicateSummary)]) -> bool {
let (mut i, mut j) = (0, 0);
// For each key in b, a must have at least as many bits
while j < b.len() {
if i >= a.len() {
// b has keys that a doesn't — a is missing info = not lower
return false;
}
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
// a has extra keys (more info) — OK for leq
i += 1;
}
std::cmp::Ordering::Greater => {
// b has a key that a doesn't → a has fewer bits → not ⊑
return false;
}
std::cmp::Ordering::Equal => {
// a.known_true must be a superset of b.known_true
if a[i].1.known_true & b[j].1.known_true != b[j].1.known_true {
return false;
}
if a[i].1.known_false & b[j].1.known_false != b[j].1.known_false {
return false;
}
i += 1;
j += 1;
}
}
}
true
}
#[cfg(test)]
mod tests {
use super::*;
fn make_taint(sym: u32, caps: Cap) -> (SymbolId, VarTaint) {
(
SymbolId(sym),
VarTaint {
caps,
origins: SmallVec::new(),
},
)
}
fn make_taint_with_origin(sym: u32, caps: Cap, node: usize) -> (SymbolId, VarTaint) {
(
SymbolId(sym),
VarTaint {
caps,
origins: smallvec::smallvec![TaintOrigin {
node: NodeIndex::new(node),
source_kind: SourceKind::Unknown,
}],
},
)
}
fn state_with_vars(vars: Vec<(SymbolId, VarTaint)>) -> TaintState {
let mut s = TaintState::initial();
s.vars = SmallVec::from_vec(vars);
s
}
// ── Lattice property tests ──────────────────────────────────────────
#[test]
fn bot_identity() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
assert_eq!(a.join(&TaintState::bot()), a);
assert_eq!(TaintState::bot().join(&a), a);
}
#[test]
fn join_commutativity() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(1, Cap::SHELL_ESCAPE)]);
assert_eq!(a.join(&b), b.join(&a));
}
#[test]
fn join_associativity() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(0, Cap::SHELL_ESCAPE)]);
let c = state_with_vars(vec![make_taint(1, Cap::HTML_ESCAPE)]);
assert_eq!(a.join(&b).join(&c), a.join(&b.join(&c)));
}
#[test]
fn join_idempotency() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR | Cap::SHELL_ESCAPE)]);
assert_eq!(a.join(&a), a);
}
#[test]
fn leq_reflexive() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
assert!(a.leq(&a));
}
#[test]
fn leq_consistent_with_join() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(0, Cap::ENV_VAR | Cap::SHELL_ESCAPE)]);
assert!(a.leq(&b));
assert_eq!(a.join(&b), b);
}
#[test]
fn join_merges_caps() {
let a = state_with_vars(vec![make_taint(0, Cap::ENV_VAR)]);
let b = state_with_vars(vec![make_taint(0, Cap::SHELL_ESCAPE)]);
let joined = a.join(&b);
assert_eq!(
joined.get(SymbolId(0)).unwrap().caps,
Cap::ENV_VAR | Cap::SHELL_ESCAPE
);
}
#[test]
fn join_merges_origins() {
let a = state_with_vars(vec![make_taint_with_origin(0, Cap::ENV_VAR, 1)]);
let b = state_with_vars(vec![make_taint_with_origin(0, Cap::ENV_VAR, 2)]);
let joined = a.join(&b);
assert_eq!(joined.get(SymbolId(0)).unwrap().origins.len(), 2);
}
#[test]
fn validated_must_intersection() {
let mut a = TaintState::initial();
a.validated_must.insert(SymbolId(0));
a.validated_must.insert(SymbolId(1));
let mut b = TaintState::initial();
b.validated_must.insert(SymbolId(1));
b.validated_must.insert(SymbolId(2));
let joined = a.join(&b);
assert!(!joined.validated_must.contains(SymbolId(0)));
assert!(joined.validated_must.contains(SymbolId(1)));
assert!(!joined.validated_must.contains(SymbolId(2)));
}
#[test]
fn validated_may_union() {
let mut a = TaintState::initial();
a.validated_may.insert(SymbolId(0));
let mut b = TaintState::initial();
b.validated_may.insert(SymbolId(1));
let joined = a.join(&b);
assert!(joined.validated_may.contains(SymbolId(0)));
assert!(joined.validated_may.contains(SymbolId(1)));
}
#[test]
fn predicate_contradiction() {
let mut state = TaintState::initial();
state.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 1, // NullCheck true
known_false: 1, // NullCheck false
},
);
assert!(state.has_contradiction());
}
#[test]
fn predicate_no_contradiction() {
let mut state = TaintState::initial();
state.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 1, // NullCheck true
known_false: 2, // EmptyCheck false (different kind)
},
);
assert!(!state.has_contradiction());
}
#[test]
fn predicate_join_intersection() {
let mut a = TaintState::initial();
a.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 0b011, // NullCheck + EmptyCheck
known_false: 0,
},
);
let mut b = TaintState::initial();
b.set_predicate(
SymbolId(0),
PredicateSummary {
known_true: 0b010, // EmptyCheck only
known_false: 0,
},
);
let joined = a.join(&b);
let pred = joined.get_predicate(SymbolId(0));
assert_eq!(pred.known_true, 0b010); // only EmptyCheck on both paths
}
// ── SmallBitSet tests ───────────────────────────────────────────────
#[test]
fn small_bitset_basic() {
let mut bs = SmallBitSet::empty();
assert!(bs.is_empty());
bs.insert(SymbolId(0));
assert!(bs.contains(SymbolId(0)));
assert!(!bs.contains(SymbolId(1)));
assert!(!bs.is_empty());
}
#[test]
fn small_bitset_union_intersection() {
let mut a = SmallBitSet::empty();
a.insert(SymbolId(0));
a.insert(SymbolId(2));
let mut b = SmallBitSet::empty();
b.insert(SymbolId(1));
b.insert(SymbolId(2));
let u = a.union(b);
assert!(u.contains(SymbolId(0)));
assert!(u.contains(SymbolId(1)));
assert!(u.contains(SymbolId(2)));
let i = a.intersection(b);
assert!(!i.contains(SymbolId(0)));
assert!(!i.contains(SymbolId(1)));
assert!(i.contains(SymbolId(2)));
}
}

View file

@ -1,11 +1,21 @@
use crate::cfg::{Cfg, FuncSummaries, NodeInfo, StmtKind};
pub mod domain;
pub mod path_state;
pub mod transfer;
use crate::cfg::{Cfg, FuncSummaries};
use crate::interop::InteropEdge;
use crate::labels::{Cap, DataLabel, SourceKind};
use crate::labels::SourceKind;
use crate::state::engine::{self, MAX_TRACKED_VARS};
use crate::state::lattice::Lattice;
use crate::state::symbol::SymbolInterner;
use crate::summary::GlobalSummaries;
use crate::symbol::Lang;
use domain::TaintState;
use path_state::PredicateKind;
use petgraph::graph::NodeIndex;
use std::collections::HashMap;
use tracing::debug;
use petgraph::visit::IntoNodeReferences;
use std::collections::HashSet;
use transfer::{TaintEvent, TaintTransfer};
/// A detected taint finding with both source and sink locations.
#[derive(Debug, Clone)]
@ -20,269 +30,23 @@ pub struct Finding {
pub path: Vec<NodeIndex>,
/// The kind of source that originated the taint.
pub source_kind: SourceKind,
}
/// Order-independent hash of a taint map.
///
/// Uses XOR of per-entry hashes so the result is the same regardless of
/// iteration order — no allocation or sorting required.
fn taint_hash(taint: &HashMap<String, Cap>) -> u64 {
let mut h: u64 = 0;
for (k, bits) in taint {
// Per-entry hash: FNV-1a-style mixing of key bytes + cap bits.
let mut entry_h: u64 = 0xcbf2_9ce4_8422_2325; // FNV offset basis
for b in k.as_bytes() {
entry_h ^= *b as u64;
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3); // FNV prime
}
entry_h ^= bits.bits() as u64;
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3);
h ^= entry_h;
}
h
}
/// Resolved summary for a callee — a uniform view regardless of whether the
/// summary came from a local (samefile) or global (crossfile) source.
struct ResolvedSummary {
source_caps: Cap,
sanitizer_caps: Cap,
sink_caps: Cap,
propagates_taint: bool,
}
/// Try to resolve a callee name using conservative same-language resolution.
///
/// Resolution order:
/// 1. Local (same-file): exact name + same lang + same namespace
/// 2. Global same-language: via `lookup_same_lang`; must be unambiguous
/// 3. Interop edges: explicit cross-language bridges
/// 4. No cross-language fallback
#[allow(clippy::too_many_arguments)]
fn resolve_callee(
callee: &str,
caller_lang: Lang,
caller_namespace: &str,
caller_func: &str,
call_ordinal: u32,
local: &FuncSummaries,
global: Option<&GlobalSummaries>,
interop_edges: &[InteropEdge],
) -> Option<ResolvedSummary> {
// 1) Local (same-file): scan local summaries for matching name + lang + namespace
let local_matches: Vec<_> = local
.iter()
.filter(|(k, _)| {
k.name == callee && k.lang == caller_lang && k.namespace == caller_namespace
})
.collect();
if local_matches.len() == 1 {
let (_, ls) = local_matches[0];
return Some(ResolvedSummary {
source_caps: ls.source_caps,
sanitizer_caps: ls.sanitizer_caps,
sink_caps: ls.sink_caps,
propagates_taint: ls.propagates_taint,
});
}
// Multiple local matches — try arity disambiguation (future), for now return None
if local_matches.len() > 1 {
return None;
}
// 2) Global same-language
if let Some(gs) = global {
let matches = gs.lookup_same_lang(caller_lang, callee);
if matches.len() == 1 {
let (_, fs) = matches[0];
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
// Multiple matches — try namespace match first
if matches.len() > 1 {
let same_ns: Vec<_> = matches
.iter()
.filter(|(k, _)| k.namespace == caller_namespace)
.collect();
if same_ns.len() == 1 {
let (_, fs) = same_ns[0];
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
// Still ambiguous — return None (conservative)
return None;
}
}
// 3) Interop edges: explicit cross-language bridges
for edge in interop_edges {
if edge.from.caller_lang == caller_lang
&& edge.from.caller_namespace == caller_namespace
&& edge.from.callee_symbol == callee
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_func)
&& (edge.from.ordinal == 0 || edge.from.ordinal == call_ordinal)
{
// Look up the target in global summaries by exact FuncKey
if let Some(gs) = global
&& let Some(fs) = gs.get(&edge.to)
{
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
}
}
// 4) No cross-language fallback
None
}
/// Apply taint transfer for a single node, mutating `out` in place.
///
/// Callers should clone the taint map before calling if they need
/// the original state preserved.
fn apply_taint(
node: &NodeInfo,
out: &mut HashMap<String, Cap>,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
caller_lang: Lang,
caller_namespace: &str,
interop_edges: &[InteropEdge],
) {
debug!(target: "taint", "Applying taint to node: {:?}", node);
debug!(target: "taint", "Taint: {:?}", out);
let caller_func = node.enclosing_func.as_deref().unwrap_or("");
match node.label {
// A new untrusted value enters the program
Some(DataLabel::Source(bits)) => {
if let Some(v) = &node.defines {
out.insert(v.clone(), bits);
}
}
// Sanitizer: propagate input taint through the assignment FIRST,
// then strip the sanitizer's capability bits. This ensures that
// `let y = sanitize_html(&x)` gives y the taint of x minus the
// HTML_ESCAPE bit — rather than leaving y completely clean (which
// would hide "wrong sanitiser for this sink" bugs).
Some(DataLabel::Sanitizer(bits)) => {
if let Some(v) = &node.defines {
// 1. Propagate: union taint from all read variables
let mut combined = Cap::empty();
for u in &node.uses {
if let Some(b) = out.get(u) {
combined |= *b;
}
}
// 2. Strip the sanitiser's bits
let new = combined & !bits;
if new.is_empty() {
out.remove(v);
} else {
out.insert(v.clone(), new);
}
}
}
// A function call — resolve against local + global summaries
_ if node.kind == StmtKind::Call => {
if let Some(callee) = &node.callee
&& let Some(resolved) = resolve_callee(
callee,
caller_lang,
caller_namespace,
caller_func,
node.call_ordinal,
local_summaries,
global_summaries,
interop_edges,
)
{
// Build the return value's taint bits in stages, then
// write once at the end. Order matters:
//
// 1. Start with fresh source taint (if the callee is a source)
// 2. Union with propagated arg taint (if the callee propagates)
// 3. Strip sanitizer bits last (so sanitization always wins)
let mut return_bits = Cap::empty();
// ── 1. Source behaviour ──
return_bits |= resolved.source_caps;
// ── 2. Propagation ──
if resolved.propagates_taint {
for u in &node.uses {
if let Some(bits) = out.get(u) {
return_bits |= *bits;
}
}
}
// ── 3. Sanitizer behaviour (applied last so it always wins) ──
return_bits &= !resolved.sanitizer_caps;
// ── Write the result ──
if let Some(v) = &node.defines {
if return_bits.is_empty() {
out.remove(v);
} else {
out.insert(v.clone(), return_bits);
}
}
// ── Sink behaviour: handled in the main analysis loop
// (checked via node.label or resolved summary) ──
return;
}
// Unresolved call — fall through to default gen/kill below
}
// All other statements: classic gen/kill for assignments
_ => {}
}
// Default gen/kill: propagate taint through variable assignments
if !matches!(
node.label,
Some(DataLabel::Source(_)) | Some(DataLabel::Sanitizer(_))
) && let Some(d) = &node.defines
{
let mut combined = Cap::empty();
for u in &node.uses {
if let Some(bits) = out.get(u) {
combined |= *bits;
}
}
if combined.is_empty() {
out.remove(d);
} else {
out.insert(d.clone(), combined);
}
}
/// Whether all tainted sink variables are guarded by a validation
/// predicate on this path (metadata only — does not change severity).
#[allow(dead_code)] // surfaced in Diag output (task 4)
pub path_validated: bool,
/// The kind of validation guard protecting this path, if any.
#[allow(dead_code)] // surfaced in Diag output (task 4)
pub guard_kind: Option<PredicateKind>,
}
/// Run taint analysis on a single file's CFG.
///
/// `global_summaries` is `None` for pass1 / singlefile mode and
/// `Some(&map)` for pass2 crossfile analysis.
/// Uses a monotone forward dataflow analysis via `state::engine::run_forward`
/// with the `TaintTransfer` function. Termination is guaranteed by lattice
/// finiteness (bounded `Cap` bits × bounded variable count).
///
/// For JS/TS files: uses a two-level solve to prevent cross-function taint
/// leakage while preserving global-to-function flows.
pub fn analyse_file(
cfg: &Cfg,
entry: NodeIndex,
@ -292,162 +56,155 @@ pub fn analyse_file(
caller_namespace: &str,
interop_edges: &[InteropEdge],
) -> Vec<Finding> {
use std::collections::{HashMap, HashSet, VecDeque};
let _span = tracing::debug_span!("taint_analyse_file").entered();
/// Queue item: current CFG node + taint map that holds here
#[derive(Clone)]
struct Item {
node: NodeIndex,
taint: HashMap<String, Cap>,
// 1. Build symbol interner from CFG
let interner = SymbolInterner::from_cfg(cfg);
if interner.len() > MAX_TRACKED_VARS {
tracing::warn!(
symbols = interner.len(),
max = MAX_TRACKED_VARS,
"taint analysis: too many variables, some will be ignored"
);
}
// (node, taint_hash) → predecessor key (for path rebuild)
type Key = (NodeIndex, u64);
let mut pred: HashMap<Key, Key> = HashMap::new();
// 2. Build base transfer function
let base_transfer = TaintTransfer {
lang: caller_lang,
namespace: caller_namespace,
interner: &interner, // also used for events_to_findings below
local_summaries,
global_summaries,
interop_edges,
global_seed: None,
scope_filter: None,
};
// Seen states so we do not revisit them infinitely
let mut seen: HashSet<Key> = HashSet::new();
// 3. Run analysis (two-level for JS/TS, single-pass otherwise)
let events = if matches!(caller_lang, Lang::JavaScript | Lang::TypeScript) {
analyse_js_two_level(cfg, entry, &interner, &base_transfer)
} else {
let result = engine::run_forward(cfg, entry, &base_transfer, TaintState::initial());
result.events
};
// Resulting findings: (sink_node, source_node, full_path)
let mut findings: Vec<Finding> = Vec::new();
// 4. Convert events to findings
let mut findings = events_to_findings(&events, &interner);
let mut q = VecDeque::new();
q.push_back(Item {
node: entry,
taint: HashMap::new(),
});
seen.insert((entry, 0));
// 5. Deduplicate findings by (sink, source), prefer path_validated=true
findings.sort_by_key(|f| (f.sink.index(), f.source.index(), !f.path_validated));
findings.dedup_by_key(|f| (f.sink, f.source));
while let Some(Item { node, taint }) = q.pop_front() {
let caller_func = cfg[node].enclosing_func.as_deref().unwrap_or("");
let mut out = taint.clone();
apply_taint(
&cfg[node],
&mut out,
local_summaries,
global_summaries,
caller_lang,
caller_namespace,
interop_edges,
);
findings
}
// ── Sink check ──────────────────────────────────────────────────
// Two ways a node can be a sink:
// 1. Its AST label says Sink (existing inline labels)
// 2. Its callee resolves to a function with sink_caps (cross-file)
let sink_caps = match cfg[node].label {
Some(DataLabel::Sink(caps)) => caps,
_ => {
// check if callee resolves to a sink
cfg[node]
.callee
.as_ref()
.and_then(|c| {
resolve_callee(
c,
caller_lang,
caller_namespace,
caller_func,
cfg[node].call_ordinal,
local_summaries,
global_summaries,
interop_edges,
)
})
.filter(|r| !r.sink_caps.is_empty())
.map(|r| r.sink_caps)
.unwrap_or(Cap::empty())
}
/// JS/TS two-level solve to prevent cross-function taint leakage.
///
/// Level 1: Solve top-level code (nodes where `enclosing_func.is_none()`).
/// Level 2: For each function, solve seeded with top-level taint.
fn analyse_js_two_level(
cfg: &Cfg,
entry: NodeIndex,
_interner: &SymbolInterner,
base_transfer: &TaintTransfer,
) -> Vec<TaintEvent> {
// Level 1: solve top-level only
let toplevel_transfer = TaintTransfer {
lang: base_transfer.lang,
namespace: base_transfer.namespace,
interner: base_transfer.interner,
local_summaries: base_transfer.local_summaries,
global_summaries: base_transfer.global_summaries,
interop_edges: base_transfer.interop_edges,
global_seed: None,
scope_filter: Some(None), // top-level only (enclosing_func == None)
};
let toplevel_result =
engine::run_forward(cfg, entry, &toplevel_transfer, TaintState::initial());
// Extract top-level taint state at the last converged point
let toplevel_state = extract_exit_state(&toplevel_result.states);
// Level 2: solve each function seeded with top-level state
let mut all_events = toplevel_result.events;
let func_entries = find_function_entries(cfg);
for (func_name, func_entry) in &func_entries {
let func_transfer = TaintTransfer {
lang: base_transfer.lang,
namespace: base_transfer.namespace,
interner: base_transfer.interner,
local_summaries: base_transfer.local_summaries,
global_summaries: base_transfer.global_summaries,
interop_edges: base_transfer.interop_edges,
global_seed: Some(&toplevel_state),
scope_filter: Some(Some(func_name.as_str())),
};
if !sink_caps.is_empty() {
let bad = cfg[node]
.uses
.iter()
.any(|u| out.get(u).is_some_and(|b| (*b & sink_caps) != Cap::empty()));
if bad {
// Reconstruct path backwards from sink to source.
//
// A node is considered a "source" if:
// 1. It has an inline DataLabel::Source (same-file), OR
// 2. It is a Call whose callee resolves to a source via
// local or global summaries (cross-file).
let sink_node = node;
let mut path = vec![node];
let mut source_node = node; // fallback: sink itself
let mut key = (node, taint_hash(&taint));
let func_result =
engine::run_forward(cfg, *func_entry, &func_transfer, TaintState::initial());
all_events.extend(func_result.events);
}
while let Some(&(prev, prev_hash)) = pred.get(&key) {
path.push(prev);
all_events
}
// Check inline source label
if matches!(cfg[prev].label, Some(DataLabel::Source(_))) {
source_node = prev;
break;
}
/// Extract the "best" taint state from converged states (join all exit/reachable states).
fn extract_exit_state(states: &std::collections::HashMap<NodeIndex, TaintState>) -> TaintState {
let mut result = TaintState::initial();
for state in states.values() {
result = result.join(state);
}
result
}
// Check cross-file source via resolved callee summary
let prev_caller_func = cfg[prev].enclosing_func.as_deref().unwrap_or("");
if cfg[prev].kind == StmtKind::Call
&& let Some(callee) = &cfg[prev].callee
&& let Some(resolved) = resolve_callee(
callee,
caller_lang,
caller_namespace,
prev_caller_func,
cfg[prev].call_ordinal,
local_summaries,
global_summaries,
interop_edges,
)
&& !resolved.source_caps.is_empty()
{
source_node = prev;
break;
}
/// Find function entry nodes: (func_name, entry_node) pairs.
///
/// A function entry is the first node with a given `enclosing_func` value.
fn find_function_entries(cfg: &Cfg) -> Vec<(String, NodeIndex)> {
let mut seen = HashSet::new();
let mut entries = Vec::new();
key = (prev, prev_hash);
}
path.reverse();
// Infer the source kind from the source node's label and callee
let source_kind = match cfg[source_node].label {
Some(DataLabel::Source(caps)) => {
let callee = cfg[source_node].callee.as_deref().unwrap_or("");
crate::labels::infer_source_kind(caps, callee)
}
_ => SourceKind::Unknown,
};
findings.push(Finding {
sink: sink_node,
source: source_node,
path,
source_kind,
});
}
for (idx, info) in cfg.node_references() {
if let Some(ref func_name) = info.enclosing_func
&& seen.insert(func_name.clone())
{
entries.push((func_name.clone(), idx));
}
}
// enqueue successors — cache hashes to avoid recomputation
let out_h = taint_hash(&out);
let in_h = taint_hash(&taint);
let succs: Vec<_> = cfg.neighbors(node).collect();
for (i, succ) in succs.iter().enumerate() {
let key = (*succ, out_h);
if !seen.contains(&key) {
seen.insert(key);
pred.insert(key, (node, in_h));
// Move the map into the last successor to avoid a clone
let taint_for_succ = if i + 1 == succs.len() {
std::mem::take(&mut out)
} else {
out.clone()
};
q.push_back(Item {
node: *succ,
taint: taint_for_succ,
});
entries
}
/// Convert TaintEvents into Findings.
fn events_to_findings(events: &[TaintEvent], _interner: &SymbolInterner) -> Vec<Finding> {
let mut findings = Vec::new();
for event in events {
let TaintEvent::SinkReached {
sink_node,
tainted_vars,
all_validated,
guard_kind,
..
} = event;
// Collect unique origins across all tainted vars at this sink
let mut seen_origins: HashSet<(usize, usize)> = HashSet::new();
for (_sym, _caps, origins) in tainted_vars {
for origin in origins {
if seen_origins.insert((origin.node.index(), sink_node.index())) {
findings.push(Finding {
sink: *sink_node,
source: origin.node,
path: vec![origin.node, *sink_node],
source_kind: origin.source_kind,
path_validated: *all_validated,
guard_kind: *guard_kind,
});
}
}
}
}

234
src/taint/path_state.rs Normal file
View file

@ -0,0 +1,234 @@
// ─── PredicateKind ───────────────────────────────────────────────────────────
/// Classification of what an if-condition tests.
///
/// Determined by heuristic analysis of the raw condition text.
/// Classification is conservative: prefer [`Unknown`](PredicateKind::Unknown)
/// over a wrong guess.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PredicateKind {
/// `x.is_none()`, `x == null`, `x == nil`, `x is None`
NullCheck,
/// `x.is_empty()`, `x.len() == 0`, `x == ""`
EmptyCheck,
/// `x.is_err()`, `x.is_ok()`, `err != nil`
ErrorCheck,
/// Call to a validation/guard function: `validate(x)`, `is_safe(x)`
ValidationCall,
/// Call to a sanitizer function: `sanitize(x)`, `escape(x)`
SanitizerCall,
/// Comparison operators: `x == 5`, `x > threshold`
Comparison,
/// Generic boolean test — cannot classify further.
Unknown,
}
/// Classify a raw condition text into a [`PredicateKind`].
///
/// # Rules
///
/// - Empty/None text → [`Unknown`](PredicateKind::Unknown).
/// - `ValidationCall` / `SanitizerCall` require a `(` in the text **and** a
/// matching callee token. This avoids misclassifying comparisons like
/// `x_valid == true`.
/// - Prefers [`Unknown`](PredicateKind::Unknown) over false positives.
pub fn classify_condition(text: &str) -> PredicateKind {
if text.is_empty() {
return PredicateKind::Unknown;
}
let lower = text.to_ascii_lowercase();
// ── Error checks (before null checks: `err != nil` is an error check,
// not a null check, even though it contains `!= nil`) ──────────────
if lower.contains("is_err")
|| lower.contains("is_ok")
|| lower.contains("err != nil")
|| lower.contains("err == nil")
|| lower.contains("error != nil")
|| lower.contains("error == nil")
{
return PredicateKind::ErrorCheck;
}
// ── Null checks ──────────────────────────────────────────────────────
if lower.contains("is_none")
|| lower.contains("is_some")
|| lower.contains("== none")
|| lower.contains("!= none")
|| lower.contains("is none")
|| lower.contains("is not none")
|| lower.contains("== null")
|| lower.contains("!= null")
|| lower.contains("=== null")
|| lower.contains("!== null")
|| lower.contains("== nil")
|| lower.contains("!= nil")
{
return PredicateKind::NullCheck;
}
// ── Empty checks ─────────────────────────────────────────────────────
if lower.contains("is_empty")
|| lower.contains(".len() == 0")
|| lower.contains(".len() != 0")
|| lower.contains(".length == 0")
|| lower.contains(".length === 0")
|| lower.contains(".length != 0")
|| lower.contains(".length !== 0")
|| lower.contains("== \"\"")
|| lower.contains("== ''")
{
return PredicateKind::EmptyCheck;
}
// ── Call-based kinds (require `(` to be present) ─────────────────────
if lower.contains('(') {
// Extract a rough callee token: everything before the first `(`
// that looks like an identifier (letters, digits, underscores, dots).
let callee_part = lower.split('(').next().unwrap_or("");
// Take the last segment (after `.` or `::`) as the bare name.
let bare = callee_part
.rsplit(['.', ':'])
.next()
.unwrap_or(callee_part)
.trim();
// Validation
if bare.contains("valid")
|| bare.contains("check")
|| bare.contains("verify")
|| bare.starts_with("is_safe")
|| bare.starts_with("is_authorized")
|| bare.starts_with("is_authenticated")
{
return PredicateKind::ValidationCall;
}
// Sanitizer
if bare.contains("sanitiz") || bare.contains("escape") || bare.contains("encode") {
return PredicateKind::SanitizerCall;
}
}
// ── Comparison operators ─────────────────────────────────────────────
if lower.contains("==")
|| lower.contains("!=")
|| lower.contains(">=")
|| lower.contains("<=")
|| lower.contains(" > ")
|| lower.contains(" < ")
{
return PredicateKind::Comparison;
}
PredicateKind::Unknown
}
// ─── Tests ───────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
// ── classify_condition ────────────────────────────────────────────────
#[test]
fn classify_empty_is_unknown() {
assert_eq!(classify_condition(""), PredicateKind::Unknown);
}
#[test]
fn classify_null_checks() {
assert_eq!(classify_condition("x.is_none()"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x == null"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x != nil"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x is None"), PredicateKind::NullCheck);
assert_eq!(classify_condition("x === null"), PredicateKind::NullCheck);
}
#[test]
fn classify_error_checks() {
assert_eq!(classify_condition("x.is_err()"), PredicateKind::ErrorCheck);
assert_eq!(classify_condition("err != nil"), PredicateKind::ErrorCheck);
assert_eq!(classify_condition("x.is_ok()"), PredicateKind::ErrorCheck);
}
#[test]
fn classify_empty_checks() {
assert_eq!(
classify_condition("x.is_empty()"),
PredicateKind::EmptyCheck
);
assert_eq!(
classify_condition("x.len() == 0"),
PredicateKind::EmptyCheck
);
assert_eq!(
classify_condition("x.length === 0"),
PredicateKind::EmptyCheck
);
}
#[test]
fn classify_validation_call() {
assert_eq!(
classify_condition("validate(x)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("is_safe(input)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("check_auth(req)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("input.verify(sig)"),
PredicateKind::ValidationCall
);
}
#[test]
fn classify_validation_requires_paren() {
// `x_valid == true` should NOT be ValidationCall — no `(` call syntax.
assert_eq!(
classify_condition("x_valid == true"),
PredicateKind::Comparison
);
assert_eq!(
classify_condition("is_valid && ready"),
PredicateKind::Unknown
);
}
#[test]
fn classify_sanitizer_call() {
assert_eq!(
classify_condition("sanitize(x)"),
PredicateKind::SanitizerCall
);
assert_eq!(
classify_condition("html_escape(s)"),
PredicateKind::SanitizerCall
);
assert_eq!(
classify_condition("url_encode(path)"),
PredicateKind::SanitizerCall
);
}
#[test]
fn classify_comparison() {
assert_eq!(classify_condition("x == 5"), PredicateKind::Comparison);
assert_eq!(classify_condition("x != y"), PredicateKind::Comparison);
assert_eq!(classify_condition("a >= b"), PredicateKind::Comparison);
}
#[test]
fn classify_unknown_fallback() {
assert_eq!(classify_condition("flag"), PredicateKind::Unknown);
assert_eq!(classify_condition("a && b"), PredicateKind::Unknown);
}
}

View file

@ -1,6 +1,7 @@
use super::*;
use crate::cfg::FuncSummaries;
use crate::interop::InteropEdge;
use crate::labels::Cap;
use crate::symbol::FuncKey;
#[test]
@ -52,8 +53,10 @@ fn taint_through_if_else() {
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// exactly one path (via the True branch) should be flagged
assert_eq!(findings.len(), 1);
// Both branches have findings: the true branch uses unsanitized `x`,
// the else branch uses `safe` which was sanitized with HTML_ESCAPE
// but the sink requires SHELL_ESCAPE (wrong sanitizer → still tainted).
assert_eq!(findings.len(), 2);
}
#[test]
@ -2218,3 +2221,318 @@ fn return_call_recognized_as_source() {
"foo() should have source_caps set because env::var is called inside return"
);
}
// ─── Path-sensitive analysis tests ───────────────────────────────────────────
#[test]
fn validate_and_early_return() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Validate before use: if validation fails, early return.
// The sink after the guard is on the "validated" path.
//
// The CFG creates a synthetic pass-through node for the false path
// with an explicit False edge from the If node. BFS reaches the
// sink via: cond → (False) → pass-through → (Seq) → sink.
// The predicate on the False edge records that `!validate(&x)` was
// false (i.e. validation passed), so the sink is path-guarded.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if !validate(&x) { return; }
Command::new("sh").arg(x).status().unwrap();
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Taint still flows (validate doesn't kill taint), but the finding
// should be annotated as path_validated because the false path
// (validation passed) has a ValidationCall predicate with polarity=true.
assert_eq!(findings.len(), 1, "should still detect the taint flow");
assert!(
findings[0].path_validated,
"finding should be marked as path_validated (early-return guard detected)"
);
assert_eq!(
findings[0].guard_kind,
Some(PredicateKind::ValidationCall),
"guard_kind should be ValidationCall"
);
}
#[test]
fn validate_in_if_else_path_validated() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// If/else where the True branch (validation passed) contains the sink.
// This IS detectable because the If node has genuine True/False branches.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if validate(&x) {
Command::new("sh").arg(&x).status().unwrap();
} else {
println!("invalid input");
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1, "should detect the taint flow");
assert!(
findings[0].path_validated,
"finding should be path_validated (sink in validated branch)"
);
assert_eq!(
findings[0].guard_kind,
Some(PredicateKind::ValidationCall),
"guard_kind should be ValidationCall"
);
}
#[test]
fn sink_on_failed_validation_branch() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Sink is in the failed-validation branch (negated condition, false edge).
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if !validate(&x) {
Command::new("sh").arg(&x).status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1, "should detect taint flow to sink");
assert!(
!findings[0].path_validated,
"finding should NOT be path_validated (sink is in failed-validation branch)"
);
}
#[test]
fn contradictory_null_check_pruned() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Inner branch is infeasible: if x.is_none() then x cannot also be is_none().
// After early return on is_none(), the fall-through path has polarity=false
// for NullCheck. The inner `if x.is_none()` True branch has polarity=true —
// contradiction.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").ok();
if x.is_none() { return; }
if x.is_none() {
Command::new("sh").arg("dangerous").status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// The inner branch is infeasible, and the arg "dangerous" is a string
// literal (not tainted), so there should be no findings.
assert!(
findings.is_empty(),
"inner branch is infeasible — should produce no findings (got {})",
findings.len()
);
}
#[test]
fn sanitize_one_branch_no_regression() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Same as existing taint_through_if_else: sanitized in one branch, not in the other.
// Verify the finding count stays at 1 (no regression from path sensitivity).
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("DANGEROUS").unwrap();
let safe = html_escape::encode_safe(&x);
if x.len() > 5 {
Command::new("sh").arg(&x).status().unwrap(); // UNSAFE
} else {
Command::new("sh").arg(&safe).status().unwrap(); // SAFE
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Both branches produce findings: the true branch uses unsanitized `x`,
// the else branch uses `safe` (HTML_ESCAPE sanitizer vs SHELL_ESCAPE sink).
// Previously only 1 finding because else_clause was silently dropped from CFG.
assert_eq!(
findings.len(),
2,
"two findings expected (both branches reach sink with wrong/no sanitizer)"
);
}
#[test]
fn path_state_budget_graceful() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Deeply nested ifs with a sink at the innermost level.
// PathState should truncate gracefully after MAX_PATH_PREDICATES.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if x.len() > 1 {
if x.len() > 2 {
if x.len() > 3 {
if x.len() > 4 {
if x.len() > 5 {
if x.len() > 6 {
if x.len() > 7 {
if x.len() > 8 {
if x.len() > 9 {
Command::new("sh").arg(&x).status().unwrap();
}
}
}
}
}
}
}
}
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Should still detect the flow — truncation shouldn't cause false negatives.
assert_eq!(
findings.len(),
1,
"should detect taint flow even with truncated PathState"
);
}
#[test]
fn unknown_predicate_not_pruned() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Comparison predicates are NOT in the contradiction whitelist, so even
// seemingly contradictory comparisons should not be pruned.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
if x.len() > 5 { return; }
if x.len() > 5 {
Command::new("sh").arg(&x).status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Comparison is not in the whitelist — the path should NOT be pruned.
assert_eq!(
findings.len(),
1,
"Comparison predicate should not cause contradiction pruning"
);
}
#[test]
fn multi_var_predicate_not_pruned() {
use crate::cfg::build_cfg;
use tree_sitter::Language;
// Multi-variable conditions should never be pruned for contradiction,
// even if the kind is in the whitelist.
let src = br#"
use std::env; use std::process::Command;
fn main() {
let x = env::var("INPUT").unwrap();
let y = env::var("OTHER").ok();
if y.is_none() { return; }
if y.is_none() {
Command::new("sh").arg(&x).status().unwrap();
}
}"#;
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// Note: y.is_none() condition references `y` and `is_none` — two idents.
// Wait, `is_none` is a method — collect_idents finds `y` and `is_none` as
// separate identifiers. That makes it multi-var, so contradiction should
// NOT fire. However, the actual behavior depends on how many idents
// collect_idents extracts from `y.is_none()`. If it returns ["y", "is_none"],
// then the predicate has 2 vars → multi-var → not pruned → finding exists.
assert!(
!findings.is_empty(),
"multi-var predicate should not be pruned; flow should be detected"
);
}

458
src/taint/transfer.rs Normal file
View file

@ -0,0 +1,458 @@
use crate::callgraph::normalize_callee_name;
use crate::cfg::{EdgeKind, FuncSummaries, NodeInfo, StmtKind};
use crate::interop::InteropEdge;
use crate::labels::{Cap, DataLabel};
use crate::state::engine::Transfer;
use crate::state::lattice::Lattice;
use crate::state::symbol::{SymbolId, SymbolInterner};
use crate::summary::{CalleeResolution, GlobalSummaries};
use crate::symbol::Lang;
use crate::taint::domain::{TaintOrigin, TaintState, VarTaint, predicate_kind_bit};
use crate::taint::path_state::{PredicateKind, classify_condition};
use petgraph::graph::NodeIndex;
use smallvec::SmallVec;
/// Events emitted by the taint transfer function during Phase 2.
#[derive(Clone, Debug)]
pub enum TaintEvent {
SinkReached {
sink_node: NodeIndex,
tainted_vars: Vec<(SymbolId, Cap, SmallVec<[TaintOrigin; 2]>)>,
#[allow(dead_code)]
sink_caps: Cap,
all_validated: bool,
guard_kind: Option<PredicateKind>,
},
}
/// Taint transfer function for forward dataflow analysis.
pub struct TaintTransfer<'a> {
pub lang: Lang,
pub namespace: &'a str,
pub interner: &'a SymbolInterner,
pub local_summaries: &'a FuncSummaries,
pub global_summaries: Option<&'a GlobalSummaries>,
pub interop_edges: &'a [InteropEdge],
/// For JS two-level solve: top-level taint state seeded into function solves.
pub global_seed: Option<&'a TaintState>,
/// Optional scope filter: if set, only process nodes whose enclosing_func matches.
/// None = process all nodes. Some(None) = top-level only. Some(Some(name)) = function only.
pub scope_filter: Option<Option<&'a str>>,
}
impl Transfer<TaintState> for TaintTransfer<'_> {
type Event = TaintEvent;
fn apply(
&self,
node: NodeIndex,
info: &NodeInfo,
edge: Option<EdgeKind>,
mut state: TaintState,
) -> (TaintState, Vec<TaintEvent>) {
let mut events = Vec::new();
// Scope filter: skip nodes outside our scope (return state unchanged)
if let Some(ref filter) = self.scope_filter {
let node_func = info.enclosing_func.as_deref();
if node_func != *filter {
return (state, events);
}
}
let caller_func = info.enclosing_func.as_deref().unwrap_or("");
// ── Apply taint transfer ────────────────────────────────────────
match info.label {
Some(DataLabel::Source(bits)) => {
self.apply_source(node, info, bits, &mut state);
}
Some(DataLabel::Sanitizer(bits)) => {
self.apply_sanitizer(info, bits, &mut state);
}
_ if info.kind == StmtKind::Call => {
self.apply_call(node, info, caller_func, &mut state);
}
_ => {
self.apply_assignment(info, &mut state);
}
}
// ── If-node predicate handling (edge-aware) ─────────────────────
if info.kind == StmtKind::If
&& !info.condition_vars.is_empty()
&& matches!(edge, Some(EdgeKind::True) | Some(EdgeKind::False))
{
let cond_text = info.condition_text.as_deref().unwrap_or("");
let kind = classify_condition(cond_text);
let polarity = matches!(edge, Some(EdgeKind::True)) ^ info.condition_negated;
// ValidationCall handling
if kind == PredicateKind::ValidationCall && polarity {
for var in &info.condition_vars {
if let Some(sym) = self.interner.get(var) {
state.validated_may.insert(sym);
state.validated_must.insert(sym);
}
}
}
// Predicate summary for whitelisted kinds (contradiction pruning)
if let Some(bit_idx) = predicate_kind_bit(kind) {
for var in &info.condition_vars {
if let Some(sym) = self.interner.get(var) {
let mut summary = state.get_predicate(sym);
if polarity {
summary.known_true |= 1 << bit_idx;
} else {
summary.known_false |= 1 << bit_idx;
}
state.set_predicate(sym, summary);
}
}
}
// Contradiction pruning: if any variable has contradictory predicates,
// this is an infeasible path → return bot (monotonically kills branch).
if state.has_contradiction() {
return (TaintState::bot(), events);
}
}
// ── Sink check ──────────────────────────────────────────────────
let sink_caps = self.resolve_sink_caps(info, caller_func);
if !sink_caps.is_empty() {
let tainted_vars = self.collect_tainted_sink_vars(info, &state, sink_caps);
if !tainted_vars.is_empty() {
let all_validated = tainted_vars
.iter()
.all(|(sym, _, _)| state.validated_may.contains(*sym));
let guard_kind = if all_validated {
Some(PredicateKind::ValidationCall)
} else {
None
};
events.push(TaintEvent::SinkReached {
sink_node: node,
tainted_vars,
sink_caps,
all_validated,
guard_kind,
});
}
}
(state, events)
}
fn iteration_budget(&self) -> usize {
100_000
}
fn on_budget_exceeded(&self) -> bool {
tracing::warn!("taint analysis: worklist budget exceeded, returning partial results");
false
}
}
impl TaintTransfer<'_> {
/// Apply a Source label: insert taint for the defined variable.
fn apply_source(&self, node: NodeIndex, info: &NodeInfo, bits: Cap, state: &mut TaintState) {
if let Some(ref v) = info.defines
&& let Some(sym) = self.interner.get(v)
{
let callee = info.callee.as_deref().unwrap_or("");
let source_kind = crate::labels::infer_source_kind(bits, callee);
let origin = TaintOrigin { node, source_kind };
match state.get(sym) {
Some(existing) => {
let mut new_taint = existing.clone();
new_taint.caps |= bits;
if new_taint.origins.len() < 4
&& !new_taint.origins.iter().any(|o| o.node == node)
{
new_taint.origins.push(origin);
}
state.set(sym, new_taint);
}
None => {
state.set(
sym,
VarTaint {
caps: bits,
origins: SmallVec::from_elem(origin, 1),
},
);
}
}
}
}
/// Apply a Sanitizer label: propagate input taint, then strip sanitizer bits.
fn apply_sanitizer(&self, info: &NodeInfo, bits: Cap, state: &mut TaintState) {
if let Some(ref v) = info.defines
&& let Some(sym) = self.interner.get(v)
{
let (combined_caps, combined_origins) = self.collect_uses_taint(info, state);
let new_caps = combined_caps & !bits;
if new_caps.is_empty() {
state.remove(sym);
} else {
state.set(
sym,
VarTaint {
caps: new_caps,
origins: combined_origins,
},
);
}
}
}
/// Apply a function call: resolve callee and compute return taint.
fn apply_call(
&self,
node: NodeIndex,
info: &NodeInfo,
caller_func: &str,
state: &mut TaintState,
) {
if let Some(ref callee) = info.callee
&& let Some(resolved) = self.resolve_callee(callee, caller_func, info.call_ordinal)
{
let mut return_bits = Cap::empty();
let mut return_origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new();
// 1. Source behaviour
if !resolved.source_caps.is_empty() {
return_bits |= resolved.source_caps;
let callee_str = info.callee.as_deref().unwrap_or("");
let source_kind =
crate::labels::infer_source_kind(resolved.source_caps, callee_str);
let origin = TaintOrigin { node, source_kind };
if !return_origins.iter().any(|o| o.node == node) {
return_origins.push(origin);
}
}
// 2. Propagation
if resolved.propagates_taint {
let (use_caps, use_origins) = self.collect_uses_taint(info, state);
return_bits |= use_caps;
for orig in &use_origins {
if return_origins.len() < 4
&& !return_origins.iter().any(|o| o.node == orig.node)
{
return_origins.push(*orig);
}
}
}
// 3. Sanitizer behaviour (applied last so it always wins)
return_bits &= !resolved.sanitizer_caps;
// Write result
if let Some(ref v) = info.defines
&& let Some(sym) = self.interner.get(v)
{
if return_bits.is_empty() {
state.remove(sym);
} else {
state.set(
sym,
VarTaint {
caps: return_bits,
origins: return_origins,
},
);
}
}
return;
}
// Unresolved call — fall through to default gen/kill
self.apply_assignment(info, state);
}
/// Default gen/kill: propagate taint through variable assignments.
fn apply_assignment(&self, info: &NodeInfo, state: &mut TaintState) {
if matches!(
info.label,
Some(DataLabel::Source(_)) | Some(DataLabel::Sanitizer(_))
) {
return;
}
if let Some(ref d) = info.defines
&& let Some(sym) = self.interner.get(d)
{
let (combined_caps, combined_origins) = self.collect_uses_taint(info, state);
if combined_caps.is_empty() {
state.remove(sym);
} else {
state.set(
sym,
VarTaint {
caps: combined_caps,
origins: combined_origins,
},
);
}
}
}
/// Collect taint from all `uses` variables (union of caps + merge origins).
fn collect_uses_taint(
&self,
info: &NodeInfo,
state: &TaintState,
) -> (Cap, SmallVec<[TaintOrigin; 2]>) {
let mut combined_caps = Cap::empty();
let mut combined_origins: SmallVec<[TaintOrigin; 2]> = SmallVec::new();
for u in &info.uses {
let taint = self.lookup_var(u, state);
if let Some(t) = taint {
combined_caps |= t.caps;
for orig in &t.origins {
if combined_origins.len() < 4
&& !combined_origins.iter().any(|o| o.node == orig.node)
{
combined_origins.push(*orig);
}
}
}
}
(combined_caps, combined_origins)
}
/// Look up a variable's taint, falling back to global_seed for JS two-level solve.
fn lookup_var<'a>(&'a self, name: &str, state: &'a TaintState) -> Option<&'a VarTaint> {
if let Some(sym) = self.interner.get(name) {
if let Some(taint) = state.get(sym) {
return Some(taint);
}
// Fall back to global seed (JS two-level solve)
if let Some(seed) = self.global_seed {
return seed.get(sym);
}
}
None
}
/// Resolve sink caps from label or callee summary.
fn resolve_sink_caps(&self, info: &NodeInfo, caller_func: &str) -> Cap {
match info.label {
Some(DataLabel::Sink(caps)) => caps,
_ => info
.callee
.as_ref()
.and_then(|c| self.resolve_callee(c, caller_func, info.call_ordinal))
.filter(|r| !r.sink_caps.is_empty())
.map(|r| r.sink_caps)
.unwrap_or(Cap::empty()),
}
}
/// Collect tainted variables at a sink node.
fn collect_tainted_sink_vars(
&self,
info: &NodeInfo,
state: &TaintState,
sink_caps: Cap,
) -> Vec<(SymbolId, Cap, SmallVec<[TaintOrigin; 2]>)> {
let mut result = Vec::new();
for u in &info.uses {
if let Some(taint) = self.lookup_var(u, state)
&& (taint.caps & sink_caps) != Cap::empty()
&& let Some(sym) = self.interner.get(u)
{
result.push((sym, taint.caps, taint.origins.clone()));
}
}
result
}
/// Resolve a callee name to its summary (local → global → interop).
fn resolve_callee(
&self,
callee: &str,
caller_func: &str,
call_ordinal: u32,
) -> Option<ResolvedSummary> {
let normalized = normalize_callee_name(callee);
// 1) Local (same-file)
let local_matches: Vec<_> = self
.local_summaries
.iter()
.filter(|(k, _)| {
k.name == normalized && k.lang == self.lang && k.namespace == self.namespace
})
.collect();
if local_matches.len() == 1 {
let (_, ls) = local_matches[0];
return Some(ResolvedSummary {
source_caps: ls.source_caps,
sanitizer_caps: ls.sanitizer_caps,
sink_caps: ls.sink_caps,
propagates_taint: ls.propagates_taint,
});
}
if local_matches.len() > 1 {
return None;
}
// 2) Global same-language
if let Some(gs) = self.global_summaries {
match gs.resolve_callee_key(normalized, self.lang, self.namespace, None) {
CalleeResolution::Resolved(target_key) => {
if let Some(fs) = gs.get(&target_key) {
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
}
CalleeResolution::NotFound | CalleeResolution::Ambiguous(_) => {}
}
}
// 3) Interop edges
for edge in self.interop_edges {
if edge.from.caller_lang == self.lang
&& edge.from.caller_namespace == self.namespace
&& edge.from.callee_symbol == callee
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_func)
&& (edge.from.ordinal == 0 || edge.from.ordinal == call_ordinal)
&& let Some(gs) = self.global_summaries
&& let Some(fs) = gs.get(&edge.to)
{
return Some(ResolvedSummary {
source_caps: fs.source_caps(),
sanitizer_caps: fs.sanitizer_caps(),
sink_caps: fs.sink_caps(),
propagates_taint: fs.propagates_taint,
});
}
}
None
}
}
/// Resolved summary for a callee.
struct ResolvedSummary {
source_caps: Cap,
sanitizer_caps: Cap,
sink_caps: Cap,
propagates_taint: bool,
}

View file

@ -61,6 +61,10 @@ pub struct ScannerConfig {
/// benchmarks, etc.) at their original severity. When false (default),
/// findings in these paths are downgraded by one severity tier.
pub include_nonprod: bool,
/// Enable the state-model dataflow engine for resource lifecycle and
/// auth-state analysis. Default: false (opt-in).
pub enable_state_analysis: bool,
}
impl Default for ScannerConfig {
fn default() -> Self {
@ -94,6 +98,7 @@ impl Default for ScannerConfig {
follow_symlinks: false,
scan_hidden_files: false,
include_nonprod: false,
enable_state_analysis: false,
}
}
}
@ -135,6 +140,60 @@ pub struct OutputConfig {
/// The maximum number of results to show.
pub max_results: Option<u32>,
/// Enable attack-surface ranking to sort findings by exploitability.
pub attack_surface_ranking: bool,
/// Minimum attack-surface score to include in output.
/// Findings below this threshold are dropped after ranking.
/// `None` means no minimum (all findings shown).
pub min_score: Option<u32>,
/// Minimum confidence level to include in output.
/// `None` means no minimum (all findings shown).
#[serde(
default,
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_confidence_opt"
)]
pub min_confidence: Option<crate::evidence::Confidence>,
/// Include Quality-category findings (excluded by default).
#[serde(default)]
pub include_quality: bool,
/// Show all findings: disables category filtering, rollups, and LOW budgets.
#[serde(default)]
pub show_all: bool,
/// Maximum total LOW findings to show.
#[serde(default = "default_max_low")]
pub max_low: u32,
/// Maximum LOW findings per file.
#[serde(default = "default_max_low_per_file")]
pub max_low_per_file: u32,
/// Maximum LOW findings per rule.
#[serde(default = "default_max_low_per_rule")]
pub max_low_per_rule: u32,
/// Number of example locations to store in rollup findings.
#[serde(default = "default_rollup_examples")]
pub rollup_examples: u32,
}
fn default_max_low() -> u32 {
20
}
fn default_max_low_per_file() -> u32 {
1
}
fn default_max_low_per_rule() -> u32 {
10
}
fn default_rollup_examples() -> u32 {
5
}
impl Default for OutputConfig {
@ -143,10 +202,36 @@ impl Default for OutputConfig {
default_format: "console".into(),
quiet: false,
max_results: None,
attack_surface_ranking: true,
min_score: None,
min_confidence: None,
include_quality: false,
show_all: false,
max_low: 20,
max_low_per_file: 1,
max_low_per_rule: 10,
rollup_examples: 5,
}
}
}
/// Deserialize an optional Confidence from a TOML string.
fn deserialize_confidence_opt<'de, D>(
deserializer: D,
) -> Result<Option<crate::evidence::Confidence>, D::Error>
where
D: serde::Deserializer<'de>,
{
let opt: Option<String> = Option::deserialize(deserializer)?;
match opt {
None => Ok(None),
Some(s) => s
.parse::<crate::evidence::Confidence>()
.map(Some)
.map_err(serde::de::Error::custom),
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(default)]
pub struct PerformanceConfig {
@ -303,6 +388,7 @@ fn merge_configs(mut default: Config, user: Config) -> Config {
default.scanner.follow_symlinks = user.scanner.follow_symlinks;
default.scanner.scan_hidden_files = user.scanner.scan_hidden_files;
default.scanner.include_nonprod = user.scanner.include_nonprod;
default.scanner.enable_state_analysis = user.scanner.enable_state_analysis;
// Merge exclusion lists (default ⊔ user), then sort & dedupe
default
@ -328,6 +414,15 @@ fn merge_configs(mut default: Config, user: Config) -> Config {
default.output.default_format = user.output.default_format;
default.output.quiet = user.output.quiet;
default.output.max_results = user.output.max_results;
default.output.attack_surface_ranking = user.output.attack_surface_ranking;
default.output.min_score = user.output.min_score;
default.output.min_confidence = user.output.min_confidence;
default.output.include_quality = user.output.include_quality;
default.output.show_all = user.output.show_all;
default.output.max_low = user.output.max_low;
default.output.max_low_per_file = user.output.max_low_per_file;
default.output.max_low_per_rule = user.output.max_low_per_rule;
default.output.rollup_examples = user.output.rollup_examples;
// --- PerformanceConfig ---
default.performance.max_depth = user.performance.max_depth;

View file

@ -147,8 +147,8 @@ pub fn spawn_file_walker(root: &Path, cfg: &Config) -> (Receiver<Paths>, JoinHan
#[test]
fn walker_respects_excluded_extensions() {
let tmp = tempfile::tempdir().unwrap();
std::fs::write(tmp.path().join("keep.rs"), "fn main(){}").unwrap();
std::fs::write(tmp.path().join("skip.txt"), "ignored").unwrap();
std::fs::write(tmp.path().join("keep.rs"), "fn main(){}").unwrap(); // nyx:ignore cfg-unguarded-sink
std::fs::write(tmp.path().join("skip.txt"), "ignored").unwrap(); // nyx:ignore cfg-unguarded-sink
let mut cfg = Config::default();
cfg.scanner.excluded_extensions = vec!["txt".into()];