nyx/src/evidence.rs
2026-05-07 01:29:31 -04:00

1414 lines
53 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Structured evidence and confidence types for scan diagnostics.
//!
//! These types capture the provenance of findings (source locations,
//! sanitizer/guard info, state-machine transitions) in a structured form
//! that can be serialized to JSON and consumed by ranking, filtering,
//! and downstream tooling.
#![allow(clippy::collapsible_if)]
use crate::commands::scan::Diag;
use crate::patterns::Severity;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::str::FromStr;
// ─────────────────────────────────────────────────────────────────────────────
// Confidence
// ─────────────────────────────────────────────────────────────────────────────
/// Confidence level for a diagnostic finding.
///
/// Ordered Low < Medium < High so that `>=` comparisons work naturally
/// for filtering (e.g. `--min-confidence medium` keeps Medium and High).
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum Confidence {
Low,
Medium,
High,
}
impl fmt::Display for Confidence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Low => write!(f, "Low"),
Self::Medium => write!(f, "Medium"),
Self::High => write!(f, "High"),
}
}
}
impl FromStr for Confidence {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_ascii_lowercase().as_str() {
"low" => Ok(Self::Low),
"medium" | "med" => Ok(Self::Medium),
"high" => Ok(Self::High),
_ => Err(format!(
"unknown confidence level: {s:?} (expected low, medium, high)"
)),
}
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Flow Steps
// ─────────────────────────────────────────────────────────────────────────────
/// The kind of operation at a flow step.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FlowStepKind {
/// A source read: user input, environment variable, network data, etc.
Source,
/// A local assignment propagating taint from one variable to another.
Assignment,
/// A function call through which taint flows (via argument or return value).
Call,
/// An SSA phi node merging tainted values from multiple predecessors.
Phi,
/// The dangerous sink where tainted data is consumed.
Sink,
}
impl fmt::Display for FlowStepKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Source => write!(f, "source"),
Self::Assignment => write!(f, "assignment"),
Self::Call => write!(f, "call"),
Self::Phi => write!(f, "phi"),
Self::Sink => write!(f, "sink"),
}
}
}
/// A single step in a taint flow path (display-ready).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FlowStep {
/// 1-based position of this step in the flow (source = 1, sink = N).
pub step: u32,
pub kind: FlowStepKind,
/// Project-relative file path where this step occurs.
pub file: String,
/// 1-based line number of the operation.
pub line: u32,
/// 0-based column offset of the operation.
pub col: u32,
/// Source code snippet at this location, if available.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub snippet: Option<String>,
/// SSA variable name carrying taint at this step.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub variable: Option<String>,
/// For [`FlowStepKind::Call`] steps, the name of the function called.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub callee: Option<String>,
/// Name of the enclosing function at this step.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub function: Option<String>,
/// True when this step crosses a file boundary, resolved via a cross-file
/// summary rather than direct SSA flow.
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub is_cross_file: bool,
}
// ─────────────────────────────────────────────────────────────────────────────
// Symbolic verdict
// ─────────────────────────────────────────────────────────────────────────────
/// Symbolic verification verdict for a taint path.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Verdict {
/// Constraint solver confirmed the path is feasible.
Confirmed,
/// Constraint solver proved the path is infeasible.
Infeasible,
/// Constraint solver could not determine feasibility.
Inconclusive,
/// No symbolic analysis was attempted for this finding.
NotAttempted,
}
/// Summary of symbolic constraint analysis for a finding.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SymbolicVerdict {
/// The outcome of symbolic path feasibility analysis.
pub verdict: Verdict,
/// Number of path constraints checked during analysis.
#[serde(default)]
pub constraints_checked: u32,
/// Number of distinct paths explored from source to sink.
#[serde(default)]
pub paths_explored: u32,
/// Human-readable witness or proof sketch.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub witness: Option<String>,
/// Interprocedural call chains leading to callee-internal sinks.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub interproc_call_chains: Vec<Vec<String>>,
/// Cutoff/fallback reasons that limited analysis precision.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub cutoff_notes: Vec<String>,
}
// ─────────────────────────────────────────────────────────────────────────────
// Evidence
// ─────────────────────────────────────────────────────────────────────────────
/// Structured evidence for a diagnostic finding.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Evidence {
/// Where tainted data originated.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source: Option<SpanEvidence>,
/// Where the dangerous operation happens.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub sink: Option<SpanEvidence>,
/// Validation guards protecting this path.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub guards: Vec<SpanEvidence>,
/// Sanitizers applied to this path.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sanitizers: Vec<SpanEvidence>,
/// State-machine evidence (resource lifecycle / auth).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub state: Option<StateEvidence>,
/// Free-form notes for ranking and display.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub notes: Vec<String>,
/// Kind of taint source (structured; replaces "source_kind:..." in notes).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_kind: Option<crate::labels::SourceKind>,
/// Number of SSA blocks between source and sink.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hop_count: Option<u16>,
/// Whether this finding was resolved via a cross-function summary.
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub uses_summary: bool,
/// Number of matching capability bits between source and sink.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cap_specificity: Option<u8>,
/// Step-by-step taint flow from source to sink.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub flow_steps: Vec<FlowStep>,
/// Human-readable explanation of the finding.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub explanation: Option<String>,
/// Reasons why confidence is not higher.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub confidence_limiters: Vec<String>,
/// Symbolic constraint analysis verdict for this finding's taint path.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub symbolic: Option<SymbolicVerdict>,
/// Resolved sink capability bits (u32 from `Cap::bits()`).
///
/// Used by deduplication to distinguish findings that share a
/// `(path, line, severity)` key but target different sinks (e.g.
/// `sink_sql(x); sink_shell(x);` on the same line). 0 when the sink
/// caps could not be resolved at the CFG node (e.g. pure summary
/// resolution where the caller's sink node carries no label).
#[serde(default, skip_serializing_if = "is_zero_cap_bits")]
pub sink_caps: u32,
/// Engine provenance notes attached to this finding (e.g. "worklist
/// iteration budget was hit before convergence"), propagated from
/// [`crate::taint::Finding::engine_notes`]. Empty for typical
/// under-budget findings and skipped during serialization in that case.
#[serde(default, skip_serializing_if = "smallvec::SmallVec::is_empty")]
pub engine_notes: smallvec::SmallVec<[crate::engine_notes::EngineNote; 2]>,
/// For `Cap::DATA_EXFIL` findings, the destination object-literal field
/// the tainted value reached (e.g. `"body"`, `"headers"`, `"json"`).
/// `None` for non-exfil findings, for exfil findings whose payload arg
/// was not an object literal, or when the sink was resolved through a
/// summary path that did not preserve destination metadata.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub data_exfil_field: Option<String>,
}
fn is_zero_cap_bits(v: &u32) -> bool {
*v == 0
}
impl Evidence {
/// Returns `true` if the evidence contains no useful data.
pub fn is_empty(&self) -> bool {
self.source.is_none()
&& self.sink.is_none()
&& self.guards.is_empty()
&& self.sanitizers.is_empty()
&& self.state.is_none()
&& self.notes.is_empty()
&& self.source_kind.is_none()
&& self.hop_count.is_none()
&& !self.uses_summary
&& self.cap_specificity.is_none()
&& self.flow_steps.is_empty()
&& self.explanation.is_none()
&& self.confidence_limiters.is_empty()
&& self.symbolic.is_none()
&& self.sink_caps == 0
&& self.engine_notes.is_empty()
}
}
/// A source-location evidence span.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpanEvidence {
pub path: String,
pub line: u32,
pub col: u32,
/// One of: `"source"`, `"sink"`, `"guard"`, `"sanitizer"`.
pub kind: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub snippet: Option<String>,
}
/// Evidence from a state-machine analysis (resource lifecycle / auth).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StateEvidence {
/// The state machine: `"resource"` or `"auth"`.
pub machine: String,
/// Variable name if available.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub subject: Option<String>,
/// State before the event.
pub from_state: String,
/// State after the event.
pub to_state: String,
}
// ─────────────────────────────────────────────────────────────────────────────
// compute_confidence
// ─────────────────────────────────────────────────────────────────────────────
/// Derive a confidence level for `diag` based on its rule ID, severity,
/// evidence, and analysis kind.
///
/// This is called as a post-pass after all findings are collected; findings
/// that already have a confidence set (e.g. from CFG analysis) are preserved.
///
/// When the finding carries engine provenance notes whose
/// [`crate::engine_notes::LossDirection`] is `OverReport` or `Bail`,
/// the computed confidence is capped at `Medium` regardless of the
/// points-based taint score. `OverReport` means precision was widened
/// (validation guards may have been lost, so the finding is more
/// likely to be a false positive); `Bail` means analysis of the body
/// aborted before producing a trustworthy result. `UnderReport` notes
/// (e.g. `WorklistCapped`) do *not* cap confidence, the reported flow
/// is still real, just surrounded by an incomplete result set.
pub fn compute_confidence(diag: &Diag) -> Confidence {
// Degraded analysis caps confidence
if let Some(ev) = &diag.evidence
&& ev.notes.iter().any(|n| n.starts_with("degraded:"))
{
return Confidence::Low;
}
let id = &diag.id;
let base = if id.starts_with("taint-data-exfiltration") {
// DATA_EXFIL is calibrated independently from the generic taint path:
// the value at risk is the leak of an *already-sensitive* source, not
// the construction of an attacker payload, so the points-based scoring
// tuned for code-exec / SSRF / SQLi over-credits these findings. Route
// to a narrower decision tree that asks "did we corroborate a real
// string body leaving the process?" instead.
compute_data_exfil_confidence(diag)
} else if id.starts_with("taint-") {
compute_taint_confidence(diag)
} else if id.starts_with("state-") {
match id.as_str() {
"state-use-after-close" => Confidence::High,
"state-double-close" => Confidence::High,
"state-unauthed-access" => Confidence::High,
"state-resource-leak" => Confidence::Medium,
"state-resource-leak-possible" => Confidence::Low,
_ => Confidence::Medium,
}
} else if id.starts_with("cfg-") {
// If CFG conversion already set confidence, preserve it
diag.confidence.unwrap_or(Confidence::Medium)
} else if diag.severity == Severity::High {
// AST patterns: High severity → Medium confidence, else Low
Confidence::Medium
} else {
Confidence::Low
};
apply_engine_notes_cap(diag, base)
}
/// Cap `base` at `Medium` when the finding carries any engine note
/// whose direction is [`crate::engine_notes::LossDirection::OverReport`]
/// or [`crate::engine_notes::LossDirection::Bail`].
///
/// Returns `base` unchanged when no evidence is present, no notes are
/// attached, or only `Informational` / `UnderReport` notes are present.
fn apply_engine_notes_cap(diag: &Diag, base: Confidence) -> Confidence {
let Some(ev) = &diag.evidence else {
return base;
};
let Some(worst) = crate::engine_notes::worst_direction(&ev.engine_notes) else {
return base;
};
match worst {
crate::engine_notes::LossDirection::OverReport
| crate::engine_notes::LossDirection::Bail => base.min(Confidence::Medium),
// UnderReport: result set is a lower bound, but the emitted
// finding itself remains as credible as the analysis decided.
// Do not cap, the rank completeness penalty is the right lever
// for that case (see rank.rs::completeness_penalty).
crate::engine_notes::LossDirection::UnderReport => base,
// Informational is filtered out upstream by `worst_direction`,
// but keep the arm to force a decision if the enum grows.
crate::engine_notes::LossDirection::Informational => base,
}
}
/// Points-based confidence scoring for taint findings.
///
/// Uses evidence metadata (source kind, path length, validation, cap
/// specificity, summary resolution) to produce a nuanced confidence level
/// instead of the previous flat High assignment.
fn compute_taint_confidence(diag: &Diag) -> Confidence {
let ev = match &diag.evidence {
Some(e) => e,
None => return Confidence::High, // no evidence struct → conservative High
};
let mut score: i32 = 0;
// Source kind (prefer structured field, fall back to notes)
score += match ev.source_kind {
Some(kind) => structured_source_kind_score(kind),
None => source_kind_score(&ev.notes),
};
// Evidence completeness
let has_source = ev.source.is_some();
let has_sink = ev.sink.is_some();
let has_snippet = ev.source.as_ref().is_some_and(|s| s.snippet.is_some())
|| ev.sink.as_ref().is_some_and(|s| s.snippet.is_some());
score += if has_source && has_sink && has_snippet {
3
} else if has_source && has_sink {
2
} else {
1
};
// Hop count penalty (prefer structured field)
score += match ev.hop_count {
Some(count) => match count {
0..=3 => 0,
4..=8 => -1,
_ => -2,
},
None => hop_count_score(&ev.notes),
};
// Path validation penalty (use Diag field directly)
if diag.path_validated {
score -= 3;
}
// Cap specificity bonus (prefer structured field)
score += match ev.cap_specificity {
Some(count) => {
if count == 1 {
1
} else {
0
}
}
None => cap_specificity_score(&ev.notes),
};
// Summary resolution penalty (prefer structured field)
if ev.uses_summary || ev.notes.iter().any(|n| n == "uses_summary") {
score -= 1;
}
// Symbolic verdict adjustments
if let Some(ref sv) = ev.symbolic {
match sv.verdict {
Verdict::Infeasible => score -= 5,
Verdict::Confirmed => {
// Stronger bonus when extract_witness produced a concrete payload
// (contains "flows to" or "reaches"); raw Display-only fallback
// from get_sink_witness does not contain these phrases.
if sv
.witness
.as_ref()
.is_some_and(|w| w.contains("flows to") || w.contains("reaches"))
{
score += 3;
} else {
score += 2;
}
}
Verdict::Inconclusive | Verdict::NotAttempted => {}
}
// Backwards-driven corroboration / infeasibility. We
// deliberately use a smaller magnitude than the symex verdict so
// symex (which reasons about concrete payloads) stays the stronger
// signal; backwards is a structural agreement check.
use crate::taint::backwards::{NOTE_BUDGET, NOTE_CONFIRMED, NOTE_INFEASIBLE};
if sv.cutoff_notes.iter().any(|n| n == NOTE_CONFIRMED) {
score += 1;
}
if sv.cutoff_notes.iter().any(|n| n == NOTE_INFEASIBLE) {
score -= 3;
}
let _ = NOTE_BUDGET;
}
match score {
5.. => Confidence::High,
2..=4 => Confidence::Medium,
_ => Confidence::Low,
}
}
/// Confidence routing for `taint-data-exfiltration` findings.
///
/// The generic taint scorer ranks DATA_EXFIL too aggressively: a Sensitive
/// source plus a sink call is enough to push it into the Medium/High band,
/// but the leak class needs corroboration that a real string body actually
/// leaves the process (otherwise we surface every `fetch(..., {body: x})`
/// where `x` happens to be Sensitive-tagged). This routing is deliberately
/// capped at Medium and only fires Medium when the symbolic execution
/// verdict confirms the path (abstract interpretation participates only as
/// a sink-suppression filter inside SSA taint and does not surface a
/// separate verdict here).
///
/// Routing:
/// * Source < Sensitive → Low (caller already strips DATA_EXFIL for
/// Plain sources, but defensively floor here).
/// * Symbolic verdict `Confirmed` → Medium (symex produced a witness
/// that a tainted string reaches the body argument).
/// * Symbolic verdict `Inconclusive` / `NotAttempted` / no symbolic
/// analysis → Low (instruction's "Inconclusive" tier; the `Confidence`
/// enum has no separate Inconclusive variant so it floors to Low).
/// * Symbolic verdict `Infeasible` → Low (path proven dead).
///
/// After routing, a `path_validated` guard on the diag drops the result
/// one tier (Medium → Low; Low stays Low) and `apply_engine_notes_cap`
/// applies the standard engine-notes cap.
fn compute_data_exfil_confidence(diag: &Diag) -> Confidence {
let ev = match &diag.evidence {
Some(e) => e,
None => return Confidence::Low,
};
let is_sensitive = ev
.source_kind
.map(|k| k.sensitivity() >= crate::labels::Sensitivity::Sensitive)
.unwrap_or(false);
if !is_sensitive {
return Confidence::Low;
}
let mut base = match ev.symbolic.as_ref().map(|s| s.verdict) {
Some(Verdict::Confirmed) => Confidence::Medium,
Some(Verdict::Infeasible) => Confidence::Low,
Some(Verdict::Inconclusive) | Some(Verdict::NotAttempted) | None => Confidence::Low,
};
// Guarded flow: drop a tier. A validation predicate on the path means
// the leak may be unreachable in practice, so the corroborated witness
// is downgraded one step (Medium → Low; Low stays Low).
if diag.path_validated && base > Confidence::Low {
base = Confidence::Low;
}
apply_engine_notes_cap(diag, base)
}
/// Score a structured `SourceKind` value.
///
/// UserInput=+3, EnvironmentConfig=+2, Unknown/FileSystem=+1, Database/CaughtException=0.
fn structured_source_kind_score(kind: crate::labels::SourceKind) -> i32 {
use crate::labels::SourceKind;
match kind {
// Cookie / Header carry auth material, score them at the same
// ranking weight as direct user input rather than the lower
// FileSystem/Database tiers.
SourceKind::UserInput | SourceKind::Cookie | SourceKind::Header => 3,
SourceKind::EnvironmentConfig => 2,
SourceKind::Unknown | SourceKind::FileSystem => 1,
SourceKind::Database | SourceKind::CaughtException => 0,
}
}
/// Extract source_kind from evidence notes and return points (legacy fallback).
///
/// UserInput=+3, EnvironmentConfig=+2, Unknown/FileSystem=+1, Database/CaughtException=0.
fn source_kind_score(notes: &[String]) -> i32 {
for note in notes {
if let Some(kind) = note.strip_prefix("source_kind:") {
return match kind {
"UserInput" => 3,
"EnvironmentConfig" => 2,
"Unknown" | "FileSystem" => 1,
_ => 0, // Database, CaughtException, etc.
};
}
}
1 // conservative default if missing
}
/// Extract hop_count from evidence notes and return penalty.
///
/// 03 blocks = 0, 48 = 1, 9+ = 2.
fn hop_count_score(notes: &[String]) -> i32 {
for note in notes {
if let Some(count_str) = note.strip_prefix("hop_count:") {
if let Ok(count) = count_str.parse::<u16>() {
return match count {
0..=3 => 0,
4..=8 => -1,
_ => -2,
};
}
}
}
0 // no hop info → no penalty
}
/// Extract cap_specificity from evidence notes and return bonus.
///
/// 1 bit (exact match) = +1, otherwise 0.
fn cap_specificity_score(notes: &[String]) -> i32 {
for note in notes {
if let Some(count_str) = note.strip_prefix("cap_specificity:") {
if let Ok(count) = count_str.parse::<u8>() {
return if count == 1 { 1 } else { 0 };
}
}
}
0
}
// ─────────────────────────────────────────────────────────────────────────────
// Explanation & Confidence Limiters
// ─────────────────────────────────────────────────────────────────────────────
/// Generate a human-readable explanation of a taint finding from its evidence.
pub fn generate_explanation(diag: &Diag) -> Option<String> {
let ev = diag.evidence.as_ref()?;
let source = ev.source.as_ref()?;
let sink = ev.sink.as_ref()?;
let source_callee = source.snippet.as_deref().unwrap_or("(unknown source)");
let sink_callee = sink.snippet.as_deref().unwrap_or("(unknown sink)");
// Extract source kind label (prefer structured field)
let source_kind_label = if let Some(kind) = ev.source_kind {
use crate::labels::SourceKind;
match kind {
SourceKind::UserInput => "user input",
SourceKind::Cookie => "cookie",
SourceKind::Header => "request header",
SourceKind::EnvironmentConfig => "environment/config",
SourceKind::Database => "database",
SourceKind::FileSystem => "file system",
SourceKind::CaughtException => "caught exception",
SourceKind::Unknown => "unclassified",
}
} else {
// Legacy fallback: parse from notes
let kind_str = ev
.notes
.iter()
.find_map(|n| n.strip_prefix("source_kind:"))
.unwrap_or("unknown");
match kind_str {
"UserInput" => "user input",
"EnvironmentConfig" => "environment/config",
"Database" => "database",
"FileSystem" => "file system",
"CaughtException" => "caught exception",
_ => "unclassified",
}
};
// Extract category from rule ID
let category = diag
.id
.strip_prefix("taint-unsanitised-flow")
.map(|_| extract_category_from_id(&diag.id))
.unwrap_or_else(|| "injection".to_string());
let step_count = ev.flow_steps.len();
let mut explanation = if step_count > 2 {
format!(
"Unsanitised {source_kind_label} data flows from {source_callee} (line {}) through {} steps to {sink_callee} (line {}), creating a potential {category} vulnerability.",
source.line,
step_count - 2, // exclude source and sink themselves
sink.line,
)
} else {
format!(
"Unsanitised {source_kind_label} data flows from {source_callee} (line {}) to {sink_callee} (line {}), creating a potential {category} vulnerability.",
source.line, sink.line,
)
};
// Conditional addenda
if diag.path_validated {
if let Some(ref guard) = diag.guard_kind {
explanation.push_str(&format!(
" A {guard} guard was detected but may not be sufficient."
));
}
}
if ev.uses_summary || ev.notes.iter().any(|n| n == "uses_summary") {
explanation.push_str(" The flow crosses function boundaries via summary resolution.");
}
Some(explanation)
}
/// Extract a vulnerability category label from the Diag (used in explanation text).
fn extract_category_from_id(id: &str) -> String {
// Rule IDs like "taint-unsanitised-flow (source 3:1)", category comes
// from the finding category field, but we approximate from the ID here.
if id.contains("sql") || id.contains("SQL") {
"SQL injection".to_string()
} else if id.contains("xss") || id.contains("XSS") {
"XSS".to_string()
} else {
"injection".to_string()
}
}
/// Compute reasons why confidence is not higher.
pub fn compute_confidence_limiters(diag: &Diag) -> Vec<String> {
let mut limiters = Vec::new();
let ev = match &diag.evidence {
Some(e) => e,
None => return limiters,
};
// Hop count (prefer structured field)
let hop = ev.hop_count.or_else(|| {
ev.notes
.iter()
.find_map(|n| n.strip_prefix("hop_count:")?.parse::<u16>().ok())
});
if let Some(count) = hop {
if count >= 4 {
limiters.push(format!(
"Taint path spans {count} blocks, increasing chance of intermediate sanitization"
));
}
}
// Summary resolution (prefer structured field)
if ev.uses_summary || ev.notes.iter().any(|n| n == "uses_summary") {
limiters.push("Flow resolved via cross-function summary (may be imprecise)".into());
}
// Path validated (use Diag field directly)
if diag.path_validated {
limiters.push("Validation guard detected on path (may provide protection)".into());
}
// Cap specificity (prefer structured field)
let cap_spec = ev.cap_specificity.or_else(|| {
ev.notes
.iter()
.find_map(|n| n.strip_prefix("cap_specificity:")?.parse::<u8>().ok())
});
if cap_spec == Some(0) {
limiters.push("Source and sink capability types do not match specifically".into());
}
// Source kind unknown (prefer structured field)
let is_unknown = ev.source_kind == Some(crate::labels::SourceKind::Unknown)
|| ev.notes.iter().any(|n| n == "source_kind:Unknown");
if is_unknown {
limiters.push("Source type is unclassified (lower exploitation confidence)".into());
}
// Symbolic verdict
if let Some(ref sv) = ev.symbolic {
if sv.verdict == Verdict::Infeasible {
limiters.push("Symbolic analysis proved this path is infeasible".into());
}
}
// Demand-driven backwards analysis notes (stored on
// `symbolic.cutoff_notes` so the evidence pipeline already plumbs
// them). When the backwards walk proved the flow infeasible or ran
// out of budget, surface a user-readable limiter.
if let Some(ref sv) = ev.symbolic {
use crate::taint::backwards::{NOTE_BUDGET, NOTE_CONFIRMED, NOTE_INFEASIBLE};
if sv.cutoff_notes.iter().any(|n| n == NOTE_INFEASIBLE) {
limiters.push("Backwards demand-driven analysis proved this flow infeasible".into());
} else if sv.cutoff_notes.iter().any(|n| n == NOTE_BUDGET) {
limiters.push(
"Backwards demand-driven analysis exceeded its budget (verdict not reached)".into(),
);
}
// Confirmation is *not* a limiter, it is a positive signal. The
// taint-confidence scorer picks it up separately.
let _ = NOTE_CONFIRMED;
}
limiters
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use crate::labels::SourceKind;
fn make_diag(id: &str, severity: Severity) -> Diag {
Diag {
path: "test.rs".into(),
line: 1,
col: 1,
severity,
id: id.into(),
category: crate::patterns::FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: None,
evidence: None,
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
finding_id: String::new(),
alternative_finding_ids: Vec::new(),
}
}
#[test]
fn compute_confidence_taint_strong_path() {
// UserInput(+3) + source+sink+snippet(+3) + short path(0) + cap_specificity:1(+1) = 7 → High
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("env::var(\"X\")".into()),
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: Some("exec()".into()),
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![
"source_kind:UserInput".into(),
"hop_count:1".into(),
"cap_specificity:1".into(),
],
source_kind: Some(crate::labels::SourceKind::UserInput),
hop_count: Some(1),
cap_specificity: Some(1),
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn compute_confidence_taint_medium_path() {
// EnvironmentConfig(+2) + source+sink no snippet(+2) + hop_count:5(1) = 3 → Medium
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["source_kind:EnvironmentConfig".into(), "hop_count:5".into()],
source_kind: Some(crate::labels::SourceKind::EnvironmentConfig),
hop_count: Some(5),
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn compute_confidence_taint_weak_path() {
// Database(0) + source+sink no snippet(+2) + hop_count:12(2) + uses_summary(1) = 1 → Low
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 20,
col: 5,
kind: "sink".into(),
snippet: None,
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec![
"source_kind:Database".into(),
"hop_count:12".into(),
"uses_summary".into(),
],
source_kind: Some(crate::labels::SourceKind::Database),
hop_count: Some(12),
uses_summary: true,
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_taint_validated_with_source() {
// UserInput(+3) + source+sink+snippet(+3) + path_validated(3) = 3 → Medium
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.path_validated = true;
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("req.query".into()),
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: Some("exec()".into()),
}),
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["path_validated".into(), "source_kind:UserInput".into()],
source_kind: Some(crate::labels::SourceKind::UserInput),
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn compute_confidence_taint_no_evidence() {
// No Evidence struct → conservative High
let d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn compute_confidence_degraded_caps_to_low() {
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: None,
sink: None,
guards: vec![],
sanitizers: vec![],
state: None,
notes: vec!["degraded:budget_exceeded".into()],
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_state_rules() {
assert_eq!(
compute_confidence(&make_diag("state-use-after-close", Severity::High)),
Confidence::High,
);
assert_eq!(
compute_confidence(&make_diag("state-double-close", Severity::Medium)),
Confidence::High,
);
assert_eq!(
compute_confidence(&make_diag("state-unauthed-access", Severity::High)),
Confidence::High,
);
assert_eq!(
compute_confidence(&make_diag("state-resource-leak", Severity::Medium)),
Confidence::Medium,
);
assert_eq!(
compute_confidence(&make_diag("state-resource-leak-possible", Severity::Low)),
Confidence::Low,
);
}
#[test]
fn compute_confidence_cfg_preserves_existing() {
let mut d = make_diag("cfg-unguarded-sink", Severity::High);
d.confidence = Some(Confidence::Low);
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_ast_low() {
let d = make_diag("rs.code_exec.eval", Severity::Medium);
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_ast_high_severity_medium() {
let d = make_diag("rs.code_exec.eval", Severity::High);
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
// ── engine_notes direction-aware capping ────────────────────────
fn taint_high_confidence_diag() -> Diag {
// A known-High taint configuration: UserInput + source+sink+snippet +
// short path + cap_specificity=1 → score 7 → High. Re-used as the
// "clean" baseline for every engine-notes cap test.
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("req.query.id".into()),
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 5,
col: 1,
kind: "sink".into(),
snippet: Some("exec(id)".into()),
}),
source_kind: Some(SourceKind::UserInput),
cap_specificity: Some(1),
hop_count: Some(1),
..Default::default()
});
d
}
fn with_notes(mut d: Diag, notes: Vec<crate::engine_notes::EngineNote>) -> Diag {
let mut ev = d.evidence.clone().unwrap_or_default();
ev.engine_notes = smallvec::SmallVec::from_vec(notes);
d.evidence = Some(ev);
d
}
#[test]
fn confidence_uncapped_without_engine_notes() {
assert_eq!(
compute_confidence(&taint_high_confidence_diag()),
Confidence::High,
"baseline must be High so cap tests have something to cap"
);
}
#[test]
fn confidence_not_capped_by_under_report() {
// UnderReport indicates we may have missed OTHER findings. The
// finding we *did* emit is still sound; its confidence stays High.
let d = with_notes(
taint_high_confidence_diag(),
vec![crate::engine_notes::EngineNote::WorklistCapped { iterations: 100 }],
);
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn confidence_capped_at_medium_by_over_report() {
// OverReport (PredicateStateWidened) means validation predicates
// were lost, the emitted finding is more likely to be spurious.
let d = with_notes(
taint_high_confidence_diag(),
vec![crate::engine_notes::EngineNote::PredicateStateWidened],
);
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn confidence_capped_at_medium_by_bail() {
let d = with_notes(
taint_high_confidence_diag(),
vec![crate::engine_notes::EngineNote::ParseTimeout { timeout_ms: 1000 }],
);
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn confidence_cap_does_not_upgrade_low() {
// `base.min(Medium)` is what caps, it must not *raise* a Low
// baseline to Medium. Use a taint finding with weak evidence so
// the points scorer gives us Low, then attach a Bail note.
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::Low);
d.evidence = Some(Evidence {
source: None,
sink: None,
source_kind: Some(SourceKind::Database),
hop_count: Some(10),
..Default::default()
});
d = with_notes(
d,
vec![crate::engine_notes::EngineNote::ParseTimeout { timeout_ms: 100 }],
);
assert_eq!(
compute_confidence(&d),
Confidence::Low,
"Bail cap must never raise Low → Medium"
);
}
#[test]
fn confidence_not_capped_by_informational() {
let d = with_notes(
taint_high_confidence_diag(),
vec![crate::engine_notes::EngineNote::InlineCacheReused],
);
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn confidence_cap_applies_to_state_findings_too() {
// state-use-after-close is High by default; an OverReport note
// on it must cap it to Medium, same as the taint path.
let d = with_notes(
make_diag("state-use-after-close", Severity::High),
vec![crate::engine_notes::EngineNote::PredicateStateWidened],
);
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn confidence_cap_chooses_worst_when_mixed() {
// UnderReport alone does not cap; OverReport does. Mixing them
// must apply the cap (worst-direction wins).
let d = with_notes(
taint_high_confidence_diag(),
vec![
crate::engine_notes::EngineNote::WorklistCapped { iterations: 10 },
crate::engine_notes::EngineNote::PredicateStateWidened,
],
);
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn evidence_is_empty() {
let ev = Evidence::default();
assert!(ev.is_empty());
let ev2 = Evidence {
source: Some(SpanEvidence {
path: "x.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
..Default::default()
};
assert!(!ev2.is_empty());
}
#[test]
fn confidence_ord() {
assert!(Confidence::Low < Confidence::Medium);
assert!(Confidence::Medium < Confidence::High);
assert!(Confidence::Low < Confidence::High);
}
#[test]
fn confidence_display_and_parse() {
assert_eq!(Confidence::Low.to_string(), "Low");
assert_eq!(Confidence::Medium.to_string(), "Medium");
assert_eq!(Confidence::High.to_string(), "High");
assert_eq!("low".parse::<Confidence>().unwrap(), Confidence::Low);
assert_eq!("MEDIUM".parse::<Confidence>().unwrap(), Confidence::Medium);
assert_eq!("High".parse::<Confidence>().unwrap(), Confidence::High);
assert!("invalid".parse::<Confidence>().is_err());
}
#[test]
fn compute_confidence_does_not_override_preset() {
// AST patterns set confidence directly; compute_confidence must not overwrite.
let mut d = make_diag("rs.quality.expect", Severity::Low);
d.confidence = Some(Confidence::High);
// The post-pass only runs when confidence is None, but verify compute_confidence
// itself would return something different (Low for AST + Low severity), proving
// the guard in scan.rs is necessary.
assert_eq!(compute_confidence(&d), Confidence::Low);
// The actual guard: confidence is already Some, so scan.rs skips compute_confidence.
assert_eq!(d.confidence, Some(Confidence::High));
}
#[test]
fn json_omits_none_fields() {
let ev = Evidence::default();
let json = serde_json::to_string(&ev).unwrap();
assert_eq!(json, "{}");
}
#[test]
fn symbolic_verdict_serde_round_trip() {
for verdict in [
Verdict::Confirmed,
Verdict::Infeasible,
Verdict::Inconclusive,
Verdict::NotAttempted,
] {
let sv = SymbolicVerdict {
verdict,
constraints_checked: 42,
paths_explored: 7,
witness: Some("x=null forces false branch".into()),
interproc_call_chains: Vec::new(),
cutoff_notes: Vec::new(),
};
let json = serde_json::to_string(&sv).unwrap();
let rt: SymbolicVerdict = serde_json::from_str(&json).unwrap();
assert_eq!(rt.verdict, verdict);
assert_eq!(rt.constraints_checked, 42);
assert_eq!(rt.paths_explored, 7);
assert_eq!(rt.witness.as_deref(), Some("x=null forces false branch"));
}
// Verify snake_case serialization
let json = serde_json::to_string(&Verdict::NotAttempted).unwrap();
assert_eq!(json, "\"not_attempted\"");
}
#[test]
fn evidence_with_symbolic_not_empty() {
let ev = Evidence {
symbolic: Some(SymbolicVerdict {
verdict: Verdict::Confirmed,
constraints_checked: 1,
paths_explored: 1,
witness: None,
interproc_call_chains: Vec::new(),
cutoff_notes: Vec::new(),
}),
..Default::default()
};
assert!(!ev.is_empty());
}
#[test]
fn symbolic_witness_omitted_when_none() {
let sv = SymbolicVerdict {
verdict: Verdict::Inconclusive,
constraints_checked: 0,
paths_explored: 0,
witness: None,
interproc_call_chains: Vec::new(),
cutoff_notes: Vec::new(),
};
let json = serde_json::to_string(&sv).unwrap();
assert!(!json.contains("witness"));
}
#[test]
fn compute_confidence_structured_fields_only() {
// Structured fields without notes → same result as with notes
// UserInput(+3) + source+sink+snippet(+3) + hop_count:1(0) + cap_specificity:1(+1) = 7 → High
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("req.query".into()),
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: Some("exec()".into()),
}),
source_kind: Some(crate::labels::SourceKind::UserInput),
hop_count: Some(1),
cap_specificity: Some(1),
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn compute_confidence_notes_only_backward_compat() {
// Notes only (no structured fields) → backward compatible
// EnvironmentConfig(+2) + source+sink(+2) + hop_count:5(1) = 3 → Medium
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: None,
}),
notes: vec!["source_kind:EnvironmentConfig".into(), "hop_count:5".into()],
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::Medium);
}
#[test]
fn compute_confidence_symbolic_infeasible_demotes() {
// UserInput(+3) + source+sink+snippet(+3) + Infeasible(5) = 1 → Low
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: Some("req.query".into()),
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: Some("exec()".into()),
}),
source_kind: Some(crate::labels::SourceKind::UserInput),
symbolic: Some(SymbolicVerdict {
verdict: Verdict::Infeasible,
constraints_checked: 3,
paths_explored: 1,
witness: None,
interproc_call_chains: Vec::new(),
cutoff_notes: Vec::new(),
}),
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::Low);
}
#[test]
fn compute_confidence_symbolic_confirmed_boosts() {
// EnvironmentConfig(+2) + source+sink(+2) + Confirmed(+2) = 6 → High
let mut d = make_diag("taint-unsanitised-flow (source 1:1)", Severity::High);
d.evidence = Some(Evidence {
source: Some(SpanEvidence {
path: "test.rs".into(),
line: 1,
col: 1,
kind: "source".into(),
snippet: None,
}),
sink: Some(SpanEvidence {
path: "test.rs".into(),
line: 10,
col: 5,
kind: "sink".into(),
snippet: None,
}),
source_kind: Some(crate::labels::SourceKind::EnvironmentConfig),
symbolic: Some(SymbolicVerdict {
verdict: Verdict::Confirmed,
constraints_checked: 2,
paths_explored: 1,
witness: None,
interproc_call_chains: Vec::new(),
cutoff_notes: Vec::new(),
}),
..Default::default()
});
assert_eq!(compute_confidence(&d), Confidence::High);
}
#[test]
fn evidence_with_structured_fields_not_empty() {
let ev = Evidence {
source_kind: Some(crate::labels::SourceKind::UserInput),
..Default::default()
};
assert!(!ev.is_empty());
let ev2 = Evidence {
uses_summary: true,
..Default::default()
};
assert!(!ev2.is_empty());
}
#[test]
fn source_kind_serde_round_trip() {
use crate::labels::SourceKind;
for kind in [
SourceKind::UserInput,
SourceKind::EnvironmentConfig,
SourceKind::FileSystem,
SourceKind::Database,
SourceKind::CaughtException,
SourceKind::Unknown,
] {
let json = serde_json::to_string(&kind).unwrap();
let rt: SourceKind = serde_json::from_str(&json).unwrap();
assert_eq!(rt, kind);
}
// Verify snake_case serialization
let json = serde_json::to_string(&crate::labels::SourceKind::UserInput).unwrap();
assert_eq!(json, "\"user_input\"");
}
}